1 /* 2 * curve25519-donna-c64.c from github.com/agl/curve25519-donna 3 * revision 80ad9b9930c9baef5829dd2a235b6b7646d32a8e 4 */ 5 6 /* Copyright 2008, Google Inc. 7 * All rights reserved. 8 * 9 * Code released into the public domain. 10 * 11 * curve25519-donna: Curve25519 elliptic curve, public key function 12 * 13 * http://code.google.com/p/curve25519-donna/ 14 * 15 * Adam Langley <agl@imperialviolet.org> 16 * 17 * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> 18 * 19 * More information about curve25519 can be found here 20 * http://cr.yp.to/ecdh.html 21 * 22 * djb's sample implementation of curve25519 is written in a special assembly 23 * language called qhasm and uses the floating point registers. 24 * 25 * This is, almost, a clean room reimplementation from the curve25519 paper. It 26 * uses many of the tricks described therein. Only the crecip function is taken 27 * from the sample implementation. 28 */ 29 module botan.pubkey.algo.curve25519_donna; 30 31 import botan.constants; 32 static if (BOTAN_HAS_CURVE25519): 33 import botan.utils.donna128; 34 import botan.utils.mul128; 35 import botan.utils.loadstor; 36 import botan.utils.mem_ops; 37 38 package 39 int curve25519Donna(ubyte* mypublic, const ubyte* secret, const ubyte* basepoint) { 40 limb[5] bp; 41 limb[5] x; 42 limb[5] z; 43 limb[5] zmone; 44 ubyte[32] e; 45 int i; 46 47 for (i = 0;i < 32;++i) e[i] = secret[i]; 48 e[0] &= 248; 49 e[31] &= 127; 50 e[31] |= 64; 51 52 fexpand(bp.ptr, basepoint); 53 cmult(x.ptr, z.ptr, e.ptr, bp.ptr); 54 crecip(zmone.ptr, z.ptr); 55 fmul(z.ptr, x.ptr, zmone.ptr); 56 fcontract(mypublic, z.ptr); 57 return 0; 58 } 59 60 private: 61 62 alias uint128_t = donna128; 63 alias limb = ulong; 64 65 /* Sum two numbers: output += in */ 66 void fsum(limb* output, const limb* input) 67 { 68 output[0] += input[0]; 69 output[1] += input[1]; 70 output[2] += input[2]; 71 output[3] += input[3]; 72 output[4] += input[4]; 73 } 74 75 /* Find the difference of two numbers: output = in - output 76 * (note the order of the arguments!) 77 * 78 * Assumes that out[i] < 2**52 79 * On return, out[i] < 2**55 80 */ 81 void fdifferenceBackwards(limb* output, const limb* input) 82 { 83 /* 152 is 19 << 3 */ 84 immutable limb two54m152 = ((cast(limb)1) << 54) - 152; 85 immutable limb two54m8 = ((cast(limb)1) << 54) - 8; 86 87 output[0] = input[0] + two54m152 - output[0]; 88 output[1] = input[1] + two54m8 - output[1]; 89 output[2] = input[2] + two54m8 - output[2]; 90 output[3] = input[3] + two54m8 - output[3]; 91 output[4] = input[4] + two54m8 - output[4]; 92 } 93 94 /* Multiply a number by a scalar: output = in * scalar */ 95 void fscalarProduct(limb* output, const limb* input, const limb scalar) { 96 uint128_t a = uint128_t(input[0]) * scalar; 97 output[0] = a & 0x7ffffffffffff; 98 99 a = uint128_t(input[1]) * scalar + carry_shift(a, 51); 100 output[1] = a & 0x7ffffffffffff; 101 102 a = uint128_t(input[2]) * scalar + carry_shift(a, 51); 103 output[2] = a & 0x7ffffffffffff; 104 105 a = uint128_t(input[3]) * scalar + carry_shift(a, 51); 106 output[3] = a & 0x7ffffffffffff; 107 108 a = uint128_t(input[4]) * scalar + carry_shift(a, 51); 109 output[4] = a & 0x7ffffffffffff; 110 111 output[0] += carry_shift(a, 51) * 19; 112 } 113 114 /* Multiply two numbers: output = in2 * in 115 * 116 * output must be distinct to both inputs. The inputs are reduced coefficient 117 * form, the output is not. 118 * 119 * Assumes that in[i] < 2**55 and likewise for in2. 120 * On return, output[i] < 2**52 121 */ 122 void fmul(limb* output, const limb* input2, const limb* input) 123 { 124 uint128_t[5] t = void; 125 limb r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c; 126 127 r0 = input[0]; 128 r1 = input[1]; 129 r2 = input[2]; 130 r3 = input[3]; 131 r4 = input[4]; 132 133 s0 = input2[0]; 134 s1 = input2[1]; 135 s2 = input2[2]; 136 s3 = input2[3]; 137 s4 = input2[4]; 138 139 t[0] = uint128_t(r0) * s0; 140 t[1] = uint128_t(r0) * s1 + uint128_t(r1) * s0; 141 t[2] = uint128_t(r0) * s2 + uint128_t(r2) * s0 + uint128_t(r1) * s1; 142 t[3] = uint128_t(r0) * s3 + uint128_t(r3) * s0 + uint128_t(r1) * s2 + uint128_t(r2) * s1; 143 t[4] = uint128_t(r0) * s4 + uint128_t(r4) * s0 + uint128_t(r3) * s1 + uint128_t(r1) * s3 + uint128_t(r2) * s2; 144 145 r4 *= 19; 146 r1 *= 19; 147 r2 *= 19; 148 r3 *= 19; 149 150 t[0] += uint128_t(r4) * s1 + uint128_t(r1) * s4 + uint128_t(r2) * s3 + uint128_t(r3) * s2; 151 t[1] += uint128_t(r4) * s2 + uint128_t(r2) * s4 + uint128_t(r3) * s3; 152 t[2] += uint128_t(r4) * s3 + uint128_t(r3) * s4; 153 t[3] += uint128_t(r4) * s4; 154 155 r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51); 156 t[1] += c; r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51); 157 t[2] += c; r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51); 158 t[3] += c; r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51); 159 t[4] += c; r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51); 160 r0 += c * 19; c = carry_shift(uint128_t(r0), 51); r0 = r0 & 0x7ffffffffffff; 161 r1 += c; c = carry_shift(uint128_t(r1), 51); r1 = r1 & 0x7ffffffffffff; 162 r2 += c; 163 164 output[0] = r0; 165 output[1] = r1; 166 output[2] = r2; 167 output[3] = r3; 168 output[4] = r4; 169 } 170 171 void fsquareTimes(limb* output, const limb* input, limb count) 172 { 173 uint128_t[5] t; 174 limb r0,r1,r2,r3,r4,c; 175 limb d0,d1,d2,d4,d419; 176 177 r0 = input[0]; 178 r1 = input[1]; 179 r2 = input[2]; 180 r3 = input[3]; 181 r4 = input[4]; 182 183 do { 184 d0 = r0 * 2; 185 d1 = r1 * 2; 186 d2 = r2 * 2 * 19; 187 d419 = r4 * 19; 188 d4 = d419 * 2; 189 190 t[0] = uint128_t(r0) * r0 + uint128_t(d4) * r1 + uint128_t(d2) * (r3 ); 191 t[1] = uint128_t(d0) * r1 + uint128_t(d4) * r2 + uint128_t(r3) * (r3 * 19); 192 t[2] = uint128_t(d0) * r2 + uint128_t(r1) * r1 + uint128_t(d4) * (r3 ); 193 t[3] = uint128_t(d0) * r3 + uint128_t(d1) * r2 + uint128_t(r4) * (d419 ); 194 t[4] = uint128_t(d0) * r4 + uint128_t(d1) * r3 + uint128_t(r2) * (r2 ); 195 196 r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51); 197 t[1] += c; r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51); 198 t[2] += c; r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51); 199 t[3] += c; r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51); 200 t[4] += c; r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51); 201 r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff; 202 r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffff; 203 r2 += c; 204 } while(--count); 205 206 output[0] = r0; 207 output[1] = r1; 208 output[2] = r2; 209 output[3] = r3; 210 output[4] = r4; 211 } 212 213 /* Load a little-endian 64-bit number */ 214 limb loadLimb(const ubyte* input) 215 { 216 return loadLittleEndian!ulong(input, 0); 217 } 218 219 void storeLimb(ubyte* output, limb input) 220 { 221 storeLittleEndian!limb(input, output); 222 } 223 224 /* Take a little-endian, 32-byte number and expand it into polynomial form */ 225 void fexpand(limb* output, const ubyte* input) { 226 output[0] = loadLimb(input) & 0x7ffffffffffff; 227 output[1] = (loadLimb(input+6) >> 3) & 0x7ffffffffffff; 228 output[2] = (loadLimb(input+12) >> 6) & 0x7ffffffffffff; 229 output[3] = (loadLimb(input+19) >> 1) & 0x7ffffffffffff; 230 output[4] = (loadLimb(input+24) >> 12) & 0x7ffffffffffff; 231 } 232 233 /* Take a fully reduced polynomial form number and contract it into a 234 * little-endian, 32-byte array 235 */ 236 void fcontract(ubyte* output, const limb* input) 237 { 238 uint128_t[5] t; 239 240 t[0] = input[0]; 241 t[1] = input[1]; 242 t[2] = input[2]; 243 t[3] = input[3]; 244 t[4] = input[4]; 245 246 t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; 247 t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; 248 t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; 249 t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; 250 t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff; 251 252 t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; 253 t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; 254 t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; 255 t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; 256 t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff; 257 258 /* now t is between 0 and 2^255-1, properly carried. */ 259 /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ 260 261 t[0] += 19; 262 263 t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; 264 t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; 265 t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; 266 t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; 267 t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff; 268 269 /* now between 19 and 2^255-1 in both cases, and offset by 19. */ 270 271 t[0] += 0x8000000000000 - 19; 272 t[1] += 0x8000000000000 - 1; 273 t[2] += 0x8000000000000 - 1; 274 t[3] += 0x8000000000000 - 1; 275 t[4] += 0x8000000000000 - 1; 276 277 /* now between 2^255 and 2^256-20, and offset by 2^255. */ 278 279 t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; 280 t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; 281 t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; 282 t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; 283 t[4] &= 0x7ffffffffffff; 284 285 storeLimb(output, combine_lower(t[0], 0, t[1], 51)); 286 storeLimb(output+8, combine_lower(t[1], 13, t[2], 38)); 287 storeLimb(output+16, combine_lower(t[2], 26, t[3], 25)); 288 storeLimb(output+24, combine_lower(t[3], 39, t[4], 12)); 289 } 290 291 /* Input: Q, Q', Q-Q' 292 * Output: 2Q, Q+Q' 293 * 294 * x2 z3: long form 295 * x3 z3: long form 296 * x z: short form, destroyed 297 * xprime zprime: short form, destroyed 298 * qmqp: short form, preserved 299 */ 300 void fmonty(limb* x2, limb* z2, /* output 2Q */ 301 limb* x3, limb* z3, /* output Q + Q' */ 302 limb* x, limb* z, /* input Q */ 303 limb* xprime, limb* zprime, /* input Q' */ 304 const limb* qmqp /* input Q - Q' */) 305 { 306 limb[5] origx; 307 limb[5] origxprime; 308 limb[5] zzz; 309 limb[5] xx; 310 limb[5] zz; 311 limb[5] xxprime; 312 limb[5] zzprime; 313 limb[5] zzzprime; 314 315 copyMem(origx.ptr, x, 5); 316 fsum(x, z); 317 fdifferenceBackwards(z, origx.ptr); // does x - z 318 319 copyMem(origxprime.ptr, xprime, 5); 320 fsum(xprime, zprime); 321 fdifferenceBackwards(zprime, origxprime.ptr); 322 fmul(xxprime.ptr, xprime, z); 323 fmul(zzprime.ptr, x, zprime); 324 copyMem(origxprime.ptr, xxprime.ptr, 5); 325 fsum(xxprime.ptr, zzprime.ptr); 326 fdifferenceBackwards(zzprime.ptr, origxprime.ptr); 327 fsquareTimes(x3, xxprime.ptr, 1); 328 fsquareTimes(zzzprime.ptr, zzprime.ptr, 1); 329 fmul(z3, zzzprime.ptr, qmqp); 330 331 fsquareTimes(xx.ptr, x, 1); 332 fsquareTimes(zz.ptr, z, 1); 333 fmul(x2, xx.ptr, zz.ptr); 334 fdifferenceBackwards(zz.ptr, xx.ptr); // does zz = xx - zz 335 fscalarProduct(zzz.ptr, zz.ptr, 121665); 336 fsum(zzz.ptr, xx.ptr); 337 fmul(z2, zz.ptr, zzz.ptr); 338 } 339 340 // ----------------------------------------------------------------------------- 341 // Maybe swap the contents of two limb arrays (@a and @b), each @len elements 342 // long. Perform the swap iff @swap is non-zero. 343 // 344 // This function performs the swap without leaking any side-channel 345 // information. 346 // ----------------------------------------------------------------------------- 347 void swapConditional(limb* a, limb* b, limb iswap) { 348 const limb swap = cast(limb)(-iswap); 349 350 for (size_t i = 0; i < 5; ++i) { 351 const limb x = swap & (a[i] ^ b[i]); 352 a[i] ^= x; 353 b[i] ^= x; 354 } 355 } 356 357 /* Calculates nQ where Q is the x-coordinate of a point on the curve 358 * 359 * resultx/resultz: the x coordinate of the resulting curve point (short form) 360 * n: a little endian, 32-byte number 361 * q: a point of the curve (short form) 362 */ 363 void cmult(limb* resultx, limb* resultz, const ubyte* n, const limb* q) { 364 limb[5] a; 365 limb[5] b; b[0] = 1; 366 limb[5] c; c[0] = 1; 367 limb[5] d; 368 limb* nqpqx = a.ptr; 369 limb* nqpqz = b.ptr; 370 limb* nqx = c.ptr; 371 limb* nqz = d.ptr; 372 limb* t; 373 limb[5] e; 374 limb[5] f; f[0] = 1; 375 limb[5] g; 376 limb[5] h; h[0] = 1; 377 limb* nqpqx2 = e.ptr; 378 limb* nqpqz2 = f.ptr; 379 limb* nqx2 = g.ptr; 380 limb* nqz2 = h.ptr; 381 382 size_t i, j; 383 384 copyMem(nqpqx, q, 5); 385 386 for (i = 0; i < 32; ++i) { 387 ubyte by = n[31 - i]; 388 for (j = 0; j < 8; ++j) { 389 const limb bit = cast(limb)(by >> 7); 390 391 swapConditional(nqx, nqpqx, bit); 392 swapConditional(nqz, nqpqz, bit); 393 fmonty(nqx2, nqz2, 394 nqpqx2, nqpqz2, 395 nqx, nqz, 396 nqpqx, nqpqz, 397 q); 398 swapConditional(nqx2, nqpqx2, bit); 399 swapConditional(nqz2, nqpqz2, bit); 400 401 t = nqx; 402 nqx = nqx2; 403 nqx2 = t; 404 t = nqz; 405 nqz = nqz2; 406 nqz2 = t; 407 t = nqpqx; 408 nqpqx = nqpqx2; 409 nqpqx2 = t; 410 t = nqpqz; 411 nqpqz = nqpqz2; 412 nqpqz2 = t; 413 414 by <<= 1; 415 } 416 } 417 418 copyMem(resultx, nqx, 5); 419 copyMem(resultz, nqz, 5); 420 } 421 422 423 // ----------------------------------------------------------------------------- 424 // Shamelessly copied from djb's code, tightened a little 425 // ----------------------------------------------------------------------------- 426 void crecip(limb* output, const limb* z) { 427 limb[5] a_ = void; 428 limb[5] t0_ = void; 429 limb[5] b_ = void; 430 limb[5] c_ = void; 431 limb* a = a_.ptr; 432 limb* t0 = t0_.ptr; 433 limb* b = b_.ptr; 434 limb* c = c_.ptr; 435 436 /* 2 */ fsquareTimes(a, z, 1); // a = 2 437 /* 8 */ fsquareTimes(t0, a, 2); 438 /* 9 */ fmul(b, t0, z); // b = 9 439 /* 11 */ fmul(a, b, a); // a = 11 440 /* 22 */ fsquareTimes(t0, a, 1); 441 /* 2^5 - 2^0 = 31 */ fmul(b, t0, b); 442 /* 2^10 - 2^5 */ fsquareTimes(t0, b, 5); 443 /* 2^10 - 2^0 */ fmul(b, t0, b); 444 /* 2^20 - 2^10 */ fsquareTimes(t0, b, 10); 445 /* 2^20 - 2^0 */ fmul(c, t0, b); 446 /* 2^40 - 2^20 */ fsquareTimes(t0, c, 20); 447 /* 2^40 - 2^0 */ fmul(t0, t0, c); 448 /* 2^50 - 2^10 */ fsquareTimes(t0, t0, 10); 449 /* 2^50 - 2^0 */ fmul(b, t0, b); 450 /* 2^100 - 2^50 */ fsquareTimes(t0, b, 50); 451 /* 2^100 - 2^0 */ fmul(c, t0, b); 452 /* 2^200 - 2^100 */ fsquareTimes(t0, c, 100); 453 /* 2^200 - 2^0 */ fmul(t0, t0, c); 454 /* 2^250 - 2^50 */ fsquareTimes(t0, t0, 50); 455 /* 2^250 - 2^0 */ fmul(t0, t0, b); 456 /* 2^255 - 2^5 */ fsquareTimes(t0, t0, 5); 457 /* 2^255 - 21 */ fmul(output, t0, a); 458 }