1 /*
2 * curve25519-donna-c64.c from github.com/agl/curve25519-donna
3 * revision 80ad9b9930c9baef5829dd2a235b6b7646d32a8e
4 */
5 
6 /* Copyright 2008, Google Inc.
7  * All rights reserved.
8  *
9  * Code released into the public domain.
10  *
11  * curve25519-donna: Curve25519 elliptic curve, public key function
12  *
13  * http://code.google.com/p/curve25519-donna/
14  *
15  * Adam Langley <agl@imperialviolet.org>
16  *
17  * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
18  *
19  * More information about curve25519 can be found here
20  *     http://cr.yp.to/ecdh.html
21  *
22  * djb's sample implementation of curve25519 is written in a special assembly
23  * language called qhasm and uses the floating point registers.
24  *
25  * This is, almost, a clean room reimplementation from the curve25519 paper. It
26  * uses many of the tricks described therein. Only the crecip function is taken
27  * from the sample implementation.
28  */
29 module botan.pubkey.algo.curve25519_donna;
30 
31 import botan.constants;
32 static if (BOTAN_HAS_CURVE25519):
33 import botan.utils.donna128;
34 import botan.utils.mul128;
35 import botan.utils.loadstor;
36 import botan.utils.mem_ops;
37 
38 package
39 int curve25519Donna(ubyte* mypublic, const ubyte* secret, const ubyte* basepoint) {
40 	limb[5] bp;
41 	limb[5] x;
42 	limb[5] z;
43 	limb[5] zmone;
44 	ubyte[32] e;
45 	int i;
46 	
47 	for (i = 0;i < 32;++i) e[i] = secret[i];
48 	e[0] &= 248;
49 	e[31] &= 127;
50 	e[31] |= 64;
51 	
52 	fexpand(bp.ptr, basepoint);
53 	cmult(x.ptr, z.ptr, e.ptr, bp.ptr);
54 	crecip(zmone.ptr, z.ptr);
55 	fmul(z.ptr, x.ptr, zmone.ptr);
56 	fcontract(mypublic, z.ptr);
57 	return 0;
58 }
59 
60 private:
61 
62 alias uint128_t = donna128;
63 alias limb = ulong;
64 
65 /* Sum two numbers: output += in */
66 void fsum(limb* output, const limb* input)
67 {
68 	output[0] += input[0];
69 	output[1] += input[1];
70 	output[2] += input[2];
71 	output[3] += input[3];
72 	output[4] += input[4];
73 }
74 
75 /* Find the difference of two numbers: output = in - output
76  * (note the order of the arguments!)
77  *
78  * Assumes that out[i] < 2**52
79  * On return, out[i] < 2**55
80  */
81 void fdifferenceBackwards(limb* output, const limb* input) 
82 {
83 	/* 152 is 19 << 3 */
84 	immutable limb two54m152 = ((cast(limb)1) << 54) - 152;
85 	immutable limb two54m8 = ((cast(limb)1) << 54) - 8;
86 	
87 	output[0] = input[0] + two54m152 - output[0];
88 	output[1] = input[1] + two54m8 - output[1];
89 	output[2] = input[2] + two54m8 - output[2];
90 	output[3] = input[3] + two54m8 - output[3];
91 	output[4] = input[4] + two54m8 - output[4];
92 }
93 
94 /* Multiply a number by a scalar: output = in * scalar */
95 void fscalarProduct(limb* output, const limb* input, const limb scalar) {
96 	uint128_t a = uint128_t(input[0]) * scalar;
97 	output[0] = a & 0x7ffffffffffff;
98 	
99 	a = uint128_t(input[1]) * scalar + carry_shift(a, 51);
100 	output[1] = a & 0x7ffffffffffff;
101 	
102 	a = uint128_t(input[2]) * scalar + carry_shift(a, 51);
103 	output[2] = a & 0x7ffffffffffff;
104 	
105 	a = uint128_t(input[3]) * scalar + carry_shift(a, 51);
106 	output[3] = a & 0x7ffffffffffff;
107 	
108 	a = uint128_t(input[4]) * scalar + carry_shift(a, 51);
109 	output[4] = a & 0x7ffffffffffff;
110 	
111 	output[0] += carry_shift(a, 51) * 19;
112 }
113 
114 /* Multiply two numbers: output = in2 * in
115  *
116  * output must be distinct to both inputs. The inputs are reduced coefficient
117  * form, the output is not.
118  *
119  * Assumes that in[i] < 2**55 and likewise for in2.
120  * On return, output[i] < 2**52
121  */
122 void fmul(limb* output, const limb* input2, const limb* input) 
123 {
124 	uint128_t[5] t = void;
125 	limb r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c;
126 	
127 	r0 = input[0];
128 	r1 = input[1];
129 	r2 = input[2];
130 	r3 = input[3];
131 	r4 = input[4];
132 	
133 	s0 = input2[0];
134 	s1 = input2[1];
135 	s2 = input2[2];
136 	s3 = input2[3];
137 	s4 = input2[4];
138 	
139 	t[0] = uint128_t(r0) * s0;
140 	t[1] = uint128_t(r0) * s1 + uint128_t(r1) * s0;
141 	t[2] = uint128_t(r0) * s2 + uint128_t(r2) * s0 + uint128_t(r1) * s1;
142 	t[3] = uint128_t(r0) * s3 + uint128_t(r3) * s0 + uint128_t(r1) * s2 + uint128_t(r2) * s1;
143 	t[4] = uint128_t(r0) * s4 + uint128_t(r4) * s0 + uint128_t(r3) * s1 + uint128_t(r1) * s3 + uint128_t(r2) * s2;
144 	
145 	r4 *= 19;
146 	r1 *= 19;
147 	r2 *= 19;
148 	r3 *= 19;
149 	
150 	t[0] += uint128_t(r4) * s1 + uint128_t(r1) * s4 + uint128_t(r2) * s3 + uint128_t(r3) * s2;
151 	t[1] += uint128_t(r4) * s2 + uint128_t(r2) * s4 + uint128_t(r3) * s3;
152 	t[2] += uint128_t(r4) * s3 + uint128_t(r3) * s4;
153 	t[3] += uint128_t(r4) * s4;
154 	
155 	r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51);
156 	t[1] += c; r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51);
157 	t[2] += c; r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51);
158 	t[3] += c; r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51);
159 	t[4] += c; r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51);
160 	r0 +=     c * 19; c = carry_shift(uint128_t(r0), 51); r0 = r0 & 0x7ffffffffffff;
161 	r1 +=     c;      c = carry_shift(uint128_t(r1), 51); r1 = r1 & 0x7ffffffffffff;
162 	r2 +=     c;
163 	
164 	output[0] = r0;
165 	output[1] = r1;
166 	output[2] = r2;
167 	output[3] = r3;
168 	output[4] = r4;
169 }
170 
171 void fsquareTimes(limb* output, const limb* input, limb count) 
172 {
173 	uint128_t[5] t;
174 	limb r0,r1,r2,r3,r4,c;
175 	limb d0,d1,d2,d4,d419;
176 	
177 	r0 = input[0];
178 	r1 = input[1];
179 	r2 = input[2];
180 	r3 = input[3];
181 	r4 = input[4];
182 	
183 	do {
184 		d0 = r0 * 2;
185 		d1 = r1 * 2;
186 		d2 = r2 * 2 * 19;
187 		d419 = r4 * 19;
188 		d4 = d419 * 2;
189 		
190 		t[0] = uint128_t(r0) * r0 + uint128_t(d4) * r1 + uint128_t(d2) * (r3     );
191 		t[1] = uint128_t(d0) * r1 + uint128_t(d4) * r2 + uint128_t(r3) * (r3 * 19);
192 		t[2] = uint128_t(d0) * r2 + uint128_t(r1) * r1 + uint128_t(d4) * (r3     );
193 		t[3] = uint128_t(d0) * r3 + uint128_t(d1) * r2 + uint128_t(r4) * (d419   );
194 		t[4] = uint128_t(d0) * r4 + uint128_t(d1) * r3 + uint128_t(r2) * (r2     );
195 		
196 		r0 = t[0] & 0x7ffffffffffff; c = carry_shift(t[0], 51);
197 		t[1] += c; r1 = t[1] & 0x7ffffffffffff; c = carry_shift(t[1], 51);
198 		t[2] += c; r2 = t[2] & 0x7ffffffffffff; c = carry_shift(t[2], 51);
199 		t[3] += c; r3 = t[3] & 0x7ffffffffffff; c = carry_shift(t[3], 51);
200 		t[4] += c; r4 = t[4] & 0x7ffffffffffff; c = carry_shift(t[4], 51);
201 		r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff;
202 		r1 += c;      c = r1 >> 51; r1 = r1 & 0x7ffffffffffff;
203 		r2 += c;
204 	} while(--count);
205 	
206 	output[0] = r0;
207 	output[1] = r1;
208 	output[2] = r2;
209 	output[3] = r3;
210 	output[4] = r4;
211 }
212 
213 /* Load a little-endian 64-bit number    */
214 limb loadLimb(const ubyte* input)
215 {
216 	return loadLittleEndian!ulong(input, 0);
217 }
218 
219 void storeLimb(ubyte* output, limb input)
220 {
221 	storeLittleEndian!limb(input, output);
222 }
223 
224 /* Take a little-endian, 32-byte number and expand it into polynomial form */
225 void fexpand(limb* output, const ubyte* input) {
226 	output[0] = loadLimb(input) & 0x7ffffffffffff;
227 	output[1] = (loadLimb(input+6) >> 3) & 0x7ffffffffffff;
228 	output[2] = (loadLimb(input+12) >> 6) & 0x7ffffffffffff;
229 	output[3] = (loadLimb(input+19) >> 1) & 0x7ffffffffffff;
230 	output[4] = (loadLimb(input+24) >> 12) & 0x7ffffffffffff;
231 }
232 
233 /* Take a fully reduced polynomial form number and contract it into a
234  * little-endian, 32-byte array
235  */
236 void fcontract(ubyte* output, const limb* input)
237 {
238 	uint128_t[5] t;
239 	
240 	t[0] = input[0];
241 	t[1] = input[1];
242 	t[2] = input[2];
243 	t[3] = input[3];
244 	t[4] = input[4];
245 	
246 	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
247 	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
248 	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
249 	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
250 	t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff;
251 	
252 	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
253 	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
254 	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
255 	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
256 	t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff;
257 	
258 	/* now t is between 0 and 2^255-1, properly carried. */
259 	/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
260 	
261 	t[0] += 19;
262 	
263 	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
264 	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
265 	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
266 	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
267 	t[0] += (t[4] >> 51) * 19; t[4] &= 0x7ffffffffffff;
268 	
269 	/* now between 19 and 2^255-1 in both cases, and offset by 19. */
270 	
271 	t[0] += 0x8000000000000 - 19;
272 	t[1] += 0x8000000000000 - 1;
273 	t[2] += 0x8000000000000 - 1;
274 	t[3] += 0x8000000000000 - 1;
275 	t[4] += 0x8000000000000 - 1;
276 	
277 	/* now between 2^255 and 2^256-20, and offset by 2^255. */
278 	
279 	t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff;
280 	t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff;
281 	t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff;
282 	t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff;
283 	t[4] &= 0x7ffffffffffff;
284 	
285 	storeLimb(output,    combine_lower(t[0], 0, t[1], 51));
286 	storeLimb(output+8,  combine_lower(t[1], 13, t[2], 38));
287 	storeLimb(output+16, combine_lower(t[2], 26, t[3], 25));
288 	storeLimb(output+24, combine_lower(t[3], 39, t[4], 12));
289 }
290 
291 /* Input: Q, Q', Q-Q'
292  * Output: 2Q, Q+Q'
293  *
294  * x2 z3: long form
295  * x3 z3: long form
296  * x z: short form, destroyed
297  * xprime zprime: short form, destroyed
298  * qmqp: short form, preserved
299  */
300 void fmonty(limb* x2, limb* z2, /* output 2Q */
301 	limb* x3, limb* z3, /* output Q + Q' */
302 	limb* x, limb* z,     /* input Q */
303 	limb* xprime, limb* zprime, /* input Q' */
304 	const limb* qmqp /* input Q - Q' */) 
305 {
306 	limb[5] origx;
307 	limb[5] origxprime;
308 	limb[5] zzz;
309 	limb[5] xx;
310 	limb[5] zz;
311 	limb[5] xxprime;
312 	limb[5] zzprime;
313 	limb[5] zzzprime;
314 	
315 	copyMem(origx.ptr, x, 5);
316 	fsum(x, z);
317 	fdifferenceBackwards(z, origx.ptr);    // does x - z
318 	
319 	copyMem(origxprime.ptr, xprime, 5);
320 	fsum(xprime, zprime);
321 	fdifferenceBackwards(zprime, origxprime.ptr);
322 	fmul(xxprime.ptr, xprime, z);
323 	fmul(zzprime.ptr, x, zprime);
324 	copyMem(origxprime.ptr, xxprime.ptr, 5);
325 	fsum(xxprime.ptr, zzprime.ptr);
326 	fdifferenceBackwards(zzprime.ptr, origxprime.ptr);
327 	fsquareTimes(x3, xxprime.ptr, 1);
328 	fsquareTimes(zzzprime.ptr, zzprime.ptr, 1);
329 	fmul(z3, zzzprime.ptr, qmqp);
330 	
331 	fsquareTimes(xx.ptr, x, 1);
332 	fsquareTimes(zz.ptr, z, 1);
333 	fmul(x2, xx.ptr, zz.ptr);
334 	fdifferenceBackwards(zz.ptr, xx.ptr);    // does zz = xx - zz
335 	fscalarProduct(zzz.ptr, zz.ptr, 121665);
336 	fsum(zzz.ptr, xx.ptr);
337 	fmul(z2, zz.ptr, zzz.ptr);
338 }
339 
340 // -----------------------------------------------------------------------------
341 // Maybe swap the contents of two limb arrays (@a and @b), each @len elements
342 // long. Perform the swap iff @swap is non-zero.
343 //
344 // This function performs the swap without leaking any side-channel
345 // information.
346 // -----------------------------------------------------------------------------
347 void swapConditional(limb* a, limb* b, limb iswap) {
348 	const limb swap = cast(limb)(-iswap);
349 	
350 	for (size_t i = 0; i < 5; ++i) {
351 		const limb x = swap & (a[i] ^ b[i]);
352 		a[i] ^= x;
353 		b[i] ^= x;
354 	}
355 }
356 
357 /* Calculates nQ where Q is the x-coordinate of a point on the curve
358  *
359  *     resultx/resultz: the x coordinate of the resulting curve point (short form)
360  *     n: a little endian, 32-byte number
361  *     q: a point of the curve (short form)
362  */
363 void cmult(limb* resultx, limb* resultz, const ubyte* n, const limb* q) {
364 	limb[5] a;
365 	limb[5] b; b[0] = 1;
366 	limb[5] c; c[0] = 1;
367 	limb[5] d;
368 	limb* nqpqx = a.ptr;
369 	limb* nqpqz = b.ptr;
370 	limb* nqx = c.ptr;
371 	limb* nqz = d.ptr;
372 	limb* t;
373 	limb[5] e;
374 	limb[5] f; f[0] = 1;
375 	limb[5] g;
376 	limb[5] h; h[0] = 1;
377 	limb* nqpqx2 = e.ptr;
378 	limb* nqpqz2 = f.ptr;
379 	limb* nqx2 = g.ptr;
380 	limb* nqz2 = h.ptr;
381 	
382 	size_t i, j;
383 	
384 	copyMem(nqpqx, q, 5);
385 	
386 	for (i = 0; i < 32; ++i) {
387 		ubyte by = n[31 - i];
388 		for (j = 0; j < 8; ++j) {
389 			const limb bit = cast(limb)(by >> 7);
390 			
391 			swapConditional(nqx, nqpqx, bit);
392 			swapConditional(nqz, nqpqz, bit);
393 			fmonty(nqx2, nqz2,
394 				nqpqx2, nqpqz2,
395 				nqx, nqz,
396 				nqpqx, nqpqz,
397 				q);
398 			swapConditional(nqx2, nqpqx2, bit);
399 			swapConditional(nqz2, nqpqz2, bit);
400 			
401 			t = nqx;
402 			nqx = nqx2;
403 			nqx2 = t;
404 			t = nqz;
405 			nqz = nqz2;
406 			nqz2 = t;
407 			t = nqpqx;
408 			nqpqx = nqpqx2;
409 			nqpqx2 = t;
410 			t = nqpqz;
411 			nqpqz = nqpqz2;
412 			nqpqz2 = t;
413 			
414 			by <<= 1;
415 		}
416 	}
417 	
418 	copyMem(resultx, nqx, 5);
419 	copyMem(resultz, nqz, 5);
420 }
421 
422 
423 // -----------------------------------------------------------------------------
424 // Shamelessly copied from djb's code, tightened a little
425 // -----------------------------------------------------------------------------
426 void crecip(limb* output, const limb* z) {
427 	limb[5] a_ = void;
428 	limb[5] t0_ = void;
429 	limb[5] b_ = void;
430 	limb[5] c_ = void;
431 	limb* a = a_.ptr;
432 	limb* t0 = t0_.ptr;
433 	limb* b = b_.ptr;
434 	limb* c = c_.ptr;
435 
436 	/* 2 */ fsquareTimes(a, z, 1); // a = 2
437 	/* 8 */ fsquareTimes(t0, a, 2);
438 	/* 9 */ fmul(b, t0, z); // b = 9
439 	/* 11 */ fmul(a, b, a); // a = 11
440 	/* 22 */ fsquareTimes(t0, a, 1);
441 	/* 2^5 - 2^0 = 31 */ fmul(b, t0, b);
442 	/* 2^10 - 2^5 */ fsquareTimes(t0, b, 5);
443 	/* 2^10 - 2^0 */ fmul(b, t0, b);
444 	/* 2^20 - 2^10 */ fsquareTimes(t0, b, 10);
445 	/* 2^20 - 2^0 */ fmul(c, t0, b);
446 	/* 2^40 - 2^20 */ fsquareTimes(t0, c, 20);
447 	/* 2^40 - 2^0 */ fmul(t0, t0, c);
448 	/* 2^50 - 2^10 */ fsquareTimes(t0, t0, 10);
449 	/* 2^50 - 2^0 */ fmul(b, t0, b);
450 	/* 2^100 - 2^50 */ fsquareTimes(t0, b, 50);
451 	/* 2^100 - 2^0 */ fmul(c, t0, b);
452 	/* 2^200 - 2^100 */ fsquareTimes(t0, c, 100);
453 	/* 2^200 - 2^0 */ fmul(t0, t0, c);
454 	/* 2^250 - 2^50 */ fsquareTimes(t0, t0, 50);
455 	/* 2^250 - 2^0 */ fmul(t0, t0, b);
456 	/* 2^255 - 2^5 */ fsquareTimes(t0, t0, 5);
457 	/* 2^255 - 21 */ fmul(output, t0, a);
458 }