1 /**
2 * GCM Mode
3 * 
4 * Copyright:
5 * (C) 2013 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.modes.aead.gcm;
12 
13 import botan.constants;
14 
15 static if (BOTAN_HAS_AEAD_GCM):
16 
17 import botan.modes.aead.aead;
18 import botan.block.block_cipher;
19 import botan.stream.stream_cipher;
20 import botan.stream.ctr;
21 import botan.utils.xor_buf;
22 import botan.utils.loadstor;
23 import botan.utils.mem_ops;
24 
25 import botan.utils.simd.immintrin;
26 import botan.utils.simd.wmmintrin;
27 
28 import botan.utils.types;
29 
30 import std.conv : to;
31 import std.algorithm : min;
32 
33 static if (BOTAN_HAS_GCM_CLMUL) {
34     import botan.utils.simd.wmmintrin;
35     import botan.utils.cpuid;
36 }
37 
38 /**
39 * GCM Mode
40 */
41 abstract class GCMMode : AEADMode, Transformation
42 {
43 public:
44     ~this() { destroy(m_ctr); destroy(m_ghash); } // TODO: for some reason CTR needs to be destroyed before ghash
45 
46     override SecureVector!ubyte startRaw(const(ubyte)* nonce, size_t nonce_len)
47     {
48         if (!validNonceLength(nonce_len))
49             throw new InvalidIVLength(name, nonce_len);
50         
51         SecureVector!ubyte y0 = SecureVector!ubyte(BS);
52         
53         if (nonce_len == 12)
54         {
55             copyMem(y0.ptr, nonce, nonce_len);
56             y0[15] = 1;
57         }
58         else
59         {
60             y0 = m_ghash.nonceHash(nonce, nonce_len);
61         }
62         
63         m_ctr.setIv(y0.ptr, y0.length);
64         
65         SecureVector!ubyte m_enc_y0 = SecureVector!ubyte(BS);
66         m_ctr.encipher(m_enc_y0);
67         
68         m_ghash.start(m_enc_y0.ptr, m_enc_y0.length);
69         
70         return SecureVector!ubyte();
71     }
72 
73     override void setAssociatedData(const(ubyte)* ad, size_t ad_len)
74     {
75         m_ghash.setAssociatedData(ad, ad_len);
76     }
77 
78     override @property string name() const
79     {
80         return (m_cipher_name ~ "/GCM");
81     }
82 
83     override size_t updateGranularity() const
84     {
85         return 4096; // CTR-BE's internal block size
86     }
87 
88     override KeyLengthSpecification keySpec() const
89     {
90         return m_ctr.keySpec();
91     }
92 
93     // GCM supports arbitrary nonce lengths
94     override bool validNonceLength(size_t) const { return true; }
95 
96     override size_t tagSize() const { return m_tag_size; }
97 
98     override void clear()
99     {
100         m_ctr.clear();
101         m_ghash.clear();
102 
103     }
104 
105     override size_t defaultNonceLength() const { return super.defaultNonceLength(); }
106 
107 protected:
108     override void keySchedule(const(ubyte)* key, size_t length)
109     {
110         m_ctr.setKey(key, length);
111         
112         const Vector!ubyte zeros = Vector!ubyte(BS);
113         m_ctr.setIv(zeros.ptr, zeros.length);
114         
115         SecureVector!ubyte H = SecureVector!ubyte(BS);
116         m_ctr.encipher(H);
117         m_ghash.setKey(H);
118     }
119 
120     /*
121     * GCMMode Constructor
122     */
123     this(BlockCipher cipher, size_t tag_size)
124     { 
125         m_tag_size = tag_size;
126         m_cipher_name = cipher.name;
127         if (cipher.blockSize() != BS)
128             throw new InvalidArgument("GCM requires a 128 bit cipher so cannot be used with " ~ cipher.name);
129         
130         m_ghash = new GHASH;
131 
132         m_ctr = new CTRBE(cipher); // CTR_BE takes ownership of cipher
133         
134         if (m_tag_size != 8 && m_tag_size != 16)
135             throw new InvalidArgument(name ~ ": Bad tag size " ~ to!string(m_tag_size));
136     }
137 
138     __gshared immutable size_t BS = 16;
139 
140     const size_t m_tag_size;
141     const string m_cipher_name;
142 
143     Unique!StreamCipher m_ctr;
144     Unique!GHASH m_ghash;
145 }
146 
147 /**
148 * GCM Encryption
149 */
150 final class GCMEncryption : GCMMode, Transformation
151 {
152 public:
153     /**
154     * Params:
155     *  cipher = the 128 bit block cipher to use
156     *  tag_size = is how big the auth tag will be
157     */
158     this(BlockCipher cipher, size_t tag_size = 16) 
159     {
160         super(cipher, tag_size);
161     }
162 
163     override size_t outputLength(size_t input_length) const
164     { return input_length + tagSize(); }
165 
166     override size_t minimumFinalSize() const { return 0; }
167 
168     override void update(ref SecureVector!ubyte buffer, size_t offset = 0)
169     {
170         assert(buffer.length >= offset, "Offset is sane");
171         const size_t sz = buffer.length - offset;
172         ubyte* buf = buffer.ptr + offset;
173         
174         m_ctr.cipher(buf, buf, sz);
175         m_ghash.update(buf, sz);
176     }
177 
178     override void finish(ref SecureVector!ubyte buffer, size_t offset = 0)
179     {
180         import std.algorithm : max;
181         update(buffer, offset);
182         auto mac = m_ghash.finished();
183         buffer ~= mac.ptr[0 .. tagSize()];
184     }
185 
186     // Interface fallthrough
187     override string provider() const { return "core"; }
188     override SecureVector!ubyte startRaw(const(ubyte)* nonce, size_t nonce_len) { return super.startRaw(nonce, nonce_len); }
189     override size_t updateGranularity() const { return super.updateGranularity(); }
190     override size_t defaultNonceLength() const { return super.defaultNonceLength(); }
191     override bool validNonceLength(size_t nonce_len) const { return super.validNonceLength(nonce_len); }
192     override @property string name() const { return super.name; }
193     override void clear() { return super.clear(); }
194 }
195 
196 /**
197 * GCM Decryption
198 */
199 final class GCMDecryption : GCMMode, Transformation
200 {
201 public:
202     /**
203     * Params:
204     *  cipher = the 128 bit block cipher to use
205     *  tag_size = is how big the auth tag will be
206     */
207     this(BlockCipher cipher, size_t tag_size = 16)
208     {
209         super(cipher, tag_size);
210     }
211 
212     override size_t outputLength(size_t input_length) const
213     {
214         assert(input_length > tagSize(), "Sufficient input");
215         return input_length - tagSize();
216     }
217 
218     override size_t minimumFinalSize() const { return tagSize(); }
219 
220     override void update(ref SecureVector!ubyte buffer, size_t offset = 0)
221     {
222         assert(buffer.length >= offset, "Offset is sane");
223         const size_t sz = buffer.length - offset;
224         ubyte* buf = buffer.ptr + offset;
225         
226         m_ghash.update(buf, sz);
227         m_ctr.cipher(buf, buf, sz);
228     }
229 
230     override void finish(ref SecureVector!ubyte buffer, size_t offset)
231     {
232         assert(buffer.length >= offset, "Offset is sane");
233         const size_t sz = buffer.length - offset;
234 
235         ubyte* buf = buffer.ptr + offset;
236         
237         assert(sz >= tagSize(), "Have the tag as part of final input");
238         
239         const size_t remaining = sz - tagSize();
240         
241         // handle any final input before the tag
242         if (remaining)
243         {
244             m_ghash.update(buf, remaining);
245 
246             m_ctr.cipher(buf, buf, remaining);
247         }
248         
249         auto mac = m_ghash.finished();
250         
251         const(ubyte)* included_tag = &buffer[remaining];
252         
253         if (!sameMem(mac.ptr, included_tag, tagSize()))
254             throw new IntegrityFailure("GCM tag check failed");
255         
256         buffer.resize(offset + remaining);
257     }
258 
259     // Interface fallthrough
260     override string provider() const { return "core"; }
261     override SecureVector!ubyte startRaw(const(ubyte)* nonce, size_t nonce_len) { return super.startRaw(nonce, nonce_len); }
262     override size_t updateGranularity() const { return super.updateGranularity(); }
263     override size_t defaultNonceLength() const { return super.defaultNonceLength(); }
264     override bool validNonceLength(size_t nonce_len) const { return super.validNonceLength(nonce_len); }
265     override @property string name() const { return super.name; }
266     override void clear() { return super.clear(); }
267 }
268 
269 /**
270 * GCM's GHASH
271 * Maybe a Transform?
272 */
273 final class GHASH : SymmetricAlgorithm
274 {
275 public:
276     void setAssociatedData(const(ubyte)* input, size_t length)
277     {
278         zeroise(m_H_ad);
279         ghashUpdate(m_H_ad, input, length);
280         m_ad_len = length;
281     }
282 
283     SecureVector!ubyte nonceHash(const(ubyte)* nonce, size_t nonce_len)
284     {
285         assert(m_ghash.length == 0, "nonceHash called during wrong time");
286         SecureVector!ubyte y0 = SecureVector!ubyte(16);
287         
288         ghashUpdate(y0, nonce, nonce_len);
289         addFinalBlock(y0, 0, nonce_len);
290         
291         return y0.move;
292     }
293 
294     void start(const(ubyte)* nonce, size_t len)
295     {
296         m_nonce[] = nonce[0 .. len];
297         m_ghash = m_H_ad.dup;
298     }
299 
300     /*
301     * Assumes input len is multiple of 16
302     */
303     void update(const(ubyte)* input, size_t length)
304     {
305         assert(m_ghash.length == 16, "Key was set");
306         
307         m_text_len += length;
308         
309         ghashUpdate(m_ghash, input, length);
310     }
311 
312     SecureVector!ubyte finished()
313     {
314         addFinalBlock(m_ghash, m_ad_len, m_text_len);
315         m_ghash ^= m_nonce;
316         m_text_len = 0;
317         return m_ghash.move;
318     }
319 
320     KeyLengthSpecification keySpec() const { return KeyLengthSpecification(16); }
321 
322     override void clear()
323     {
324         zeroise(m_H);
325         zeroise(m_H_ad);
326         m_ghash.clear();
327         m_text_len = m_ad_len = 0;
328     }
329 
330     @property string name() const { return "GHASH"; }
331 
332     override void keySchedule(const(ubyte)* key, size_t length)
333     {
334         m_H[] = key[0 .. length];
335         m_H_ad.resize(16);
336         m_ad_len = 0;
337         m_text_len = 0;
338     }
339 
340 private:
341     void gcmMultiply(ref SecureVector!ubyte x)
342     {
343         import std.algorithm : max;
344         static if (BOTAN_HAS_GCM_CLMUL) {
345             if (CPUID.hasClmul()) {
346                 return gcmMultiplyClmul(*cast(ubyte[16]*) x.ptr, *cast(ubyte[16]*) m_H.ptr);
347             }
348         }
349         
350         __gshared immutable ulong R = 0xE100000000000000;
351         
352         ulong[2] H = [ loadBigEndian!ulong(m_H.ptr, 0), loadBigEndian!ulong(m_H.ptr, 1) ];
353         ulong[2] Z = [ 0, 0 ];
354         
355         // SSE2 might be useful here        
356         foreach (size_t i; 0 .. 2)
357         {
358             const ulong X = loadBigEndian!ulong(x.ptr, i);
359             
360             foreach (size_t j; 0 .. 64)
361             {
362                 if ((X >> (63-j)) & 1)
363                 {
364                     Z[0] ^= H[0];
365                     Z[1] ^= H[1];
366                 }
367                 
368                 const ulong r = (H[1] & 1) ? R : 0;
369                 
370                 H[1] = (H[0] << 63) | (H[1] >> 1);
371                 H[0] = (H[0] >> 1) ^ r;
372             }
373         }
374         
375         storeBigEndian!ulong(x.ptr, Z[0], Z[1]);
376     }
377 
378     void ghashUpdate(ref SecureVector!ubyte ghash, const(ubyte)* input, size_t length)
379     {
380         __gshared immutable size_t BS = 16;
381         
382         /*
383         This assumes if less than block size input then we're just on the
384         final block and should pad with zeros
385         */
386         while (length)
387         {
388             const size_t to_proc = min(length, BS);
389             
390             xorBuf(ghash.ptr, input, to_proc);
391             gcmMultiply(ghash);
392             
393             input += to_proc;
394             length -= to_proc;
395         }
396     }
397 
398     void addFinalBlock(ref SecureVector!ubyte hash,
399                        size_t ad_len, size_t text_len)
400     {
401         SecureVector!ubyte final_block = SecureVector!ubyte(16);
402         storeBigEndian!ulong(final_block.ptr, 8*ad_len, 8*text_len);
403         ghashUpdate(hash, final_block.ptr, final_block.length);
404     }
405 
406     SecureVector!ubyte m_H;
407     SecureVector!ubyte m_H_ad;
408     SecureVector!ubyte m_nonce;
409     SecureVector!ubyte m_ghash;
410     size_t m_ad_len = 0, m_text_len = 0;
411 }
412 
413 
414 
415 static if (BOTAN_HAS_GCM_CLMUL)
416     void gcmMultiplyClmul(ref ubyte[16] x, in ubyte[16] H) 
417 {
418     __gshared immutable(__m128i) BSWAP_MASK = _mm_set1_epi8!([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])();
419 	version(D_InlineAsm_X86_64) {
420 		version(DMD) {
421 			enum USE_ASM = true;
422 		} else enum USE_ASM = false;
423 	} else enum USE_ASM = false;
424 
425     static if (USE_ASM) {
426         __m128i* a = cast(__m128i*) x.ptr;
427         __m128i* b = cast(__m128i*) H.ptr;
428         __m128i* c = cast(__m128i*) &BSWAP_MASK;
429 
430         asm pure nothrow {
431             mov RAX, a;
432             mov RBX, b;
433             mov RCX, c;
434             movdqu XMM13, [RAX];                        // __m128i a = _mm_loadu_si128(cast(const(__m128i*)) x.ptr);
435             movdqu XMM14, [RBX];                        // __m128i b = _mm_loadu_si128(cast(const(__m128i*)) H.ptr);
436             movdqu XMM15, [RCX];
437             pshufb XMM13, XMM15;                         // a = _mm_shuffle_epi8(a, BSWAP_MASK);
438             pshufb XMM14, XMM15;                         // b = _mm_shuffle_epi8(b, BSWAP_MASK);
439             movdqa XMM0, XMM13; // XMM0 => T0
440             movdqa XMM1, XMM13; // XMM1 => T1
441             movdqa XMM2, XMM13; // XMM2 => T2
442             movdqa XMM3, XMM13; // XMM3 => T3
443 
444             db 0x66, 0x41, 0x0F, 0x3A, 0x44, 0xC6, 0x00; // T0 = _mm_clmulepi64_si128!"0x00"(a, b);
445             db 0x66, 0x41, 0x0F, 0x3A, 0x44, 0xCE, 0x01; // T1 = _mm_clmulepi64_si128!"0x01"(a, b);
446             db 0x66, 0x41, 0x0F, 0x3A, 0x44, 0xD6, 0x10; // T2 = _mm_clmulepi64_si128!"0x10"(a, b);
447             db 0x66, 0x41, 0x0F, 0x3A, 0x44, 0xDE, 0x11; // T3 = _mm_clmulepi64_si128!"0x11"(a, b);
448             pxor XMM1, XMM2;                             // T1 = _mm_xor_si128(T1, T2);
449             movdqa XMM6, XMM1;
450             pslldq XMM6, 8;                                 // T2 = _mm_slli_si128!8(T1);
451             movdqa XMM2, XMM6;
452             psrldq XMM1, 8;                                 // T1 = _mm_srli_si128!8(T1);
453             pxor XMM0, XMM2;                             // T0 = _mm_xor_si128(T0, T2);
454             pxor XMM3, XMM1;                             // T3 = _mm_xor_si128(T3, T1);
455             movdqa XMM6, XMM0;
456             psrld XMM6, 31;                                 // T4 = _mm_srli_epi32!31(T0)
457             movdqa XMM4, XMM6;
458             pslld XMM0, 1;                                 // T0 = _mm_slli_epi32!1(T0);
459             movdqa XMM6, XMM3;
460             psrld XMM6, 31;
461             movdqa XMM5, XMM6;                             // T5 = _mm_srli_epi32!31(T3);
462             pslld XMM3, 1;                                 // T3 = _mm_slli_epi32!1(T3);
463             movdqa XMM6, XMM4;
464             psrldq XMM6, 12;                             // T2 = _mm_srli_si128!12(T4);
465             movdqa XMM2, XMM6;
466             pslldq XMM5, 4;                                 // T5 = _mm_slli_si128!4(T5);
467             pslldq XMM4, 4;                                 // T4 = _mm_slli_si128!4(T4);
468             por XMM0, XMM4;                                  // T0 = _mm_or_si128(T0, T4);
469             por XMM3, XMM5;                                 // T3 = _mm_or_si128(T3, T5);
470             por XMM3, XMM2;                                 // T3 = _mm_or_si128(T3, T2);
471             movdqa XMM6, XMM0;
472             pslld XMM6, 31;                                 // T4 = _mm_slli_epi32!31(T0);
473             movdqa XMM4, XMM6;
474             movdqa XMM6, XMM0;
475             pslld XMM6, 30;                                 // T5 = _mm_slli_epi32!30(T0);
476             movdqa XMM5, XMM6;
477             movdqa XMM6, XMM0;
478             pslld XMM6, 25;                                 // T2 = _mm_slli_epi32!25(T0);
479             movdqa XMM2, XMM6;
480             pxor XMM4, XMM5;                             // T4 = _mm_xor_si128(T4, T5);
481             pxor XMM4, XMM2;                             // T4 = _mm_xor_si128(T4, T2);
482             movdqa XMM6, XMM4;
483             psrldq XMM6, 4;                                 // T5 = _mm_srli_si128!4(T4);
484             movdqa XMM5, XMM6;
485             pxor XMM3, XMM5;                             // T3 = _mm_xor_si128(T3, T5);
486             pslldq XMM4, 12;                             // T4 = _mm_slli_si128!12(T4);
487             pxor XMM0, XMM4;                             // T0 = _mm_xor_si128(T0, T4);
488             pxor XMM3, XMM0;                             // T3 = _mm_xor_si128(T3, T0);
489             movdqa XMM6, XMM0;
490             psrld XMM6, 1;                                 // T4 = _mm_srli_epi32!1(T0);
491             movdqa XMM4, XMM6;
492             movdqa XMM6, XMM0;
493             psrld XMM6, 2;                                 // T1 = _mm_srli_epi32!2(T0);
494             movdqa XMM1, XMM6;
495             movdqa XMM6, XMM0;
496             psrld XMM6, 7;                                 // T2 = _mm_srli_epi32!7(T0);
497             movdqa XMM2, XMM6;
498             pxor XMM3, XMM1;                             // T3 = _mm_xor_si128(T3, T1);
499             pxor XMM3, XMM2;                             // T3 = _mm_xor_si128(T3, T2);
500             pxor XMM3, XMM4;                             // T3 = _mm_xor_si128(T3, T4);
501             mov RCX, c;
502             movdqu XMM15, [RCX];
503             pshufb XMM3, XMM15;                             // T3 = _mm_shuffle_epi8(T3, BSWAP_MASK);
504             mov RAX, a;
505             movdqu [RAX], XMM3;                             // _mm_storeu_si128(cast(__m128i*) x.ptr, T3);
506         }
507     }
508     else {
509         /*
510         * Algorithms 1 and 5 from Intel's CLMUL guide
511         */        
512         __m128i a = _mm_loadu_si128(cast(const(__m128i*)) x.ptr);
513         __m128i b = _mm_loadu_si128(cast(const(__m128i*)) H.ptr);
514         
515         a = _mm_shuffle_epi8(a, BSWAP_MASK);
516         b = _mm_shuffle_epi8(b, BSWAP_MASK);
517         
518         __m128i T0, T1, T2, T3, T4, T5;
519         
520         T0 = _mm_clmulepi64_si128!"0x00"(a, b);
521         T1 = _mm_clmulepi64_si128!"0x01"(a, b);
522         T2 = _mm_clmulepi64_si128!"0x10"(a, b);
523         T3 = _mm_clmulepi64_si128!"0x11"(a, b);
524         
525         T1 = _mm_xor_si128(T1, T2);
526         T2 = _mm_slli_si128!8(T1);
527         T1 = _mm_srli_si128!8(T1);
528         T0 = _mm_xor_si128(T0, T2);
529         T3 = _mm_xor_si128(T3, T1);
530         
531         T4 = _mm_srli_epi32!31(T0);
532         T0 = _mm_slli_epi32!1(T0);
533         
534         T5 = _mm_srli_epi32!31(T3);
535         T3 = _mm_slli_epi32!1(T3);
536         
537         T2 = _mm_srli_si128!12(T4);
538         T5 = _mm_slli_si128!4(T5);
539         T4 = _mm_slli_si128!4(T4);
540         T0 = _mm_or_si128(T0, T4);
541         T3 = _mm_or_si128(T3, T5);
542         T3 = _mm_or_si128(T3, T2);
543         
544         T4 = _mm_slli_epi32!31(T0);
545         T5 = _mm_slli_epi32!30(T0);
546         T2 = _mm_slli_epi32!25(T0);
547         
548         T4 = _mm_xor_si128(T4, T5);
549         T4 = _mm_xor_si128(T4, T2);
550         T5 = _mm_srli_si128!4(T4);
551         T3 = _mm_xor_si128(T3, T5);
552         T4 = _mm_slli_si128!12(T4);
553         T0 = _mm_xor_si128(T0, T4);
554         T3 = _mm_xor_si128(T3, T0);
555         
556         T4 = _mm_srli_epi32!1(T0);
557         T1 = _mm_srli_epi32!2(T0);
558         T2 = _mm_srli_epi32!7(T0);
559         T3 = _mm_xor_si128(T3, T1);
560         T3 = _mm_xor_si128(T3, T2);
561         T3 = _mm_xor_si128(T3, T4);
562         
563         T3 = _mm_shuffle_epi8(T3, BSWAP_MASK);
564         
565         _mm_storeu_si128(cast(__m128i*) x.ptr, T3);
566     }
567 }