1 /**
2 * AES using AES-NI instructions
3 * 
4 * Copyright:
5 * (C) 2009 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.block.aes_ni;
12 
13 import botan.constants;
14 static if (BOTAN_HAS_AES_NI):
15 import botan.block.block_cipher;
16 import botan.utils.loadstor;
17 import botan.utils.simd.wmmintrin;
18 import botan.utils.mem_ops;
19 import std.format : format;
20 
21 /**
22 * AES-128 using AES-NI
23 */
24 final class AES128NI : BlockCipherFixedParams!(16, 16), BlockCipher, SymmetricAlgorithm
25 {
26 public:
27     override @property size_t parallelism() const { return 4; }
28 
29     /*
30     * AES-128 Encryption
31     */
32     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
33     {
34         __m128i* in_mm = cast(__m128i*)(input);
35         __m128i* out_mm = cast(__m128i*)(output);
36         
37         const(__m128i*) key_mm = cast(const(__m128i*))(m_EK.ptr);
38         
39         __m128i K0  = _mm_loadu_si128(key_mm);
40         __m128i K1  = _mm_loadu_si128(key_mm + 1);
41         __m128i K2  = _mm_loadu_si128(key_mm + 2);
42         __m128i K3  = _mm_loadu_si128(key_mm + 3);
43         __m128i K4  = _mm_loadu_si128(key_mm + 4);
44         __m128i K5  = _mm_loadu_si128(key_mm + 5);
45         __m128i K6  = _mm_loadu_si128(key_mm + 6);
46         __m128i K7  = _mm_loadu_si128(key_mm + 7);
47         __m128i K8  = _mm_loadu_si128(key_mm + 8);
48         __m128i K9  = _mm_loadu_si128(key_mm + 9);
49         __m128i K10 = _mm_loadu_si128(key_mm + 10);
50         
51         while (blocks >= 4)
52         {
53             __m128i B0 = _mm_loadu_si128(in_mm + 0);
54             __m128i B1 = _mm_loadu_si128(in_mm + 1);
55             __m128i B2 = _mm_loadu_si128(in_mm + 2);
56             __m128i B3 = _mm_loadu_si128(in_mm + 3);
57             
58             B0 = _mm_xor_si128(B0, K0);
59             B1 = _mm_xor_si128(B1, K0);
60             B2 = _mm_xor_si128(B2, K0);
61             B3 = _mm_xor_si128(B3, K0);
62             
63             mixin(AES_ENC_4_ROUNDS!(K1));
64             mixin(AES_ENC_4_ROUNDS!(K2));
65             mixin(AES_ENC_4_ROUNDS!(K3));
66             mixin(AES_ENC_4_ROUNDS!(K4));
67             mixin(AES_ENC_4_ROUNDS!(K5));
68             mixin(AES_ENC_4_ROUNDS!(K6));
69             mixin(AES_ENC_4_ROUNDS!(K7));
70             mixin(AES_ENC_4_ROUNDS!(K8));
71             mixin(AES_ENC_4_ROUNDS!(K9));
72             mixin(AES_ENC_4_LAST_ROUNDS!(K10));
73 
74             _mm_storeu_si128(out_mm + 0, B0);
75             _mm_storeu_si128(out_mm + 1, B1);
76             _mm_storeu_si128(out_mm + 2, B2);
77             _mm_storeu_si128(out_mm + 3, B3);
78 
79             blocks -= 4;
80             in_mm += 4;
81             out_mm += 4;
82         }
83         
84         foreach (size_t i; 0 .. blocks)
85         {
86             __m128i B = _mm_loadu_si128(in_mm + i);
87             
88             B = _mm_xor_si128(B, K0);
89             
90             B = _mm_aesenc_si128(B, K1);
91             B = _mm_aesenc_si128(B, K2);
92             B = _mm_aesenc_si128(B, K3);
93             B = _mm_aesenc_si128(B, K4);
94             B = _mm_aesenc_si128(B, K5);
95             B = _mm_aesenc_si128(B, K6);
96             B = _mm_aesenc_si128(B, K7);
97             B = _mm_aesenc_si128(B, K8);
98             B = _mm_aesenc_si128(B, K9);
99             B = _mm_aesenclast_si128(B, K10);
100             
101             _mm_storeu_si128(out_mm + i, B);
102         }
103     }
104 
105     /*
106     * AES-128 Decryption
107     */
108     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
109     {
110         __m128i* in_mm = cast(__m128i*)(input);
111         __m128i* out_mm = cast(__m128i*)(output);
112         
113         const(__m128i*) key_mm = cast(const(__m128i*))(m_DK.ptr);
114         
115         __m128i K0  = _mm_loadu_si128(key_mm);
116         __m128i K1  = _mm_loadu_si128(key_mm + 1);
117         __m128i K2  = _mm_loadu_si128(key_mm + 2);
118         __m128i K3  = _mm_loadu_si128(key_mm + 3);
119         __m128i K4  = _mm_loadu_si128(key_mm + 4);
120         __m128i K5  = _mm_loadu_si128(key_mm + 5);
121         __m128i K6  = _mm_loadu_si128(key_mm + 6);
122         __m128i K7  = _mm_loadu_si128(key_mm + 7);
123         __m128i K8  = _mm_loadu_si128(key_mm + 8);
124         __m128i K9  = _mm_loadu_si128(key_mm + 9);
125         __m128i K10 = _mm_loadu_si128(key_mm + 10);
126         
127         while (blocks >= 4)
128         {
129             __m128i B0 = _mm_loadu_si128(in_mm + 0);
130             __m128i B1 = _mm_loadu_si128(in_mm + 1);
131             __m128i B2 = _mm_loadu_si128(in_mm + 2);
132             __m128i B3 = _mm_loadu_si128(in_mm + 3);
133             
134             B0 = _mm_xor_si128(B0, K0);
135             B1 = _mm_xor_si128(B1, K0);
136             B2 = _mm_xor_si128(B2, K0);
137             B3 = _mm_xor_si128(B3, K0);
138             
139             mixin(AES_DEC_4_ROUNDS!(K1));
140             mixin(AES_DEC_4_ROUNDS!(K2));
141             mixin(AES_DEC_4_ROUNDS!(K3));
142             mixin(AES_DEC_4_ROUNDS!(K4));
143             mixin(AES_DEC_4_ROUNDS!(K5));
144             mixin(AES_DEC_4_ROUNDS!(K6));
145             mixin(AES_DEC_4_ROUNDS!(K7));
146             mixin(AES_DEC_4_ROUNDS!(K8));
147             mixin(AES_DEC_4_ROUNDS!(K9));
148             mixin(AES_DEC_4_LAST_ROUNDS!(K10));
149             
150             _mm_storeu_si128(out_mm + 0, B0);
151             _mm_storeu_si128(out_mm + 1, B1);
152             _mm_storeu_si128(out_mm + 2, B2);
153             _mm_storeu_si128(out_mm + 3, B3);
154             
155             blocks -= 4;
156             in_mm += 4;
157             out_mm += 4;
158         }
159         
160         foreach (size_t i; 0 .. blocks)
161         {
162             __m128i B = _mm_loadu_si128(in_mm + i);
163             
164             B = _mm_xor_si128(B, K0);
165             
166             B = _mm_aesdec_si128(B, K1);
167             B = _mm_aesdec_si128(B, K2);
168             B = _mm_aesdec_si128(B, K3);
169             B = _mm_aesdec_si128(B, K4);
170             B = _mm_aesdec_si128(B, K5);
171             B = _mm_aesdec_si128(B, K6);
172             B = _mm_aesdec_si128(B, K7);
173             B = _mm_aesdec_si128(B, K8);
174             B = _mm_aesdec_si128(B, K9);
175             B = _mm_aesdeclast_si128(B, K10);
176             
177             _mm_storeu_si128(out_mm + i, B);
178         }
179     }
180 
181 
182     /*
183     * Clear memory of sensitive data
184     */
185     override void clear()
186     {
187         zap(m_EK);
188         zap(m_DK);
189     }
190 
191     @property string name() const { return "AES-128"; }
192     override BlockCipher clone() const { return new AES128NI; }
193     override size_t blockSize() const { return super.blockSize(); }
194     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
195 protected:
196     /*
197     * AES-128 Key Schedule
198     */
199     override void keySchedule(const(ubyte)* key, size_t)
200     {
201         m_EK.resize(44);
202         m_DK.resize(44);
203         
204         __m128i K0  = _mm_loadu_si128(cast(const(__m128i*))(key));
205         mixin(`__m128i K1  = ` ~ AES_128_key_exp!("K0", 0x01));
206         mixin(`__m128i K2  = ` ~ AES_128_key_exp!("K1", 0x02));
207         mixin(`__m128i K3  = ` ~  AES_128_key_exp!("K2", 0x04));
208         mixin(`__m128i K4  = ` ~  AES_128_key_exp!("K3", 0x08));
209         mixin(`__m128i K5  = ` ~  AES_128_key_exp!("K4", 0x10));
210         mixin(`__m128i K6  = ` ~  AES_128_key_exp!("K5", 0x20));
211         mixin(`__m128i K7  = ` ~  AES_128_key_exp!("K6", 0x40));
212         mixin(`__m128i K8  = ` ~  AES_128_key_exp!("K7", 0x80));
213         mixin(`__m128i K9  = ` ~  AES_128_key_exp!("K8", 0x1B));
214         mixin(`__m128i K10 = ` ~  AES_128_key_exp!("K9", 0x36));
215         __m128i* EK_mm = cast(__m128i*)(m_EK.ptr);
216         _mm_storeu_si128(EK_mm      , K0);
217         mixin( q{
218             _mm_storeu_si128(EK_mm +  1, K1);
219             _mm_storeu_si128(EK_mm +  2, K2);
220             _mm_storeu_si128(EK_mm +  3, K3);
221             _mm_storeu_si128(EK_mm +  4, K4);
222             _mm_storeu_si128(EK_mm +  5, K5);
223             _mm_storeu_si128(EK_mm +  6, K6);
224             _mm_storeu_si128(EK_mm +  7, K7);
225             _mm_storeu_si128(EK_mm +  8, K8);
226             _mm_storeu_si128(EK_mm +  9, K9);
227             _mm_storeu_si128(EK_mm + 10, K10);
228         });
229         // Now generate decryption keys
230         
231         __m128i* DK_mm = cast(__m128i*)(m_DK.ptr);
232         _mm_storeu_si128(DK_mm      , K10);
233         _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
234         _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
235         _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
236         _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
237         _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
238         _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
239         _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
240         _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
241         _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
242         _mm_storeu_si128(DK_mm + 10, K0);
243     }
244 
245 
246     SecureVector!uint m_EK, m_DK;
247 }
248 
249 /**
250 * AES-192 using AES-NI
251 */
252 final class AES192NI : BlockCipherFixedParams!(16, 24), BlockCipher, SymmetricAlgorithm
253 {
254 public:
255     override @property size_t parallelism() const { return 4; }
256 
257     /*
258     * AES-192 Encryption
259     */
260     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
261     {
262         __m128i* in_mm = cast(__m128i*)(input);
263         __m128i* out_mm = cast(__m128i*)(output);
264         
265         const(__m128i*) key_mm = cast(const(__m128i*))(m_EK.ptr);
266         
267         __m128i K0  = _mm_loadu_si128(key_mm);
268         __m128i K1  = _mm_loadu_si128(key_mm + 1);
269         __m128i K2  = _mm_loadu_si128(key_mm + 2);
270         __m128i K3  = _mm_loadu_si128(key_mm + 3);
271         __m128i K4  = _mm_loadu_si128(key_mm + 4);
272         __m128i K5  = _mm_loadu_si128(key_mm + 5);
273         __m128i K6  = _mm_loadu_si128(key_mm + 6);
274         __m128i K7  = _mm_loadu_si128(key_mm + 7);
275         __m128i K8  = _mm_loadu_si128(key_mm + 8);
276         __m128i K9  = _mm_loadu_si128(key_mm + 9);
277         __m128i K10 = _mm_loadu_si128(key_mm + 10);
278         __m128i K11 = _mm_loadu_si128(key_mm + 11);
279         __m128i K12 = _mm_loadu_si128(key_mm + 12);
280         
281         while (blocks >= 4)
282         {
283             __m128i B0 = _mm_loadu_si128(in_mm + 0);
284             __m128i B1 = _mm_loadu_si128(in_mm + 1);
285             __m128i B2 = _mm_loadu_si128(in_mm + 2);
286             __m128i B3 = _mm_loadu_si128(in_mm + 3);
287             
288             B0 = _mm_xor_si128(B0, K0);
289             B1 = _mm_xor_si128(B1, K0);
290             B2 = _mm_xor_si128(B2, K0);
291             B3 = _mm_xor_si128(B3, K0);
292             
293             mixin(AES_ENC_4_ROUNDS!(K1));
294             mixin(AES_ENC_4_ROUNDS!(K2));
295             mixin(AES_ENC_4_ROUNDS!(K3));
296             mixin(AES_ENC_4_ROUNDS!(K4));
297             mixin(AES_ENC_4_ROUNDS!(K5));
298             mixin(AES_ENC_4_ROUNDS!(K6));
299             mixin(AES_ENC_4_ROUNDS!(K7));
300             mixin(AES_ENC_4_ROUNDS!(K8));
301             mixin(AES_ENC_4_ROUNDS!(K9));
302             mixin(AES_ENC_4_ROUNDS!(K10));
303             mixin(AES_ENC_4_ROUNDS!(K11));
304             mixin(AES_ENC_4_LAST_ROUNDS!(K12));
305             
306             _mm_storeu_si128(out_mm + 0, B0);
307             _mm_storeu_si128(out_mm + 1, B1);
308             _mm_storeu_si128(out_mm + 2, B2);
309             _mm_storeu_si128(out_mm + 3, B3);
310             
311             blocks -= 4;
312             in_mm += 4;
313             out_mm += 4;
314         }
315         
316         foreach (size_t i; 0 .. blocks)
317         {
318             __m128i B = _mm_loadu_si128(in_mm + i);
319             
320             B = _mm_xor_si128(B, K0);
321             
322             B = _mm_aesenc_si128(B, K1);
323             B = _mm_aesenc_si128(B, K2);
324             B = _mm_aesenc_si128(B, K3);
325             B = _mm_aesenc_si128(B, K4);
326             B = _mm_aesenc_si128(B, K5);
327             B = _mm_aesenc_si128(B, K6);
328             B = _mm_aesenc_si128(B, K7);
329             B = _mm_aesenc_si128(B, K8);
330             B = _mm_aesenc_si128(B, K9);
331             B = _mm_aesenc_si128(B, K10);
332             B = _mm_aesenc_si128(B, K11);
333             B = _mm_aesenclast_si128(B, K12);
334             
335             _mm_storeu_si128(out_mm + i, B);
336         }
337     }
338 
339     /*
340     * AES-192 Decryption
341     */
342     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
343     {
344         __m128i* in_mm = cast(__m128i*)(input);
345         __m128i* out_mm = cast(__m128i*)(output);
346         
347         const(__m128i*) key_mm = cast(const(__m128i*))(m_DK.ptr);
348         
349         __m128i K0  = _mm_loadu_si128(key_mm);
350         __m128i K1  = _mm_loadu_si128(key_mm + 1);
351         __m128i K2  = _mm_loadu_si128(key_mm + 2);
352         __m128i K3  = _mm_loadu_si128(key_mm + 3);
353         __m128i K4  = _mm_loadu_si128(key_mm + 4);
354         __m128i K5  = _mm_loadu_si128(key_mm + 5);
355         __m128i K6  = _mm_loadu_si128(key_mm + 6);
356         __m128i K7  = _mm_loadu_si128(key_mm + 7);
357         __m128i K8  = _mm_loadu_si128(key_mm + 8);
358         __m128i K9  = _mm_loadu_si128(key_mm + 9);
359         __m128i K10 = _mm_loadu_si128(key_mm + 10);
360         __m128i K11 = _mm_loadu_si128(key_mm + 11);
361         __m128i K12 = _mm_loadu_si128(key_mm + 12);
362         
363         while (blocks >= 4)
364         {
365             __m128i B0 = _mm_loadu_si128(in_mm + 0);
366             __m128i B1 = _mm_loadu_si128(in_mm + 1);
367             __m128i B2 = _mm_loadu_si128(in_mm + 2);
368             __m128i B3 = _mm_loadu_si128(in_mm + 3);
369             
370             B0 = _mm_xor_si128(B0, K0);
371             B1 = _mm_xor_si128(B1, K0);
372             B2 = _mm_xor_si128(B2, K0);
373             B3 = _mm_xor_si128(B3, K0);
374             
375             mixin(AES_DEC_4_ROUNDS!(K1));
376             mixin(AES_DEC_4_ROUNDS!(K2));
377             mixin(AES_DEC_4_ROUNDS!(K3));
378             mixin(AES_DEC_4_ROUNDS!(K4));
379             mixin(AES_DEC_4_ROUNDS!(K5));
380             mixin(AES_DEC_4_ROUNDS!(K6));
381             mixin(AES_DEC_4_ROUNDS!(K7));
382             mixin(AES_DEC_4_ROUNDS!(K8));
383             mixin(AES_DEC_4_ROUNDS!(K9));
384             mixin(AES_DEC_4_ROUNDS!(K10));
385             mixin(AES_DEC_4_ROUNDS!(K11));
386             mixin(AES_DEC_4_LAST_ROUNDS!(K12));
387             
388             _mm_storeu_si128(out_mm + 0, B0);
389             _mm_storeu_si128(out_mm + 1, B1);
390             _mm_storeu_si128(out_mm + 2, B2);
391             _mm_storeu_si128(out_mm + 3, B3);
392             
393             blocks -= 4;
394             in_mm += 4;
395             out_mm += 4;
396         }
397         
398         foreach (size_t i; 0 .. blocks)
399         {
400             __m128i B = _mm_loadu_si128(in_mm + i);
401             
402             B = _mm_xor_si128(B, K0);
403             
404             B = _mm_aesdec_si128(B, K1);
405             B = _mm_aesdec_si128(B, K2);
406             B = _mm_aesdec_si128(B, K3);
407             B = _mm_aesdec_si128(B, K4);
408             B = _mm_aesdec_si128(B, K5);
409             B = _mm_aesdec_si128(B, K6);
410             B = _mm_aesdec_si128(B, K7);
411             B = _mm_aesdec_si128(B, K8);
412             B = _mm_aesdec_si128(B, K9);
413             B = _mm_aesdec_si128(B, K10);
414             B = _mm_aesdec_si128(B, K11);
415             B = _mm_aesdeclast_si128(B, K12);
416             
417             _mm_storeu_si128(out_mm + i, B);
418         }
419     }
420 
421 
422 
423     /*
424     * Clear memory of sensitive data
425     */
426     override void clear()
427     {
428         zap(m_EK);
429         zap(m_DK);
430     }
431     @property string name() const { return "AES-192"; }
432     override BlockCipher clone() const { return new AES192NI; }
433     override size_t blockSize() const { return super.blockSize(); }
434     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
435 protected:
436     /*
437     * AES-192 Key Schedule
438     */
439     override void keySchedule(const(ubyte)* key, size_t)
440     {
441         m_EK.resize(52);
442         m_DK.resize(52);
443         
444         __m128i K0 = _mm_loadu_si128(cast(const(__m128i*))(key));
445         __m128i K1 = _mm_loadu_si128(cast(const(__m128i*))(key + 8));
446         K1 = _mm_srli_si128!8(K1);
447         
448         loadLittleEndian(m_EK.ptr, key, 6);
449         
450         mixin(AES_192_key_exp!(0x01, 6));
451         mixin(AES_192_key_exp!(0x02, 12));
452         mixin(AES_192_key_exp!(0x04, 18));
453         mixin(AES_192_key_exp!(0x08, 24));
454         mixin(AES_192_key_exp!(0x10, 30));
455         mixin(AES_192_key_exp!(0x20, 36));
456         mixin(AES_192_key_exp!(0x40, 42));
457         mixin(AES_192_key_exp!(0x80, 48));
458         
459         // Now generate decryption keys
460         const(__m128i*) EK_mm = cast(const(__m128i*))(m_EK.ptr);
461         
462         __m128i* DK_mm = cast(__m128i*)(m_DK.ptr);
463         _mm_storeu_si128(DK_mm      , _mm_loadu_si128(EK_mm + 12));
464         _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
465         _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
466         _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
467         _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
468         _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
469         _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
470         _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
471         _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
472         _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
473         _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
474         _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
475         _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
476     }
477 
478 
479     SecureVector!uint m_EK, m_DK;
480 }
481 
482 /**
483 * AES-256 using AES-NI
484 */
485 final class AES256NI : BlockCipherFixedParams!(16, 32), BlockCipher, SymmetricAlgorithm
486 {
487 public:
488     override @property size_t parallelism() const { return 4; }
489 
490     /*
491     * AES-256 Encryption
492     */
493     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
494     {
495         assert(m_EK.length >= 60);
496         __m128i* in_mm = cast(__m128i*)(input);
497         __m128i* out_mm = cast(__m128i*)(output);
498         
499         const(__m128i*) key_mm = cast(const(__m128i*))(m_EK.ptr);
500         
501         __m128i K0  = _mm_loadu_si128(key_mm);
502         __m128i K1  = _mm_loadu_si128(key_mm + 1);
503         __m128i K2  = _mm_loadu_si128(key_mm + 2);
504         __m128i K3  = _mm_loadu_si128(key_mm + 3);
505         __m128i K4  = _mm_loadu_si128(key_mm + 4);
506         __m128i K5  = _mm_loadu_si128(key_mm + 5);
507         __m128i K6  = _mm_loadu_si128(key_mm + 6);
508         __m128i K7  = _mm_loadu_si128(key_mm + 7);
509         __m128i K8  = _mm_loadu_si128(key_mm + 8);
510         __m128i K9  = _mm_loadu_si128(key_mm + 9);
511         __m128i K10 = _mm_loadu_si128(key_mm + 10);
512         __m128i K11 = _mm_loadu_si128(key_mm + 11);
513         __m128i K12 = _mm_loadu_si128(key_mm + 12);
514         __m128i K13 = _mm_loadu_si128(key_mm + 13);
515         __m128i K14 = _mm_loadu_si128(key_mm + 14);
516         
517         while (blocks >= 4)
518         {
519             __m128i B0 = _mm_loadu_si128(in_mm + 0);
520             __m128i B1 = _mm_loadu_si128(in_mm + 1);
521             __m128i B2 = _mm_loadu_si128(in_mm + 2);
522             __m128i B3 = _mm_loadu_si128(in_mm + 3);
523             
524             B0 = _mm_xor_si128(B0, K0);
525             B1 = _mm_xor_si128(B1, K0);
526             B2 = _mm_xor_si128(B2, K0);
527             B3 = _mm_xor_si128(B3, K0);
528             
529             mixin(AES_ENC_4_ROUNDS!(K1));
530             mixin(AES_ENC_4_ROUNDS!(K2));
531             mixin(AES_ENC_4_ROUNDS!(K3));
532             mixin(AES_ENC_4_ROUNDS!(K4));
533             mixin(AES_ENC_4_ROUNDS!(K5));
534             mixin(AES_ENC_4_ROUNDS!(K6));
535             mixin(AES_ENC_4_ROUNDS!(K7));
536             mixin(AES_ENC_4_ROUNDS!(K8));
537             mixin(AES_ENC_4_ROUNDS!(K9));
538             mixin(AES_ENC_4_ROUNDS!(K10));
539             mixin(AES_ENC_4_ROUNDS!(K11));
540             mixin(AES_ENC_4_ROUNDS!(K12));
541             mixin(AES_ENC_4_ROUNDS!(K13));
542             mixin(AES_ENC_4_LAST_ROUNDS!(K14));
543             
544             _mm_storeu_si128(out_mm + 0, B0);
545             _mm_storeu_si128(out_mm + 1, B1);
546             _mm_storeu_si128(out_mm + 2, B2);
547             _mm_storeu_si128(out_mm + 3, B3);
548             
549             blocks -= 4;
550             in_mm += 4;
551             out_mm += 4;
552         }
553         
554         foreach (size_t i; 0 .. blocks)
555         {
556             __m128i B = _mm_loadu_si128(in_mm + i);
557             
558             B = _mm_xor_si128(B, K0);
559             
560             B = _mm_aesenc_si128(B, K1);
561             B = _mm_aesenc_si128(B, K2);
562             B = _mm_aesenc_si128(B, K3);
563             B = _mm_aesenc_si128(B, K4);
564             B = _mm_aesenc_si128(B, K5);
565             B = _mm_aesenc_si128(B, K6);
566             B = _mm_aesenc_si128(B, K7);
567             B = _mm_aesenc_si128(B, K8);
568             B = _mm_aesenc_si128(B, K9);
569             B = _mm_aesenc_si128(B, K10);
570             B = _mm_aesenc_si128(B, K11);
571             B = _mm_aesenc_si128(B, K12);
572             B = _mm_aesenc_si128(B, K13);
573             B = _mm_aesenclast_si128(B, K14);
574             
575             _mm_storeu_si128(out_mm + i, B);
576         }
577     }
578 
579     /*
580     * AES-256 Decryption
581     */
582     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
583     {
584         __m128i* in_mm = cast(__m128i*)(input);
585         __m128i* out_mm = cast(__m128i*)(output);
586         
587         const(__m128i*) key_mm = cast(const(__m128i*))(m_DK.ptr);
588         
589         __m128i K0  = _mm_loadu_si128(key_mm);
590         __m128i K1  = _mm_loadu_si128(key_mm + 1);
591         __m128i K2  = _mm_loadu_si128(key_mm + 2);
592         __m128i K3  = _mm_loadu_si128(key_mm + 3);
593         __m128i K4  = _mm_loadu_si128(key_mm + 4);
594         __m128i K5  = _mm_loadu_si128(key_mm + 5);
595         __m128i K6  = _mm_loadu_si128(key_mm + 6);
596         __m128i K7  = _mm_loadu_si128(key_mm + 7);
597         __m128i K8  = _mm_loadu_si128(key_mm + 8);
598         __m128i K9  = _mm_loadu_si128(key_mm + 9);
599         __m128i K10 = _mm_loadu_si128(key_mm + 10);
600         __m128i K11 = _mm_loadu_si128(key_mm + 11);
601         __m128i K12 = _mm_loadu_si128(key_mm + 12);
602         __m128i K13 = _mm_loadu_si128(key_mm + 13);
603         __m128i K14 = _mm_loadu_si128(key_mm + 14);
604         
605         while (blocks >= 4)
606         {
607             __m128i B0 = _mm_loadu_si128(in_mm + 0);
608             __m128i B1 = _mm_loadu_si128(in_mm + 1);
609             __m128i B2 = _mm_loadu_si128(in_mm + 2);
610             __m128i B3 = _mm_loadu_si128(in_mm + 3);
611             
612             B0 = _mm_xor_si128(B0, K0);
613             B1 = _mm_xor_si128(B1, K0);
614             B2 = _mm_xor_si128(B2, K0);
615             B3 = _mm_xor_si128(B3, K0);
616             
617             mixin(AES_DEC_4_ROUNDS!(K1));
618             mixin(AES_DEC_4_ROUNDS!(K2));
619             mixin(AES_DEC_4_ROUNDS!(K3));
620             mixin(AES_DEC_4_ROUNDS!(K4));
621             mixin(AES_DEC_4_ROUNDS!(K5));
622             mixin(AES_DEC_4_ROUNDS!(K6));
623             mixin(AES_DEC_4_ROUNDS!(K7));
624             mixin(AES_DEC_4_ROUNDS!(K8));
625             mixin(AES_DEC_4_ROUNDS!(K9));
626             mixin(AES_DEC_4_ROUNDS!(K10));
627             mixin(AES_DEC_4_ROUNDS!(K11));
628             mixin(AES_DEC_4_ROUNDS!(K12));
629             mixin(AES_DEC_4_ROUNDS!(K13));
630             mixin(AES_DEC_4_LAST_ROUNDS!(K14));
631             
632             _mm_storeu_si128(out_mm + 0, B0);
633             _mm_storeu_si128(out_mm + 1, B1);
634             _mm_storeu_si128(out_mm + 2, B2);
635             _mm_storeu_si128(out_mm + 3, B3);
636             
637             blocks -= 4;
638             in_mm += 4;
639             out_mm += 4;
640         }
641         
642         foreach (size_t i; 0 .. blocks)
643         {
644             __m128i B = _mm_loadu_si128(in_mm + i);
645             
646             B = _mm_xor_si128(B, K0);
647             
648             B = _mm_aesdec_si128(B, K1);
649             B = _mm_aesdec_si128(B, K2);
650             B = _mm_aesdec_si128(B, K3);
651             B = _mm_aesdec_si128(B, K4);
652             B = _mm_aesdec_si128(B, K5);
653             B = _mm_aesdec_si128(B, K6);
654             B = _mm_aesdec_si128(B, K7);
655             B = _mm_aesdec_si128(B, K8);
656             B = _mm_aesdec_si128(B, K9);
657             B = _mm_aesdec_si128(B, K10);
658             B = _mm_aesdec_si128(B, K11);
659             B = _mm_aesdec_si128(B, K12);
660             B = _mm_aesdec_si128(B, K13);
661             B = _mm_aesdeclast_si128(B, K14);
662             
663             _mm_storeu_si128(out_mm + i, B);
664         }
665     }
666 
667     /*
668     * Clear memory of sensitive data
669     */
670     override void clear()
671     {
672         zap(m_EK);
673         zap(m_DK);
674     }
675 
676     @property string name() const { return "AES-256"; }
677     override BlockCipher clone() const { return new AES256NI; }
678     override size_t blockSize() const { return super.blockSize(); }
679     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
680 protected:
681     /*
682     * AES-256 Key Schedule
683     */
684     override void keySchedule(const(ubyte)* key, size_t)
685     {
686         m_EK.resize(60);
687         m_DK.resize(60);
688         
689         __m128i K0 = _mm_loadu_si128(cast(const(__m128i*))(key));
690         __m128i K1 = _mm_loadu_si128(cast(const(__m128i*))(key + 16));
691         
692         __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128!0x01(K1));
693         __m128i K3 = aes_256_key_expansion(K1, K2);
694         
695         __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128!0x02(K3));
696         __m128i K5 = aes_256_key_expansion(K3, K4);
697         
698         __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128!0x04(K5));
699         __m128i K7 = aes_256_key_expansion(K5, K6);
700         
701         __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128!0x08(K7));
702         __m128i K9 = aes_256_key_expansion(K7, K8);
703         
704         __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128!0x10(K9));
705         __m128i K11 = aes_256_key_expansion(K9, K10);
706         
707         __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128!0x20(K11));
708         __m128i K13 = aes_256_key_expansion(K11, K12);
709         
710         __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128!0x40(K13));
711         
712         __m128i* EK_mm = cast(__m128i*)(m_EK.ptr);
713         _mm_storeu_si128(EK_mm      , K0);
714         _mm_storeu_si128(EK_mm +  1, K1);
715         _mm_storeu_si128(EK_mm +  2, K2);
716         _mm_storeu_si128(EK_mm +  3, K3);
717         _mm_storeu_si128(EK_mm +  4, K4);
718         _mm_storeu_si128(EK_mm +  5, K5);
719         _mm_storeu_si128(EK_mm +  6, K6);
720         _mm_storeu_si128(EK_mm +  7, K7);
721         _mm_storeu_si128(EK_mm +  8, K8);
722         _mm_storeu_si128(EK_mm +  9, K9);
723         _mm_storeu_si128(EK_mm + 10, K10);
724         _mm_storeu_si128(EK_mm + 11, K11);
725         _mm_storeu_si128(EK_mm + 12, K12);
726         _mm_storeu_si128(EK_mm + 13, K13);
727         _mm_storeu_si128(EK_mm + 14, K14);
728         
729         // Now generate decryption keys
730         __m128i* DK_mm = cast(__m128i*)(m_DK.ptr);
731         _mm_storeu_si128(DK_mm      , K14);
732         _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
733         _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
734         _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
735         _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
736         _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
737         _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
738         _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
739         _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
740         _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
741         _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
742         _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
743         _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
744         _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
745         _mm_storeu_si128(DK_mm + 14, K0);
746     }
747 
748 
749     SecureVector!uint m_EK, m_DK;
750 }
751 
752 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
753 {
754     key_with_rcon = _mm_shuffle_epi32!(_MM_SHUFFLE(3,3,3,3))(key_with_rcon);
755     key = _mm_xor_si128(key, _mm_slli_si128!4(key));
756     key = _mm_xor_si128(key, _mm_slli_si128!4(key));
757     key = _mm_xor_si128(key, _mm_slli_si128!4(key));
758     return _mm_xor_si128(key, key_with_rcon);
759 }
760 
761 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
762                            uint* output, bool last)
763 {
764     __m128i key1 = *K1;
765     __m128i key2 = *K2;
766     
767     key2_with_rcon  = _mm_shuffle_epi32!(_MM_SHUFFLE(1,1,1,1))(key2_with_rcon);
768     key1 = _mm_xor_si128(key1, _mm_slli_si128!4(key1));
769     key1 = _mm_xor_si128(key1, _mm_slli_si128!4(key1));
770     key1 = _mm_xor_si128(key1, _mm_slli_si128!4(key1));
771     key1 = _mm_xor_si128(key1, key2_with_rcon);
772     
773     *K1 = key1;
774     _mm_storeu_si128(cast(__m128i*)(output), key1);
775     
776     if (last)
777         return;
778     
779     key2 = _mm_xor_si128(key2, _mm_slli_si128!4(key2));
780     key2 = _mm_xor_si128(key2, _mm_shuffle_epi32!(_MM_SHUFFLE(3,3,3,3))(key1));
781     
782     *K2 = key2;
783     output[4] = _mm_cvtsi128_si32(key2);
784     output[5] = _mm_cvtsi128_si32(_mm_srli_si128!4(key2));
785 }
786 
787 /*
788 * The second half of the AES-256 key expansion (other half same as AES-128)
789 */
790 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
791 {
792     __m128i key_with_rcon = _mm_aeskeygenassist_si128!0x00(key2);
793     key_with_rcon = _mm_shuffle_epi32!(_MM_SHUFFLE(2,2,2,2))(key_with_rcon);
794     
795     key = _mm_xor_si128(key, _mm_slli_si128!4(key));
796     key = _mm_xor_si128(key, _mm_slli_si128!4(key));
797     key = _mm_xor_si128(key, _mm_slli_si128!4(key));
798     return _mm_xor_si128(key, key_with_rcon);
799 }
800 
801 enum string AES_ENC_4_ROUNDS(alias K) = q{
802     B0 = _mm_aesenc_si128(B0, %1$s);
803     B1 = _mm_aesenc_si128(B1, %1$s);
804     B2 = _mm_aesenc_si128(B2, %1$s);
805     B3 = _mm_aesenc_si128(B3, %1$s);
806 }.format(__traits(identifier, K));
807 
808 enum string AES_ENC_4_LAST_ROUNDS(alias K) = q{
809     B0 = _mm_aesenclast_si128(B0, %1$s);
810     B1 = _mm_aesenclast_si128(B1, %1$s);
811     B2 = _mm_aesenclast_si128(B2, %1$s);
812     B3 = _mm_aesenclast_si128(B3, %1$s);
813 }.format(__traits(identifier, K));
814 
815 enum string AES_DEC_4_ROUNDS(alias K) = q{
816     B0 = _mm_aesdec_si128(B0, %1$s);
817     B1 = _mm_aesdec_si128(B1, %1$s);
818     B2 = _mm_aesdec_si128(B2, %1$s);
819     B3 = _mm_aesdec_si128(B3, %1$s);
820 }.format(__traits(identifier, K));
821 
822 enum string AES_DEC_4_LAST_ROUNDS(alias K) = q{
823     B0 = _mm_aesdeclast_si128(B0, %1$s);
824     B1 = _mm_aesdeclast_si128(B1, %1$s);
825     B2 = _mm_aesdeclast_si128(B2, %1$s);
826     B3 = _mm_aesdeclast_si128(B3, %1$s);
827 }.format(__traits(identifier, K));
828 
829 enum string AES_128_key_exp(string K, ubyte RCON) =
830     `aes_128_key_expansion(` ~ K ~ `, _mm_aeskeygenassist_si128!` ~ RCON.to!string ~ `(` ~ K ~ `));`;
831 
832 enum string AES_192_key_exp(ubyte RCON, size_t EK_OFF) = 
833     `aes_192_key_expansion(&K0, &K1, 
834                                   _mm_aeskeygenassist_si128! ` ~ RCON.to!string ~ `(K1),
835                                   &m_EK[` ~ EK_OFF.stringof ~ `], ` ~ EK_OFF.stringof ~ ` == 48);`;