1 /**
2 * AES using SSSE3
3 * 
4 * Copyright:
5 * (C) 2010 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.block.aes_ssse3;
12 
13 import botan.constants;
14 static if (BOTAN_HAS_AES_SSSE3):
15 
16 import std.range : iota;
17 import botan.block.block_cipher;
18 import botan.utils.types;
19 import botan.utils.mem_ops;
20 import botan.utils.simd.tmmintrin;
21 
22 
23 /**
24 * AES-128 using SSSE3
25 */
26 final class AES128_SSSE3 : BlockCipherFixedParams!(16, 16), BlockCipher, SymmetricAlgorithm
27 {
28 public:
29     /*
30     * AES-128 Encryption
31     */
32     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
33     {
34         const(__m128i*) in_mm = cast(const(__m128i*))(input);
35         __m128i* out_mm = cast(__m128i*)(output);
36         
37         const(__m128i*) keys = cast(const(__m128i*))(m_EK.ptr);
38         
39         foreach (size_t i; 0 .. blocks)
40         {
41             __m128i B = _mm_loadu_si128(in_mm + i);
42             _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
43         }
44     }
45 
46     /*
47     * AES-128 Decryption
48     */
49     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
50     {
51         const(__m128i*) in_mm = cast(const(__m128i*))(input);
52         __m128i* out_mm = cast(__m128i*)(output);
53         
54         const(__m128i*) keys = cast(const(__m128i*))(m_DK.ptr);
55         
56         foreach (size_t i; 0 .. blocks)
57         {
58             __m128i B = _mm_loadu_si128(in_mm + i);
59             _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
60         }
61     }
62 
63     override void clear()
64     {
65         zap(m_EK);
66         zap(m_DK);
67     }
68 
69     @property string name() const { return "AES-128"; }
70     override @property size_t parallelism() const { return 1; }
71     override BlockCipher clone() const { return new AES128_SSSE3; }
72     override size_t blockSize() const { return super.blockSize(); }
73     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
74 protected:
75 
76     /*
77     * AES-128 Key Schedule
78     */
79     override void keySchedule(const(ubyte)* keyb, size_t)
80     {
81         __m128i rcon = _mm_set_epi32!(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6)();
82         
83         __m128i key = _mm_loadu_si128(cast(const(__m128i*))(keyb));
84         
85         m_EK.resize(11*4);
86         m_DK.resize(11*4);
87         __m128i* EK_mm = cast(__m128i*)(m_EK.ptr);
88         __m128i* DK_mm = cast(__m128i*)(m_DK.ptr);
89         
90         _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2]));
91         
92         key = aes_schedule_transform(key, k_ipt1, k_ipt2);
93         
94         _mm_storeu_si128(EK_mm, key);
95         
96         foreach (size_t i; 1 .. 10)
97         {
98             key = aes_schedule_round(&rcon, key, key);
99             
100             _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key, (12-i) % 4));
101             
102             _mm_storeu_si128(DK_mm + (10-i), aes_schedule_mangle_dec(key, (10-i) % 4));
103         }
104         
105         key = aes_schedule_round(&rcon, key, key);
106         _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2));
107         _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key));
108 
109     }
110 
111     SecureVector!uint m_EK, m_DK;
112 }
113 
114 /**
115 * AES-192 using SSSE3
116 */
117 final class AES192_SSSE3 : BlockCipherFixedParams!(16, 24), BlockCipher, SymmetricAlgorithm
118 {
119 public:
120     /*
121     * AES-192 Encryption
122     */
123     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
124     {
125         const(__m128i*) in_mm = cast(const(__m128i*))(input);
126         __m128i* out_mm = cast(__m128i*)(output);
127         
128         const(__m128i*) keys = cast(const(__m128i*))(m_EK.ptr);
129         
130         foreach (size_t i; 0 .. blocks)
131         {
132             __m128i B = _mm_loadu_si128(in_mm + i);
133             _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12));
134         }
135     }
136 
137     /*
138     * AES-192 Decryption
139     */
140     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
141     {
142         const(__m128i*) in_mm = cast(const(__m128i*))(input);
143         __m128i* out_mm = cast(__m128i*)(output);
144         
145         const(__m128i*) keys = cast(const(__m128i*))(m_DK.ptr);
146         
147         foreach (size_t i; 0 .. blocks)
148         {
149             __m128i B = _mm_loadu_si128(in_mm + i);
150             _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12));
151         }
152     }
153 
154     override void clear()
155     {
156         zap(m_EK);
157         zap(m_DK);
158     }
159 
160     @property string name() const { return "AES-192"; }
161     override @property size_t parallelism() const { return 1; }
162     override BlockCipher clone() const { return new AES192_SSSE3; }
163     override size_t blockSize() const { return super.blockSize(); }
164     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
165 protected:
166     /*
167     * AES-192 Key Schedule
168     */
169     override void keySchedule(const(ubyte)* keyb, size_t len)
170     {
171         immutable(__m128i) rcon_imm = _mm_set_epi32!(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6)();
172         __m128i rcon = rcon_imm;
173         m_EK.resize(13*4);
174         m_DK.resize(13*4);
175         
176         __m128i* EK_mm = cast(__m128i*)(m_EK.ptr);
177         __m128i* DK_mm = cast(__m128i*)(m_DK.ptr);
178         
179         __m128i key1 = _mm_loadu_si128(cast(const(__m128i*))(keyb));
180         __m128i key2 = _mm_loadu_si128(cast(const(__m128i*))(keyb + 8));
181         
182         _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0]));
183         
184         key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
185         key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
186         
187         _mm_storeu_si128(EK_mm + 0, key1);
188         
189         // key2 with 8 high bytes masked off
190         __m128i t = _mm_slli_si128!8(_mm_srli_si128!8(key2));
191         
192         foreach (size_t i; 0 .. 4)
193         {
194             key2 = aes_schedule_round(&rcon, key2, key1);
195             
196             _mm_storeu_si128(EK_mm + 3*i+1, aes_schedule_mangle(_mm_alignr_epi8!8(key2, t), (i+3)%4));
197             _mm_storeu_si128(DK_mm + 11-3*i, aes_schedule_mangle_dec(_mm_alignr_epi8!8(key2, t), (i+3)%4));
198             
199             t = aes_schedule_192_smear(key2, t);
200             
201             _mm_storeu_si128(EK_mm + 3*i+2, aes_schedule_mangle(t, (i+2)%4));
202             _mm_storeu_si128(DK_mm + 10-3*i, aes_schedule_mangle_dec(t, (i+2)%4));
203             
204             key2 = aes_schedule_round(&rcon, t, key2);
205             
206             if (i == 3)
207             {
208                 _mm_storeu_si128(EK_mm + 3*i+3, aes_schedule_mangle_last(key2, (i+1)%4));
209                 _mm_storeu_si128(DK_mm + 9-3*i, aes_schedule_mangle_last_dec(key2));
210             }
211             else
212             {
213                 _mm_storeu_si128(EK_mm + 3*i+3, aes_schedule_mangle(key2, (i+1)%4));
214                 _mm_storeu_si128(DK_mm + 9-3*i, aes_schedule_mangle_dec(key2, (i+1)%4));
215             }
216             
217             key1 = key2;
218             key2 = aes_schedule_192_smear(key2, _mm_slli_si128!8(_mm_srli_si128!8(t)));
219             t = _mm_slli_si128!8(_mm_srli_si128!8(key2));
220         }
221     }
222 
223     SecureVector!uint m_EK, m_DK;
224 }
225 
226 /**
227 * AES-256 using SSSE3
228 */
229 final class AES256_SSSE3 : BlockCipherFixedParams!(16, 32), BlockCipher, SymmetricAlgorithm
230 {
231 public:
232     /*
233     * AES-256 Encryption
234     */
235     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
236     {
237         const(__m128i*) in_mm = cast(const(__m128i*))(input);
238         __m128i* out_mm = cast(__m128i*)(output);
239         
240         const(__m128i*) keys = cast(const(__m128i*))(m_EK.ptr);
241         
242         foreach (size_t i; 0 .. blocks)
243         {
244             __m128i B = _mm_loadu_si128(in_mm + i);
245             _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14));
246         }
247     }
248 
249     /*
250     * AES-256 Decryption
251     */
252     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
253     {
254         const(__m128i*) in_mm = cast(const(__m128i*))(input);
255         __m128i* out_mm = cast(__m128i*)(output);
256         
257         const(__m128i*) keys = cast(const(__m128i*))(m_DK.ptr);
258         
259         foreach (size_t i; 0 .. blocks)
260         {
261             __m128i B = _mm_loadu_si128(in_mm + i);
262             _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14));
263         }
264     }
265 
266     override void clear()
267     {
268         zap(m_EK);
269         zap(m_DK);
270     }
271 
272     @property string name() const { return "AES-256"; }
273     override @property size_t parallelism() const { return 1; }
274     override BlockCipher clone() const { return new AES256_SSSE3; }
275     override size_t blockSize() const { return super.blockSize(); }
276     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
277 protected:
278     /*
279     * AES-256 Key Schedule
280     */
281     override void keySchedule(const(ubyte)* keyb, size_t)
282     {
283         __m128i rcon = _mm_set_epi32!(0x702A9808, 0x4D7C7D81,
284                                      0x1F8391B9, 0xAF9DEEB6)();
285         
286         m_EK.resize(15*4);
287         m_DK.resize(15*4);
288         
289         __m128i* EK_mm = cast(__m128i*)(m_EK.ptr);
290         __m128i* DK_mm = cast(__m128i*)(m_DK.ptr);
291         
292         __m128i key1 = _mm_loadu_si128(cast(const(__m128i*))(keyb));
293         __m128i key2 = _mm_loadu_si128(cast(const(__m128i*))((keyb + 16)));
294         
295         _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2]));
296         
297         key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
298         key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
299         
300         _mm_storeu_si128(EK_mm + 0, key1);
301         _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3));
302         
303         _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1));
304         
305         foreach (size_t i; iota(2, 14, 2))
306         {
307             __m128i k_t = key2;
308             key1 = key2 = aes_schedule_round(&rcon, key2, key1);
309             
310             _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4));
311             _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4));
312             
313             __m128i k_t_0 = _mm_shuffle_epi32!0xFF(key2);
314             key2 = aes_schedule_round(cast(__m128i*)null, k_t_0, k_t);
315             _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4));
316             _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4));
317         }
318         
319         key2 = aes_schedule_round(&rcon, key2, key1);
320         
321         _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2));
322         _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2));
323     }
324 
325     SecureVector!uint m_EK, m_DK;
326 }
327 
328 shared static this() {
329     logTrace("Loading AES SSSE3 ...");
330 
331     low_nibs = _mm_set1_epi8!(0x0F)();
332     k_ipt1 = _mm_set_epi32!(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000)();
333     k_ipt2 = _mm_set_epi32!(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00)();
334     k_inv1 = _mm_set_epi32!(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180)();
335     k_inv2 = _mm_set_epi32!(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780)();
336     sb1u = _mm_set_epi32!(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00)();
337     sb1t = _mm_set_epi32!(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300)();
338     mc_forward = [
339         _mm_set_epi32!(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201)(),
340         _mm_set_epi32!(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605)(),
341         _mm_set_epi32!(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09)(),
342         _mm_set_epi32!(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)()];
343     __m128i[4] sr_ = [
344         _mm_set_epi32!(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)(),
345         _mm_set_epi32!(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500)(),
346         _mm_set_epi32!(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900)(),
347         _mm_set_epi32!(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00)()];
348     sr = sr_;
349 }
350 
351 __gshared immutable __m128i low_nibs;
352 
353 __gshared immutable __m128i k_ipt1 ;
354 __gshared immutable __m128i k_ipt2;
355 
356 __gshared immutable __m128i k_inv1;
357 __gshared immutable __m128i k_inv2;
358 
359 __gshared immutable __m128i sb1u;
360 __gshared immutable __m128i sb1t;
361 
362 __gshared immutable(__m128i)[4] mc_forward;
363 
364 __gshared immutable(__m128i)[4] sr;
365 
366 package:
367 
368 __m128i aes_schedule_transform(__m128i input,
369                                __m128i table_1,
370                                __m128i table_2)
371 {
372     __m128i i_1 = _mm_and_si128(low_nibs, input);
373     __m128i i_2 = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, input));
374     
375     input = _mm_and_si128(low_nibs, input);
376     
377     return _mm_xor_si128(_mm_shuffle_epi8(table_1, i_1),
378                          _mm_shuffle_epi8(table_2, i_2));
379 }
380     
381 __m128i aes_schedule_mangle(__m128i k, ubyte round_no)
382 {
383     __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8!(0x5B)()), mc_forward[0]);
384     
385     __m128i t2 = t;
386     
387     t = _mm_shuffle_epi8(t, mc_forward[0]);
388     
389     t2 = _mm_xor_si128(t2, _mm_xor_si128(t, _mm_shuffle_epi8(t, mc_forward[0])));
390     
391     return _mm_shuffle_epi8(t2, sr[round_no % 4]);
392 }
393 
394 __m128i aes_schedule_192_smear(__m128i x, __m128i y)
395 {
396 return _mm_xor_si128(y,_mm_xor_si128(_mm_shuffle_epi32!0xFE(x),
397                                      _mm_shuffle_epi32!0x80(y)));
398 }
399 
400 __m128i aes_schedule_mangle_dec(__m128i k, ubyte round_no)
401 {
402     immutable(__m128i)[8] dsk = [
403             _mm_set_epi32!(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700)(),
404             _mm_set_epi32!(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300)(),
405             _mm_set_epi32!(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400)(),
406             _mm_set_epi32!(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00)(),
407             _mm_set_epi32!(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700)(),
408             _mm_set_epi32!(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700)(),
409             _mm_set_epi32!(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000)(),
410             _mm_set_epi32!(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)()
411     ];
412     
413     __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
414     __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
415     
416     t = aes_schedule_transform(t, dsk[2], dsk[3]);
417     output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
418     
419     t = aes_schedule_transform(t, dsk[4], dsk[5]);
420     output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
421     
422     t = aes_schedule_transform(t, dsk[6], dsk[7]);
423     output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
424     
425     return _mm_shuffle_epi8(output, sr[round_no % 4]);
426 }
427 
428 __m128i aes_schedule_mangle_last(__m128i k, ubyte round_no)
429 {
430     immutable(__m128i) out_tr1 = _mm_set_epi32!(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000)();
431     immutable(__m128i) out_tr2 = _mm_set_epi32!(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00)();
432     
433     k = _mm_shuffle_epi8(k, sr[round_no % 4]);
434     k = _mm_xor_si128(k, _mm_set1_epi8!(0x5B)());
435     return aes_schedule_transform(k, out_tr1, out_tr2);
436 }
437 
438 __m128i aes_schedule_mangle_last_dec(__m128i k)
439 {
440     immutable(__m128i) deskew1 = _mm_set_epi32!(0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300)();
441     immutable(__m128i) deskew2 = _mm_set_epi32!(0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900)();
442     
443     k = _mm_xor_si128(k, _mm_set1_epi8!(0x5B)());
444     return aes_schedule_transform(k, deskew1, deskew2);
445 }
446 
447 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
448 {
449 
450 
451     if (rcon !is null)
452     {
453         input2 = _mm_xor_si128(_mm_alignr_epi8!15(_mm_setzero_si128(), *rcon), input2);
454         __m128i tmp_rcon = *rcon;
455         *rcon = _mm_alignr_epi8!15(tmp_rcon, tmp_rcon); // next rcon
456         
457         input1 = _mm_shuffle_epi32!0xFF(input1); // rotate
458         input1 = _mm_alignr_epi8!1(input1, input1);
459     }
460     
461     __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128!4(input2));
462     smeared = _mm_xor_si128(smeared, _mm_xor_si128(_mm_slli_si128!8(smeared), _mm_set1_epi8!(0x5B)()));
463     
464     __m128i t = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, input1));
465     
466     input1 = _mm_and_si128(low_nibs, input1);
467     
468     __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
469     
470     input1 = _mm_xor_si128(input1, t);
471     
472     __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
473     __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
474     
475     __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
476     __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
477     
478     return _mm_xor_si128(_mm_shuffle_epi8(sb1u, t5),
479                          _mm_xor_si128(_mm_shuffle_epi8(sb1t, t6), smeared));
480 }
481 
482 __m128i aes_ssse3_encrypt(__m128i B, const(__m128i*) keys, size_t rounds)
483 {
484     immutable(__m128i) sb2u = _mm_set_epi32!(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400)();
485     immutable(__m128i) sb2t = _mm_set_epi32!(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900)();
486     
487     immutable(__m128i) sbou = _mm_set_epi32!(0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700)();
488     immutable(__m128i) sbot = _mm_set_epi32!(0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00)();
489     
490     immutable(__m128i)[4] mc_backward = [
491             _mm_set_epi32!(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003)(),
492             _mm_set_epi32!(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F)(),
493             _mm_set_epi32!(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B)(),
494             _mm_set_epi32!(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407)(),
495     ];
496     
497     B = _mm_xor_si128(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
498                       _mm_xor_si128(_mm_shuffle_epi8(k_ipt2, _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, B))),
499                                        _mm_loadu_si128(keys)));
500     
501     for (size_t r = 1; ; ++r)
502     {
503         const(__m128i) K = _mm_loadu_si128(keys + r);
504         
505         __m128i t = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, B));
506         
507         B = _mm_and_si128(low_nibs, B);
508         
509         __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
510         
511         B = _mm_xor_si128(B, t);
512         
513         __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
514         __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
515         
516         __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
517 
518         __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
519         
520         if (r == rounds)
521         {
522             B = _mm_shuffle_epi8(_mm_xor_si128(_mm_shuffle_epi8(sbou, t5),
523                                                _mm_xor_si128(_mm_shuffle_epi8(sbot, t6), K)), 
524                                  sr[r % 4]);
525             
526             return B;
527         }
528         
529         __m128i t7 = _mm_xor_si128(_mm_shuffle_epi8(sb1t, t6), _mm_xor_si128( _mm_shuffle_epi8(sb1u, t5), K));
530 
531         __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb2t, t6),
532                                    _mm_xor_si128(_mm_shuffle_epi8(sb2u, t5),
533                                      _mm_shuffle_epi8(t7, mc_forward[r % 4])));
534         
535         B = _mm_xor_si128(_mm_shuffle_epi8(t8, mc_forward[r % 4]),
536                           _mm_xor_si128(_mm_shuffle_epi8(t7, mc_backward[r % 4]), t8));
537     }
538 }
539 
540 __m128i aes_ssse3_decrypt(__m128i B, const(__m128i*) keys, size_t rounds)
541 {
542     immutable(__m128i) k_dipt1 = _mm_set_epi32!(0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00)();
543     immutable(__m128i) k_dipt2 = _mm_set_epi32!(0x12771772, 0xF491F194, 0x86E383E6, 0x60056500)();
544     
545     immutable(__m128i) sb9u = _mm_set_epi32!(0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600)();
546     immutable(__m128i) sb9t = _mm_set_epi32!(0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900)();
547     
548     immutable(__m128i) sbeu = _mm_set_epi32!(0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000)();
549     immutable(__m128i) sbet = _mm_set_epi32!(0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100)();
550     
551     immutable(__m128i) sbdu = _mm_set_epi32!(0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200)();
552     immutable(__m128i) sbdt = _mm_set_epi32!(0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00)();
553     
554     immutable(__m128i) sbbu = _mm_set_epi32!(0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200)();
555     immutable(__m128i) sbbt = _mm_set_epi32!(0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700)();
556     
557     __m128i mc = mc_forward[3];
558     
559     __m128i t =    _mm_shuffle_epi8(k_dipt2, _mm_srli_epi32!4( _mm_andnot_si128(low_nibs, B)));
560 
561     B = _mm_xor_si128(t,_mm_xor_si128( _mm_loadu_si128(keys), _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs))));
562     
563     for (size_t r = 1; ; ++r)
564     {
565         const(__m128i) K = _mm_loadu_si128(keys + r);
566         
567         t = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, B));
568         
569         B = _mm_and_si128(low_nibs, B);
570         
571         __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
572         
573         B = _mm_xor_si128(B, t);
574         
575         __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
576         __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
577         __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
578         __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
579         
580         if (r == rounds)
581         {
582             immutable(__m128i) sbou = _mm_set_epi32!(0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000)();
583             immutable(__m128i) sbot = _mm_set_epi32!(0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00)();
584             
585             __m128i x = _mm_shuffle_epi8(sbou, t5);
586             __m128i y = _mm_shuffle_epi8(sbot, t6);
587             x = _mm_xor_si128(x, K);
588             x = _mm_xor_si128(x, y);
589             
590             const uint which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
591             return _mm_shuffle_epi8(x, sr[which_sr]);
592         }
593         
594         __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
595                                    _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
596         
597         __m128i t9 = _mm_xor_si128(_mm_shuffle_epi8(t8, mc), _mm_xor_si128(_mm_shuffle_epi8(sbdu, t5), _mm_shuffle_epi8(sbdt, t6)));
598 
599         __m128i t12 = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t9, mc),
600                                                   _mm_shuffle_epi8(sbbu, t5)),
601                                     _mm_shuffle_epi8(sbbt, t6));
602         
603         B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
604                                         _mm_shuffle_epi8(sbeu, t5)),
605                           _mm_shuffle_epi8(sbet, t6));
606         
607         mc = _mm_alignr_epi8!12(mc, mc);
608     }
609 }