1 /**
2 * Threefish-512 in AVX2
3 * 
4 * Copyright:
5 * (C) 2013 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.block.threefish_avx2;
12 
13 import botan.constants;
14 static if (BOTAN_HAS_THREEFISH_512_AVX2):
15 
16 import botan.block.threefish;
17 import botan.utils.simd.immintrin;
18 import botan.block.block_cipher;
19 import botan.utils.mem_ops;
20 import std.format : format;
21 
22 /**
23 * Threefish-512
24 */
25 final class Threefish512AVX2 : Threefish512
26 {
27 public:
28     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
29     {
30         const ulong* K = &getK()[0];
31         const ulong* T_64 = &getT()[0];
32         
33         const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
34         const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
35         const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
36         const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
37         const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
38         const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
39         const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
40         const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
41         
42         
43         /*
44         v1.0 key schedule: 9 ymm registers (only need 2 or 3)
45         (0,1,2,3),(4,5,6,7) [8]
46         then mutating with vpermq
47         */
48         const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
49         const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
50         const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
51         const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
52         const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
53         const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
54         const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
55         const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
56         const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
57         
58         const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
59         
60         const __m256i* in_mm = cast(const __m256i*)(input);
61         __m256i* out_mm = cast(__m256i*)(output);
62         
63         while (blocks >= 2)
64         {
65             __m256i X0 = _mm256_loadu_si256(in_mm++);
66             __m256i X1 = _mm256_loadu_si256(in_mm++);
67             __m256i X2 = _mm256_loadu_si256(in_mm++);
68             __m256i X3 = _mm256_loadu_si256(in_mm++);
69             
70             const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
71             
72             __m256i R = _mm256_set_epi64x(0, 0, 0, 0);
73             
74             interleave_epi64(X0, X1);
75             interleave_epi64(X2, X3);
76             
77             mixin(THREEFISH_INJECT_KEY_2!(K0, K1, 2, 3));
78             
79             mixin(THREEFISH_ENC_2_8_ROUNDS!(K1,K2,K3, 1, 2, 3));
80             mixin(THREEFISH_ENC_2_8_ROUNDS!(K3,K4,K5, 2, 3, 1));
81             mixin(THREEFISH_ENC_2_8_ROUNDS!(K5,K6,K7, 3, 1, 2));
82             
83             mixin(THREEFISH_ENC_2_8_ROUNDS!(K7,K8,K0, 1, 2, 3));
84             mixin(THREEFISH_ENC_2_8_ROUNDS!(K0,K1,K2, 2, 3, 1));
85             mixin(THREEFISH_ENC_2_8_ROUNDS!(K2,K3,K4, 3, 1, 2));
86             
87             mixin(THREEFISH_ENC_2_8_ROUNDS!(K4,K5,K6, 1, 2, 3));
88             mixin(THREEFISH_ENC_2_8_ROUNDS!(K6,K7,K8, 2, 3, 1));
89             mixin(THREEFISH_ENC_2_8_ROUNDS!(K8,K0,K1, 3, 1, 2));
90             
91             deinterleave_epi64(X0, X1);
92             deinterleave_epi64(X2, X3);
93             
94             _mm256_storeu_si256(out_mm++, X0);
95             _mm256_storeu_si256(out_mm++, X1);
96             _mm256_storeu_si256(out_mm++, X2);
97             _mm256_storeu_si256(out_mm++, X3);
98             
99             blocks -= 2;
100         }
101         
102         foreach (size_t i; 0 .. blocks)
103         {
104             __m256i X0 = _mm256_loadu_si256(in_mm++);
105             __m256i X1 = _mm256_loadu_si256(in_mm++);
106             
107             const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
108             
109             __m256i R = _mm256_set_epi64x(0, 0, 0, 0);
110             
111             interleave_epi64(X0, X1);
112             
113             mixin(THREEFISH_ENC_INJECT_KEY!(K0, K1, 2, 3));
114             
115             mixin(THREEFISH_ENC_8_ROUNDS!(K1,K2,K3, 1, 2, 3));
116             mixin(THREEFISH_ENC_8_ROUNDS!(K3,K4,K5, 2, 3, 1));
117             mixin(THREEFISH_ENC_8_ROUNDS!(K5,K6,K7, 3, 1, 2));
118             
119             mixin(THREEFISH_ENC_8_ROUNDS!(K7,K8,K0, 1, 2, 3));
120             mixin(THREEFISH_ENC_8_ROUNDS!(K0,K1,K2, 2, 3, 1));
121             mixin(THREEFISH_ENC_8_ROUNDS!(K2,K3,K4, 3, 1, 2));
122             
123             mixin(THREEFISH_ENC_8_ROUNDS!(K4,K5,K6, 1, 2, 3));
124             mixin(THREEFISH_ENC_8_ROUNDS!(K6,K7,K8, 2, 3, 1));
125             mixin(THREEFISH_ENC_8_ROUNDS!(K8,K0,K1, 3, 1, 2));
126             
127             deinterleave_epi64(X0, X1);
128             
129             _mm256_storeu_si256(out_mm++, X0);
130             _mm256_storeu_si256(out_mm++, X1);
131         }
132     }
133 
134     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
135     {
136         const ulong* K = &getK()[0];
137         const ulong* T_64 = &getT()[0];
138         
139         const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
140         const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
141         const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
142         const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
143         const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
144         const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
145         const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
146         const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
147         
148         /*
149         v1.0 key schedule: 9 ymm registers (only need 2 or 3)
150         (0,1,2,3),(4,5,6,7) [8]
151         then mutating with vpermq
152         */
153         const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
154         const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
155         const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
156         const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
157         const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
158         const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
159         const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
160         const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
161         const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
162         
163         const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
164         
165         const __m256i* in_mm = cast(const __m256i*)(input);
166         __m256i* out_mm = cast(__m256i*)(output);
167         
168         foreach (size_t i; 0 .. blocks)
169         {
170             __m256i X0 = _mm256_loadu_si256(in_mm++);
171             __m256i X1 = _mm256_loadu_si256(in_mm++);
172             
173             const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
174             
175             __m256i R = _mm256_set_epi64x(18, 0, 0, 0);
176             
177             interleave_epi64(X0, X1);
178 
179             mixin(THREEFISH_DEC_8_ROUNDS!(K8,K0,K1, 3, 1, 2));
180             mixin(THREEFISH_DEC_8_ROUNDS!(K6,K7,K8, 2, 3, 1));
181             mixin(THREEFISH_DEC_8_ROUNDS!(K4,K5,K6, 1, 2, 3));
182             mixin(THREEFISH_DEC_8_ROUNDS!(K2,K3,K4, 3, 1, 2));
183             mixin(THREEFISH_DEC_8_ROUNDS!(K0,K1,K2, 2, 3, 1));
184             mixin(THREEFISH_DEC_8_ROUNDS!(K7,K8,K0, 1, 2, 3));
185             mixin(THREEFISH_DEC_8_ROUNDS!(K5,K6,K7, 3, 1, 2));
186             mixin(THREEFISH_DEC_8_ROUNDS!(K3,K4,K5, 2, 3, 1));
187             mixin(THREEFISH_DEC_8_ROUNDS!(K1,K2,K3, 1, 2, 3));
188 
189             mixin(THREEFISH_DEC_INJECT_KEY!(K0, K1, 2, 3));
190             
191             deinterleave_epi64(X0, X1);
192             
193             _mm256_storeu_si256(out_mm++, X0);
194             _mm256_storeu_si256(out_mm++, X1);
195         }
196         
197     }
198 
199     override BlockCipher clone() const { return new Threefish512AVX2; }
200 }
201 
202 private:
203     
204 void interleave_epi64(ref __m256i X0, ref __m256i X1) pure
205 {
206     // interleave X0 and X1 qwords
207     // (X0,X1,X2,X3),(X4,X5,X6,X7) . (X0,X2,X4,X6),(X1,X3,X5,X7)
208     
209     const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
210     const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
211     
212     X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
213     X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
214 }
215 
216 void deinterleave_epi64(ref __m256i X0, ref __m256i X1) pure
217 {
218     const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
219     const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
220     
221     X0 = _mm256_unpacklo_epi64(T0, T1);
222     X1 = _mm256_unpackhi_epi64(T0, T1);
223 }
224 
225 
226 
227 enum string THREEFISH_ENC_ROUND(alias _SHL) = q{
228     {const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), %1$s);
229     X0 = _mm256_add_epi64(X0, X1);
230     X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, %1$s), _mm256_srlv_epi64(X1, SHR));
231     X1 = _mm256_xor_si256(X1, X0);
232     X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));
233     X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));}
234 }.format(__traits(identifier, _SHL));
235 
236 enum string THREEFISH_ENC_ROUND_2(alias _SHL) = q{
237     {const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), %1$s);
238     X0 = _mm256_add_epi64(X0, X1);
239     X2 = _mm256_add_epi64(X2, X3);
240     X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, %1$s), _mm256_srlv_epi64(X1, SHR));
241     X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, %1$s), _mm256_srlv_epi64(X3, SHR));
242     X1 = _mm256_xor_si256(X1, X0);
243     X3 = _mm256_xor_si256(X3, X2);
244     X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));
245     X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));
246     X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));
247     X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));}
248 }.format(__traits(identifier, _SHL));
249 
250 enum string THREEFISH_ENC_INJECT_KEY(alias _K0, alias _K1, ubyte _T0I, ubyte _T1I) = q{
251     {const __m256i T0_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(%3$s, 0, 0, 0));
252     const __m256i T1_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, %4$s, 0, 0));
253     X0 = _mm256_add_epi64(X0, %1$s);
254     X1 = _mm256_add_epi64(X1, %2$s);
255     X1 = _mm256_add_epi64(X1, R);
256     X0 = _mm256_add_epi64(X0, T0_);
257     X1 = _mm256_add_epi64(X1, T1_);
258     R = _mm256_add_epi64(R, ONE);}
259 }.format(__traits(identifier, _K0), __traits(identifier, _K1), _T0I, _T1I);
260 
261 enum string THREEFISH_ENC_INJECT_KEY_2(alias _K0, alias _K1, ubyte _T0I, ubyte _T1I) = q{
262     {const __m256i T0_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(%3$s, 0, 0, 0));
263     __m256i T1_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, %4$s, 0, 0));
264     X0 = _mm256_add_epi64(X0, %1$s);
265     X2 = _mm256_add_epi64(X2, %1$s);
266     X1 = _mm256_add_epi64(X1, %2$s);
267     X3 = _mm256_add_epi64(X3, %2$s);
268     T1_ = _mm256_add_epi64(T1_, R);
269     X0 = _mm256_add_epi64(X0, T0_);
270     X2 = _mm256_add_epi64(X2, T0_);
271     X1 = _mm256_add_epi64(X1, T1_);
272     X3 = _mm256_add_epi64(X3, T1_);
273     R = _mm256_add_epi64(R, ONE);}
274 }.format(__traits(identifier, _K0), __traits(identifier, _K1), _T0I.stringof, _T1I.stringof);
275 
276 enum string THREEFISH_ENC_8_ROUNDS(alias _K1, alias _K2, alias _K3, ubyte _T0, ubyte _T1, ubyte _T2) =
277     `mixin(THREEFISH_ENC_ROUND!(ROTATE_1));
278     mixin(THREEFISH_ENC_ROUND!(ROTATE_2));
279     mixin(THREEFISH_ENC_ROUND!(ROTATE_3));
280     mixin(THREEFISH_ENC_ROUND!(ROTATE_4));
281     mixin(THREEFISH_ENC_INJECT_KEY!(`~__traits(identifier, _K1)~`, `~__traits(identifier, _K2)~`, `~_T0.stringof~`, `~_T1.stringof~`));
282     mixin(THREEFISH_ENC_ROUND!(ROTATE_5));
283     mixin(THREEFISH_ENC_ROUND!(ROTATE_6));
284     mixin(THREEFISH_ENC_ROUND!(ROTATE_7));
285     mixin(THREEFISH_ENC_ROUND!(ROTATE_8));
286     mixin(THREEFISH_ENC_INJECT_KEY!(`~__traits(identifier, _K2)~`, `~__traits(identifier, _K3)~`, `~_T2.stringof~`, `~_T0.stringof~`));`;
287 
288 enum string THREEFISH_ENC_2_8_ROUNDS(alias _K1, alias _K2, alias _K3, ubyte _T0, ubyte _T1, ubyte _T2) =
289     `mixin(THREEFISH_ENC_ROUND_2!(ROTATE_1));
290     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_2));
291     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_3));
292     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_4));
293     mixin(THREEFISH_ENC_INJECT_KEY_2!(`~__traits(identifier, _K1)~`, `~__traits(identifier, _K2)~`, `~_T0.stringof~`, `~_T1.stringof~`));
294     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_5));
295     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_6));
296     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_7));
297     mixin(THREEFISH_ENC_ROUND_2!(ROTATE_8));
298     mixin(THREEFISH_ENC_INJECT_KEY_2!(`~__traits(identifier, _K2)~`, `~__traits(identifier, _K3)~`, `~_T2.stringof~`, `~_T0.stringof~`));`;
299 
300 enum string THREEFISH_DEC_ROUND(alias _SHR) = q{
301     {const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), %1$s);
302     X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));
303     X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));
304     X1 = _mm256_xor_si256(X1, X0);
305     X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, %1$s));
306     X0 = _mm256_sub_epi64(X0, X1);}
307 }.format(__traits(identifier, _SHR));
308 
309 enum string THREEFISH_DEC_INJECT_KEY(alias _K0, alias _K1, ubyte _T0I, ubyte _T1I) = q{
310     {const __m256i T0_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(%3$s, 0, 0, 0));
311     const __m256i T1_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, %4$s, 0, 0));
312     X0 = _mm256_sub_epi64(X0, %1$s);
313     X1 = _mm256_sub_epi64(X1, %2$s);
314     X1 = _mm256_sub_epi64(X1, R);
315     R = _mm256_sub_epi64(R, ONE);
316     X0 = _mm256_sub_epi64(X0, T0_);
317     X1 = _mm256_sub_epi64(X1, T1_);}
318 }.format(__traits(identifier, _K0), __traits(identifier, _K1), _T0I.stringof, _T1I.stringof);
319 
320 enum string THREEFISH_DEC_8_ROUNDS(alias _K1, alias _K2, alias _K3, ubyte _T0, ubyte _T1, ubyte _T2) =
321     `mixin(THREEFISH_DEC_INJECT_KEY!(`~__traits(identifier, _K2)~`, `~__traits(identifier, _K3)~`, `~_T2.stringof~`, `~_T0.stringof~`));
322     mixin(THREEFISH_DEC_ROUND!(ROTATE_8));
323     mixin(THREEFISH_DEC_ROUND!(ROTATE_7));
324     mixin(THREEFISH_DEC_ROUND!(ROTATE_6));
325     mixin(THREEFISH_DEC_ROUND!(ROTATE_5));
326     mixin(THREEFISH_DEC_INJECT_KEY!(`~__traits(identifier, _K1)~`, `~__traits(identifier, _K2)~`, `~_T0.stringof~`, `~_T1.stringof~`));
327     mixin(THREEFISH_DEC_ROUND!(ROTATE_4));
328     mixin(THREEFISH_DEC_ROUND!(ROTATE_3));
329     mixin(THREEFISH_DEC_ROUND!(ROTATE_2));
330     mixin(THREEFISH_DEC_ROUND!(ROTATE_1));`;