1 /** 2 * Threefish-512 in AVX2 3 * 4 * Copyright: 5 * (C) 2013 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.block.threefish_avx2; 12 13 import botan.constants; 14 static if (BOTAN_HAS_THREEFISH_512_AVX2): 15 16 import botan.block.threefish; 17 import botan.utils.simd.immintrin; 18 import botan.block.block_cipher; 19 import botan.utils.mem_ops; 20 import std.format : format; 21 22 /** 23 * Threefish-512 24 */ 25 final class Threefish512AVX2 : Threefish512 26 { 27 public: 28 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 29 { 30 const ulong* K = &getK()[0]; 31 const ulong* T_64 = &getT()[0]; 32 33 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); 34 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); 35 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); 36 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); 37 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); 38 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); 39 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); 40 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); 41 42 43 /* 44 v1.0 key schedule: 9 ymm registers (only need 2 or 3) 45 (0,1,2,3),(4,5,6,7) [8] 46 then mutating with vpermq 47 */ 48 const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); 49 const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); 50 const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); 51 const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); 52 const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); 53 const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); 54 const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); 55 const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); 56 const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); 57 58 const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); 59 60 const __m256i* in_mm = cast(const __m256i*)(input); 61 __m256i* out_mm = cast(__m256i*)(output); 62 63 while (blocks >= 2) 64 { 65 __m256i X0 = _mm256_loadu_si256(in_mm++); 66 __m256i X1 = _mm256_loadu_si256(in_mm++); 67 __m256i X2 = _mm256_loadu_si256(in_mm++); 68 __m256i X3 = _mm256_loadu_si256(in_mm++); 69 70 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); 71 72 __m256i R = _mm256_set_epi64x(0, 0, 0, 0); 73 74 interleave_epi64(X0, X1); 75 interleave_epi64(X2, X3); 76 77 mixin(THREEFISH_INJECT_KEY_2!(K0, K1, 2, 3)); 78 79 mixin(THREEFISH_ENC_2_8_ROUNDS!(K1,K2,K3, 1, 2, 3)); 80 mixin(THREEFISH_ENC_2_8_ROUNDS!(K3,K4,K5, 2, 3, 1)); 81 mixin(THREEFISH_ENC_2_8_ROUNDS!(K5,K6,K7, 3, 1, 2)); 82 83 mixin(THREEFISH_ENC_2_8_ROUNDS!(K7,K8,K0, 1, 2, 3)); 84 mixin(THREEFISH_ENC_2_8_ROUNDS!(K0,K1,K2, 2, 3, 1)); 85 mixin(THREEFISH_ENC_2_8_ROUNDS!(K2,K3,K4, 3, 1, 2)); 86 87 mixin(THREEFISH_ENC_2_8_ROUNDS!(K4,K5,K6, 1, 2, 3)); 88 mixin(THREEFISH_ENC_2_8_ROUNDS!(K6,K7,K8, 2, 3, 1)); 89 mixin(THREEFISH_ENC_2_8_ROUNDS!(K8,K0,K1, 3, 1, 2)); 90 91 deinterleave_epi64(X0, X1); 92 deinterleave_epi64(X2, X3); 93 94 _mm256_storeu_si256(out_mm++, X0); 95 _mm256_storeu_si256(out_mm++, X1); 96 _mm256_storeu_si256(out_mm++, X2); 97 _mm256_storeu_si256(out_mm++, X3); 98 99 blocks -= 2; 100 } 101 102 foreach (size_t i; 0 .. blocks) 103 { 104 __m256i X0 = _mm256_loadu_si256(in_mm++); 105 __m256i X1 = _mm256_loadu_si256(in_mm++); 106 107 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); 108 109 __m256i R = _mm256_set_epi64x(0, 0, 0, 0); 110 111 interleave_epi64(X0, X1); 112 113 mixin(THREEFISH_ENC_INJECT_KEY!(K0, K1, 2, 3)); 114 115 mixin(THREEFISH_ENC_8_ROUNDS!(K1,K2,K3, 1, 2, 3)); 116 mixin(THREEFISH_ENC_8_ROUNDS!(K3,K4,K5, 2, 3, 1)); 117 mixin(THREEFISH_ENC_8_ROUNDS!(K5,K6,K7, 3, 1, 2)); 118 119 mixin(THREEFISH_ENC_8_ROUNDS!(K7,K8,K0, 1, 2, 3)); 120 mixin(THREEFISH_ENC_8_ROUNDS!(K0,K1,K2, 2, 3, 1)); 121 mixin(THREEFISH_ENC_8_ROUNDS!(K2,K3,K4, 3, 1, 2)); 122 123 mixin(THREEFISH_ENC_8_ROUNDS!(K4,K5,K6, 1, 2, 3)); 124 mixin(THREEFISH_ENC_8_ROUNDS!(K6,K7,K8, 2, 3, 1)); 125 mixin(THREEFISH_ENC_8_ROUNDS!(K8,K0,K1, 3, 1, 2)); 126 127 deinterleave_epi64(X0, X1); 128 129 _mm256_storeu_si256(out_mm++, X0); 130 _mm256_storeu_si256(out_mm++, X1); 131 } 132 } 133 134 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 135 { 136 const ulong* K = &getK()[0]; 137 const ulong* T_64 = &getT()[0]; 138 139 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); 140 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); 141 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); 142 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); 143 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); 144 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); 145 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); 146 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); 147 148 /* 149 v1.0 key schedule: 9 ymm registers (only need 2 or 3) 150 (0,1,2,3),(4,5,6,7) [8] 151 then mutating with vpermq 152 */ 153 const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); 154 const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); 155 const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); 156 const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); 157 const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); 158 const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); 159 const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); 160 const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); 161 const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); 162 163 const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); 164 165 const __m256i* in_mm = cast(const __m256i*)(input); 166 __m256i* out_mm = cast(__m256i*)(output); 167 168 foreach (size_t i; 0 .. blocks) 169 { 170 __m256i X0 = _mm256_loadu_si256(in_mm++); 171 __m256i X1 = _mm256_loadu_si256(in_mm++); 172 173 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); 174 175 __m256i R = _mm256_set_epi64x(18, 0, 0, 0); 176 177 interleave_epi64(X0, X1); 178 179 mixin(THREEFISH_DEC_8_ROUNDS!(K8,K0,K1, 3, 1, 2)); 180 mixin(THREEFISH_DEC_8_ROUNDS!(K6,K7,K8, 2, 3, 1)); 181 mixin(THREEFISH_DEC_8_ROUNDS!(K4,K5,K6, 1, 2, 3)); 182 mixin(THREEFISH_DEC_8_ROUNDS!(K2,K3,K4, 3, 1, 2)); 183 mixin(THREEFISH_DEC_8_ROUNDS!(K0,K1,K2, 2, 3, 1)); 184 mixin(THREEFISH_DEC_8_ROUNDS!(K7,K8,K0, 1, 2, 3)); 185 mixin(THREEFISH_DEC_8_ROUNDS!(K5,K6,K7, 3, 1, 2)); 186 mixin(THREEFISH_DEC_8_ROUNDS!(K3,K4,K5, 2, 3, 1)); 187 mixin(THREEFISH_DEC_8_ROUNDS!(K1,K2,K3, 1, 2, 3)); 188 189 mixin(THREEFISH_DEC_INJECT_KEY!(K0, K1, 2, 3)); 190 191 deinterleave_epi64(X0, X1); 192 193 _mm256_storeu_si256(out_mm++, X0); 194 _mm256_storeu_si256(out_mm++, X1); 195 } 196 197 } 198 199 override BlockCipher clone() const { return new Threefish512AVX2; } 200 } 201 202 private: 203 204 void interleave_epi64(ref __m256i X0, ref __m256i X1) pure 205 { 206 // interleave X0 and X1 qwords 207 // (X0,X1,X2,X3),(X4,X5,X6,X7) . (X0,X2,X4,X6),(X1,X3,X5,X7) 208 209 const __m256i T0 = _mm256_unpacklo_epi64(X0, X1); 210 const __m256i T1 = _mm256_unpackhi_epi64(X0, X1); 211 212 X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0)); 213 X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0)); 214 } 215 216 void deinterleave_epi64(ref __m256i X0, ref __m256i X1) pure 217 { 218 const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0)); 219 const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0)); 220 221 X0 = _mm256_unpacklo_epi64(T0, T1); 222 X1 = _mm256_unpackhi_epi64(T0, T1); 223 } 224 225 226 227 enum string THREEFISH_ENC_ROUND(alias _SHL) = q{ 228 {const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), %1$s); 229 X0 = _mm256_add_epi64(X0, X1); 230 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, %1$s), _mm256_srlv_epi64(X1, SHR)); 231 X1 = _mm256_xor_si256(X1, X0); 232 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); 233 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));} 234 }.format(__traits(identifier, _SHL)); 235 236 enum string THREEFISH_ENC_ROUND_2(alias _SHL) = q{ 237 {const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), %1$s); 238 X0 = _mm256_add_epi64(X0, X1); 239 X2 = _mm256_add_epi64(X2, X3); 240 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, %1$s), _mm256_srlv_epi64(X1, SHR)); 241 X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, %1$s), _mm256_srlv_epi64(X3, SHR)); 242 X1 = _mm256_xor_si256(X1, X0); 243 X3 = _mm256_xor_si256(X3, X2); 244 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); 245 X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); 246 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); 247 X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));} 248 }.format(__traits(identifier, _SHL)); 249 250 enum string THREEFISH_ENC_INJECT_KEY(alias _K0, alias _K1, ubyte _T0I, ubyte _T1I) = q{ 251 {const __m256i T0_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(%3$s, 0, 0, 0)); 252 const __m256i T1_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, %4$s, 0, 0)); 253 X0 = _mm256_add_epi64(X0, %1$s); 254 X1 = _mm256_add_epi64(X1, %2$s); 255 X1 = _mm256_add_epi64(X1, R); 256 X0 = _mm256_add_epi64(X0, T0_); 257 X1 = _mm256_add_epi64(X1, T1_); 258 R = _mm256_add_epi64(R, ONE);} 259 }.format(__traits(identifier, _K0), __traits(identifier, _K1), _T0I, _T1I); 260 261 enum string THREEFISH_ENC_INJECT_KEY_2(alias _K0, alias _K1, ubyte _T0I, ubyte _T1I) = q{ 262 {const __m256i T0_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(%3$s, 0, 0, 0)); 263 __m256i T1_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, %4$s, 0, 0)); 264 X0 = _mm256_add_epi64(X0, %1$s); 265 X2 = _mm256_add_epi64(X2, %1$s); 266 X1 = _mm256_add_epi64(X1, %2$s); 267 X3 = _mm256_add_epi64(X3, %2$s); 268 T1_ = _mm256_add_epi64(T1_, R); 269 X0 = _mm256_add_epi64(X0, T0_); 270 X2 = _mm256_add_epi64(X2, T0_); 271 X1 = _mm256_add_epi64(X1, T1_); 272 X3 = _mm256_add_epi64(X3, T1_); 273 R = _mm256_add_epi64(R, ONE);} 274 }.format(__traits(identifier, _K0), __traits(identifier, _K1), _T0I.stringof, _T1I.stringof); 275 276 enum string THREEFISH_ENC_8_ROUNDS(alias _K1, alias _K2, alias _K3, ubyte _T0, ubyte _T1, ubyte _T2) = 277 `mixin(THREEFISH_ENC_ROUND!(ROTATE_1)); 278 mixin(THREEFISH_ENC_ROUND!(ROTATE_2)); 279 mixin(THREEFISH_ENC_ROUND!(ROTATE_3)); 280 mixin(THREEFISH_ENC_ROUND!(ROTATE_4)); 281 mixin(THREEFISH_ENC_INJECT_KEY!(`~__traits(identifier, _K1)~`, `~__traits(identifier, _K2)~`, `~_T0.stringof~`, `~_T1.stringof~`)); 282 mixin(THREEFISH_ENC_ROUND!(ROTATE_5)); 283 mixin(THREEFISH_ENC_ROUND!(ROTATE_6)); 284 mixin(THREEFISH_ENC_ROUND!(ROTATE_7)); 285 mixin(THREEFISH_ENC_ROUND!(ROTATE_8)); 286 mixin(THREEFISH_ENC_INJECT_KEY!(`~__traits(identifier, _K2)~`, `~__traits(identifier, _K3)~`, `~_T2.stringof~`, `~_T0.stringof~`));`; 287 288 enum string THREEFISH_ENC_2_8_ROUNDS(alias _K1, alias _K2, alias _K3, ubyte _T0, ubyte _T1, ubyte _T2) = 289 `mixin(THREEFISH_ENC_ROUND_2!(ROTATE_1)); 290 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_2)); 291 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_3)); 292 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_4)); 293 mixin(THREEFISH_ENC_INJECT_KEY_2!(`~__traits(identifier, _K1)~`, `~__traits(identifier, _K2)~`, `~_T0.stringof~`, `~_T1.stringof~`)); 294 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_5)); 295 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_6)); 296 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_7)); 297 mixin(THREEFISH_ENC_ROUND_2!(ROTATE_8)); 298 mixin(THREEFISH_ENC_INJECT_KEY_2!(`~__traits(identifier, _K2)~`, `~__traits(identifier, _K3)~`, `~_T2.stringof~`, `~_T0.stringof~`));`; 299 300 enum string THREEFISH_DEC_ROUND(alias _SHR) = q{ 301 {const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), %1$s); 302 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); 303 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); 304 X1 = _mm256_xor_si256(X1, X0); 305 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, %1$s)); 306 X0 = _mm256_sub_epi64(X0, X1);} 307 }.format(__traits(identifier, _SHR)); 308 309 enum string THREEFISH_DEC_INJECT_KEY(alias _K0, alias _K1, ubyte _T0I, ubyte _T1I) = q{ 310 {const __m256i T0_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(%3$s, 0, 0, 0)); 311 const __m256i T1_ = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, %4$s, 0, 0)); 312 X0 = _mm256_sub_epi64(X0, %1$s); 313 X1 = _mm256_sub_epi64(X1, %2$s); 314 X1 = _mm256_sub_epi64(X1, R); 315 R = _mm256_sub_epi64(R, ONE); 316 X0 = _mm256_sub_epi64(X0, T0_); 317 X1 = _mm256_sub_epi64(X1, T1_);} 318 }.format(__traits(identifier, _K0), __traits(identifier, _K1), _T0I.stringof, _T1I.stringof); 319 320 enum string THREEFISH_DEC_8_ROUNDS(alias _K1, alias _K2, alias _K3, ubyte _T0, ubyte _T1, ubyte _T2) = 321 `mixin(THREEFISH_DEC_INJECT_KEY!(`~__traits(identifier, _K2)~`, `~__traits(identifier, _K3)~`, `~_T2.stringof~`, `~_T0.stringof~`)); 322 mixin(THREEFISH_DEC_ROUND!(ROTATE_8)); 323 mixin(THREEFISH_DEC_ROUND!(ROTATE_7)); 324 mixin(THREEFISH_DEC_ROUND!(ROTATE_6)); 325 mixin(THREEFISH_DEC_ROUND!(ROTATE_5)); 326 mixin(THREEFISH_DEC_INJECT_KEY!(`~__traits(identifier, _K1)~`, `~__traits(identifier, _K2)~`, `~_T0.stringof~`, `~_T1.stringof~`)); 327 mixin(THREEFISH_DEC_ROUND!(ROTATE_4)); 328 mixin(THREEFISH_DEC_ROUND!(ROTATE_3)); 329 mixin(THREEFISH_DEC_ROUND!(ROTATE_2)); 330 mixin(THREEFISH_DEC_ROUND!(ROTATE_1));`;