1 /** 2 * AES using SSSE3 3 * 4 * Copyright: 5 * (C) 2010 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.block.aes_ssse3; 12 13 import botan.constants; 14 static if (BOTAN_HAS_AES_SSSE3): 15 16 import std.range : iota; 17 import botan.block.block_cipher; 18 import botan.utils.types; 19 import botan.utils.mem_ops; 20 import botan.utils.simd.tmmintrin; 21 22 23 /** 24 * AES-128 using SSSE3 25 */ 26 final class AES128_SSSE3 : BlockCipherFixedParams!(16, 16), BlockCipher, SymmetricAlgorithm 27 { 28 public: 29 /* 30 * AES-128 Encryption 31 */ 32 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 33 { 34 const(__m128i*) in_mm = cast(const(__m128i*))(input); 35 __m128i* out_mm = cast(__m128i*)(output); 36 37 const(__m128i*) keys = cast(const(__m128i*))(m_EK.ptr); 38 39 foreach (size_t i; 0 .. blocks) 40 { 41 __m128i B = _mm_loadu_si128(in_mm + i); 42 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10)); 43 } 44 } 45 46 /* 47 * AES-128 Decryption 48 */ 49 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 50 { 51 const(__m128i*) in_mm = cast(const(__m128i*))(input); 52 __m128i* out_mm = cast(__m128i*)(output); 53 54 const(__m128i*) keys = cast(const(__m128i*))(m_DK.ptr); 55 56 foreach (size_t i; 0 .. blocks) 57 { 58 __m128i B = _mm_loadu_si128(in_mm + i); 59 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10)); 60 } 61 } 62 63 override void clear() 64 { 65 zap(m_EK); 66 zap(m_DK); 67 } 68 69 @property string name() const { return "AES-128"; } 70 override @property size_t parallelism() const { return 1; } 71 override BlockCipher clone() const { return new AES128_SSSE3; } 72 override size_t blockSize() const { return super.blockSize(); } 73 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 74 protected: 75 76 /* 77 * AES-128 Key Schedule 78 */ 79 override void keySchedule(const(ubyte)* keyb, size_t) 80 { 81 __m128i rcon = _mm_set_epi32!(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6)(); 82 83 __m128i key = _mm_loadu_si128(cast(const(__m128i*))(keyb)); 84 85 m_EK.resize(11*4); 86 m_DK.resize(11*4); 87 __m128i* EK_mm = cast(__m128i*)(m_EK.ptr); 88 __m128i* DK_mm = cast(__m128i*)(m_DK.ptr); 89 90 _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2])); 91 92 key = aes_schedule_transform(key, k_ipt1, k_ipt2); 93 94 _mm_storeu_si128(EK_mm, key); 95 96 foreach (size_t i; 1 .. 10) 97 { 98 key = aes_schedule_round(&rcon, key, key); 99 100 _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key, (12-i) % 4)); 101 102 _mm_storeu_si128(DK_mm + (10-i), aes_schedule_mangle_dec(key, (10-i) % 4)); 103 } 104 105 key = aes_schedule_round(&rcon, key, key); 106 _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2)); 107 _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key)); 108 109 } 110 111 SecureVector!uint m_EK, m_DK; 112 } 113 114 /** 115 * AES-192 using SSSE3 116 */ 117 final class AES192_SSSE3 : BlockCipherFixedParams!(16, 24), BlockCipher, SymmetricAlgorithm 118 { 119 public: 120 /* 121 * AES-192 Encryption 122 */ 123 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 124 { 125 const(__m128i*) in_mm = cast(const(__m128i*))(input); 126 __m128i* out_mm = cast(__m128i*)(output); 127 128 const(__m128i*) keys = cast(const(__m128i*))(m_EK.ptr); 129 130 foreach (size_t i; 0 .. blocks) 131 { 132 __m128i B = _mm_loadu_si128(in_mm + i); 133 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12)); 134 } 135 } 136 137 /* 138 * AES-192 Decryption 139 */ 140 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 141 { 142 const(__m128i*) in_mm = cast(const(__m128i*))(input); 143 __m128i* out_mm = cast(__m128i*)(output); 144 145 const(__m128i*) keys = cast(const(__m128i*))(m_DK.ptr); 146 147 foreach (size_t i; 0 .. blocks) 148 { 149 __m128i B = _mm_loadu_si128(in_mm + i); 150 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12)); 151 } 152 } 153 154 override void clear() 155 { 156 zap(m_EK); 157 zap(m_DK); 158 } 159 160 @property string name() const { return "AES-192"; } 161 override @property size_t parallelism() const { return 1; } 162 override BlockCipher clone() const { return new AES192_SSSE3; } 163 override size_t blockSize() const { return super.blockSize(); } 164 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 165 protected: 166 /* 167 * AES-192 Key Schedule 168 */ 169 override void keySchedule(const(ubyte)* keyb, size_t len) 170 { 171 immutable(__m128i) rcon_imm = _mm_set_epi32!(0x702A9808, 0x4D7C7D81, 0x1F8391B9, 0xAF9DEEB6)(); 172 __m128i rcon = rcon_imm; 173 m_EK.resize(13*4); 174 m_DK.resize(13*4); 175 176 __m128i* EK_mm = cast(__m128i*)(m_EK.ptr); 177 __m128i* DK_mm = cast(__m128i*)(m_DK.ptr); 178 179 __m128i key1 = _mm_loadu_si128(cast(const(__m128i*))(keyb)); 180 __m128i key2 = _mm_loadu_si128(cast(const(__m128i*))(keyb + 8)); 181 182 _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0])); 183 184 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); 185 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); 186 187 _mm_storeu_si128(EK_mm + 0, key1); 188 189 // key2 with 8 high bytes masked off 190 __m128i t = _mm_slli_si128!8(_mm_srli_si128!8(key2)); 191 192 foreach (size_t i; 0 .. 4) 193 { 194 key2 = aes_schedule_round(&rcon, key2, key1); 195 196 _mm_storeu_si128(EK_mm + 3*i+1, aes_schedule_mangle(_mm_alignr_epi8!8(key2, t), (i+3)%4)); 197 _mm_storeu_si128(DK_mm + 11-3*i, aes_schedule_mangle_dec(_mm_alignr_epi8!8(key2, t), (i+3)%4)); 198 199 t = aes_schedule_192_smear(key2, t); 200 201 _mm_storeu_si128(EK_mm + 3*i+2, aes_schedule_mangle(t, (i+2)%4)); 202 _mm_storeu_si128(DK_mm + 10-3*i, aes_schedule_mangle_dec(t, (i+2)%4)); 203 204 key2 = aes_schedule_round(&rcon, t, key2); 205 206 if (i == 3) 207 { 208 _mm_storeu_si128(EK_mm + 3*i+3, aes_schedule_mangle_last(key2, (i+1)%4)); 209 _mm_storeu_si128(DK_mm + 9-3*i, aes_schedule_mangle_last_dec(key2)); 210 } 211 else 212 { 213 _mm_storeu_si128(EK_mm + 3*i+3, aes_schedule_mangle(key2, (i+1)%4)); 214 _mm_storeu_si128(DK_mm + 9-3*i, aes_schedule_mangle_dec(key2, (i+1)%4)); 215 } 216 217 key1 = key2; 218 key2 = aes_schedule_192_smear(key2, _mm_slli_si128!8(_mm_srli_si128!8(t))); 219 t = _mm_slli_si128!8(_mm_srli_si128!8(key2)); 220 } 221 } 222 223 SecureVector!uint m_EK, m_DK; 224 } 225 226 /** 227 * AES-256 using SSSE3 228 */ 229 final class AES256_SSSE3 : BlockCipherFixedParams!(16, 32), BlockCipher, SymmetricAlgorithm 230 { 231 public: 232 /* 233 * AES-256 Encryption 234 */ 235 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 236 { 237 const(__m128i*) in_mm = cast(const(__m128i*))(input); 238 __m128i* out_mm = cast(__m128i*)(output); 239 240 const(__m128i*) keys = cast(const(__m128i*))(m_EK.ptr); 241 242 foreach (size_t i; 0 .. blocks) 243 { 244 __m128i B = _mm_loadu_si128(in_mm + i); 245 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14)); 246 } 247 } 248 249 /* 250 * AES-256 Decryption 251 */ 252 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 253 { 254 const(__m128i*) in_mm = cast(const(__m128i*))(input); 255 __m128i* out_mm = cast(__m128i*)(output); 256 257 const(__m128i*) keys = cast(const(__m128i*))(m_DK.ptr); 258 259 foreach (size_t i; 0 .. blocks) 260 { 261 __m128i B = _mm_loadu_si128(in_mm + i); 262 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14)); 263 } 264 } 265 266 override void clear() 267 { 268 zap(m_EK); 269 zap(m_DK); 270 } 271 272 @property string name() const { return "AES-256"; } 273 override @property size_t parallelism() const { return 1; } 274 override BlockCipher clone() const { return new AES256_SSSE3; } 275 override size_t blockSize() const { return super.blockSize(); } 276 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 277 protected: 278 /* 279 * AES-256 Key Schedule 280 */ 281 override void keySchedule(const(ubyte)* keyb, size_t) 282 { 283 __m128i rcon = _mm_set_epi32!(0x702A9808, 0x4D7C7D81, 284 0x1F8391B9, 0xAF9DEEB6)(); 285 286 m_EK.resize(15*4); 287 m_DK.resize(15*4); 288 289 __m128i* EK_mm = cast(__m128i*)(m_EK.ptr); 290 __m128i* DK_mm = cast(__m128i*)(m_DK.ptr); 291 292 __m128i key1 = _mm_loadu_si128(cast(const(__m128i*))(keyb)); 293 __m128i key2 = _mm_loadu_si128(cast(const(__m128i*))((keyb + 16))); 294 295 _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2])); 296 297 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); 298 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); 299 300 _mm_storeu_si128(EK_mm + 0, key1); 301 _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3)); 302 303 _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1)); 304 305 foreach (size_t i; iota(2, 14, 2)) 306 { 307 __m128i k_t = key2; 308 key1 = key2 = aes_schedule_round(&rcon, key2, key1); 309 310 _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4)); 311 _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4)); 312 313 __m128i k_t_0 = _mm_shuffle_epi32!0xFF(key2); 314 key2 = aes_schedule_round(cast(__m128i*)null, k_t_0, k_t); 315 _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4)); 316 _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4)); 317 } 318 319 key2 = aes_schedule_round(&rcon, key2, key1); 320 321 _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2)); 322 _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2)); 323 } 324 325 SecureVector!uint m_EK, m_DK; 326 } 327 328 static this() { 329 logTrace("Loading AES SSSE3 ..."); 330 331 low_nibs = _mm_set1_epi8!(0x0F)(); 332 k_ipt1 = _mm_set_epi32!(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000)(); 333 k_ipt2 = _mm_set_epi32!(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00)(); 334 k_inv1 = _mm_set_epi32!(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180)(); 335 k_inv2 = _mm_set_epi32!(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780)(); 336 sb1u = _mm_set_epi32!(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00)(); 337 sb1t = _mm_set_epi32!(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300)(); 338 mc_forward = [ 339 _mm_set_epi32!(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201)(), 340 _mm_set_epi32!(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605)(), 341 _mm_set_epi32!(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09)(), 342 _mm_set_epi32!(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)()]; 343 __m128i[4] sr_ = [ 344 _mm_set_epi32!(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)(), 345 _mm_set_epi32!(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500)(), 346 _mm_set_epi32!(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900)(), 347 _mm_set_epi32!(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00)()]; 348 sr = sr_; 349 } 350 351 immutable __m128i low_nibs; 352 353 immutable __m128i k_ipt1 ; 354 immutable __m128i k_ipt2; 355 356 immutable __m128i k_inv1; 357 immutable __m128i k_inv2; 358 359 immutable __m128i sb1u; 360 immutable __m128i sb1t; 361 362 immutable(__m128i)[4] mc_forward; 363 364 immutable(__m128i)[4] sr; 365 366 package: 367 368 __m128i aes_schedule_transform(__m128i input, 369 __m128i table_1, 370 __m128i table_2) 371 { 372 __m128i i_1 = _mm_and_si128(low_nibs, input); 373 __m128i i_2 = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, input)); 374 375 input = _mm_and_si128(low_nibs, input); 376 377 return _mm_xor_si128(_mm_shuffle_epi8(table_1, i_1), 378 _mm_shuffle_epi8(table_2, i_2)); 379 } 380 381 __m128i aes_schedule_mangle(__m128i k, ubyte round_no) 382 { 383 __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8!(0x5B)()), mc_forward[0]); 384 385 __m128i t2 = t; 386 387 t = _mm_shuffle_epi8(t, mc_forward[0]); 388 389 t2 = _mm_xor_si128(t2, _mm_xor_si128(t, _mm_shuffle_epi8(t, mc_forward[0]))); 390 391 return _mm_shuffle_epi8(t2, sr[round_no % 4]); 392 } 393 394 __m128i aes_schedule_192_smear(__m128i x, __m128i y) 395 { 396 return _mm_xor_si128(y,_mm_xor_si128(_mm_shuffle_epi32!0xFE(x), 397 _mm_shuffle_epi32!0x80(y))); 398 } 399 400 __m128i aes_schedule_mangle_dec(__m128i k, ubyte round_no) 401 { 402 immutable(__m128i)[8] dsk = [ 403 _mm_set_epi32!(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700)(), 404 _mm_set_epi32!(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300)(), 405 _mm_set_epi32!(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400)(), 406 _mm_set_epi32!(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00)(), 407 _mm_set_epi32!(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700)(), 408 _mm_set_epi32!(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700)(), 409 _mm_set_epi32!(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000)(), 410 _mm_set_epi32!(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)() 411 ]; 412 413 __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]); 414 __m128i output = _mm_shuffle_epi8(t, mc_forward[0]); 415 416 t = aes_schedule_transform(t, dsk[2], dsk[3]); 417 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); 418 419 t = aes_schedule_transform(t, dsk[4], dsk[5]); 420 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); 421 422 t = aes_schedule_transform(t, dsk[6], dsk[7]); 423 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); 424 425 return _mm_shuffle_epi8(output, sr[round_no % 4]); 426 } 427 428 __m128i aes_schedule_mangle_last(__m128i k, ubyte round_no) 429 { 430 immutable(__m128i) out_tr1 = _mm_set_epi32!(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000)(); 431 immutable(__m128i) out_tr2 = _mm_set_epi32!(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00)(); 432 433 k = _mm_shuffle_epi8(k, sr[round_no % 4]); 434 k = _mm_xor_si128(k, _mm_set1_epi8!(0x5B)()); 435 return aes_schedule_transform(k, out_tr1, out_tr2); 436 } 437 438 __m128i aes_schedule_mangle_last_dec(__m128i k) 439 { 440 immutable(__m128i) deskew1 = _mm_set_epi32!(0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300)(); 441 immutable(__m128i) deskew2 = _mm_set_epi32!(0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900)(); 442 443 k = _mm_xor_si128(k, _mm_set1_epi8!(0x5B)()); 444 return aes_schedule_transform(k, deskew1, deskew2); 445 } 446 447 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) 448 { 449 450 451 if (rcon !is null) 452 { 453 input2 = _mm_xor_si128(_mm_alignr_epi8!15(_mm_setzero_si128(), *rcon), input2); 454 __m128i tmp_rcon = *rcon; 455 *rcon = _mm_alignr_epi8!15(tmp_rcon, tmp_rcon); // next rcon 456 457 input1 = _mm_shuffle_epi32!0xFF(input1); // rotate 458 input1 = _mm_alignr_epi8!1(input1, input1); 459 } 460 461 __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128!4(input2)); 462 smeared = _mm_xor_si128(smeared, _mm_xor_si128(_mm_slli_si128!8(smeared), _mm_set1_epi8!(0x5B)())); 463 464 __m128i t = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, input1)); 465 466 input1 = _mm_and_si128(low_nibs, input1); 467 468 __m128i t2 = _mm_shuffle_epi8(k_inv2, input1); 469 470 input1 = _mm_xor_si128(input1, t); 471 472 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); 473 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1)); 474 475 __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3)); 476 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); 477 478 return _mm_xor_si128(_mm_shuffle_epi8(sb1u, t5), 479 _mm_xor_si128(_mm_shuffle_epi8(sb1t, t6), smeared)); 480 } 481 482 __m128i aes_ssse3_encrypt(__m128i B, const(__m128i*) keys, size_t rounds) 483 { 484 immutable(__m128i) sb2u = _mm_set_epi32!(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400)(); 485 immutable(__m128i) sb2t = _mm_set_epi32!(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900)(); 486 487 immutable(__m128i) sbou = _mm_set_epi32!(0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700)(); 488 immutable(__m128i) sbot = _mm_set_epi32!(0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00)(); 489 490 immutable(__m128i)[4] mc_backward = [ 491 _mm_set_epi32!(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003)(), 492 _mm_set_epi32!(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F)(), 493 _mm_set_epi32!(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B)(), 494 _mm_set_epi32!(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407)(), 495 ]; 496 497 B = _mm_xor_si128(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)), 498 _mm_xor_si128(_mm_shuffle_epi8(k_ipt2, _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, B))), 499 _mm_loadu_si128(keys))); 500 501 for (size_t r = 1; ; ++r) 502 { 503 const(__m128i) K = _mm_loadu_si128(keys + r); 504 505 __m128i t = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, B)); 506 507 B = _mm_and_si128(low_nibs, B); 508 509 __m128i t2 = _mm_shuffle_epi8(k_inv2, B); 510 511 B = _mm_xor_si128(B, t); 512 513 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); 514 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); 515 516 __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); 517 518 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); 519 520 if (r == rounds) 521 { 522 B = _mm_shuffle_epi8(_mm_xor_si128(_mm_shuffle_epi8(sbou, t5), 523 _mm_xor_si128(_mm_shuffle_epi8(sbot, t6), K)), 524 sr[r % 4]); 525 526 return B; 527 } 528 529 __m128i t7 = _mm_xor_si128(_mm_shuffle_epi8(sb1t, t6), _mm_xor_si128( _mm_shuffle_epi8(sb1u, t5), K)); 530 531 __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb2t, t6), 532 _mm_xor_si128(_mm_shuffle_epi8(sb2u, t5), 533 _mm_shuffle_epi8(t7, mc_forward[r % 4]))); 534 535 B = _mm_xor_si128(_mm_shuffle_epi8(t8, mc_forward[r % 4]), 536 _mm_xor_si128(_mm_shuffle_epi8(t7, mc_backward[r % 4]), t8)); 537 } 538 } 539 540 __m128i aes_ssse3_decrypt(__m128i B, const(__m128i*) keys, size_t rounds) 541 { 542 immutable(__m128i) k_dipt1 = _mm_set_epi32!(0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00)(); 543 immutable(__m128i) k_dipt2 = _mm_set_epi32!(0x12771772, 0xF491F194, 0x86E383E6, 0x60056500)(); 544 545 immutable(__m128i) sb9u = _mm_set_epi32!(0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600)(); 546 immutable(__m128i) sb9t = _mm_set_epi32!(0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900)(); 547 548 immutable(__m128i) sbeu = _mm_set_epi32!(0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000)(); 549 immutable(__m128i) sbet = _mm_set_epi32!(0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100)(); 550 551 immutable(__m128i) sbdu = _mm_set_epi32!(0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200)(); 552 immutable(__m128i) sbdt = _mm_set_epi32!(0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00)(); 553 554 immutable(__m128i) sbbu = _mm_set_epi32!(0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200)(); 555 immutable(__m128i) sbbt = _mm_set_epi32!(0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700)(); 556 557 __m128i mc = mc_forward[3]; 558 559 __m128i t = _mm_shuffle_epi8(k_dipt2, _mm_srli_epi32!4( _mm_andnot_si128(low_nibs, B))); 560 561 B = _mm_xor_si128(t,_mm_xor_si128( _mm_loadu_si128(keys), _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)))); 562 563 for (size_t r = 1; ; ++r) 564 { 565 const(__m128i) K = _mm_loadu_si128(keys + r); 566 567 t = _mm_srli_epi32!4(_mm_andnot_si128(low_nibs, B)); 568 569 B = _mm_and_si128(low_nibs, B); 570 571 __m128i t2 = _mm_shuffle_epi8(k_inv2, B); 572 573 B = _mm_xor_si128(B, t); 574 575 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); 576 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); 577 __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); 578 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); 579 580 if (r == rounds) 581 { 582 immutable(__m128i) sbou = _mm_set_epi32!(0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000)(); 583 immutable(__m128i) sbot = _mm_set_epi32!(0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00)(); 584 585 __m128i x = _mm_shuffle_epi8(sbou, t5); 586 __m128i y = _mm_shuffle_epi8(sbot, t6); 587 x = _mm_xor_si128(x, K); 588 x = _mm_xor_si128(x, y); 589 590 const uint which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16; 591 return _mm_shuffle_epi8(x, sr[which_sr]); 592 } 593 594 __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6), 595 _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K)); 596 597 __m128i t9 = _mm_xor_si128(_mm_shuffle_epi8(t8, mc), _mm_xor_si128(_mm_shuffle_epi8(sbdu, t5), _mm_shuffle_epi8(sbdt, t6))); 598 599 __m128i t12 = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t9, mc), 600 _mm_shuffle_epi8(sbbu, t5)), 601 _mm_shuffle_epi8(sbbt, t6)); 602 603 B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc), 604 _mm_shuffle_epi8(sbeu, t5)), 605 _mm_shuffle_epi8(sbet, t6)); 606 607 mc = _mm_alignr_epi8!12(mc, mc); 608 } 609 }