1 /** 2 * AES using AES-NI instructions 3 * 4 * Copyright: 5 * (C) 2009 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.block.aes_ni; 12 13 import botan.constants; 14 static if (BOTAN_HAS_AES_NI): 15 import botan.block.block_cipher; 16 import botan.utils.loadstor; 17 import botan.utils.simd.wmmintrin; 18 import botan.utils.mem_ops; 19 import std.format : format; 20 21 /** 22 * AES-128 using AES-NI 23 */ 24 final class AES128NI : BlockCipherFixedParams!(16, 16), BlockCipher, SymmetricAlgorithm 25 { 26 public: 27 override @property size_t parallelism() const { return 4; } 28 29 /* 30 * AES-128 Encryption 31 */ 32 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 33 { 34 __m128i* in_mm = cast(__m128i*)(input); 35 __m128i* out_mm = cast(__m128i*)(output); 36 37 const(__m128i*) key_mm = cast(const(__m128i*))(m_EK.ptr); 38 39 __m128i K0 = _mm_loadu_si128(key_mm); 40 __m128i K1 = _mm_loadu_si128(key_mm + 1); 41 __m128i K2 = _mm_loadu_si128(key_mm + 2); 42 __m128i K3 = _mm_loadu_si128(key_mm + 3); 43 __m128i K4 = _mm_loadu_si128(key_mm + 4); 44 __m128i K5 = _mm_loadu_si128(key_mm + 5); 45 __m128i K6 = _mm_loadu_si128(key_mm + 6); 46 __m128i K7 = _mm_loadu_si128(key_mm + 7); 47 __m128i K8 = _mm_loadu_si128(key_mm + 8); 48 __m128i K9 = _mm_loadu_si128(key_mm + 9); 49 __m128i K10 = _mm_loadu_si128(key_mm + 10); 50 51 while (blocks >= 4) 52 { 53 __m128i B0 = _mm_loadu_si128(in_mm + 0); 54 __m128i B1 = _mm_loadu_si128(in_mm + 1); 55 __m128i B2 = _mm_loadu_si128(in_mm + 2); 56 __m128i B3 = _mm_loadu_si128(in_mm + 3); 57 58 B0 = _mm_xor_si128(B0, K0); 59 B1 = _mm_xor_si128(B1, K0); 60 B2 = _mm_xor_si128(B2, K0); 61 B3 = _mm_xor_si128(B3, K0); 62 63 mixin(AES_ENC_4_ROUNDS!(K1)); 64 mixin(AES_ENC_4_ROUNDS!(K2)); 65 mixin(AES_ENC_4_ROUNDS!(K3)); 66 mixin(AES_ENC_4_ROUNDS!(K4)); 67 mixin(AES_ENC_4_ROUNDS!(K5)); 68 mixin(AES_ENC_4_ROUNDS!(K6)); 69 mixin(AES_ENC_4_ROUNDS!(K7)); 70 mixin(AES_ENC_4_ROUNDS!(K8)); 71 mixin(AES_ENC_4_ROUNDS!(K9)); 72 mixin(AES_ENC_4_LAST_ROUNDS!(K10)); 73 74 _mm_storeu_si128(out_mm + 0, B0); 75 _mm_storeu_si128(out_mm + 1, B1); 76 _mm_storeu_si128(out_mm + 2, B2); 77 _mm_storeu_si128(out_mm + 3, B3); 78 79 blocks -= 4; 80 in_mm += 4; 81 out_mm += 4; 82 } 83 84 foreach (size_t i; 0 .. blocks) 85 { 86 __m128i B = _mm_loadu_si128(in_mm + i); 87 88 B = _mm_xor_si128(B, K0); 89 90 B = _mm_aesenc_si128(B, K1); 91 B = _mm_aesenc_si128(B, K2); 92 B = _mm_aesenc_si128(B, K3); 93 B = _mm_aesenc_si128(B, K4); 94 B = _mm_aesenc_si128(B, K5); 95 B = _mm_aesenc_si128(B, K6); 96 B = _mm_aesenc_si128(B, K7); 97 B = _mm_aesenc_si128(B, K8); 98 B = _mm_aesenc_si128(B, K9); 99 B = _mm_aesenclast_si128(B, K10); 100 101 _mm_storeu_si128(out_mm + i, B); 102 } 103 } 104 105 /* 106 * AES-128 Decryption 107 */ 108 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 109 { 110 __m128i* in_mm = cast(__m128i*)(input); 111 __m128i* out_mm = cast(__m128i*)(output); 112 113 const(__m128i*) key_mm = cast(const(__m128i*))(m_DK.ptr); 114 115 __m128i K0 = _mm_loadu_si128(key_mm); 116 __m128i K1 = _mm_loadu_si128(key_mm + 1); 117 __m128i K2 = _mm_loadu_si128(key_mm + 2); 118 __m128i K3 = _mm_loadu_si128(key_mm + 3); 119 __m128i K4 = _mm_loadu_si128(key_mm + 4); 120 __m128i K5 = _mm_loadu_si128(key_mm + 5); 121 __m128i K6 = _mm_loadu_si128(key_mm + 6); 122 __m128i K7 = _mm_loadu_si128(key_mm + 7); 123 __m128i K8 = _mm_loadu_si128(key_mm + 8); 124 __m128i K9 = _mm_loadu_si128(key_mm + 9); 125 __m128i K10 = _mm_loadu_si128(key_mm + 10); 126 127 while (blocks >= 4) 128 { 129 __m128i B0 = _mm_loadu_si128(in_mm + 0); 130 __m128i B1 = _mm_loadu_si128(in_mm + 1); 131 __m128i B2 = _mm_loadu_si128(in_mm + 2); 132 __m128i B3 = _mm_loadu_si128(in_mm + 3); 133 134 B0 = _mm_xor_si128(B0, K0); 135 B1 = _mm_xor_si128(B1, K0); 136 B2 = _mm_xor_si128(B2, K0); 137 B3 = _mm_xor_si128(B3, K0); 138 139 mixin(AES_DEC_4_ROUNDS!(K1)); 140 mixin(AES_DEC_4_ROUNDS!(K2)); 141 mixin(AES_DEC_4_ROUNDS!(K3)); 142 mixin(AES_DEC_4_ROUNDS!(K4)); 143 mixin(AES_DEC_4_ROUNDS!(K5)); 144 mixin(AES_DEC_4_ROUNDS!(K6)); 145 mixin(AES_DEC_4_ROUNDS!(K7)); 146 mixin(AES_DEC_4_ROUNDS!(K8)); 147 mixin(AES_DEC_4_ROUNDS!(K9)); 148 mixin(AES_DEC_4_LAST_ROUNDS!(K10)); 149 150 _mm_storeu_si128(out_mm + 0, B0); 151 _mm_storeu_si128(out_mm + 1, B1); 152 _mm_storeu_si128(out_mm + 2, B2); 153 _mm_storeu_si128(out_mm + 3, B3); 154 155 blocks -= 4; 156 in_mm += 4; 157 out_mm += 4; 158 } 159 160 foreach (size_t i; 0 .. blocks) 161 { 162 __m128i B = _mm_loadu_si128(in_mm + i); 163 164 B = _mm_xor_si128(B, K0); 165 166 B = _mm_aesdec_si128(B, K1); 167 B = _mm_aesdec_si128(B, K2); 168 B = _mm_aesdec_si128(B, K3); 169 B = _mm_aesdec_si128(B, K4); 170 B = _mm_aesdec_si128(B, K5); 171 B = _mm_aesdec_si128(B, K6); 172 B = _mm_aesdec_si128(B, K7); 173 B = _mm_aesdec_si128(B, K8); 174 B = _mm_aesdec_si128(B, K9); 175 B = _mm_aesdeclast_si128(B, K10); 176 177 _mm_storeu_si128(out_mm + i, B); 178 } 179 } 180 181 182 /* 183 * Clear memory of sensitive data 184 */ 185 override void clear() 186 { 187 zap(m_EK); 188 zap(m_DK); 189 } 190 191 @property string name() const { return "AES-128"; } 192 override BlockCipher clone() const { return new AES128NI; } 193 override size_t blockSize() const { return super.blockSize(); } 194 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 195 protected: 196 /* 197 * AES-128 Key Schedule 198 */ 199 override void keySchedule(const(ubyte)* key, size_t) 200 { 201 m_EK.resize(44); 202 m_DK.resize(44); 203 204 __m128i K0 = _mm_loadu_si128(cast(const(__m128i*))(key)); 205 mixin(`__m128i K1 = ` ~ AES_128_key_exp!("K0", 0x01)); 206 mixin(`__m128i K2 = ` ~ AES_128_key_exp!("K1", 0x02)); 207 mixin(`__m128i K3 = ` ~ AES_128_key_exp!("K2", 0x04)); 208 mixin(`__m128i K4 = ` ~ AES_128_key_exp!("K3", 0x08)); 209 mixin(`__m128i K5 = ` ~ AES_128_key_exp!("K4", 0x10)); 210 mixin(`__m128i K6 = ` ~ AES_128_key_exp!("K5", 0x20)); 211 mixin(`__m128i K7 = ` ~ AES_128_key_exp!("K6", 0x40)); 212 mixin(`__m128i K8 = ` ~ AES_128_key_exp!("K7", 0x80)); 213 mixin(`__m128i K9 = ` ~ AES_128_key_exp!("K8", 0x1B)); 214 mixin(`__m128i K10 = ` ~ AES_128_key_exp!("K9", 0x36)); 215 __m128i* EK_mm = cast(__m128i*)(m_EK.ptr); 216 _mm_storeu_si128(EK_mm , K0); 217 mixin( q{ 218 _mm_storeu_si128(EK_mm + 1, K1); 219 _mm_storeu_si128(EK_mm + 2, K2); 220 _mm_storeu_si128(EK_mm + 3, K3); 221 _mm_storeu_si128(EK_mm + 4, K4); 222 _mm_storeu_si128(EK_mm + 5, K5); 223 _mm_storeu_si128(EK_mm + 6, K6); 224 _mm_storeu_si128(EK_mm + 7, K7); 225 _mm_storeu_si128(EK_mm + 8, K8); 226 _mm_storeu_si128(EK_mm + 9, K9); 227 _mm_storeu_si128(EK_mm + 10, K10); 228 }); 229 // Now generate decryption keys 230 231 __m128i* DK_mm = cast(__m128i*)(m_DK.ptr); 232 _mm_storeu_si128(DK_mm , K10); 233 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); 234 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); 235 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); 236 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); 237 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); 238 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); 239 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); 240 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); 241 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); 242 _mm_storeu_si128(DK_mm + 10, K0); 243 } 244 245 246 SecureVector!uint m_EK, m_DK; 247 } 248 249 /** 250 * AES-192 using AES-NI 251 */ 252 final class AES192NI : BlockCipherFixedParams!(16, 24), BlockCipher, SymmetricAlgorithm 253 { 254 public: 255 override @property size_t parallelism() const { return 4; } 256 257 /* 258 * AES-192 Encryption 259 */ 260 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 261 { 262 __m128i* in_mm = cast(__m128i*)(input); 263 __m128i* out_mm = cast(__m128i*)(output); 264 265 const(__m128i*) key_mm = cast(const(__m128i*))(m_EK.ptr); 266 267 __m128i K0 = _mm_loadu_si128(key_mm); 268 __m128i K1 = _mm_loadu_si128(key_mm + 1); 269 __m128i K2 = _mm_loadu_si128(key_mm + 2); 270 __m128i K3 = _mm_loadu_si128(key_mm + 3); 271 __m128i K4 = _mm_loadu_si128(key_mm + 4); 272 __m128i K5 = _mm_loadu_si128(key_mm + 5); 273 __m128i K6 = _mm_loadu_si128(key_mm + 6); 274 __m128i K7 = _mm_loadu_si128(key_mm + 7); 275 __m128i K8 = _mm_loadu_si128(key_mm + 8); 276 __m128i K9 = _mm_loadu_si128(key_mm + 9); 277 __m128i K10 = _mm_loadu_si128(key_mm + 10); 278 __m128i K11 = _mm_loadu_si128(key_mm + 11); 279 __m128i K12 = _mm_loadu_si128(key_mm + 12); 280 281 while (blocks >= 4) 282 { 283 __m128i B0 = _mm_loadu_si128(in_mm + 0); 284 __m128i B1 = _mm_loadu_si128(in_mm + 1); 285 __m128i B2 = _mm_loadu_si128(in_mm + 2); 286 __m128i B3 = _mm_loadu_si128(in_mm + 3); 287 288 B0 = _mm_xor_si128(B0, K0); 289 B1 = _mm_xor_si128(B1, K0); 290 B2 = _mm_xor_si128(B2, K0); 291 B3 = _mm_xor_si128(B3, K0); 292 293 mixin(AES_ENC_4_ROUNDS!(K1)); 294 mixin(AES_ENC_4_ROUNDS!(K2)); 295 mixin(AES_ENC_4_ROUNDS!(K3)); 296 mixin(AES_ENC_4_ROUNDS!(K4)); 297 mixin(AES_ENC_4_ROUNDS!(K5)); 298 mixin(AES_ENC_4_ROUNDS!(K6)); 299 mixin(AES_ENC_4_ROUNDS!(K7)); 300 mixin(AES_ENC_4_ROUNDS!(K8)); 301 mixin(AES_ENC_4_ROUNDS!(K9)); 302 mixin(AES_ENC_4_ROUNDS!(K10)); 303 mixin(AES_ENC_4_ROUNDS!(K11)); 304 mixin(AES_ENC_4_LAST_ROUNDS!(K12)); 305 306 _mm_storeu_si128(out_mm + 0, B0); 307 _mm_storeu_si128(out_mm + 1, B1); 308 _mm_storeu_si128(out_mm + 2, B2); 309 _mm_storeu_si128(out_mm + 3, B3); 310 311 blocks -= 4; 312 in_mm += 4; 313 out_mm += 4; 314 } 315 316 foreach (size_t i; 0 .. blocks) 317 { 318 __m128i B = _mm_loadu_si128(in_mm + i); 319 320 B = _mm_xor_si128(B, K0); 321 322 B = _mm_aesenc_si128(B, K1); 323 B = _mm_aesenc_si128(B, K2); 324 B = _mm_aesenc_si128(B, K3); 325 B = _mm_aesenc_si128(B, K4); 326 B = _mm_aesenc_si128(B, K5); 327 B = _mm_aesenc_si128(B, K6); 328 B = _mm_aesenc_si128(B, K7); 329 B = _mm_aesenc_si128(B, K8); 330 B = _mm_aesenc_si128(B, K9); 331 B = _mm_aesenc_si128(B, K10); 332 B = _mm_aesenc_si128(B, K11); 333 B = _mm_aesenclast_si128(B, K12); 334 335 _mm_storeu_si128(out_mm + i, B); 336 } 337 } 338 339 /* 340 * AES-192 Decryption 341 */ 342 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 343 { 344 __m128i* in_mm = cast(__m128i*)(input); 345 __m128i* out_mm = cast(__m128i*)(output); 346 347 const(__m128i*) key_mm = cast(const(__m128i*))(m_DK.ptr); 348 349 __m128i K0 = _mm_loadu_si128(key_mm); 350 __m128i K1 = _mm_loadu_si128(key_mm + 1); 351 __m128i K2 = _mm_loadu_si128(key_mm + 2); 352 __m128i K3 = _mm_loadu_si128(key_mm + 3); 353 __m128i K4 = _mm_loadu_si128(key_mm + 4); 354 __m128i K5 = _mm_loadu_si128(key_mm + 5); 355 __m128i K6 = _mm_loadu_si128(key_mm + 6); 356 __m128i K7 = _mm_loadu_si128(key_mm + 7); 357 __m128i K8 = _mm_loadu_si128(key_mm + 8); 358 __m128i K9 = _mm_loadu_si128(key_mm + 9); 359 __m128i K10 = _mm_loadu_si128(key_mm + 10); 360 __m128i K11 = _mm_loadu_si128(key_mm + 11); 361 __m128i K12 = _mm_loadu_si128(key_mm + 12); 362 363 while (blocks >= 4) 364 { 365 __m128i B0 = _mm_loadu_si128(in_mm + 0); 366 __m128i B1 = _mm_loadu_si128(in_mm + 1); 367 __m128i B2 = _mm_loadu_si128(in_mm + 2); 368 __m128i B3 = _mm_loadu_si128(in_mm + 3); 369 370 B0 = _mm_xor_si128(B0, K0); 371 B1 = _mm_xor_si128(B1, K0); 372 B2 = _mm_xor_si128(B2, K0); 373 B3 = _mm_xor_si128(B3, K0); 374 375 mixin(AES_DEC_4_ROUNDS!(K1)); 376 mixin(AES_DEC_4_ROUNDS!(K2)); 377 mixin(AES_DEC_4_ROUNDS!(K3)); 378 mixin(AES_DEC_4_ROUNDS!(K4)); 379 mixin(AES_DEC_4_ROUNDS!(K5)); 380 mixin(AES_DEC_4_ROUNDS!(K6)); 381 mixin(AES_DEC_4_ROUNDS!(K7)); 382 mixin(AES_DEC_4_ROUNDS!(K8)); 383 mixin(AES_DEC_4_ROUNDS!(K9)); 384 mixin(AES_DEC_4_ROUNDS!(K10)); 385 mixin(AES_DEC_4_ROUNDS!(K11)); 386 mixin(AES_DEC_4_LAST_ROUNDS!(K12)); 387 388 _mm_storeu_si128(out_mm + 0, B0); 389 _mm_storeu_si128(out_mm + 1, B1); 390 _mm_storeu_si128(out_mm + 2, B2); 391 _mm_storeu_si128(out_mm + 3, B3); 392 393 blocks -= 4; 394 in_mm += 4; 395 out_mm += 4; 396 } 397 398 foreach (size_t i; 0 .. blocks) 399 { 400 __m128i B = _mm_loadu_si128(in_mm + i); 401 402 B = _mm_xor_si128(B, K0); 403 404 B = _mm_aesdec_si128(B, K1); 405 B = _mm_aesdec_si128(B, K2); 406 B = _mm_aesdec_si128(B, K3); 407 B = _mm_aesdec_si128(B, K4); 408 B = _mm_aesdec_si128(B, K5); 409 B = _mm_aesdec_si128(B, K6); 410 B = _mm_aesdec_si128(B, K7); 411 B = _mm_aesdec_si128(B, K8); 412 B = _mm_aesdec_si128(B, K9); 413 B = _mm_aesdec_si128(B, K10); 414 B = _mm_aesdec_si128(B, K11); 415 B = _mm_aesdeclast_si128(B, K12); 416 417 _mm_storeu_si128(out_mm + i, B); 418 } 419 } 420 421 422 423 /* 424 * Clear memory of sensitive data 425 */ 426 override void clear() 427 { 428 zap(m_EK); 429 zap(m_DK); 430 } 431 @property string name() const { return "AES-192"; } 432 override BlockCipher clone() const { return new AES192NI; } 433 override size_t blockSize() const { return super.blockSize(); } 434 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 435 protected: 436 /* 437 * AES-192 Key Schedule 438 */ 439 override void keySchedule(const(ubyte)* key, size_t) 440 { 441 m_EK.resize(52); 442 m_DK.resize(52); 443 444 __m128i K0 = _mm_loadu_si128(cast(const(__m128i*))(key)); 445 __m128i K1 = _mm_loadu_si128(cast(const(__m128i*))(key + 8)); 446 K1 = _mm_srli_si128!8(K1); 447 448 loadLittleEndian(m_EK.ptr, key, 6); 449 450 mixin(AES_192_key_exp!(0x01, 6)); 451 mixin(AES_192_key_exp!(0x02, 12)); 452 mixin(AES_192_key_exp!(0x04, 18)); 453 mixin(AES_192_key_exp!(0x08, 24)); 454 mixin(AES_192_key_exp!(0x10, 30)); 455 mixin(AES_192_key_exp!(0x20, 36)); 456 mixin(AES_192_key_exp!(0x40, 42)); 457 mixin(AES_192_key_exp!(0x80, 48)); 458 459 // Now generate decryption keys 460 const(__m128i*) EK_mm = cast(const(__m128i*))(m_EK.ptr); 461 462 __m128i* DK_mm = cast(__m128i*)(m_DK.ptr); 463 _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12)); 464 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11))); 465 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10))); 466 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9))); 467 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8))); 468 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7))); 469 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6))); 470 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5))); 471 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4))); 472 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3))); 473 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2))); 474 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1))); 475 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0)); 476 } 477 478 479 SecureVector!uint m_EK, m_DK; 480 } 481 482 /** 483 * AES-256 using AES-NI 484 */ 485 final class AES256NI : BlockCipherFixedParams!(16, 32), BlockCipher, SymmetricAlgorithm 486 { 487 public: 488 override @property size_t parallelism() const { return 4; } 489 490 /* 491 * AES-256 Encryption 492 */ 493 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 494 { 495 assert(m_EK.length >= 60); 496 __m128i* in_mm = cast(__m128i*)(input); 497 __m128i* out_mm = cast(__m128i*)(output); 498 499 const(__m128i*) key_mm = cast(const(__m128i*))(m_EK.ptr); 500 501 __m128i K0 = _mm_loadu_si128(key_mm); 502 __m128i K1 = _mm_loadu_si128(key_mm + 1); 503 __m128i K2 = _mm_loadu_si128(key_mm + 2); 504 __m128i K3 = _mm_loadu_si128(key_mm + 3); 505 __m128i K4 = _mm_loadu_si128(key_mm + 4); 506 __m128i K5 = _mm_loadu_si128(key_mm + 5); 507 __m128i K6 = _mm_loadu_si128(key_mm + 6); 508 __m128i K7 = _mm_loadu_si128(key_mm + 7); 509 __m128i K8 = _mm_loadu_si128(key_mm + 8); 510 __m128i K9 = _mm_loadu_si128(key_mm + 9); 511 __m128i K10 = _mm_loadu_si128(key_mm + 10); 512 __m128i K11 = _mm_loadu_si128(key_mm + 11); 513 __m128i K12 = _mm_loadu_si128(key_mm + 12); 514 __m128i K13 = _mm_loadu_si128(key_mm + 13); 515 __m128i K14 = _mm_loadu_si128(key_mm + 14); 516 517 while (blocks >= 4) 518 { 519 __m128i B0 = _mm_loadu_si128(in_mm + 0); 520 __m128i B1 = _mm_loadu_si128(in_mm + 1); 521 __m128i B2 = _mm_loadu_si128(in_mm + 2); 522 __m128i B3 = _mm_loadu_si128(in_mm + 3); 523 524 B0 = _mm_xor_si128(B0, K0); 525 B1 = _mm_xor_si128(B1, K0); 526 B2 = _mm_xor_si128(B2, K0); 527 B3 = _mm_xor_si128(B3, K0); 528 529 mixin(AES_ENC_4_ROUNDS!(K1)); 530 mixin(AES_ENC_4_ROUNDS!(K2)); 531 mixin(AES_ENC_4_ROUNDS!(K3)); 532 mixin(AES_ENC_4_ROUNDS!(K4)); 533 mixin(AES_ENC_4_ROUNDS!(K5)); 534 mixin(AES_ENC_4_ROUNDS!(K6)); 535 mixin(AES_ENC_4_ROUNDS!(K7)); 536 mixin(AES_ENC_4_ROUNDS!(K8)); 537 mixin(AES_ENC_4_ROUNDS!(K9)); 538 mixin(AES_ENC_4_ROUNDS!(K10)); 539 mixin(AES_ENC_4_ROUNDS!(K11)); 540 mixin(AES_ENC_4_ROUNDS!(K12)); 541 mixin(AES_ENC_4_ROUNDS!(K13)); 542 mixin(AES_ENC_4_LAST_ROUNDS!(K14)); 543 544 _mm_storeu_si128(out_mm + 0, B0); 545 _mm_storeu_si128(out_mm + 1, B1); 546 _mm_storeu_si128(out_mm + 2, B2); 547 _mm_storeu_si128(out_mm + 3, B3); 548 549 blocks -= 4; 550 in_mm += 4; 551 out_mm += 4; 552 } 553 554 foreach (size_t i; 0 .. blocks) 555 { 556 __m128i B = _mm_loadu_si128(in_mm + i); 557 558 B = _mm_xor_si128(B, K0); 559 560 B = _mm_aesenc_si128(B, K1); 561 B = _mm_aesenc_si128(B, K2); 562 B = _mm_aesenc_si128(B, K3); 563 B = _mm_aesenc_si128(B, K4); 564 B = _mm_aesenc_si128(B, K5); 565 B = _mm_aesenc_si128(B, K6); 566 B = _mm_aesenc_si128(B, K7); 567 B = _mm_aesenc_si128(B, K8); 568 B = _mm_aesenc_si128(B, K9); 569 B = _mm_aesenc_si128(B, K10); 570 B = _mm_aesenc_si128(B, K11); 571 B = _mm_aesenc_si128(B, K12); 572 B = _mm_aesenc_si128(B, K13); 573 B = _mm_aesenclast_si128(B, K14); 574 575 _mm_storeu_si128(out_mm + i, B); 576 } 577 } 578 579 /* 580 * AES-256 Decryption 581 */ 582 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 583 { 584 __m128i* in_mm = cast(__m128i*)(input); 585 __m128i* out_mm = cast(__m128i*)(output); 586 587 const(__m128i*) key_mm = cast(const(__m128i*))(m_DK.ptr); 588 589 __m128i K0 = _mm_loadu_si128(key_mm); 590 __m128i K1 = _mm_loadu_si128(key_mm + 1); 591 __m128i K2 = _mm_loadu_si128(key_mm + 2); 592 __m128i K3 = _mm_loadu_si128(key_mm + 3); 593 __m128i K4 = _mm_loadu_si128(key_mm + 4); 594 __m128i K5 = _mm_loadu_si128(key_mm + 5); 595 __m128i K6 = _mm_loadu_si128(key_mm + 6); 596 __m128i K7 = _mm_loadu_si128(key_mm + 7); 597 __m128i K8 = _mm_loadu_si128(key_mm + 8); 598 __m128i K9 = _mm_loadu_si128(key_mm + 9); 599 __m128i K10 = _mm_loadu_si128(key_mm + 10); 600 __m128i K11 = _mm_loadu_si128(key_mm + 11); 601 __m128i K12 = _mm_loadu_si128(key_mm + 12); 602 __m128i K13 = _mm_loadu_si128(key_mm + 13); 603 __m128i K14 = _mm_loadu_si128(key_mm + 14); 604 605 while (blocks >= 4) 606 { 607 __m128i B0 = _mm_loadu_si128(in_mm + 0); 608 __m128i B1 = _mm_loadu_si128(in_mm + 1); 609 __m128i B2 = _mm_loadu_si128(in_mm + 2); 610 __m128i B3 = _mm_loadu_si128(in_mm + 3); 611 612 B0 = _mm_xor_si128(B0, K0); 613 B1 = _mm_xor_si128(B1, K0); 614 B2 = _mm_xor_si128(B2, K0); 615 B3 = _mm_xor_si128(B3, K0); 616 617 mixin(AES_DEC_4_ROUNDS!(K1)); 618 mixin(AES_DEC_4_ROUNDS!(K2)); 619 mixin(AES_DEC_4_ROUNDS!(K3)); 620 mixin(AES_DEC_4_ROUNDS!(K4)); 621 mixin(AES_DEC_4_ROUNDS!(K5)); 622 mixin(AES_DEC_4_ROUNDS!(K6)); 623 mixin(AES_DEC_4_ROUNDS!(K7)); 624 mixin(AES_DEC_4_ROUNDS!(K8)); 625 mixin(AES_DEC_4_ROUNDS!(K9)); 626 mixin(AES_DEC_4_ROUNDS!(K10)); 627 mixin(AES_DEC_4_ROUNDS!(K11)); 628 mixin(AES_DEC_4_ROUNDS!(K12)); 629 mixin(AES_DEC_4_ROUNDS!(K13)); 630 mixin(AES_DEC_4_LAST_ROUNDS!(K14)); 631 632 _mm_storeu_si128(out_mm + 0, B0); 633 _mm_storeu_si128(out_mm + 1, B1); 634 _mm_storeu_si128(out_mm + 2, B2); 635 _mm_storeu_si128(out_mm + 3, B3); 636 637 blocks -= 4; 638 in_mm += 4; 639 out_mm += 4; 640 } 641 642 foreach (size_t i; 0 .. blocks) 643 { 644 __m128i B = _mm_loadu_si128(in_mm + i); 645 646 B = _mm_xor_si128(B, K0); 647 648 B = _mm_aesdec_si128(B, K1); 649 B = _mm_aesdec_si128(B, K2); 650 B = _mm_aesdec_si128(B, K3); 651 B = _mm_aesdec_si128(B, K4); 652 B = _mm_aesdec_si128(B, K5); 653 B = _mm_aesdec_si128(B, K6); 654 B = _mm_aesdec_si128(B, K7); 655 B = _mm_aesdec_si128(B, K8); 656 B = _mm_aesdec_si128(B, K9); 657 B = _mm_aesdec_si128(B, K10); 658 B = _mm_aesdec_si128(B, K11); 659 B = _mm_aesdec_si128(B, K12); 660 B = _mm_aesdec_si128(B, K13); 661 B = _mm_aesdeclast_si128(B, K14); 662 663 _mm_storeu_si128(out_mm + i, B); 664 } 665 } 666 667 /* 668 * Clear memory of sensitive data 669 */ 670 override void clear() 671 { 672 zap(m_EK); 673 zap(m_DK); 674 } 675 676 @property string name() const { return "AES-256"; } 677 override BlockCipher clone() const { return new AES256NI; } 678 override size_t blockSize() const { return super.blockSize(); } 679 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 680 protected: 681 /* 682 * AES-256 Key Schedule 683 */ 684 override void keySchedule(const(ubyte)* key, size_t) 685 { 686 m_EK.resize(60); 687 m_DK.resize(60); 688 689 __m128i K0 = _mm_loadu_si128(cast(const(__m128i*))(key)); 690 __m128i K1 = _mm_loadu_si128(cast(const(__m128i*))(key + 16)); 691 692 __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128!0x01(K1)); 693 __m128i K3 = aes_256_key_expansion(K1, K2); 694 695 __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128!0x02(K3)); 696 __m128i K5 = aes_256_key_expansion(K3, K4); 697 698 __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128!0x04(K5)); 699 __m128i K7 = aes_256_key_expansion(K5, K6); 700 701 __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128!0x08(K7)); 702 __m128i K9 = aes_256_key_expansion(K7, K8); 703 704 __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128!0x10(K9)); 705 __m128i K11 = aes_256_key_expansion(K9, K10); 706 707 __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128!0x20(K11)); 708 __m128i K13 = aes_256_key_expansion(K11, K12); 709 710 __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128!0x40(K13)); 711 712 __m128i* EK_mm = cast(__m128i*)(m_EK.ptr); 713 _mm_storeu_si128(EK_mm , K0); 714 _mm_storeu_si128(EK_mm + 1, K1); 715 _mm_storeu_si128(EK_mm + 2, K2); 716 _mm_storeu_si128(EK_mm + 3, K3); 717 _mm_storeu_si128(EK_mm + 4, K4); 718 _mm_storeu_si128(EK_mm + 5, K5); 719 _mm_storeu_si128(EK_mm + 6, K6); 720 _mm_storeu_si128(EK_mm + 7, K7); 721 _mm_storeu_si128(EK_mm + 8, K8); 722 _mm_storeu_si128(EK_mm + 9, K9); 723 _mm_storeu_si128(EK_mm + 10, K10); 724 _mm_storeu_si128(EK_mm + 11, K11); 725 _mm_storeu_si128(EK_mm + 12, K12); 726 _mm_storeu_si128(EK_mm + 13, K13); 727 _mm_storeu_si128(EK_mm + 14, K14); 728 729 // Now generate decryption keys 730 __m128i* DK_mm = cast(__m128i*)(m_DK.ptr); 731 _mm_storeu_si128(DK_mm , K14); 732 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13)); 733 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12)); 734 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11)); 735 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10)); 736 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9)); 737 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8)); 738 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7)); 739 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6)); 740 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5)); 741 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4)); 742 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3)); 743 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2)); 744 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1)); 745 _mm_storeu_si128(DK_mm + 14, K0); 746 } 747 748 749 SecureVector!uint m_EK, m_DK; 750 } 751 752 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) 753 { 754 key_with_rcon = _mm_shuffle_epi32!(_MM_SHUFFLE(3,3,3,3))(key_with_rcon); 755 key = _mm_xor_si128(key, _mm_slli_si128!4(key)); 756 key = _mm_xor_si128(key, _mm_slli_si128!4(key)); 757 key = _mm_xor_si128(key, _mm_slli_si128!4(key)); 758 return _mm_xor_si128(key, key_with_rcon); 759 } 760 761 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, 762 uint* output, bool last) 763 { 764 __m128i key1 = *K1; 765 __m128i key2 = *K2; 766 767 key2_with_rcon = _mm_shuffle_epi32!(_MM_SHUFFLE(1,1,1,1))(key2_with_rcon); 768 key1 = _mm_xor_si128(key1, _mm_slli_si128!4(key1)); 769 key1 = _mm_xor_si128(key1, _mm_slli_si128!4(key1)); 770 key1 = _mm_xor_si128(key1, _mm_slli_si128!4(key1)); 771 key1 = _mm_xor_si128(key1, key2_with_rcon); 772 773 *K1 = key1; 774 _mm_storeu_si128(cast(__m128i*)(output), key1); 775 776 if (last) 777 return; 778 779 key2 = _mm_xor_si128(key2, _mm_slli_si128!4(key2)); 780 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32!(_MM_SHUFFLE(3,3,3,3))(key1)); 781 782 *K2 = key2; 783 output[4] = _mm_cvtsi128_si32(key2); 784 output[5] = _mm_cvtsi128_si32(_mm_srli_si128!4(key2)); 785 } 786 787 /* 788 * The second half of the AES-256 key expansion (other half same as AES-128) 789 */ 790 __m128i aes_256_key_expansion(__m128i key, __m128i key2) 791 { 792 __m128i key_with_rcon = _mm_aeskeygenassist_si128!0x00(key2); 793 key_with_rcon = _mm_shuffle_epi32!(_MM_SHUFFLE(2,2,2,2))(key_with_rcon); 794 795 key = _mm_xor_si128(key, _mm_slli_si128!4(key)); 796 key = _mm_xor_si128(key, _mm_slli_si128!4(key)); 797 key = _mm_xor_si128(key, _mm_slli_si128!4(key)); 798 return _mm_xor_si128(key, key_with_rcon); 799 } 800 801 enum string AES_ENC_4_ROUNDS(alias K) = q{ 802 B0 = _mm_aesenc_si128(B0, %1$s); 803 B1 = _mm_aesenc_si128(B1, %1$s); 804 B2 = _mm_aesenc_si128(B2, %1$s); 805 B3 = _mm_aesenc_si128(B3, %1$s); 806 }.format(__traits(identifier, K)); 807 808 enum string AES_ENC_4_LAST_ROUNDS(alias K) = q{ 809 B0 = _mm_aesenclast_si128(B0, %1$s); 810 B1 = _mm_aesenclast_si128(B1, %1$s); 811 B2 = _mm_aesenclast_si128(B2, %1$s); 812 B3 = _mm_aesenclast_si128(B3, %1$s); 813 }.format(__traits(identifier, K)); 814 815 enum string AES_DEC_4_ROUNDS(alias K) = q{ 816 B0 = _mm_aesdec_si128(B0, %1$s); 817 B1 = _mm_aesdec_si128(B1, %1$s); 818 B2 = _mm_aesdec_si128(B2, %1$s); 819 B3 = _mm_aesdec_si128(B3, %1$s); 820 }.format(__traits(identifier, K)); 821 822 enum string AES_DEC_4_LAST_ROUNDS(alias K) = q{ 823 B0 = _mm_aesdeclast_si128(B0, %1$s); 824 B1 = _mm_aesdeclast_si128(B1, %1$s); 825 B2 = _mm_aesdeclast_si128(B2, %1$s); 826 B3 = _mm_aesdeclast_si128(B3, %1$s); 827 }.format(__traits(identifier, K)); 828 829 enum string AES_128_key_exp(string K, ubyte RCON) = 830 `aes_128_key_expansion(` ~ K ~ `, _mm_aeskeygenassist_si128!` ~ RCON.to!string ~ `(` ~ K ~ `));`; 831 832 enum string AES_192_key_exp(ubyte RCON, size_t EK_OFF) = 833 `aes_192_key_expansion(&K0, &K1, 834 _mm_aeskeygenassist_si128! ` ~ RCON.to!string ~ `(K1), 835 &m_EK[` ~ EK_OFF.stringof ~ `], ` ~ EK_OFF.stringof ~ ` == 48);`;