1 /* 2 * emmintrin.h style functions 3 * (C) 2014-2015 Etienne Cimon 4 * 5 * License: 6 * Released under the MIT license 7 */ 8 module botan.utils.simd.emmintrin; 9 10 import botan.constants; 11 static if (BOTAN_HAS_SIMD_SSE2): 12 version(D_InlineAsm_X86) { 13 version(Windows) { pragma(msg, "Error: Loaded SIMD SSE2 in a x86 build! Use --config=windows_x86 in dub"); } 14 else { pragma(msg, "Error: Loaded SIMD SSE2 in a x86 build! Use --config=posix_x86 in dub"); } 15 } 16 import core.simd; 17 import std.conv : to; 18 19 pure: 20 nothrow: 21 @trusted: 22 23 alias __m128i = byte16; 24 alias __m64 = ulong; 25 26 int _MM_SHUFFLE(int z, int y, int x, int w) 27 { 28 return ( (z<<6) | (y<<4) | (x<<2) | w ); 29 } 30 31 // _mm_set1_epi32 32 __m128i _mm_set1_epi32 (int i)() { 33 int4 vec = [i, i, i, i]; 34 return *cast(__m128i*) &vec; 35 } 36 37 // _mm_set1_epi32 38 __m128i _mm_set1_epi32 (int i) { 39 align(16) int[4] vec = [i, i, i, i]; 40 return _mm_loadu_si128(cast(__m128i*)&vec); 41 } 42 43 // _mm_set_epi32 44 immutable(__m128i) _mm_set_epi32 (int i, int j, int k, int l)() { 45 int4 vec = [l, k, j, i]; 46 return *cast(immutable(__m128i)*) &vec; 47 } 48 49 // _mm_set_epi32 50 immutable(__m128i) _mm_set_epi32 (int i, int j, int k, int l) { 51 52 align(16) int[4] vec = [l, k, j, i]; 53 return _mm_loadu_si128(cast(__m128i*)&vec); 54 } 55 56 // _mm_set_epi8 57 immutable(__m128i) _mm_set1_epi8 (byte i)() { 58 return byte16([i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i]); 59 } 60 61 // _mm_set_epi8 62 immutable(__m128i) _mm_set1_epi8(byte[] arr)() { 63 mixin(`byte16 arr_fix = [` ~ arr[15].to!string ~ `, ` ~ arr[14].to!string ~ `, 64 ` ~ arr[13].to!string ~ `, ` ~ arr[12].to!string ~ `, 65 ` ~ arr[11].to!string ~ `, ` ~ arr[10].to!string ~ `, 66 ` ~ arr[9].to!string ~ `, ` ~ arr[8].to!string ~ `, 67 ` ~ arr[7].to!string ~ `, ` ~ arr[6].to!string ~ `, 68 ` ~ arr[5].to!string ~ `, ` ~ arr[4].to!string ~ `, 69 ` ~ arr[3].to!string ~ `, ` ~ arr[2].to!string ~ `, 70 ` ~ arr[1].to!string ~ `, ` ~ arr[0].to!string ~ `];`); 71 return cast(immutable __m128i)arr_fix; 72 } 73 74 // _mm_set1_epi16 75 __m128i _mm_set1_epi16(short w)() { 76 short8 vec = short8([w,w,w,w,w,w,w,w]); 77 return *cast(__m128i*) &vec; 78 } 79 80 version(GDC) { 81 // GDC <--> emmintrin => gcc/gcc/config/i386/emmintrin.h 82 static import gcc.attribute; 83 import gcc.builtins; 84 enum inline = gcc.attribute.attribute("forceinline"); 85 @inline: 86 // _mm_set1_epi16 87 __m128i _mm_set1_epi16(short w) { 88 short[8] a = [w,w,w,w,w,w,w,w]; 89 __m128i b; 90 short[8]* _a = &a; 91 __m128i* _b = &b; 92 mixin( q{ 93 asm pure nothrow { 94 "movdqu (%0), %xmm0\n" 95 "movdqu %xmm0, (%1)\n" 96 : : "0" (_a), "1" (_b) : "xmm0" 97 } 98 }); 99 return b; 100 } 101 102 ulong bswap64(ulong val) { 103 return cast(ulong) __builtin_bswap64(val); 104 } 105 106 int _mm_cvtsi128_si32(__m128i a) { 107 return cast(int) __builtin_ia32_vec_ext_v4si(cast(int4) a, 0); 108 } 109 110 // _mm_min_epu8 111 __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) { 112 return cast(__m128i) __builtin_ia32_pminub128(a, b); 113 } 114 115 __m128i _mm_shuffle_epi8()(auto ref __m128i a, auto const ref __m128i b) { 116 return cast(__m128i) __builtin_ia32_pshufb128(a, b); 117 } 118 119 // _mm_subs_epu16 120 __m128i _mm_subs_epu16()(auto ref __m128i a, auto const ref __m128i b) { 121 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8) a, cast(short8) b); 122 } 123 124 // _mm_mulhi_epu16 ; PMULHUW 125 __m128i _mm_mulhi_epu16()(auto ref __m128i a, auto const ref __m128i b) { 126 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8) a, cast(short8) b); 127 } 128 129 130 // _mm_cmpeq_epi16 ; PCMPEQW 131 __m128i _mm_cmpeq_epi16()(auto ref __m128i a, auto const ref __m128i b) { 132 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8) a, cast(short8) b); 133 } 134 135 // _mm_mullo_epi16 ; PMULLW 136 __m128i _mm_mullo_epi16()(auto ref __m128i a, auto const ref __m128i b) { 137 return cast(__m128i) __builtin_ia32_pmullw128(cast(short8) a, cast(short8) b); 138 } 139 140 // _mm_sub_epi16 ; PSUBW 141 __m128i _mm_sub_epi16()(auto ref __m128i a, auto const ref __m128i b) { 142 return cast(__m128i) __builtin_ia32_psubw128(cast(short8) a, cast(short8) b); 143 } 144 145 // _mm_add_epi16 ; PADDW 146 __m128i _mm_add_epi16()(auto ref __m128i a, auto const ref __m128i b) { 147 return cast(__m128i) __builtin_ia32_paddw128(cast(short8) a, cast(short8) b); 148 } 149 150 // _mm_srli_epi16 ; PSRLW 151 __m128i _mm_srli_epi16(int imm)(__m128i a) { 152 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8) a, imm); 153 } 154 155 // _mm_slli_epi16 ; PSLLW 156 __m128i _mm_slli_epi16(int imm)(__m128i a) { 157 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8) a, imm); 158 } 159 160 // _mm_shufflehi_epi16 ; PSHUFHW 161 __m128i _mm_shufflehi_epi16(int imm)(__m128i a) { 162 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8) a, imm); 163 } 164 165 // _mm_shufflelo_epi16 ; PSHUFLW 166 __m128i _mm_shufflelo_epi16(int imm)(__m128i a) { 167 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8) a, imm); 168 } 169 170 // _mm_add_epi32 ; PADDD 171 __m128i _mm_add_epi32()(auto ref __m128i a, auto const ref __m128i b) { 172 return cast(__m128i) __builtin_ia32_paddd128(cast(int4) a, cast(int4) b); 173 } 174 175 // _mm_sub_epi32 ; PSUBD 176 __m128i _mm_sub_epi32()(auto ref __m128i a, auto const ref __m128i b) { 177 return cast(__m128i) __builtin_ia32_psubd128(cast(int4) a, cast(int4) b); 178 } 179 180 // _mm_cmplt_epi32 ; PCMPGTDr 181 __m128i _mm_cmplt_epi32()(auto ref __m128i a, auto const ref __m128i b) { 182 return cast(__m128i) __builtin_ia32_pcmpgtd128(cast(int4) b, cast(int4) a); 183 } 184 185 // _mm_shuffle_epi32 186 __m128i _mm_shuffle_epi32(int imm)(__m128i a) { 187 return cast(__m128i) __builtin_ia32_pshufd(cast(int4) a, imm); 188 } 189 190 // _mm_extract_epi32 ; pextrd 191 int _mm_extract_epi32(__m128i a, in int ndx) { 192 return cast(__m128i) __builtin_ia32_vec_ext_v4si(cast(int4) a, ndx); 193 } 194 195 // _mm_unpackhi_epi32 ; PUNPCKHDQ 196 __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) { 197 return cast(__m128i) __builtin_ia32_punpckhdq128(cast(int4) a, cast(int4) b); 198 } 199 200 // _mm_unpacklo_epi32 ; PUNPCKLDQ 201 __m128i _mm_unpacklo_epi32()(auto ref __m128i a, auto const ref __m128i b) { 202 return cast(__m128i) __builtin_ia32_punpckldq128(cast(int4) a, cast(int4) b); 203 } 204 205 // _mm_unpackhi_epi64 ; PUNPCKHQDQ 206 __m128i _mm_unpackhi_epi64()(auto ref __m128i a, auto const ref __m128i b) { 207 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 208 } 209 210 // _mm_unpacklo_epi64 ; PUNPCKLQDQ 211 __m128i _mm_unpacklo_epi64()(auto ref __m128i a, auto const ref __m128i b) { 212 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 213 } 214 215 // _mm_setzero_si128 ; PXOR 216 __m128i _mm_setzero_si128 () { 217 return cast(__m128i) int4([0, 0, 0, 0]); 218 } 219 220 // _mm_loadu_si128 ; MOVDQU 221 __m128i _mm_loadu_si128 (in __m128i* p) { 222 return cast(__m128i) __builtin_ia32_loaddqu(p); 223 } 224 225 // _mm_storeu_si128 ; MOVDQU 226 void _mm_storeu_si128()(__m128i* p, auto const ref __m128i a) { 227 return cast(__m128i) __builtin_ia32_storedqu(p, a); 228 } 229 230 // _mm_or_si128 ; POR 231 __m128i _mm_or_si128()(auto ref __m128i a, auto const ref __m128i b) { 232 return cast(__m128i) __builtin_ia32_por128(cast(long2) a, cast(long2) b); 233 } 234 235 // _mm_andnot_si128 ; PANDN 236 __m128i _mm_andnot_si128()(auto ref __m128i a, auto const ref __m128i b) { 237 return cast(__m128i) __builtin_ia32_pandn128(cast(long2) a, cast(long2) b); 238 } 239 240 // _mm_and_si128 ; PAND 241 __m128i _mm_and_si128()(auto ref __m128i a, auto const ref __m128i b) { 242 return cast(__m128i) __builtin_ia32_pand128(cast(long2) a, cast(long2) b); 243 } 244 245 // _mm_xor_si128 ; PXOR 246 __m128i _mm_xor_si128 ( __m128i a, auto const ref __m128i b) { 247 return cast(__m128i) __builtin_ia32_pxor128(cast(long2) a, cast(long2) b); 248 } 249 250 // _mm_srli_si128 ; PSRLDQ 251 __m128i _mm_srli_si128(int imm)(__m128i a) { 252 return cast(__m128i) __builtin_ia32_psrldqi128(a, imm*8); 253 } 254 255 // _mm_slli_si128 ; PSLLDQ 256 __m128i _mm_slli_si128(int imm)(__m128i a) { 257 return cast(__m128i) __builtin_ia32_pslldqi128(a, imm*8); 258 } 259 } 260 261 version(none) { 262 import ldc.gccbuiltins_x86; 263 264 pragma(LDC_intrinsic, "llvm.bswap.i64") 265 ulong bswap64(ulong i); 266 267 __m128i _mm_set1_epi16(short w) { 268 short[8] a = [w,w,w,w,w,w,w,w]; 269 __m128i b; 270 short[8]* _a = &a; 271 __m128i* _b = &b; 272 mixin( q{ 273 __asm pure nothrow { 274 "movdqu (%0), %xmm0\n" 275 ~ "movdqu %xmm0, (%1)\n" 276 : : "0" (_a), "1" (_b) : "xmm0" 277 } 278 }); 279 return b; 280 } 281 282 int _mm_cvtsi128_si32(__m128i a) { 283 return cast(int) __builtin_ia32_vec_ext_v4si(cast(int4) a, 0); 284 } 285 286 // _mm_shuffle_epi8 287 __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { 288 return cast(__m128i) __builtin_ia32_pshufb128(a, b); 289 } 290 291 // _mm_min_epu8 292 __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) { 293 return cast(__m128i) __builtin_ia32_pminub128(a, b); 294 } 295 296 // _mm_subs_epu16 297 __m128i _mm_subs_epu16()(auto ref __m128i a, auto const ref __m128i b) { 298 return cast(__m128i) __builtin_ia32_psubusw128(cast(short8) a, cast(short8) b); 299 } 300 301 // _mm_mulhi_epu16 ; PMULHUW 302 __m128i _mm_mulhi_epu16()(auto ref __m128i a, auto const ref __m128i b) { 303 return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8) a, cast(short8) b); 304 } 305 306 // _mm_set1_epi16 307 __m128i _mm_set1_epi16 (short w) { 308 return cast(__m128i) short8([w,w,w,w,w,w,w,w]); 309 } 310 311 // _mm_cmpeq_epi16 ; PCMPEQW 312 __m128i _mm_cmpeq_epi16()(auto ref __m128i a, auto const ref __m128i b) { 313 return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8) a, cast(short8) b); 314 } 315 316 // _mm_mullo_epi16 ; PMULLW 317 __m128i _mm_mullo_epi16()(auto ref __m128i a, auto const ref __m128i b) { 318 return cast(__m128i) __builtin_ia32_pmullw128(cast(short8) a, cast(short8) b); 319 } 320 321 // _mm_sub_epi16 ; PSUBW 322 __m128i _mm_sub_epi16()(auto ref __m128i a, auto const ref __m128i b) { 323 return cast(__m128i) __builtin_ia32_psubw128(cast(short8) a, cast(short8) b); 324 } 325 326 // _mm_add_epi16 ; PADDW 327 __m128i _mm_add_epi16()(auto ref __m128i a, auto const ref __m128i b) { 328 return cast(__m128i) __builtin_ia32_paddw128(cast(short8) a, cast(short8) b); 329 } 330 331 // _mm_srli_epi16 ; PSRLW 332 __m128i _mm_srli_epi16(int imm)(__m128i a) { 333 return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8) a, imm); 334 } 335 336 // _mm_slli_epi16 ; PSLLW 337 __m128i _mm_slli_epi16(int imm)(__m128i a) { 338 return cast(__m128i) __builtin_ia32_psllwi128(cast(short8) a, imm); 339 } 340 341 // _mm_shufflehi_epi16 ; PSHUFHW 342 __m128i _mm_shufflehi_epi16(int imm)(__m128i a) { 343 return cast(__m128i) __builtin_ia32_pshufhw(cast(short8) a, imm); 344 } 345 346 // _mm_shufflelo_epi16 ; PSHUFLW 347 __m128i _mm_shufflelo_epi16(int imm)(__m128i a) { 348 return cast(__m128i) __builtin_ia32_pshuflw(cast(short8) a, imm); 349 } 350 351 // _mm_add_epi32 ; PADDD 352 __m128i _mm_add_epi32()(auto ref __m128i a, auto const ref __m128i b) { 353 return cast(__m128i) __builtin_ia32_paddd128(cast(int4) a, cast(int4) b); 354 } 355 356 // _mm_sub_epi32 ; PSUBD 357 __m128i _mm_sub_epi32()(auto ref __m128i a, auto const ref __m128i b) { 358 return cast(__m128i) __builtin_ia32_psubd128(cast(int4) a, cast(int4) b); 359 } 360 361 // _mm_cmplt_epi32 ; PCMPGTDr 362 __m128i _mm_cmplt_epi32()(auto ref __m128i a, auto const ref __m128i b) { 363 return cast(__m128i) __builtin_ia32_pcmpgtd128(cast(int4) b, cast(int4) a); 364 } 365 366 // _mm_shuffle_epi32 367 __m128i _mm_shuffle_epi32(int imm)(__m128i a) { 368 return cast(__m128i) __builtin_ia32_pshufd(cast(int4) a, imm); 369 } 370 371 // _mm_extract_epi32 ; pextrd 372 int _mm_extract_epi32(__m128i a, in int ndx) { 373 return cast(__m128i) __builtin_ia32_vec_ext_v4si(cast(int4) a, ndx); 374 } 375 376 // _mm_unpackhi_epi32 ; PUNPCKHDQ 377 __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) { 378 return cast(__m128i) __builtin_ia32_punpckhdq128(cast(int4) a, cast(int4) b); 379 } 380 381 // _mm_unpacklo_epi32 ; PUNPCKLDQ 382 __m128i _mm_unpacklo_epi32()(auto ref __m128i a, auto const ref __m128i b) { 383 return cast(__m128i) __builtin_ia32_punpckldq128(cast(int4) a, cast(int4) b); 384 } 385 386 // _mm_unpackhi_epi64 ; PUNPCKHQDQ 387 __m128i _mm_unpackhi_epi64()(auto ref __m128i a, auto const ref __m128i b) { 388 return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b); 389 } 390 391 // _mm_unpacklo_epi64 ; PUNPCKLQDQ 392 __m128i _mm_unpacklo_epi64()(auto ref __m128i a, auto const ref __m128i b) { 393 return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b); 394 } 395 396 // _mm_setzero_si128 ; PXOR 397 __m128i _mm_setzero_si128 () { 398 return cast(__m128i) int4([0, 0, 0, 0]); 399 } 400 401 // _mm_loadu_si128 ; MOVDQU 402 __m128i _mm_loadu_si128 (in __m128i* p) { 403 return cast(__m128i) __builtin_ia32_loaddqu(p); 404 } 405 406 // _mm_storeu_si128 ; MOVDQU 407 void _mm_storeu_si128()(__m128i *p, auto const ref __m128i a) { 408 return cast(__m128i) __builtin_ia32_storedqu(p, a); 409 } 410 411 // _mm_or_si128 ; POR 412 __m128i _mm_or_si128()(auto ref __m128i a, auto const ref __m128i b) { 413 return cast(__m128i) __builtin_ia32_por128(cast(long2) a, cast(long2) b); 414 } 415 416 // _mm_andnot_si128 ; PANDN 417 __m128i _mm_andnot_si128()(auto ref __m128i a, auto const ref __m128i b) { 418 return cast(__m128i) __builtin_ia32_pandn128(cast(long2) a, cast(long2) b); 419 } 420 421 // _mm_and_si128 ; PAND 422 __m128i _mm_and_si128()(auto ref __m128i a, auto const ref __m128i b) { 423 return cast(__m128i) __builtin_ia32_pand128(cast(long2) a, cast(long2) b); 424 } 425 426 // _mm_xor_si128 ; PXOR 427 __m128i _mm_xor_si128 ( __m128i a, auto const ref __m128i b) { 428 return cast(__m128i) __builtin_ia32_pxor128(cast(long2) a, cast(long2) b); 429 } 430 431 // _mm_srli_si128 ; PSRLDQ 432 __m128i _mm_srli_si128(int imm)(__m128i a) { 433 return cast(__m128i) __builtin_ia32_psrldqi128(a, imm*8); 434 } 435 436 // _mm_slli_si128 ; PSLLDQ 437 __m128i _mm_slli_si128(int imm)(__m128i a) { 438 return cast(__m128i) __builtin_ia32_pslldqi128(a, imm*8); 439 } 440 441 // bswap64 442 443 } 444 445 version(D_InlineAsm_X86_64) { 446 // _mm_set1_epi16 447 __m128i _mm_set1_epi16(short w) { 448 short[8] a = [w,w,w,w,w,w,w,w]; 449 short[8]* _a = &a; 450 __m128i b; 451 __m128i* _b = &b; 452 453 asm pure nothrow { 454 mov RAX, _a; 455 mov RBX, _b; 456 movdqu XMM1, [RAX]; 457 movdqu [RBX], XMM1; 458 } 459 460 return b; 461 } 462 463 int _mm_cvtsi128_si32()(auto const ref __m128i a) { 464 int ret; 465 int* _ret = &ret; 466 const(__m128i)* _a = &a; 467 468 asm pure nothrow { 469 mov RAX, _a; 470 mov RBX, _ret; 471 movdqu XMM1, [RAX]; 472 movd [RBX], XMM1; 473 } 474 return ret; 475 } 476 477 // _mm_min_epu8 ; PMINUB 478 __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) { 479 480 const(__m128i)* _a = &a; 481 const(__m128i)* _b = &b; 482 __m128i c; 483 __m128i* _c = &c; 484 485 asm pure nothrow { 486 mov RAX, _a; 487 mov RBX, _b; 488 mov RCX, _c; 489 movdqu XMM1, [RAX]; 490 movdqu XMM2, [RBX]; 491 pminub XMM1, XMM2; 492 movdqu [RCX], XMM1; 493 } 494 return c; 495 } 496 497 __m128i _mm_shuffle_epi8()(auto const ref __m128i a, auto const ref __m128i b) { 498 const(__m128i)* _a = &a; 499 const(__m128i)* _b = &b; 500 __m128i c; 501 __m128i* _c = &c; 502 503 asm pure nothrow { 504 mov RAX, _a; 505 mov RBX, _b; 506 mov RCX, _c; 507 movdqu XMM1, [RAX]; 508 movdqu XMM2, [RBX]; 509 pshufb XMM1, XMM2; 510 movdqu [RCX], XMM1; 511 } 512 return c; 513 } 514 515 // _mm_subs_epu16 ; PSUBUSW 516 __m128i _mm_subs_epu16()(auto const ref __m128i a, auto const ref __m128i b) { 517 518 const(__m128i)* _a = &a; 519 const(__m128i)* _b = &b; 520 __m128i c; 521 __m128i* _c = &c; 522 523 asm pure nothrow { 524 mov RAX, _a; 525 mov RBX, _b; 526 mov RCX, _c; 527 movdqu XMM1, [RAX]; 528 movdqu XMM2, [RBX]; 529 psubusw XMM1, XMM2; 530 movdqu [RCX], XMM1; 531 } 532 return c; 533 } 534 535 // _mm_mulhi_epu16 ; PMULHUW 536 __m128i _mm_mulhi_epu16()(auto const ref __m128i a, auto const ref __m128i b) { 537 const(__m128i)* _a = &a; 538 const(__m128i)* _b = &b; 539 __m128i c; 540 __m128i* _c = &c; 541 542 asm pure nothrow { 543 mov RAX, _a; 544 mov RBX, _b; 545 mov RCX, _c; 546 movdqu XMM1, [RAX]; 547 movdqu XMM2, [RBX]; 548 pmulhuw XMM1, XMM2; 549 movdqu [RCX], XMM1; 550 } 551 return c; 552 } 553 554 // _mm_cmpeq_epi16 ; PCMPEQW 555 __m128i _mm_cmpeq_epi16()(auto const ref __m128i a, auto const ref __m128i b) { 556 557 const(__m128i)* _a = &a; 558 const(__m128i)* _b = &b; 559 __m128i c; 560 __m128i* _c = &c; 561 562 asm pure nothrow { 563 mov RAX, _a; 564 mov RBX, _b; 565 mov RCX, _c; 566 movdqu XMM1, [RAX]; 567 movdqu XMM2, [RBX]; 568 pcmpeqw XMM1, XMM2; 569 movdqu [RCX], XMM1; 570 } 571 return c; 572 } 573 574 // _mm_mullo_epi16 ; PMULLW 575 __m128i _mm_mullo_epi16()(auto const ref __m128i a, auto const ref __m128i b) { 576 577 const(__m128i)* _a = &a; 578 const(__m128i)* _b = &b; 579 __m128i c; 580 __m128i* _c = &c; 581 582 asm pure nothrow { 583 mov RAX, _a; 584 mov RBX, _b; 585 mov RCX, _c; 586 movdqu XMM1, [RAX]; 587 movdqu XMM2, [RBX]; 588 pmullw XMM1, XMM2; 589 movdqu [RCX], XMM1; 590 } 591 return c; 592 593 } 594 595 // _mm_sub_epi16 ; PSUBW 596 __m128i _mm_sub_epi16()(auto const ref __m128i a, auto const ref __m128i b) { 597 const(__m128i)* _a = &a; 598 const(__m128i)* _b = &b; 599 __m128i c; 600 __m128i* _c = &c; 601 602 asm pure nothrow { 603 mov RAX, _a; 604 mov RBX, _b; 605 mov RCX, _c; 606 movdqu XMM1, [RAX]; 607 movdqu XMM2, [RBX]; 608 psubw XMM1, XMM2; 609 movdqu [RCX], XMM1; 610 } 611 return c; 612 } 613 614 // _mm_add_epi16 ; PADDW 615 __m128i _mm_add_epi16()(auto const ref __m128i a, auto const ref __m128i b) { 616 const(__m128i)* _a = &a; 617 const(__m128i)* _b = &b; 618 __m128i c; 619 __m128i* _c = &c; 620 asm pure nothrow { 621 mov RAX, _a; 622 mov RBX, _b; 623 mov RCX, _c; 624 movdqu XMM1, [RAX]; 625 movdqu XMM2, [RBX]; 626 paddw XMM1, XMM2; 627 movdqu [RCX], XMM1; 628 } 629 return c; 630 } 631 632 // _mm_srli_epi16 ; PSRLW 633 __m128i _mm_srli_epi16(int imm)(auto const ref __m128i a) { 634 const(__m128i)* _a = &a; 635 __m128i b; 636 __m128i* _b = &b; 637 638 mixin(`asm pure nothrow { 639 mov RAX, _a; 640 mov RBX, _b; 641 movdqu XMM1, [RAX]; 642 psrlw XMM1, ` ~ imm.to!string ~ `; 643 movdqu [RBX], XMM1; 644 }`); 645 return b; 646 } 647 648 // _mm_srli_epi32 ; PSRLD 649 __m128i _mm_srli_epi32(int imm)(auto const ref __m128i a) { 650 const(__m128i)* _a = &a; 651 __m128i b; 652 __m128i* _b = &b; 653 654 mixin(`asm pure nothrow { 655 mov RAX, _a; 656 mov RBX, _b; 657 movdqu XMM1, [RAX]; 658 psrld XMM1, ` ~ imm.to!string ~ `; 659 movdqu [RBX], XMM1; 660 }`); 661 return b; 662 } 663 664 // _mm_slli_epi32 ; PSLLD 665 __m128i _mm_slli_epi32(int imm)(auto const ref __m128i a) { 666 const(__m128i)* _a = &a; 667 __m128i b; 668 __m128i* _b = &b; 669 mixin(`asm pure nothrow { 670 mov RAX, _a; 671 mov RBX, _b; 672 movdqu XMM1, [RAX]; 673 pslld XMM1, ` ~ imm.to!string ~ `; 674 movdqu [RBX], XMM1; 675 }`); 676 return b; 677 } 678 679 // _mm_slli_epi16 ; PSLLW 680 __m128i _mm_slli_epi16(int imm)(auto const ref __m128i a) { 681 const(__m128i)* _a = &a; 682 __m128i b; 683 __m128i* _b = &b; 684 685 mixin(`asm pure nothrow { 686 mov RAX, _a; 687 mov RBX, _b; 688 movdqu XMM1, [RAX]; 689 psllw XMM1, ` ~ imm.to!string ~ `; 690 movdqu [RBX], XMM1; 691 }`); 692 return b; 693 } 694 695 // _mm_shufflehi_epi16 ; PSHUFHW 696 __m128i _mm_shufflehi_epi16(int imm)(const ref __m128i a) { 697 const(__m128i)* _a = &a; 698 __m128i b; 699 __m128i* _b = &b; 700 mixin(`asm pure nothrow { 701 mov RAX, _a; 702 mov RBX, _b; 703 movdqu XMM2, [RAX]; 704 pshufhw XMM1, XMM2, ` ~ imm.to!string ~ `; 705 movdqu [RBX], XMM1; 706 }`); 707 return b; 708 } 709 710 // _mm_shufflelo_epi16 ; PSHUFLW 711 __m128i _mm_shufflelo_epi16(int imm)(auto const ref __m128i a) { 712 const(__m128i)* _a = &a; 713 __m128i b; 714 __m128i* _b = &b; 715 716 mixin(`asm pure nothrow { 717 mov RAX, _a; 718 mov RBX, _b; 719 movdqu XMM2, [RAX]; 720 pshuflw XMM1, XMM2, ` ~ imm.to!string ~ `; 721 movdqu [RBX], XMM1; 722 }`); 723 return b; 724 } 725 726 // _mm_add_epi32 ; PADDD 727 __m128i _mm_add_epi32()(auto const ref __m128i a, auto const ref __m128i b) { 728 const(__m128i)* _a = &a; 729 const(__m128i)* _b = &b; 730 __m128i c; 731 __m128i* _c = &c; 732 733 asm pure nothrow { 734 mov RAX, _a; 735 mov RBX, _b; 736 mov RCX, _c; 737 movdqu XMM1, [RAX]; 738 movdqu XMM2, [RBX]; 739 paddd XMM1, XMM2; 740 movdqu [RCX], XMM1; 741 } 742 return c; 743 } 744 745 // _mm_sub_epi32 ; PSUBD 746 __m128i _mm_sub_epi32()(auto const ref __m128i a, auto const ref __m128i b) { 747 const(__m128i)* _a = &a; 748 const(__m128i)* _b = &b; 749 __m128i c; 750 __m128i* _c = &c; 751 752 asm pure nothrow { 753 mov RAX, _a; 754 mov RBX, _b; 755 mov RCX, _c; 756 movdqu XMM1, [RAX]; 757 movdqu XMM2, [RBX]; 758 psubd XMM1, XMM2; 759 movdqu [RCX], XMM1; 760 } 761 return c; 762 } 763 764 // _mm_cmplt_epi32 ; PCMPGTD 765 __m128i _mm_cmplt_epi32()(auto const ref __m128i a, auto const ref __m128i b) { 766 const(__m128i)* _a = &a; 767 const(__m128i)* _b = &b; 768 __m128i c; 769 __m128i* _c = &c; 770 771 asm pure nothrow { 772 mov RAX, _b; 773 mov RBX, _a; 774 mov RCX, _c; 775 movdqu XMM1, [RAX]; 776 movdqu XMM2, [RBX]; 777 pcmpgtd XMM1, XMM2; 778 movdqu [RCX], XMM1; 779 } 780 781 return c; 782 } 783 784 // _mm_shuffle_epi32 ; PSHUFD 785 __m128i _mm_shuffle_epi32(int imm)(auto const ref __m128i a) { 786 const(__m128i)* _a = &a; 787 __m128i b; 788 __m128i* _b = &b; 789 790 mixin(`asm pure nothrow { 791 mov RAX, _a; 792 mov RBX, _b; 793 movdqu XMM2, [RAX]; 794 pshufd XMM1, XMM2, ` ~ imm.to!string ~ `; 795 movdqu [RBX], XMM1; 796 }`); 797 return b; 798 } 799 800 // _mm_extract_epi32 ; pextrd 801 int _mm_extract_epi32(int ndx)(__m128i a) { 802 803 __m128i* _a = &a; 804 int b; 805 int* _b = &b; 806 mixin(`asm pure nothrow { 807 mov RAX, _a; 808 mov RBX, _b; 809 movdqu XMM2, [RAX]; 810 pextrd ECX, XMM2, ` ~ ndx.to!string ~ `; 811 mov [RBX], ECX; 812 }`); 813 return b; 814 } 815 816 // _mm_unpackhi_epi32 ; PUNPCKHDQ 817 __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) { 818 const(__m128i)* _a = &a; 819 const(__m128i)* _b = &b; 820 __m128i c; 821 __m128i* _c = &c; 822 823 asm pure nothrow { 824 mov RAX, _a; 825 mov RBX, _b; 826 mov RCX, _c; 827 movdqu XMM1, [RAX]; 828 movdqu XMM2, [RBX]; 829 punpckhdq XMM1, XMM2; 830 movdqu [RCX], XMM1; 831 } 832 833 return c; 834 } 835 836 // _mm_unpacklo_epi32 ; PUNPCKLDQ 837 __m128i _mm_unpacklo_epi32()(auto const ref __m128i a, auto const ref __m128i b) { 838 const(__m128i)* _a = &a; 839 const(__m128i)* _b = &b; 840 __m128i c; 841 __m128i* _c = &c; 842 843 asm pure nothrow { 844 mov RAX, _a; 845 mov RBX, _b; 846 mov RCX, _c; 847 movdqu XMM1, [RAX]; 848 movdqu XMM2, [RBX]; 849 punpckldq XMM1, XMM2; 850 movdqu [RCX], XMM1; 851 } 852 853 return c; 854 } 855 856 // _mm_unpackhi_epi64 ; PUNPCKHQDQ 857 __m128i _mm_unpackhi_epi64()(auto const ref __m128i a, auto const ref __m128i b) { 858 const(__m128i)* _a = &a; 859 const(__m128i)* _b = &b; 860 __m128i c; 861 __m128i* _c = &c; 862 863 asm pure nothrow { 864 mov RAX, _a; 865 mov RBX, _b; 866 mov RCX, _c; 867 movdqu XMM1, [RAX]; 868 movdqu XMM2, [RBX]; 869 punpckhqdq XMM1, XMM2; 870 movdqu [RCX], XMM1; 871 } 872 873 return c; 874 } 875 876 // _mm_unpacklo_epi64 ; PUNPCKLQDQ 877 __m128i _mm_unpacklo_epi64()(auto const ref __m128i a, auto const ref __m128i b) { 878 const(__m128i)* _a = &a; 879 const(__m128i)* _b = &b; 880 __m128i c; 881 __m128i* _c = &c; 882 883 asm pure nothrow { 884 mov RAX, _a; 885 mov RBX, _b; 886 mov RCX, _c; 887 movdqu XMM1, [RAX]; 888 movdqu XMM2, [RBX]; 889 punpcklqdq XMM1, XMM2; 890 movdqu [RCX], XMM1; 891 } 892 893 return c; 894 } 895 896 // _mm_setzero_si128 ; PXOR 897 __m128i _mm_setzero_si128 () { 898 return cast(__m128i) int4([0, 0, 0, 0]); 899 } 900 901 // _mm_loadu_si128 ; MOVDQU 902 __m128i _mm_loadu_si128 (in __m128i* p) { 903 __m128i a; 904 __m128i* _a = &a; 905 906 asm pure nothrow { 907 mov RAX, p; 908 mov RBX, _a; 909 movdqu XMM1, [RAX]; 910 movdqu [RBX], XMM1; 911 } 912 913 return a; 914 } 915 916 // _mm_storeu_si128 ; MOVDQU 917 void _mm_storeu_si128()(__m128i* p, auto const ref __m128i a) { 918 const(__m128i)* _a = &a; 919 920 asm pure nothrow { 921 mov RAX, _a; 922 mov RBX, p; 923 movdqu XMM1, [RAX]; 924 movdqu [RBX], XMM1; 925 } 926 } 927 928 // _mm_or_si128 ; POR 929 __m128i _mm_or_si128()(auto const ref __m128i a, auto const ref __m128i b) { 930 const(__m128i)* _a = &a; 931 const(__m128i)* _b = &b; 932 __m128i c; 933 __m128i* _c = &c; 934 935 asm pure nothrow { 936 mov RAX, _a; 937 mov RBX, _b; 938 mov RCX, _c; 939 movdqu XMM1, [RAX]; 940 movdqu XMM2, [RBX]; 941 por XMM1, XMM2; 942 movdqu [RCX], XMM1; 943 } 944 945 return c; 946 } 947 948 // _mm_andnot_si128 ; PANDN 949 __m128i _mm_andnot_si128()(auto const ref __m128i a, auto const ref __m128i b) { 950 const(__m128i)* _a = &a; 951 const(__m128i)* _b = &b; 952 __m128i c; 953 __m128i* _c = &c; 954 955 asm pure nothrow { 956 mov RAX, _a; 957 mov RBX, _b; 958 mov RCX, _c; 959 movdqu XMM1, [RAX]; 960 movdqu XMM2, [RBX]; 961 pandn XMM1, XMM2; 962 movdqu [RCX], XMM1; 963 } 964 965 return c; 966 } 967 968 // _mm_and_si128 ; PAND 969 __m128i _mm_and_si128()(auto const ref __m128i a, auto const ref __m128i b) { 970 const(__m128i)* _a = &a; 971 const(__m128i)* _b = &b; 972 __m128i c; 973 __m128i* _c = &c; 974 asm pure nothrow { 975 mov RAX, _a; 976 mov RBX, _b; 977 mov RCX, _c; 978 movdqu XMM1, [RAX]; 979 movdqu XMM2, [RBX]; 980 pand XMM1, XMM2; 981 movdqu [RCX], XMM1; 982 } 983 984 return c; 985 } 986 987 // _mm_xor_si128 ; PXOR 988 __m128i _mm_xor_si128()(auto const ref __m128i a, auto const ref __m128i b) { 989 const(__m128i)* _a = &a; 990 const(__m128i)* _b = &b; 991 __m128i c; 992 __m128i* _c = &c; 993 asm pure nothrow { 994 mov RAX, _a; 995 mov RBX, _b; 996 mov RCX, _c; 997 movdqu XMM1, [RAX]; 998 movdqu XMM2, [RBX]; 999 pxor XMM1, XMM2; 1000 movdqu [RCX], XMM1; 1001 } 1002 return c; 1003 } 1004 1005 // _mm_srli_si128 ; PSRLDQ 1006 __m128i _mm_srli_si128(int imm)(auto const ref __m128i a) { 1007 const(__m128i)* _a = &a; 1008 __m128i b; 1009 __m128i* _b = &b; 1010 mixin(`asm pure nothrow { 1011 mov RAX, _a; 1012 mov RBX, _b; 1013 movdqu XMM1, [RAX]; 1014 psrldq XMM1, ` ~ imm.to!string ~ `; 1015 movdqu [RBX], XMM1; 1016 }`); 1017 return b; 1018 } 1019 1020 // _mm_slli_si128 ; PSLLDQ 1021 __m128i _mm_slli_si128(int imm)(auto const ref __m128i a) { 1022 const(__m128i)* _a = &a; 1023 __m128i b; 1024 __m128i* _b = &b; 1025 mixin(`asm pure nothrow { 1026 mov RAX, _a; 1027 mov RBX, _b; 1028 movdqu XMM1, [RAX]; 1029 pslldq XMM1, ` ~ imm.to!string ~ `; 1030 movdqu [RBX], XMM1; 1031 }`); 1032 return b; 1033 } 1034 }