1 /*
2 * emmintrin.h style functions
3 * (C) 2014-2015 Etienne Cimon
4 *
5 * License:
6 * Released under the MIT license
7 */
8 module botan.utils.simd.emmintrin;
9 
10 import botan.constants;
11 static if (BOTAN_HAS_SIMD_SSE2):
12 version(D_InlineAsm_X86) {
13     version(Windows) { pragma(msg, "Error: Loaded SIMD SSE2 in a x86 build! Use --config=windows_x86 in dub"); }
14     else { pragma(msg, "Error: Loaded SIMD SSE2 in a x86 build! Use --config=posix_x86 in dub"); }
15 }
16 import core.simd;
17 import std.conv : to;
18 
19 pure:
20 nothrow:
21 @trusted:
22 
23 alias __m128i = byte16;
24 alias __m64 = ulong;
25 
26 int _MM_SHUFFLE(int z, int y, int x, int w)
27 {
28     return ( (z<<6) | (y<<4) | (x<<2) | w );
29 }
30 
31 // _mm_set1_epi32
32 __m128i _mm_set1_epi32 (int i)() {
33     int4 vec = [i, i, i, i];
34     return *cast(__m128i*) &vec;
35 }
36 
37 // _mm_set1_epi32
38 __m128i _mm_set1_epi32 (int i) {
39     align(16) int[4] vec = [i, i, i, i];
40     return _mm_loadu_si128(cast(__m128i*)&vec);
41 }
42 
43 // _mm_set_epi32
44 immutable(__m128i) _mm_set_epi32 (int i, int j, int k, int l)() {
45     int4 vec = [l, k, j, i];
46     return *cast(immutable(__m128i)*) &vec;
47 }
48 
49 // _mm_set_epi32
50 immutable(__m128i) _mm_set_epi32 (int i, int j, int k, int l) {
51 
52     align(16) int[4] vec = [l, k, j, i];
53     return _mm_loadu_si128(cast(__m128i*)&vec);
54 }
55 
56 // _mm_set_epi8
57 immutable(__m128i) _mm_set1_epi8 (byte i)() {
58     return byte16([i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i]);
59 }
60 
61 // _mm_set_epi8
62 immutable(__m128i) _mm_set1_epi8(byte[] arr)() {
63     mixin(`byte16 arr_fix = [` ~ arr[15].to!string ~ `, ` ~ arr[14].to!string ~ `, 
64                                 ` ~ arr[13].to!string ~ `, ` ~ arr[12].to!string ~ `, 
65                                 ` ~ arr[11].to!string ~ `, ` ~ arr[10].to!string ~ `, 
66                                 ` ~ arr[9].to!string ~ `, ` ~ arr[8].to!string ~ `, 
67                                 ` ~ arr[7].to!string ~ `, ` ~ arr[6].to!string ~ `, 
68                                 ` ~ arr[5].to!string ~ `, ` ~ arr[4].to!string ~ `, 
69                                 ` ~ arr[3].to!string ~ `, ` ~ arr[2].to!string ~ `, 
70                                 ` ~ arr[1].to!string ~ `, ` ~ arr[0].to!string ~ `];`);
71     return cast(immutable __m128i)arr_fix;
72 }
73 
74 // _mm_set1_epi16
75 __m128i _mm_set1_epi16(short w)() {
76     short8 vec = short8([w,w,w,w,w,w,w,w]);
77     return *cast(__m128i*) &vec;
78 }
79 
80 version(GDC) {
81     // GDC <--> emmintrin => gcc/gcc/config/i386/emmintrin.h
82     static import gcc.attribute;
83     import gcc.builtins;
84     enum inline = gcc.attribute.attribute("forceinline");
85 @inline:
86     // _mm_set1_epi16
87     __m128i _mm_set1_epi16(short w) {
88         short[8] a = [w,w,w,w,w,w,w,w];
89         __m128i b;
90         short[8]* _a = &a;
91         __m128i* _b = &b;
92         mixin( q{
93             asm pure nothrow {
94                 "movdqu (%0), %xmm0\n"
95                 "movdqu %xmm0, (%1)\n"
96                 : : "0" (_a), "1" (_b) : "xmm0"
97             }
98         });
99         return b;
100     }
101 
102     ulong bswap64(ulong val) {
103         return cast(ulong) __builtin_bswap64(val);
104     }
105 
106     int _mm_cvtsi128_si32(__m128i a) {
107         return cast(int) __builtin_ia32_vec_ext_v4si(cast(int4) a, 0);
108     }
109 
110     // _mm_min_epu8
111     __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) {
112         return cast(__m128i) __builtin_ia32_pminub128(a, b);
113     }
114 
115     __m128i _mm_shuffle_epi8()(auto ref __m128i a, auto const ref __m128i b) {
116         return cast(__m128i) __builtin_ia32_pshufb128(a, b);
117     }
118 
119     // _mm_subs_epu16
120     __m128i _mm_subs_epu16()(auto ref __m128i a, auto const ref __m128i b) {
121         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8) a, cast(short8) b);
122     }
123 
124     // _mm_mulhi_epu16 ; PMULHUW
125     __m128i _mm_mulhi_epu16()(auto ref __m128i a, auto const ref __m128i b) {
126         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8) a, cast(short8) b);
127     }
128 
129 
130     // _mm_cmpeq_epi16 ; PCMPEQW
131     __m128i _mm_cmpeq_epi16()(auto ref __m128i a, auto const ref __m128i b) {
132         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8) a, cast(short8) b);
133     }
134 
135     // _mm_mullo_epi16 ; PMULLW
136     __m128i _mm_mullo_epi16()(auto ref __m128i a, auto const ref __m128i b) {
137         return cast(__m128i) __builtin_ia32_pmullw128(cast(short8) a, cast(short8) b);
138     }
139 
140     // _mm_sub_epi16 ; PSUBW
141     __m128i _mm_sub_epi16()(auto ref __m128i a, auto const ref __m128i b) {
142         return cast(__m128i) __builtin_ia32_psubw128(cast(short8) a, cast(short8) b);
143     }
144 
145     // _mm_add_epi16 ; PADDW
146     __m128i _mm_add_epi16()(auto ref __m128i a, auto const ref __m128i b) {
147         return cast(__m128i) __builtin_ia32_paddw128(cast(short8) a, cast(short8) b);
148     }
149 
150     // _mm_srli_epi16 ; PSRLW
151     __m128i _mm_srli_epi16(int imm)(__m128i a) {
152         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8) a, imm);
153     }
154 
155     // _mm_slli_epi16 ; PSLLW
156     __m128i _mm_slli_epi16(int imm)(__m128i a) {
157         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8) a, imm);
158     }
159 
160     // _mm_shufflehi_epi16 ; PSHUFHW
161     __m128i _mm_shufflehi_epi16(int imm)(__m128i a) {
162         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8) a, imm);
163     }
164 
165     // _mm_shufflelo_epi16 ; PSHUFLW
166     __m128i _mm_shufflelo_epi16(int imm)(__m128i a) {
167         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8) a, imm);
168     }
169 
170     // _mm_add_epi32 ; PADDD
171     __m128i _mm_add_epi32()(auto ref __m128i a, auto const ref __m128i b) {
172         return cast(__m128i) __builtin_ia32_paddd128(cast(int4) a, cast(int4) b);
173     }
174 
175     // _mm_sub_epi32 ; PSUBD
176     __m128i _mm_sub_epi32()(auto ref __m128i a, auto const ref __m128i b) {
177         return cast(__m128i) __builtin_ia32_psubd128(cast(int4) a, cast(int4) b);
178     }
179 
180     // _mm_cmplt_epi32 ; PCMPGTDr
181     __m128i _mm_cmplt_epi32()(auto ref __m128i a, auto const ref __m128i b) {
182         return cast(__m128i) __builtin_ia32_pcmpgtd128(cast(int4) b, cast(int4) a);
183     }
184 
185     // _mm_shuffle_epi32
186     __m128i _mm_shuffle_epi32(int imm)(__m128i a) {
187         return cast(__m128i) __builtin_ia32_pshufd(cast(int4) a, imm);
188     }
189 
190     // _mm_extract_epi32 ; pextrd
191     int _mm_extract_epi32(__m128i a, in int ndx) {
192         return cast(__m128i) __builtin_ia32_vec_ext_v4si(cast(int4) a, ndx);
193     }
194 
195     // _mm_unpackhi_epi32 ; PUNPCKHDQ
196     __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) {
197         return cast(__m128i) __builtin_ia32_punpckhdq128(cast(int4) a, cast(int4) b);
198     }
199 
200     // _mm_unpacklo_epi32 ; PUNPCKLDQ
201     __m128i _mm_unpacklo_epi32()(auto ref __m128i a, auto const ref __m128i b) {
202         return cast(__m128i) __builtin_ia32_punpckldq128(cast(int4) a, cast(int4) b);
203     }
204 
205     // _mm_unpackhi_epi64 ; PUNPCKHQDQ
206     __m128i _mm_unpackhi_epi64()(auto ref __m128i a, auto const ref __m128i b) {
207         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
208     }
209 
210     // _mm_unpacklo_epi64 ; PUNPCKLQDQ
211     __m128i _mm_unpacklo_epi64()(auto ref __m128i a, auto const ref __m128i b) {
212         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
213     }
214     
215     // _mm_setzero_si128 ; PXOR
216     __m128i _mm_setzero_si128 () {
217         return cast(__m128i) int4([0, 0, 0, 0]);
218     }
219 
220     // _mm_loadu_si128 ; MOVDQU
221     __m128i _mm_loadu_si128 (in __m128i* p) {
222         return cast(__m128i) __builtin_ia32_loaddqu(p);
223     }
224 
225     // _mm_storeu_si128 ; MOVDQU
226     void _mm_storeu_si128()(__m128i* p, auto const ref __m128i a) {
227         return cast(__m128i) __builtin_ia32_storedqu(p, a);
228     }
229 
230     // _mm_or_si128 ; POR
231     __m128i _mm_or_si128()(auto ref __m128i a, auto const ref __m128i b) {
232         return cast(__m128i) __builtin_ia32_por128(cast(long2) a, cast(long2) b);
233     }
234 
235     // _mm_andnot_si128 ; PANDN
236     __m128i _mm_andnot_si128()(auto ref __m128i a, auto const ref __m128i b) {
237         return cast(__m128i) __builtin_ia32_pandn128(cast(long2) a, cast(long2) b);
238     }
239 
240     // _mm_and_si128 ; PAND
241     __m128i _mm_and_si128()(auto ref __m128i a, auto const ref __m128i b) {
242         return cast(__m128i) __builtin_ia32_pand128(cast(long2) a, cast(long2) b);
243     }
244 
245     // _mm_xor_si128 ; PXOR
246     __m128i _mm_xor_si128 ( __m128i a, auto const ref __m128i b) {
247         return cast(__m128i) __builtin_ia32_pxor128(cast(long2) a, cast(long2) b);
248     }
249 
250     // _mm_srli_si128 ; PSRLDQ
251     __m128i _mm_srli_si128(int imm)(__m128i a) {
252         return cast(__m128i) __builtin_ia32_psrldqi128(a, imm*8);
253     }
254 
255     // _mm_slli_si128 ; PSLLDQ
256     __m128i _mm_slli_si128(int imm)(__m128i a) {
257         return cast(__m128i) __builtin_ia32_pslldqi128(a, imm*8);
258     }
259 }
260 
261 version(none) {
262     import ldc.gccbuiltins_x86;
263 
264     pragma(LDC_intrinsic, "llvm.bswap.i64")
265         ulong bswap64(ulong i);
266 
267     __m128i _mm_set1_epi16(short w) {
268         short[8] a = [w,w,w,w,w,w,w,w];
269         __m128i b;
270         short[8]* _a = &a;
271         __m128i* _b = &b;
272         mixin( q{
273             __asm pure nothrow {
274                 "movdqu (%0), %xmm0\n"
275                 ~ "movdqu %xmm0, (%1)\n"
276                 : : "0" (_a), "1" (_b) : "xmm0"
277             }
278         });
279         return b;
280     }
281 
282     int _mm_cvtsi128_si32(__m128i a) {
283         return cast(int) __builtin_ia32_vec_ext_v4si(cast(int4) a, 0);
284     }
285 
286     // _mm_shuffle_epi8
287     __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
288         return cast(__m128i) __builtin_ia32_pshufb128(a, b);
289     }
290 
291     // _mm_min_epu8
292     __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) {
293         return cast(__m128i) __builtin_ia32_pminub128(a, b);
294     }
295     
296     // _mm_subs_epu16
297     __m128i _mm_subs_epu16()(auto ref __m128i a, auto const ref __m128i b) {
298         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8) a, cast(short8) b);
299     }
300     
301     // _mm_mulhi_epu16 ; PMULHUW
302     __m128i _mm_mulhi_epu16()(auto ref __m128i a, auto const ref __m128i b) {
303         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8) a, cast(short8) b);
304     }
305     
306     // _mm_set1_epi16
307     __m128i _mm_set1_epi16 (short w) {
308         return cast(__m128i) short8([w,w,w,w,w,w,w,w]);
309     }
310     
311     // _mm_cmpeq_epi16 ; PCMPEQW
312     __m128i _mm_cmpeq_epi16()(auto ref __m128i a, auto const ref __m128i b) {
313         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8) a, cast(short8) b);
314     }
315     
316     // _mm_mullo_epi16 ; PMULLW
317     __m128i _mm_mullo_epi16()(auto ref __m128i a, auto const ref __m128i b) {
318         return cast(__m128i) __builtin_ia32_pmullw128(cast(short8) a, cast(short8) b);
319     }
320     
321     // _mm_sub_epi16 ; PSUBW
322     __m128i _mm_sub_epi16()(auto ref __m128i a, auto const ref __m128i b) {
323         return cast(__m128i) __builtin_ia32_psubw128(cast(short8) a, cast(short8) b);
324     }
325     
326     // _mm_add_epi16 ; PADDW
327     __m128i _mm_add_epi16()(auto ref __m128i a, auto const ref __m128i b) {
328         return cast(__m128i) __builtin_ia32_paddw128(cast(short8) a, cast(short8) b);
329     }
330     
331     // _mm_srli_epi16 ; PSRLW
332     __m128i _mm_srli_epi16(int imm)(__m128i a) {
333         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8) a, imm);
334     }
335     
336     // _mm_slli_epi16 ; PSLLW
337     __m128i _mm_slli_epi16(int imm)(__m128i a) {
338         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8) a, imm);
339     }
340     
341     // _mm_shufflehi_epi16 ; PSHUFHW
342     __m128i _mm_shufflehi_epi16(int imm)(__m128i a) {
343         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8) a, imm);
344     }
345     
346     // _mm_shufflelo_epi16 ; PSHUFLW
347     __m128i _mm_shufflelo_epi16(int imm)(__m128i a) {
348         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8) a, imm);
349     }
350     
351     // _mm_add_epi32 ; PADDD
352     __m128i _mm_add_epi32()(auto ref __m128i a, auto const ref __m128i b) {
353         return cast(__m128i) __builtin_ia32_paddd128(cast(int4) a, cast(int4) b);
354     }
355     
356     // _mm_sub_epi32 ; PSUBD
357     __m128i _mm_sub_epi32()(auto ref __m128i a, auto const ref __m128i b) {
358         return cast(__m128i) __builtin_ia32_psubd128(cast(int4) a, cast(int4) b);
359     }
360     
361     // _mm_cmplt_epi32 ; PCMPGTDr
362     __m128i _mm_cmplt_epi32()(auto ref __m128i a, auto const ref __m128i b) {
363         return cast(__m128i) __builtin_ia32_pcmpgtd128(cast(int4) b, cast(int4) a);
364     }
365 
366     // _mm_shuffle_epi32
367     __m128i _mm_shuffle_epi32(int imm)(__m128i a) {
368         return cast(__m128i) __builtin_ia32_pshufd(cast(int4) a, imm);
369     }
370     
371     // _mm_extract_epi32 ; pextrd
372     int _mm_extract_epi32(__m128i a, in int ndx) {
373         return cast(__m128i) __builtin_ia32_vec_ext_v4si(cast(int4) a, ndx);
374     }
375     
376     // _mm_unpackhi_epi32 ; PUNPCKHDQ
377     __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) {
378         return cast(__m128i) __builtin_ia32_punpckhdq128(cast(int4) a, cast(int4) b);
379     }
380     
381     // _mm_unpacklo_epi32 ; PUNPCKLDQ
382     __m128i _mm_unpacklo_epi32()(auto ref __m128i a, auto const ref __m128i b) {
383         return cast(__m128i) __builtin_ia32_punpckldq128(cast(int4) a, cast(int4) b);
384     }
385     
386     // _mm_unpackhi_epi64 ; PUNPCKHQDQ
387     __m128i _mm_unpackhi_epi64()(auto ref __m128i a, auto const ref __m128i b) {
388         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
389     }
390     
391     // _mm_unpacklo_epi64 ; PUNPCKLQDQ
392     __m128i _mm_unpacklo_epi64()(auto ref __m128i a, auto const ref __m128i b) {
393         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
394     }
395     
396     // _mm_setzero_si128 ; PXOR
397     __m128i _mm_setzero_si128 () {
398         return cast(__m128i) int4([0, 0, 0, 0]);
399     }
400     
401     // _mm_loadu_si128 ; MOVDQU
402     __m128i _mm_loadu_si128 (in __m128i* p) {
403         return cast(__m128i) __builtin_ia32_loaddqu(p);
404     }
405     
406     // _mm_storeu_si128 ; MOVDQU
407     void _mm_storeu_si128()(__m128i *p, auto const ref __m128i a) {
408         return cast(__m128i) __builtin_ia32_storedqu(p, a);
409     }
410     
411     // _mm_or_si128 ; POR
412     __m128i _mm_or_si128()(auto ref __m128i a, auto const ref __m128i b) {
413         return cast(__m128i) __builtin_ia32_por128(cast(long2) a, cast(long2) b);
414     }
415     
416     // _mm_andnot_si128 ; PANDN
417     __m128i _mm_andnot_si128()(auto ref __m128i a, auto const ref __m128i b) {
418         return cast(__m128i) __builtin_ia32_pandn128(cast(long2) a, cast(long2) b);
419     }
420     
421     // _mm_and_si128 ; PAND
422     __m128i _mm_and_si128()(auto ref __m128i a, auto const ref __m128i b) {
423         return cast(__m128i) __builtin_ia32_pand128(cast(long2) a, cast(long2) b);
424     }
425     
426     // _mm_xor_si128 ; PXOR
427     __m128i _mm_xor_si128 ( __m128i a, auto const ref __m128i b) {
428         return cast(__m128i) __builtin_ia32_pxor128(cast(long2) a, cast(long2) b);
429     }
430     
431     // _mm_srli_si128 ; PSRLDQ
432     __m128i _mm_srli_si128(int imm)(__m128i a) {
433         return cast(__m128i) __builtin_ia32_psrldqi128(a, imm*8);
434     }
435     
436     // _mm_slli_si128 ; PSLLDQ
437     __m128i _mm_slli_si128(int imm)(__m128i a) {
438         return cast(__m128i) __builtin_ia32_pslldqi128(a, imm*8);
439     }
440     
441     // bswap64
442     
443 }
444 
445 version(D_InlineAsm_X86_64) {
446     // _mm_set1_epi16
447     __m128i _mm_set1_epi16(short w) {
448         short[8] a = [w,w,w,w,w,w,w,w];
449         short[8]* _a = &a;
450         __m128i b;
451         __m128i* _b = &b;
452 
453         asm pure nothrow {
454             mov RAX, _a;
455             mov RBX, _b;
456             movdqu XMM1, [RAX];
457             movdqu [RBX], XMM1;
458         }
459 
460         return b;
461     }
462 
463     int _mm_cvtsi128_si32()(auto const ref __m128i a) {
464         int ret;
465         int* _ret = &ret;
466         const(__m128i)* _a = &a;
467 
468         asm pure nothrow {
469             mov RAX, _a;
470             mov RBX, _ret;
471             movdqu XMM1, [RAX];
472             movd [RBX], XMM1;
473         }
474         return ret;
475     }
476 
477     // _mm_min_epu8 ; PMINUB
478     __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) {
479 
480         const(__m128i)* _a = &a;
481         const(__m128i)* _b = &b;
482         __m128i c;
483         __m128i* _c = &c;
484 
485         asm pure nothrow {
486             mov RAX, _a;
487             mov RBX, _b;
488             mov RCX, _c;
489             movdqu XMM1, [RAX];
490             movdqu XMM2, [RBX];
491             pminub XMM1, XMM2;
492             movdqu [RCX], XMM1;
493         }
494         return c;
495     }
496 
497     __m128i _mm_shuffle_epi8()(auto const ref __m128i a, auto const ref __m128i b) {
498         const(__m128i)* _a = &a;
499         const(__m128i)* _b = &b;
500         __m128i c;
501         __m128i* _c = &c;
502 
503         asm pure nothrow {
504             mov RAX, _a;
505             mov RBX, _b;
506             mov RCX, _c;
507             movdqu XMM1, [RAX];
508             movdqu XMM2, [RBX];
509             pshufb XMM1, XMM2;
510             movdqu [RCX], XMM1;
511         }
512         return c;
513     }
514 
515     // _mm_subs_epu16 ; PSUBUSW
516     __m128i _mm_subs_epu16()(auto const ref __m128i a, auto const ref __m128i b) {
517 
518         const(__m128i)* _a = &a;
519         const(__m128i)* _b = &b;
520         __m128i c;
521         __m128i* _c = &c;
522 
523         asm pure nothrow {
524             mov RAX, _a;
525             mov RBX, _b;
526             mov RCX, _c;
527             movdqu XMM1, [RAX];
528             movdqu XMM2, [RBX];
529             psubusw XMM1, XMM2;
530             movdqu [RCX], XMM1;
531         }
532         return c;
533     }
534     
535     // _mm_mulhi_epu16 ; PMULHUW
536     __m128i _mm_mulhi_epu16()(auto const ref __m128i a, auto const ref __m128i b) {
537         const(__m128i)* _a = &a;
538         const(__m128i)* _b = &b;
539         __m128i c;
540         __m128i* _c = &c;
541 
542         asm pure nothrow {
543             mov RAX, _a;
544             mov RBX, _b;
545             mov RCX, _c;
546             movdqu XMM1, [RAX];
547             movdqu XMM2, [RBX];
548             pmulhuw XMM1, XMM2;
549             movdqu [RCX], XMM1;
550         }
551         return c;
552     }
553         
554     // _mm_cmpeq_epi16 ; PCMPEQW
555     __m128i _mm_cmpeq_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
556 
557         const(__m128i)* _a = &a;
558         const(__m128i)* _b = &b;
559         __m128i c;
560         __m128i* _c = &c;
561 
562         asm pure nothrow {
563             mov RAX, _a;
564             mov RBX, _b;
565             mov RCX, _c;
566             movdqu XMM1, [RAX];
567             movdqu XMM2, [RBX];
568             pcmpeqw XMM1, XMM2;
569             movdqu [RCX], XMM1;
570         }
571         return c;
572     }
573     
574     // _mm_mullo_epi16 ; PMULLW
575     __m128i _mm_mullo_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
576 
577         const(__m128i)* _a = &a;
578         const(__m128i)* _b = &b;
579         __m128i c;
580         __m128i* _c = &c;
581 
582         asm pure nothrow {
583             mov RAX, _a;
584             mov RBX, _b;
585             mov RCX, _c;
586             movdqu XMM1, [RAX];
587             movdqu XMM2, [RBX];
588             pmullw XMM1, XMM2;
589             movdqu [RCX], XMM1;
590         }
591         return c;
592 
593     }
594     
595     // _mm_sub_epi16 ; PSUBW
596     __m128i _mm_sub_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
597         const(__m128i)* _a = &a;
598         const(__m128i)* _b = &b;
599         __m128i c;
600         __m128i* _c = &c;
601 
602         asm pure nothrow {
603             mov RAX, _a;
604             mov RBX, _b;
605             mov RCX, _c;
606             movdqu XMM1, [RAX];
607             movdqu XMM2, [RBX];
608             psubw XMM1, XMM2;
609             movdqu [RCX], XMM1;
610         }
611         return c;
612     }
613     
614     // _mm_add_epi16 ; PADDW
615     __m128i _mm_add_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
616         const(__m128i)* _a = &a;
617         const(__m128i)* _b = &b;
618         __m128i c;
619         __m128i* _c = &c;
620         asm pure nothrow {
621             mov RAX, _a;
622             mov RBX, _b;
623             mov RCX, _c;
624             movdqu XMM1, [RAX];
625             movdqu XMM2, [RBX];
626             paddw XMM1, XMM2;
627             movdqu [RCX], XMM1;
628         }
629         return c;
630     }
631 
632     // _mm_srli_epi16 ; PSRLW
633     __m128i _mm_srli_epi16(int imm)(auto const ref __m128i a) {
634         const(__m128i)* _a = &a;
635         __m128i b;
636         __m128i* _b = &b;
637 
638         mixin(`asm pure nothrow {
639             mov RAX, _a;
640             mov RBX, _b;
641             movdqu XMM1, [RAX];
642             psrlw XMM1, ` ~ imm.to!string ~ `;
643             movdqu [RBX], XMM1;
644         }`);
645         return b;
646     }    
647 
648     // _mm_srli_epi32 ; PSRLD
649     __m128i _mm_srli_epi32(int imm)(auto const ref __m128i a) {
650         const(__m128i)* _a = &a;
651         __m128i b;
652         __m128i* _b = &b;
653 
654         mixin(`asm pure nothrow {
655             mov RAX, _a;
656             mov RBX, _b;
657             movdqu XMM1, [RAX];
658             psrld XMM1, ` ~ imm.to!string ~ `;
659             movdqu [RBX], XMM1;
660         }`);
661         return b;
662     }
663 
664     // _mm_slli_epi32 ; PSLLD
665     __m128i _mm_slli_epi32(int imm)(auto const ref __m128i a) {
666         const(__m128i)* _a = &a;
667         __m128i b;
668         __m128i* _b = &b;
669         mixin(`asm pure nothrow {
670             mov RAX, _a;
671             mov RBX, _b;
672             movdqu XMM1, [RAX];
673             pslld XMM1, ` ~ imm.to!string ~ `;
674             movdqu [RBX], XMM1;
675         }`);
676         return b;
677     }
678     
679     // _mm_slli_epi16 ; PSLLW
680     __m128i _mm_slli_epi16(int imm)(auto const ref __m128i a) {
681         const(__m128i)* _a = &a;
682         __m128i b;
683         __m128i* _b = &b;
684 
685         mixin(`asm pure nothrow {
686             mov RAX, _a;
687             mov RBX, _b;
688             movdqu XMM1, [RAX];
689             psllw XMM1, ` ~ imm.to!string ~ `;
690             movdqu [RBX], XMM1;
691         }`);
692         return b;
693     }
694     
695     // _mm_shufflehi_epi16 ; PSHUFHW
696     __m128i _mm_shufflehi_epi16(int imm)(const ref __m128i a) {
697         const(__m128i)* _a = &a;
698         __m128i b;
699         __m128i* _b = &b;
700         mixin(`asm pure nothrow {
701             mov RAX, _a;
702             mov RBX, _b;
703             movdqu XMM2, [RAX];
704             pshufhw XMM1, XMM2, ` ~ imm.to!string ~ `;
705             movdqu [RBX], XMM1;
706             }`);
707         return b;
708     }
709     
710     // _mm_shufflelo_epi16 ; PSHUFLW
711     __m128i _mm_shufflelo_epi16(int imm)(auto const ref __m128i a) {
712         const(__m128i)* _a = &a;
713         __m128i b;
714         __m128i* _b = &b;
715 
716         mixin(`asm pure nothrow {
717             mov RAX, _a;
718             mov RBX, _b;
719             movdqu XMM2, [RAX];
720             pshuflw XMM1, XMM2, ` ~ imm.to!string ~ `;
721             movdqu [RBX], XMM1;
722         }`);
723         return b;
724     }
725     
726     // _mm_add_epi32 ; PADDD
727     __m128i _mm_add_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
728         const(__m128i)* _a = &a;
729         const(__m128i)* _b = &b;
730         __m128i c;
731         __m128i* _c = &c;
732 
733         asm pure nothrow {
734             mov RAX, _a;
735             mov RBX, _b;
736             mov RCX, _c;
737             movdqu XMM1, [RAX];
738             movdqu XMM2, [RBX];
739             paddd XMM1, XMM2;
740             movdqu [RCX], XMM1;
741         }
742         return c;
743     }
744     
745     // _mm_sub_epi32 ; PSUBD
746     __m128i _mm_sub_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
747         const(__m128i)* _a = &a;
748         const(__m128i)* _b = &b;
749         __m128i c;
750         __m128i* _c = &c;
751 
752         asm pure nothrow {
753             mov RAX, _a;
754             mov RBX, _b;
755             mov RCX, _c;
756             movdqu XMM1, [RAX];
757             movdqu XMM2, [RBX];
758             psubd XMM1, XMM2;
759             movdqu [RCX], XMM1;
760         }
761         return c;
762     }
763     
764     // _mm_cmplt_epi32 ; PCMPGTD
765     __m128i _mm_cmplt_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
766         const(__m128i)* _a = &a;
767         const(__m128i)* _b = &b;
768         __m128i c;
769         __m128i* _c = &c;
770 
771         asm pure nothrow {
772             mov RAX, _b;
773             mov RBX, _a;
774             mov RCX, _c;
775             movdqu XMM1, [RAX];
776             movdqu XMM2, [RBX];
777             pcmpgtd XMM1, XMM2;
778             movdqu [RCX], XMM1;
779         }
780 
781         return c;
782     }
783 
784     // _mm_shuffle_epi32 ;  PSHUFD
785     __m128i _mm_shuffle_epi32(int imm)(auto const ref __m128i a) {
786         const(__m128i)* _a = &a;
787         __m128i b;
788         __m128i* _b = &b;
789 
790         mixin(`asm pure nothrow {
791             mov RAX, _a;
792             mov RBX, _b;
793             movdqu XMM2, [RAX];
794             pshufd XMM1, XMM2, ` ~ imm.to!string ~ `;
795             movdqu [RBX], XMM1;
796         }`);
797         return b;
798     }
799     
800     // _mm_extract_epi32 ; pextrd
801     int _mm_extract_epi32(int ndx)(__m128i a) {
802 
803         __m128i* _a = &a;
804         int b;
805         int* _b = &b;
806         mixin(`asm pure nothrow {
807             mov RAX, _a;
808             mov RBX, _b;
809             movdqu XMM2, [RAX];
810             pextrd ECX, XMM2, ` ~ ndx.to!string ~ `;
811             mov [RBX], ECX;
812         }`);
813         return b;
814     }
815     
816     // _mm_unpackhi_epi32 ; PUNPCKHDQ
817     __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) {
818         const(__m128i)* _a = &a;
819         const(__m128i)* _b = &b;
820         __m128i c;
821         __m128i* _c = &c;
822 
823         asm pure nothrow {
824             mov RAX, _a;
825             mov RBX, _b;
826             mov RCX, _c;
827             movdqu XMM1, [RAX];
828             movdqu XMM2, [RBX];
829             punpckhdq XMM1, XMM2;
830             movdqu [RCX], XMM1;
831         }
832         
833         return c;
834     }
835     
836     // _mm_unpacklo_epi32 ; PUNPCKLDQ
837     __m128i _mm_unpacklo_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
838         const(__m128i)* _a = &a;
839         const(__m128i)* _b = &b;
840         __m128i c;
841         __m128i* _c = &c;
842 
843         asm pure nothrow {
844             mov RAX, _a;
845             mov RBX, _b;
846             mov RCX, _c;
847             movdqu XMM1, [RAX];
848             movdqu XMM2, [RBX];
849             punpckldq XMM1, XMM2;
850             movdqu [RCX], XMM1;
851         }
852         
853         return c;
854     }
855     
856     // _mm_unpackhi_epi64 ; PUNPCKHQDQ
857     __m128i _mm_unpackhi_epi64()(auto const ref __m128i a, auto const ref __m128i b) {
858         const(__m128i)* _a = &a;
859         const(__m128i)* _b = &b;
860         __m128i c;
861         __m128i* _c = &c;
862 
863         asm pure nothrow {
864             mov RAX, _a;
865             mov RBX, _b;
866             mov RCX, _c;
867             movdqu XMM1, [RAX];
868             movdqu XMM2, [RBX];
869             punpckhqdq XMM1, XMM2;
870             movdqu [RCX], XMM1;
871         }
872         
873         return c;
874     }
875     
876     // _mm_unpacklo_epi64 ; PUNPCKLQDQ
877     __m128i _mm_unpacklo_epi64()(auto const ref __m128i a, auto const ref __m128i b) {
878         const(__m128i)* _a = &a;
879         const(__m128i)* _b = &b;
880         __m128i c;
881         __m128i* _c = &c;
882 
883         asm pure nothrow {
884             mov RAX, _a;
885             mov RBX, _b;
886             mov RCX, _c;
887             movdqu XMM1, [RAX];
888             movdqu XMM2, [RBX];
889             punpcklqdq XMM1, XMM2;
890             movdqu [RCX], XMM1;
891         }
892         
893         return c;
894     }
895     
896     // _mm_setzero_si128 ; PXOR
897     __m128i _mm_setzero_si128 () {
898         return cast(__m128i) int4([0, 0, 0, 0]);
899     }
900     
901     // _mm_loadu_si128 ; MOVDQU
902     __m128i _mm_loadu_si128 (in __m128i* p) {
903         __m128i a;
904         __m128i* _a = &a;
905         
906         asm pure nothrow {
907             mov RAX, p;
908             mov RBX, _a;
909             movdqu XMM1, [RAX];
910             movdqu [RBX], XMM1;
911         }
912         
913         return a;
914     }
915     
916     // _mm_storeu_si128 ; MOVDQU
917     void _mm_storeu_si128()(__m128i* p, auto const ref __m128i a) {
918         const(__m128i)* _a = &a;
919         
920         asm pure nothrow {
921             mov RAX, _a;
922             mov RBX, p;
923             movdqu XMM1, [RAX];
924             movdqu [RBX], XMM1;
925         }
926     }
927     
928     // _mm_or_si128 ; POR
929     __m128i _mm_or_si128()(auto const ref __m128i a, auto const ref __m128i b) {
930         const(__m128i)* _a = &a;
931         const(__m128i)* _b = &b;
932         __m128i c;
933         __m128i* _c = &c;
934 
935         asm pure nothrow {
936             mov RAX, _a;
937             mov RBX, _b;
938             mov RCX, _c;
939             movdqu XMM1, [RAX];
940             movdqu XMM2, [RBX];
941             por XMM1, XMM2;
942             movdqu [RCX], XMM1;
943         }
944         
945         return c;
946     }
947     
948     // _mm_andnot_si128 ; PANDN
949     __m128i _mm_andnot_si128()(auto const ref __m128i a, auto const ref __m128i b) {
950         const(__m128i)* _a = &a;
951         const(__m128i)* _b = &b;
952         __m128i c;
953         __m128i* _c = &c;
954 
955         asm pure nothrow {
956             mov RAX, _a;
957             mov RBX, _b;
958             mov RCX, _c;
959             movdqu XMM1, [RAX];
960             movdqu XMM2, [RBX];
961             pandn XMM1, XMM2;
962             movdqu [RCX], XMM1;
963         }
964         
965         return c;
966     }
967     
968     // _mm_and_si128 ; PAND
969     __m128i _mm_and_si128()(auto const ref __m128i a, auto const ref __m128i b) {
970         const(__m128i)* _a = &a;
971         const(__m128i)* _b = &b;
972         __m128i c;
973         __m128i* _c = &c;
974         asm pure nothrow {
975             mov RAX, _a;
976             mov RBX, _b;
977             mov RCX, _c;
978             movdqu XMM1, [RAX];
979             movdqu XMM2, [RBX];
980             pand XMM1, XMM2;
981             movdqu [RCX], XMM1;
982         }
983         
984         return c;
985     }
986     
987     // _mm_xor_si128 ; PXOR
988     __m128i _mm_xor_si128()(auto const ref __m128i a, auto const ref __m128i b) {
989         const(__m128i)* _a = &a;
990         const(__m128i)* _b = &b;
991         __m128i c;
992         __m128i* _c = &c;
993         asm pure nothrow {
994             mov RAX, _a;
995             mov RBX, _b;
996             mov RCX, _c;
997             movdqu XMM1, [RAX];
998             movdqu XMM2, [RBX];
999             pxor XMM1, XMM2;
1000             movdqu [RCX], XMM1;
1001         }
1002         return c;
1003     }
1004     
1005     // _mm_srli_si128 ; PSRLDQ
1006     __m128i _mm_srli_si128(int imm)(auto const ref __m128i a) {
1007         const(__m128i)* _a = &a;
1008         __m128i b;
1009         __m128i* _b = &b;
1010         mixin(`asm pure nothrow {
1011             mov RAX, _a;
1012             mov RBX, _b;
1013             movdqu XMM1, [RAX];
1014             psrldq XMM1, ` ~ imm.to!string ~ `;
1015             movdqu [RBX], XMM1;
1016         }`);
1017         return b;
1018     }
1019     
1020     // _mm_slli_si128 ; PSLLDQ
1021     __m128i _mm_slli_si128(int imm)(auto const ref __m128i a) {
1022         const(__m128i)* _a = &a;
1023         __m128i b;
1024         __m128i* _b = &b;
1025         mixin(`asm pure nothrow {
1026             mov RAX, _a;
1027             mov RBX, _b;
1028             movdqu XMM1, [RAX];
1029             pslldq XMM1, ` ~ imm.to!string ~ `;
1030             movdqu [RBX], XMM1;
1031         }`);
1032         return b;
1033     }
1034 }