1 /*
2 * emmintrin.h style functions
3 * (C) 2014-2015 Etienne Cimon
4 *
5 * License:
6 * Released under the MIT license
7 */
8 module botan.utils.simd.emmintrin;
9 
10 import botan.constants;
11 static if (BOTAN_HAS_SIMD_SSE2):
12 version(D_InlineAsm_X86) {
13     version(Windows) { pragma(msg, "Error: Loaded SIMD SSE2 in a x86 build! Use --config=windows_x86 in dub"); }
14     else { pragma(msg, "Error: Loaded SIMD SSE2 in a x86 build! Use --config=posix_x86 in dub"); }
15 }
16 import core.simd;
17 import std.conv : to;
18 
19 pure:
20 nothrow:
21 @trusted:
22 
23 alias __m128i = byte16;
24 alias __m64 = ulong;
25 
26 int _MM_SHUFFLE(int z, int y, int x, int w)
27 {
28     return ( (z<<6) | (y<<4) | (x<<2) | w );
29 }
30 
31 // _mm_set1_epi32
32 __m128i _mm_set1_epi32 (int i)() {
33     int4 vec = [i, i, i, i];
34     return *cast(__m128i*) &vec;
35 }
36 
37 // _mm_set1_epi32
38 __m128i _mm_set1_epi32 (int i) {
39     align(16) int[4] vec = [i, i, i, i];
40     return _mm_loadu_si128(cast(__m128i*)&vec);
41 }
42 
43 // _mm_set_epi32
44 immutable(__m128i) _mm_set_epi32 (int i, int j, int k, int l)() {
45     int4 vec = [l, k, j, i];
46     return *cast(immutable(__m128i)*) &vec;
47 }
48 
49 // _mm_set_epi32
50 immutable(__m128i) _mm_set_epi32 (int i, int j, int k, int l) {
51 
52     align(16) int[4] vec = [l, k, j, i];
53     return _mm_loadu_si128(cast(__m128i*)&vec);
54 }
55 
56 // _mm_set_epi8
57 immutable(__m128i) _mm_set1_epi8 (byte i)() {
58     return byte16([i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i]);
59 }
60 
61 // _mm_set_epi8
62 immutable(__m128i) _mm_set1_epi8(byte[] arr)() {
63     mixin(`byte16 arr_fix = [` ~ arr[15].to!string ~ `, ` ~ arr[14].to!string ~ `, 
64                                 ` ~ arr[13].to!string ~ `, ` ~ arr[12].to!string ~ `, 
65                                 ` ~ arr[11].to!string ~ `, ` ~ arr[10].to!string ~ `, 
66                                 ` ~ arr[9].to!string ~ `, ` ~ arr[8].to!string ~ `, 
67                                 ` ~ arr[7].to!string ~ `, ` ~ arr[6].to!string ~ `, 
68                                 ` ~ arr[5].to!string ~ `, ` ~ arr[4].to!string ~ `, 
69                                 ` ~ arr[3].to!string ~ `, ` ~ arr[2].to!string ~ `, 
70                                 ` ~ arr[1].to!string ~ `, ` ~ arr[0].to!string ~ `];`);
71     return cast(immutable __m128i)arr_fix;
72 }
73 
74 // _mm_set1_epi16
75 __m128i _mm_set1_epi16(short w)() {
76     short8 vec = short8([w,w,w,w,w,w,w,w]);
77     return *cast(__m128i*) &vec;
78 }
79 
80 version(GDC) {
81     // GDC <--> emmintrin => gcc/gcc/config/i386/emmintrin.h
82     static import gcc.attribute;
83     import gcc.builtins;
84     enum inline = gcc.attribute.attribute("forceinline");
85 @inline:
86     // _mm_set1_epi16
87     __m128i _mm_set1_epi16(short w) {
88         short[8] a = [w,w,w,w,w,w,w,w];
89         __m128i b;
90         short[8]* _a = &a;
91         __m128i* _b = &b;
92         mixin( q{
93             asm pure nothrow {
94                 "movdqu (%0), %xmm0\n"
95                 "movdqu %xmm0, (%1)\n"
96                 : : "0" (_a), "1" (_b) : "xmm0"
97             }
98         });
99         return b;
100     }
101 
102     ulong bswap64(ulong val) {
103         return cast(ulong) __builtin_bswap64(val);
104     }
105 
106     int _mm_cvtsi128_si32(__m128i a) {
107         return cast(int) __builtin_ia32_vec_ext_v4si(cast(int4) a, 0);
108     }
109 
110     // _mm_min_epu8
111     __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) {
112         return cast(__m128i) __builtin_ia32_pminub128(a, b);
113     }
114 
115     __m128i _mm_shuffle_epi8()(auto ref __m128i a, auto const ref __m128i b) {
116         return cast(__m128i) __builtin_ia32_pshufb128(a, b);
117     }
118 
119     // _mm_subs_epu16
120     __m128i _mm_subs_epu16()(auto ref __m128i a, auto const ref __m128i b) {
121         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8) a, cast(short8) b);
122     }
123 
124     // _mm_mulhi_epu16 ; PMULHUW
125     __m128i _mm_mulhi_epu16()(auto ref __m128i a, auto const ref __m128i b) {
126         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8) a, cast(short8) b);
127     }
128 
129 
130     // _mm_cmpeq_epi16 ; PCMPEQW
131     __m128i _mm_cmpeq_epi16()(auto ref __m128i a, auto const ref __m128i b) {
132         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8) a, cast(short8) b);
133     }
134 
135     // _mm_mullo_epi16 ; PMULLW
136     __m128i _mm_mullo_epi16()(auto ref __m128i a, auto const ref __m128i b) {
137         return cast(__m128i) __builtin_ia32_pmullw128(cast(short8) a, cast(short8) b);
138     }
139 
140     // _mm_sub_epi16 ; PSUBW
141     __m128i _mm_sub_epi16()(auto ref __m128i a, auto const ref __m128i b) {
142         return cast(__m128i) __builtin_ia32_psubw128(cast(short8) a, cast(short8) b);
143     }
144 
145     // _mm_add_epi16 ; PADDW
146     __m128i _mm_add_epi16()(auto ref __m128i a, auto const ref __m128i b) {
147         return cast(__m128i) __builtin_ia32_paddw128(cast(short8) a, cast(short8) b);
148     }
149 
150     // _mm_srli_epi16 ; PSRLW
151     __m128i _mm_srli_epi16(int imm)(__m128i a) {
152         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8) a, imm);
153     }
154 
155     // _mm_slli_epi16 ; PSLLW
156     __m128i _mm_slli_epi16(int imm)(__m128i a) {
157         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8) a, imm);
158     }
159 
160     // _mm_shufflehi_epi16 ; PSHUFHW
161     __m128i _mm_shufflehi_epi16(int imm)(__m128i a) {
162         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8) a, imm);
163     }
164 
165     // _mm_shufflelo_epi16 ; PSHUFLW
166     __m128i _mm_shufflelo_epi16(int imm)(__m128i a) {
167         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8) a, imm);
168     }
169 
170     // _mm_add_epi32 ; PADDD
171     __m128i _mm_add_epi32()(auto ref __m128i a, auto const ref __m128i b) {
172         return cast(__m128i) __builtin_ia32_paddd128(cast(int4) a, cast(int4) b);
173     }
174 
175     // _mm_sub_epi32 ; PSUBD
176     __m128i _mm_sub_epi32()(auto ref __m128i a, auto const ref __m128i b) {
177         return cast(__m128i) __builtin_ia32_psubd128(cast(int4) a, cast(int4) b);
178     }
179 
180     // _mm_cmplt_epi32 ; PCMPGTDr
181     __m128i _mm_cmplt_epi32()(auto ref __m128i a, auto const ref __m128i b) {
182         return cast(__m128i) __builtin_ia32_pcmpgtd128(cast(int4) b, cast(int4) a);
183     }
184 
185     // _mm_shuffle_epi32
186     __m128i _mm_shuffle_epi32(int imm)(__m128i a) {
187         return cast(__m128i) __builtin_ia32_pshufd(cast(int4) a, imm);
188     }
189 
190     // _mm_extract_epi32 ; pextrd
191     int _mm_extract_epi32(__m128i a, in int ndx) {
192         return cast(__m128i) __builtin_ia32_vec_ext_v4si(cast(int4) a, ndx);
193     }
194 
195     // _mm_unpackhi_epi32 ; PUNPCKHDQ
196     __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) {
197         return cast(__m128i) __builtin_ia32_punpckhdq128(cast(int4) a, cast(int4) b);
198     }
199 
200     // _mm_unpacklo_epi32 ; PUNPCKLDQ
201     __m128i _mm_unpacklo_epi32()(auto ref __m128i a, auto const ref __m128i b) {
202         return cast(__m128i) __builtin_ia32_punpckldq128(cast(int4) a, cast(int4) b);
203     }
204 
205     // _mm_unpackhi_epi64 ; PUNPCKHQDQ
206     __m128i _mm_unpackhi_epi64()(auto ref __m128i a, auto const ref __m128i b) {
207         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
208     }
209 
210     // _mm_unpacklo_epi64 ; PUNPCKLQDQ
211     __m128i _mm_unpacklo_epi64()(auto ref __m128i a, auto const ref __m128i b) {
212         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
213     }
214     
215     // _mm_setzero_si128 ; PXOR
216     __m128i _mm_setzero_si128 () {
217         return cast(__m128i) int4([0, 0, 0, 0]);
218     }
219 
220     // _mm_loadu_si128 ; MOVDQU
221     __m128i _mm_loadu_si128 (in __m128i* p) {
222         return cast(__m128i) __builtin_ia32_loaddqu(p);
223     }
224 
225     // _mm_storeu_si128 ; MOVDQU
226     void _mm_storeu_si128()(__m128i* p, auto const ref __m128i a) {
227         return cast(__m128i) __builtin_ia32_storedqu(p, a);
228     }
229 
230     // _mm_or_si128 ; POR
231     __m128i _mm_or_si128()(auto ref __m128i a, auto const ref __m128i b) {
232         return cast(__m128i) __builtin_ia32_por128(cast(long2) a, cast(long2) b);
233     }
234 
235     // _mm_andnot_si128 ; PANDN
236     __m128i _mm_andnot_si128()(auto ref __m128i a, auto const ref __m128i b) {
237         return cast(__m128i) __builtin_ia32_pandn128(cast(long2) a, cast(long2) b);
238     }
239 
240     // _mm_and_si128 ; PAND
241     __m128i _mm_and_si128()(auto ref __m128i a, auto const ref __m128i b) {
242         return cast(__m128i) __builtin_ia32_pand128(cast(long2) a, cast(long2) b);
243     }
244 
245     // _mm_xor_si128 ; PXOR
246     __m128i _mm_xor_si128 ( __m128i a, auto const ref __m128i b) {
247         return cast(__m128i) __builtin_ia32_pxor128(cast(long2) a, cast(long2) b);
248     }
249 
250     // _mm_srli_si128 ; PSRLDQ
251     __m128i _mm_srli_si128(int imm)(__m128i a) {
252         return cast(__m128i) __builtin_ia32_psrldqi128(a, imm*8);
253     }
254 
255     // _mm_slli_si128 ; PSLLDQ
256     __m128i _mm_slli_si128(int imm)(__m128i a) {
257         return cast(__m128i) __builtin_ia32_pslldqi128(a, imm*8);
258     }
259 }
260 
261 version(none) {
262     import ldc.gccbuiltins_x86;
263 
264     pragma(LDC_intrinsic, "llvm.bswap.i64")
265         ulong bswap64(ulong i);
266 
267     __m128i _mm_set1_epi16(short w) {
268         short[8] a = [w,w,w,w,w,w,w,w];
269         __m128i b;
270         short[8]* _a = &a;
271         __m128i* _b = &b;
272         mixin( q{
273             __asm pure nothrow {
274                 "movdqu (%0), %xmm0\n"
275                 ~ "movdqu %xmm0, (%1)\n"
276                 : : "0" (_a), "1" (_b) : "xmm0"
277             }
278         });
279         return b;
280     }
281 
282     int _mm_cvtsi128_si32(__m128i a) {
283         return cast(int) __builtin_ia32_vec_ext_v4si(cast(int4) a, 0);
284     }
285 
286     // _mm_shuffle_epi8
287     __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
288         return cast(__m128i) __builtin_ia32_pshufb128(a, b);
289     }
290 
291     // _mm_min_epu8
292     __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) {
293         return cast(__m128i) __builtin_ia32_pminub128(a, b);
294     }
295     
296     // _mm_subs_epu16
297     __m128i _mm_subs_epu16()(auto ref __m128i a, auto const ref __m128i b) {
298         return cast(__m128i) __builtin_ia32_psubusw128(cast(short8) a, cast(short8) b);
299     }
300     
301     // _mm_mulhi_epu16 ; PMULHUW
302     __m128i _mm_mulhi_epu16()(auto ref __m128i a, auto const ref __m128i b) {
303         return cast(__m128i) __builtin_ia32_pmulhuw128(cast(short8) a, cast(short8) b);
304     }
305     
306     // _mm_set1_epi16
307     __m128i _mm_set1_epi16 (short w) {
308         return cast(__m128i) short8([w,w,w,w,w,w,w,w]);
309     }
310     
311     // _mm_cmpeq_epi16 ; PCMPEQW
312     __m128i _mm_cmpeq_epi16()(auto ref __m128i a, auto const ref __m128i b) {
313         return cast(__m128i) __builtin_ia32_pcmpeqw128(cast(short8) a, cast(short8) b);
314     }
315     
316     // _mm_mullo_epi16 ; PMULLW
317     __m128i _mm_mullo_epi16()(auto ref __m128i a, auto const ref __m128i b) {
318         return cast(__m128i) __builtin_ia32_pmullw128(cast(short8) a, cast(short8) b);
319     }
320     
321     // _mm_sub_epi16 ; PSUBW
322     __m128i _mm_sub_epi16()(auto ref __m128i a, auto const ref __m128i b) {
323         return cast(__m128i) __builtin_ia32_psubw128(cast(short8) a, cast(short8) b);
324     }
325     
326     // _mm_add_epi16 ; PADDW
327     __m128i _mm_add_epi16()(auto ref __m128i a, auto const ref __m128i b) {
328         return cast(__m128i) __builtin_ia32_paddw128(cast(short8) a, cast(short8) b);
329     }
330     
331     // _mm_srli_epi16 ; PSRLW
332     __m128i _mm_srli_epi16(int imm)(__m128i a) {
333         return cast(__m128i) __builtin_ia32_psrlwi128(cast(short8) a, imm);
334     }
335     
336     // _mm_slli_epi16 ; PSLLW
337     __m128i _mm_slli_epi16(int imm)(__m128i a) {
338         return cast(__m128i) __builtin_ia32_psllwi128(cast(short8) a, imm);
339     }
340     
341     // _mm_shufflehi_epi16 ; PSHUFHW
342     __m128i _mm_shufflehi_epi16(int imm)(__m128i a) {
343         return cast(__m128i) __builtin_ia32_pshufhw(cast(short8) a, imm);
344     }
345     
346     // _mm_shufflelo_epi16 ; PSHUFLW
347     __m128i _mm_shufflelo_epi16(int imm)(__m128i a) {
348         return cast(__m128i) __builtin_ia32_pshuflw(cast(short8) a, imm);
349     }
350     
351     // _mm_add_epi32 ; PADDD
352     __m128i _mm_add_epi32()(auto ref __m128i a, auto const ref __m128i b) {
353         return cast(__m128i) __builtin_ia32_paddd128(cast(int4) a, cast(int4) b);
354     }
355     
356     // _mm_sub_epi32 ; PSUBD
357     __m128i _mm_sub_epi32()(auto ref __m128i a, auto const ref __m128i b) {
358         return cast(__m128i) __builtin_ia32_psubd128(cast(int4) a, cast(int4) b);
359     }
360     
361     // _mm_cmplt_epi32 ; PCMPGTDr
362     __m128i _mm_cmplt_epi32()(auto ref __m128i a, auto const ref __m128i b) {
363         return cast(__m128i) __builtin_ia32_pcmpgtd128(cast(int4) b, cast(int4) a);
364     }
365 
366     // _mm_shuffle_epi32
367     __m128i _mm_shuffle_epi32(int imm)(__m128i a) {
368         return cast(__m128i) __builtin_ia32_pshufd(cast(int4) a, imm);
369     }
370     
371     // _mm_extract_epi32 ; pextrd
372     int _mm_extract_epi32(__m128i a, in int ndx) {
373         return cast(__m128i) __builtin_ia32_vec_ext_v4si(cast(int4) a, ndx);
374     }
375     
376     // _mm_unpackhi_epi32 ; PUNPCKHDQ
377     __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) {
378         return cast(__m128i) __builtin_ia32_punpckhdq128(cast(int4) a, cast(int4) b);
379     }
380     
381     // _mm_unpacklo_epi32 ; PUNPCKLDQ
382     __m128i _mm_unpacklo_epi32()(auto ref __m128i a, auto const ref __m128i b) {
383         return cast(__m128i) __builtin_ia32_punpckldq128(cast(int4) a, cast(int4) b);
384     }
385     
386     // _mm_unpackhi_epi64 ; PUNPCKHQDQ
387     __m128i _mm_unpackhi_epi64()(auto ref __m128i a, auto const ref __m128i b) {
388         return cast(__m128i) __builtin_ia32_punpckhqdq128(cast(long2) a, cast(long2) b);
389     }
390     
391     // _mm_unpacklo_epi64 ; PUNPCKLQDQ
392     __m128i _mm_unpacklo_epi64()(auto ref __m128i a, auto const ref __m128i b) {
393         return cast(__m128i) __builtin_ia32_punpcklqdq128(cast(long2) a, cast(long2) b);
394     }
395     
396     // _mm_setzero_si128 ; PXOR
397     __m128i _mm_setzero_si128 () {
398         return cast(__m128i) int4([0, 0, 0, 0]);
399     }
400     
401     // _mm_loadu_si128 ; MOVDQU
402     __m128i _mm_loadu_si128 (in __m128i* p) {
403         return cast(__m128i) __builtin_ia32_loaddqu(p);
404     }
405     
406     // _mm_storeu_si128 ; MOVDQU
407     void _mm_storeu_si128()(__m128i *p, auto const ref __m128i a) {
408         return cast(__m128i) __builtin_ia32_storedqu(p, a);
409     }
410     
411     // _mm_or_si128 ; POR
412     __m128i _mm_or_si128()(auto ref __m128i a, auto const ref __m128i b) {
413         return cast(__m128i) __builtin_ia32_por128(cast(long2) a, cast(long2) b);
414     }
415     
416     // _mm_andnot_si128 ; PANDN
417     __m128i _mm_andnot_si128()(auto ref __m128i a, auto const ref __m128i b) {
418         return cast(__m128i) __builtin_ia32_pandn128(cast(long2) a, cast(long2) b);
419     }
420     
421     // _mm_and_si128 ; PAND
422     __m128i _mm_and_si128()(auto ref __m128i a, auto const ref __m128i b) {
423         return cast(__m128i) __builtin_ia32_pand128(cast(long2) a, cast(long2) b);
424     }
425     
426     // _mm_xor_si128 ; PXOR
427     __m128i _mm_xor_si128 ( __m128i a, auto const ref __m128i b) {
428         return cast(__m128i) __builtin_ia32_pxor128(cast(long2) a, cast(long2) b);
429     }
430     
431     // _mm_srli_si128 ; PSRLDQ
432     __m128i _mm_srli_si128(int imm)(__m128i a) {
433         return cast(__m128i) __builtin_ia32_psrldqi128(a, imm*8);
434     }
435     
436     // _mm_slli_si128 ; PSLLDQ
437     __m128i _mm_slli_si128(int imm)(__m128i a) {
438         return cast(__m128i) __builtin_ia32_pslldqi128(a, imm*8);
439     }
440     
441     // bswap64
442     
443 }
444 
445 version(D_InlineAsm_X86_64) {
446     // _mm_set1_epi16
447     __m128i _mm_set1_epi16(short w) {
448         short[8] a = [w,w,w,w,w,w,w,w];
449         short[8]* _a = &a;
450         __m128i b;
451         __m128i* _b = &b;
452 
453         asm pure nothrow {
454             mov RAX, _a;
455             mov RBX, _b;
456             movdqu XMM1, [RAX];
457             movdqu [RBX], XMM1;
458         }
459 
460         return b;
461     }
462 
463     int _mm_cvtsi128_si32()(auto const ref __m128i a) {
464         int ret;
465         int* _ret = &ret;
466         const(__m128i)* _a = &a;
467 
468         asm pure nothrow {
469             mov RAX, _a;
470             mov RBX, _ret;
471             movdqu XMM1, [RAX];
472             movd [RBX], XMM1;
473         }
474         return ret;
475     }
476 
477     // _mm_min_epu8 ; PMINUB
478     __m128i _mm_min_epu8()(auto ref __m128i a, auto const ref __m128i b) {
479 
480         const(__m128i)* _a = &a;
481         const(__m128i)* _b = &b;
482         __m128i c;
483         __m128i* _c = &c;
484 
485         asm pure nothrow {
486             mov RAX, _a;
487             mov RBX, _b;
488             mov RCX, _c;
489             movdqu XMM1, [RAX];
490             movdqu XMM2, [RBX];
491             pminub XMM1, XMM2;
492             movdqu [RCX], XMM1;
493         }
494         return c;
495     }
496 
497     __m128i _mm_shuffle_epi8()(auto const ref __m128i a, auto const ref __m128i b) {
498         const(__m128i)* _a = &a;
499         const(__m128i)* _b = &b;
500         __m128i c;
501         __m128i* _c = &c;
502 
503         asm pure nothrow {
504             mov RAX, _a;
505             mov RBX, _b;
506             mov RCX, _c;
507             movdqu XMM1, [RAX];
508             movdqu XMM2, [RBX];
509             pshufb XMM1, XMM2;
510             movdqu [RCX], XMM1;
511         }
512         return c;
513     }
514 
515     // _mm_subs_epu16 ; PSUBUSW
516     __m128i _mm_subs_epu16()(auto const ref __m128i a, auto const ref __m128i b) {
517 
518         const(__m128i)* _a = &a;
519         const(__m128i)* _b = &b;
520         __m128i c;
521         __m128i* _c = &c;
522 
523         asm pure nothrow {
524             mov RAX, _a;
525             mov RBX, _b;
526             mov RCX, _c;
527             movdqu XMM1, [RAX];
528             movdqu XMM2, [RBX];
529             psubusw XMM1, XMM2;
530             movdqu [RCX], XMM1;
531         }
532         return c;
533     }
534     
535     // _mm_mulhi_epu16 ; PMULHUW
536     __m128i _mm_mulhi_epu16()(auto const ref __m128i a, auto const ref __m128i b) {
537         const(__m128i)* _a = &a;
538         const(__m128i)* _b = &b;
539         __m128i c;
540         __m128i* _c = &c;
541 
542         asm pure nothrow {
543             mov RAX, _a;
544             mov RBX, _b;
545             mov RCX, _c;
546             movdqu XMM1, [RAX];
547             movdqu XMM2, [RBX];
548             pmulhuw XMM1, XMM2;
549             movdqu [RCX], XMM1;
550         }
551         return c;
552     }
553         
554     // _mm_cmpeq_epi16 ; PCMPEQW
555     __m128i _mm_cmpeq_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
556 
557         const(__m128i)* _a = &a;
558         const(__m128i)* _b = &b;
559         __m128i c;
560         __m128i* _c = &c;
561 
562         asm pure nothrow {
563             mov RAX, _a;
564             mov RBX, _b;
565             mov RCX, _c;
566             movdqu XMM1, [RAX];
567             movdqu XMM2, [RBX];
568             pcmpeqw XMM1, XMM2;
569             movdqu [RCX], XMM1;
570         }
571         return c;
572     }
573     
574     // _mm_mullo_epi16 ; PMULLW
575     __m128i _mm_mullo_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
576 
577         const(__m128i)* _a = &a;
578         const(__m128i)* _b = &b;
579         __m128i c;
580         __m128i* _c = &c;
581 
582         asm pure nothrow {
583             mov RAX, _a;
584             mov RBX, _b;
585             mov RCX, _c;
586             movdqu XMM1, [RAX];
587             movdqu XMM2, [RBX];
588             pmullw XMM1, XMM2;
589             movdqu [RCX], XMM1;
590         }
591         return c;
592 
593     }
594     
595     // _mm_sub_epi16 ; PSUBW
596     __m128i _mm_sub_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
597         const(__m128i)* _a = &a;
598         const(__m128i)* _b = &b;
599         __m128i c;
600         __m128i* _c = &c;
601 
602         asm pure nothrow {
603             mov RAX, _a;
604             mov RBX, _b;
605             mov RCX, _c;
606             movdqu XMM1, [RAX];
607             movdqu XMM2, [RBX];
608             psubw XMM1, XMM2;
609             movdqu [RCX], XMM1;
610         }
611         return c;
612     }
613     
614     // _mm_add_epi16 ; PADDW
615     __m128i _mm_add_epi16()(auto const ref __m128i a, auto const ref __m128i b) {
616         const(__m128i)* _a = &a;
617         const(__m128i)* _b = &b;
618         __m128i c;
619         __m128i* _c = &c;
620         asm pure nothrow {
621             mov RAX, _a;
622             mov RBX, _b;
623             mov RCX, _c;
624             movdqu XMM1, [RAX];
625             movdqu XMM2, [RBX];
626             paddw XMM1, XMM2;
627             movdqu [RCX], XMM1;
628         }
629         return c;
630     }
631 
632     // _mm_srli_epi16 ; PSRLW
633     __m128i _mm_srli_epi16(int imm)(auto const ref __m128i a) {
634         const(__m128i)* _a = &a;
635         __m128i b;
636         __m128i* _b = &b;
637 
638         mixin(`asm pure nothrow {
639             mov RAX, _a;
640             mov RBX, _b;
641             movdqu XMM1, [RAX];
642             psrlw XMM1, ` ~ imm.to!string ~ `;
643             movdqu [RBX], XMM1;
644         }`);
645         return b;
646     }    
647 
648     // _mm_srli_epi32 ; PSRLD
649     __m128i _mm_srli_epi32(int imm)(auto const ref __m128i a) {
650         const(__m128i)* _a = &a;
651         __m128i b;
652         __m128i* _b = &b;
653 
654         mixin(`asm pure nothrow {
655             mov RAX, _a;
656             mov RBX, _b;
657             movdqu XMM1, [RAX];
658             psrld XMM1, ` ~ imm.to!string ~ `;
659             movdqu [RBX], XMM1;
660         }`);
661         return b;
662     }
663 
664     // _mm_slli_epi32 ; PSLLD
665     __m128i _mm_slli_epi32(int imm)(auto const ref __m128i a) {
666         const(__m128i)* _a = &a;
667         __m128i b;
668         __m128i* _b = &b;
669         mixin(`asm pure nothrow {
670             mov RAX, _a;
671             mov RBX, _b;
672             movdqu XMM1, [RAX];
673             pslld XMM1, ` ~ imm.to!string ~ `;
674             movdqu [RBX], XMM1;
675         }`);
676         return b;
677     }
678     
679     // _mm_slli_epi16 ; PSLLW
680     __m128i _mm_slli_epi16(int imm)(auto const ref __m128i a) {
681         const(__m128i)* _a = &a;
682         __m128i b;
683         __m128i* _b = &b;
684 
685         mixin(`asm pure nothrow {
686             mov RAX, _a;
687             mov RBX, _b;
688             movdqu XMM1, [RAX];
689             psllw XMM1, ` ~ imm.to!string ~ `;
690             movdqu [RBX], XMM1;
691         }`);
692         return b;
693     }
694     
695     // _mm_shufflehi_epi16 ; PSHUFHW
696     __m128i _mm_shufflehi_epi16(int imm)(const ref __m128i a) {
697         const(__m128i)* _a = &a;
698         __m128i b;
699         __m128i* _b = &b;
700         mixin(`asm pure nothrow {
701             mov RAX, _a;
702             mov RBX, _b;
703             movdqu XMM2, [RAX];
704             pshufhw XMM1, XMM2, ` ~ imm.to!string ~ `;
705             movdqu [RBX], XMM1;
706             }`);
707         return b;
708     }
709     
710     // _mm_shufflelo_epi16 ; PSHUFLW
711     __m128i _mm_shufflelo_epi16(int imm)(auto const ref __m128i a) {
712         const(__m128i)* _a = &a;
713         __m128i b;
714         __m128i* _b = &b;
715 
716         mixin(`asm pure nothrow {
717             mov RAX, _a;
718             mov RBX, _b;
719             movdqu XMM2, [RAX];
720             pshuflw XMM1, XMM2, ` ~ imm.to!string ~ `;
721             movdqu [RBX], XMM1;
722         }`);
723         return b;
724     }
725 	
726 	// _mm_add_epi32 ; PADDD
727 	__m128i _mm_add_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
728 		const(__m128i)* _a = &a;
729 		const(__m128i)* _b = &b;
730 		__m128i c;
731 		__m128i* _c = &c;
732 		
733 		asm pure nothrow {
734 			mov RAX, _a;
735 			mov RBX, _b;
736 			mov RCX, _c;
737 			movdqu XMM1, [RAX];
738 			movdqu XMM2, [RBX];
739 			paddd XMM1, XMM2;
740 			movdqu [RCX], XMM1;
741 		}
742 		return c;
743 	}
744 	
745 	// _mm_add_epi64 ; PADDQ
746 	__m128i _mm_add_epi64()(auto const ref __m128i a, auto const ref __m128i b) {
747 		const(__m128i)* _a = &a;
748 		const(__m128i)* _b = &b;
749 		__m128i c;
750 		__m128i* _c = &c;
751 		
752 		asm pure nothrow {
753 			mov RAX, _a;
754 			mov RBX, _b;
755 			mov RCX, _c;
756 			movdqu XMM1, [RAX];
757 			movdqu XMM2, [RBX];
758 			paddq XMM1, XMM2;
759 			movdqu [RCX], XMM1;
760 		}
761 		return c;
762 	}
763     
764     // _mm_sub_epi32 ; PSUBD
765     __m128i _mm_sub_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
766         const(__m128i)* _a = &a;
767         const(__m128i)* _b = &b;
768         __m128i c;
769         __m128i* _c = &c;
770 
771         asm pure nothrow {
772             mov RAX, _a;
773             mov RBX, _b;
774             mov RCX, _c;
775             movdqu XMM1, [RAX];
776             movdqu XMM2, [RBX];
777             psubd XMM1, XMM2;
778             movdqu [RCX], XMM1;
779         }
780         return c;
781     }
782     
783     // _mm_cmplt_epi32 ; PCMPGTD
784     __m128i _mm_cmplt_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
785         const(__m128i)* _a = &a;
786         const(__m128i)* _b = &b;
787         __m128i c;
788         __m128i* _c = &c;
789 
790         asm pure nothrow {
791             mov RAX, _b;
792             mov RBX, _a;
793             mov RCX, _c;
794             movdqu XMM1, [RAX];
795             movdqu XMM2, [RBX];
796             pcmpgtd XMM1, XMM2;
797             movdqu [RCX], XMM1;
798         }
799 
800         return c;
801     }
802 
803     // _mm_shuffle_epi32 ;  PSHUFD
804     __m128i _mm_shuffle_epi32(int imm)(auto const ref __m128i a) {
805         const(__m128i)* _a = &a;
806         __m128i b;
807         __m128i* _b = &b;
808 
809         mixin(`asm pure nothrow {
810             mov RAX, _a;
811             mov RBX, _b;
812             movdqu XMM2, [RAX];
813             pshufd XMM1, XMM2, ` ~ imm.to!string ~ `;
814             movdqu [RBX], XMM1;
815         }`);
816         return b;
817     }
818     
819     // _mm_extract_epi32 ; pextrd
820     int _mm_extract_epi32(int ndx)(__m128i a) {
821 
822         __m128i* _a = &a;
823         int b;
824         int* _b = &b;
825         mixin(`asm pure nothrow {
826             mov RAX, _a;
827             mov RBX, _b;
828             movdqu XMM2, [RAX];
829             pextrd ECX, XMM2, ` ~ ndx.to!string ~ `;
830             mov [RBX], ECX;
831         }`);
832         return b;
833     }
834     
835     // _mm_unpackhi_epi32 ; PUNPCKHDQ
836     __m128i _mm_unpackhi_epi32()(auto ref __m128i a, auto const ref __m128i b) {
837         const(__m128i)* _a = &a;
838         const(__m128i)* _b = &b;
839         __m128i c;
840         __m128i* _c = &c;
841 
842         asm pure nothrow {
843             mov RAX, _a;
844             mov RBX, _b;
845             mov RCX, _c;
846             movdqu XMM1, [RAX];
847             movdqu XMM2, [RBX];
848             punpckhdq XMM1, XMM2;
849             movdqu [RCX], XMM1;
850         }
851         
852         return c;
853     }
854     
855     // _mm_unpacklo_epi32 ; PUNPCKLDQ
856     __m128i _mm_unpacklo_epi32()(auto const ref __m128i a, auto const ref __m128i b) {
857         const(__m128i)* _a = &a;
858         const(__m128i)* _b = &b;
859         __m128i c;
860         __m128i* _c = &c;
861 
862         asm pure nothrow {
863             mov RAX, _a;
864             mov RBX, _b;
865             mov RCX, _c;
866             movdqu XMM1, [RAX];
867             movdqu XMM2, [RBX];
868             punpckldq XMM1, XMM2;
869             movdqu [RCX], XMM1;
870         }
871         
872         return c;
873     }
874     
875     // _mm_unpackhi_epi64 ; PUNPCKHQDQ
876     __m128i _mm_unpackhi_epi64()(auto const ref __m128i a, auto const ref __m128i b) {
877         const(__m128i)* _a = &a;
878         const(__m128i)* _b = &b;
879         __m128i c;
880         __m128i* _c = &c;
881 
882         asm pure nothrow {
883             mov RAX, _a;
884             mov RBX, _b;
885             mov RCX, _c;
886             movdqu XMM1, [RAX];
887             movdqu XMM2, [RBX];
888             punpckhqdq XMM1, XMM2;
889             movdqu [RCX], XMM1;
890         }
891         
892         return c;
893     }
894     
895     // _mm_unpacklo_epi64 ; PUNPCKLQDQ
896     __m128i _mm_unpacklo_epi64()(auto const ref __m128i a, auto const ref __m128i b) {
897         const(__m128i)* _a = &a;
898         const(__m128i)* _b = &b;
899         __m128i c;
900         __m128i* _c = &c;
901 
902         asm pure nothrow {
903             mov RAX, _a;
904             mov RBX, _b;
905             mov RCX, _c;
906             movdqu XMM1, [RAX];
907             movdqu XMM2, [RBX];
908             punpcklqdq XMM1, XMM2;
909             movdqu [RCX], XMM1;
910         }
911         
912         return c;
913     }
914     
915     // _mm_setzero_si128 ; PXOR
916     __m128i _mm_setzero_si128 () {
917         return cast(__m128i) int4([0, 0, 0, 0]);
918     }
919     
920     // _mm_loadu_si128 ; MOVDQU
921     __m128i _mm_loadu_si128 (in __m128i* p) {
922         __m128i a;
923         __m128i* _a = &a;
924         
925         asm pure nothrow {
926             mov RAX, p;
927             mov RBX, _a;
928             movdqu XMM1, [RAX];
929             movdqu [RBX], XMM1;
930         }
931         
932         return a;
933     }
934     
935     // _mm_storeu_si128 ; MOVDQU
936     void _mm_storeu_si128()(__m128i* p, auto const ref __m128i a) {
937         const(__m128i)* _a = &a;
938         
939         asm pure nothrow {
940             mov RAX, _a;
941             mov RBX, p;
942             movdqu XMM1, [RAX];
943             movdqu [RBX], XMM1;
944         }
945     }
946     
947     // _mm_or_si128 ; POR
948     __m128i _mm_or_si128()(auto const ref __m128i a, auto const ref __m128i b) {
949         const(__m128i)* _a = &a;
950         const(__m128i)* _b = &b;
951         __m128i c;
952         __m128i* _c = &c;
953 
954         asm pure nothrow {
955             mov RAX, _a;
956             mov RBX, _b;
957             mov RCX, _c;
958             movdqu XMM1, [RAX];
959             movdqu XMM2, [RBX];
960             por XMM1, XMM2;
961             movdqu [RCX], XMM1;
962         }
963         
964         return c;
965     }
966     
967     // _mm_andnot_si128 ; PANDN
968     __m128i _mm_andnot_si128()(auto const ref __m128i a, auto const ref __m128i b) {
969         const(__m128i)* _a = &a;
970         const(__m128i)* _b = &b;
971         __m128i c;
972         __m128i* _c = &c;
973 
974         asm pure nothrow {
975             mov RAX, _a;
976             mov RBX, _b;
977             mov RCX, _c;
978             movdqu XMM1, [RAX];
979             movdqu XMM2, [RBX];
980             pandn XMM1, XMM2;
981             movdqu [RCX], XMM1;
982         }
983         
984         return c;
985     }
986     
987     // _mm_and_si128 ; PAND
988     __m128i _mm_and_si128()(auto const ref __m128i a, auto const ref __m128i b) {
989         const(__m128i)* _a = &a;
990         const(__m128i)* _b = &b;
991         __m128i c;
992         __m128i* _c = &c;
993         asm pure nothrow {
994             mov RAX, _a;
995             mov RBX, _b;
996             mov RCX, _c;
997             movdqu XMM1, [RAX];
998             movdqu XMM2, [RBX];
999             pand XMM1, XMM2;
1000             movdqu [RCX], XMM1;
1001         }
1002         
1003         return c;
1004     }
1005     
1006     // _mm_xor_si128 ; PXOR
1007     __m128i _mm_xor_si128()(auto const ref __m128i a, auto const ref __m128i b) {
1008         const(__m128i)* _a = &a;
1009         const(__m128i)* _b = &b;
1010         __m128i c;
1011         __m128i* _c = &c;
1012         asm pure nothrow {
1013             mov RAX, _a;
1014             mov RBX, _b;
1015             mov RCX, _c;
1016             movdqu XMM1, [RAX];
1017             movdqu XMM2, [RBX];
1018             pxor XMM1, XMM2;
1019             movdqu [RCX], XMM1;
1020         }
1021         return c;
1022     }
1023     
1024     // _mm_srli_si128 ; PSRLDQ
1025     __m128i _mm_srli_si128(int imm)(auto const ref __m128i a) {
1026         const(__m128i)* _a = &a;
1027         __m128i b;
1028         __m128i* _b = &b;
1029         mixin(`asm pure nothrow {
1030             mov RAX, _a;
1031             mov RBX, _b;
1032             movdqu XMM1, [RAX];
1033             psrldq XMM1, ` ~ imm.to!string ~ `;
1034             movdqu [RBX], XMM1;
1035         }`);
1036         return b;
1037     }
1038     
1039     // _mm_slli_si128 ; PSLLDQ
1040     __m128i _mm_slli_si128(int imm)(auto const ref __m128i a) {
1041         const(__m128i)* _a = &a;
1042         __m128i b;
1043         __m128i* _b = &b;
1044         mixin(`asm pure nothrow {
1045             mov RAX, _a;
1046             mov RBX, _b;
1047             movdqu XMM1, [RAX];
1048             pslldq XMM1, ` ~ imm.to!string ~ `;
1049             movdqu [RBX], XMM1;
1050         }`);
1051         return b;
1052     }
1053 }