1 /**
2 * SHA-160
3 * 
4 * Copyright:
5 * (C) 1999-2007 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.hash.sha1_sse2;
12 
13 import botan.constants;
14 static if (BOTAN_HAS_SHA1 && BOTAN_HAS_SHA1_SSE2 && BOTAN_HAS_SIMD_SSE2):
15 
16 import botan.hash.sha160;
17 import botan.utils.rotate;
18 import botan.utils.simd.emmintrin;
19 import botan.hash.hash;
20 import std.format : format;
21 
22 /**
23 * SHA-160 using SSE2 for the message expansion
24 */
25 class SHA160SSE2 : SHA160
26 {
27 public:
28     override HashFunction clone() const { return new SHA160SSE2; }
29     this() 
30     {
31         super(0);
32     } // no W needed
33 
34 protected:
35     /*
36     * SHA-160 Compression Function using SSE for message expansion
37     */
38     override void compressN(const(ubyte)* input_bytes, size_t blocks)
39     {
40         
41         const(__m128i) K00_19 = _mm_set1_epi32!(0x5A827999)();
42         const(__m128i) K20_39 = _mm_set1_epi32!(0x6ED9EBA1)();
43         const(__m128i) K40_59 = _mm_set1_epi32!(0x8F1BBCDC)();
44         const(__m128i) K60_79 = _mm_set1_epi32!(0xCA62C1D6)();
45         
46         uint A = m_digest[0],
47             B = m_digest[1],
48             C = m_digest[2],
49             D = m_digest[3],
50             E = m_digest[4];
51         
52         __m128i* input = cast(__m128i*)(input_bytes);
53         
54         foreach (size_t i; 0 .. blocks)
55         {
56             union v4si {
57                 uint[4] u32;
58                 __m128i u128;
59             }
60             
61             v4si P0, P1, P2, P3;
62             
63             __m128i W0 = _mm_loadu_si128(input);
64             mixin(prep00_15!(P0, W0));
65             
66             __m128i W1 = _mm_loadu_si128(&input[1]);
67             mixin(prep00_15!(P1, W1));
68             
69             __m128i W2 = _mm_loadu_si128(&input[2]);
70             mixin(prep00_15!(P2, W2));
71             
72             __m128i W3 = _mm_loadu_si128(&input[3]);
73             mixin(prep00_15!(P3, W3));
74             
75             
76             mixin(`
77         F1(A, B, C, D, E, ` ~ GET_P_32!(P0, 0) ~ `);
78         F1(E, A, B, C, D, ` ~ GET_P_32!(P0, 1) ~ `);
79         F1(D, E, A, B, C, ` ~ GET_P_32!(P0, 2) ~ `);
80         F1(C, D, E, A, B, ` ~ GET_P_32!(P0, 3) ~ `);
81         ` ~ prep!(P0, W0, W1, W2, W3, K00_19) ~ `
82 
83         F1(B, C, D, E, A, ` ~ GET_P_32!(P1, 0) ~ `);
84         F1(A, B, C, D, E, ` ~ GET_P_32!(P1, 1) ~ `);
85         F1(E, A, B, C, D, ` ~ GET_P_32!(P1, 2) ~ `);
86         F1(D, E, A, B, C, ` ~ GET_P_32!(P1, 3) ~ `);
87         ` ~ prep!(P1, W1, W2, W3, W0, K20_39) ~ `
88 
89         F1(C, D, E, A, B, ` ~ GET_P_32!(P2, 0) ~ `);
90         F1(B, C, D, E, A, ` ~ GET_P_32!(P2, 1) ~ `);
91         F1(A, B, C, D, E, ` ~ GET_P_32!(P2, 2) ~ `);
92         F1(E, A, B, C, D, ` ~ GET_P_32!(P2, 3) ~ `);
93         ` ~ prep!(P2, W2, W3, W0, W1, K20_39) ~ `
94 
95         F1(D, E, A, B, C, ` ~ GET_P_32!(P3, 0) ~ `);
96         F1(C, D, E, A, B, ` ~ GET_P_32!(P3, 1) ~ `);
97         F1(B, C, D, E, A, ` ~ GET_P_32!(P3, 2) ~ `);
98         F1(A, B, C, D, E, ` ~ GET_P_32!(P3, 3) ~ `);
99         ` ~ prep!(P3, W3, W0, W1, W2, K20_39) ~ `
100 
101         F1(E, A, B, C, D, ` ~ GET_P_32!(P0, 0) ~ `);
102         F1(D, E, A, B, C, ` ~ GET_P_32!(P0, 1) ~ `);
103         F1(C, D, E, A, B, ` ~ GET_P_32!(P0, 2) ~ `);
104         F1(B, C, D, E, A, ` ~ GET_P_32!(P0, 3) ~ `);
105         ` ~ prep!(P0, W0, W1, W2, W3, K20_39) ~ `
106 
107         F2(A, B, C, D, E, ` ~ GET_P_32!(P1, 0) ~ `);
108         F2(E, A, B, C, D, ` ~ GET_P_32!(P1, 1) ~ `);
109         F2(D, E, A, B, C, ` ~ GET_P_32!(P1, 2) ~ `);
110         F2(C, D, E, A, B, ` ~ GET_P_32!(P1, 3) ~ `);
111         ` ~ prep!(P1, W1, W2, W3, W0, K20_39) ~ `
112 
113         F2(B, C, D, E, A, ` ~ GET_P_32!(P2, 0) ~ `);
114         F2(A, B, C, D, E, ` ~ GET_P_32!(P2, 1) ~ `);
115         F2(E, A, B, C, D, ` ~ GET_P_32!(P2, 2) ~ `);
116         F2(D, E, A, B, C, ` ~ GET_P_32!(P2, 3) ~ `);
117         ` ~ prep!(P2, W2, W3, W0, W1, K40_59) ~ `
118 
119         F2(C, D, E, A, B, ` ~ GET_P_32!(P3, 0) ~ `);
120         F2(B, C, D, E, A, ` ~ GET_P_32!(P3, 1) ~ `);
121         F2(A, B, C, D, E, ` ~ GET_P_32!(P3, 2) ~ `);
122         F2(E, A, B, C, D, ` ~ GET_P_32!(P3, 3) ~ `);
123         ` ~ prep!(P3, W3, W0, W1, W2, K40_59) ~ `
124 
125         F2(D, E, A, B, C, ` ~ GET_P_32!(P0, 0) ~ `);
126         F2(C, D, E, A, B, ` ~ GET_P_32!(P0, 1) ~ `);
127         F2(B, C, D, E, A, ` ~ GET_P_32!(P0, 2) ~ `);
128         F2(A, B, C, D, E, ` ~ GET_P_32!(P0, 3) ~ `);
129         ` ~ prep!(P0, W0, W1, W2, W3, K40_59) ~ `
130 
131         F2(E, A, B, C, D, ` ~ GET_P_32!(P1, 0) ~ `);
132         F2(D, E, A, B, C, ` ~ GET_P_32!(P1, 1) ~ `);
133         F2(C, D, E, A, B, ` ~ GET_P_32!(P1, 2) ~ `);
134         F2(B, C, D, E, A, ` ~ GET_P_32!(P1, 3) ~ `);
135         ` ~ prep!(P1, W1, W2, W3, W0, K40_59) ~ `
136 
137         F3(A, B, C, D, E, ` ~ GET_P_32!(P2, 0) ~ `);
138         F3(E, A, B, C, D, ` ~ GET_P_32!(P2, 1) ~ `);
139         F3(D, E, A, B, C, ` ~ GET_P_32!(P2, 2) ~ `);
140         F3(C, D, E, A, B, ` ~ GET_P_32!(P2, 3) ~ `);
141         ` ~ prep!(P2, W2, W3, W0, W1, K40_59) ~ `
142 
143         F3(B, C, D, E, A, ` ~ GET_P_32!(P3, 0) ~ `);
144         F3(A, B, C, D, E, ` ~ GET_P_32!(P3, 1) ~ `);
145         F3(E, A, B, C, D, ` ~ GET_P_32!(P3, 2) ~ `);
146         F3(D, E, A, B, C, ` ~ GET_P_32!(P3, 3) ~ `);
147         ` ~ prep!(P3, W3, W0, W1, W2, K60_79) ~ `
148 
149         F3(C, D, E, A, B, ` ~ GET_P_32!(P0, 0) ~ `);
150         F3(B, C, D, E, A, ` ~ GET_P_32!(P0, 1) ~ `);
151         F3(A, B, C, D, E, ` ~ GET_P_32!(P0, 2) ~ `);
152         F3(E, A, B, C, D, ` ~ GET_P_32!(P0, 3) ~ `);
153         ` ~ prep!(P0, W0, W1, W2, W3, K60_79) ~ `
154 
155         F3(D, E, A, B, C, ` ~ GET_P_32!(P1, 0) ~ `);
156         F3(C, D, E, A, B, ` ~ GET_P_32!(P1, 1) ~ `);
157         F3(B, C, D, E, A, ` ~ GET_P_32!(P1, 2) ~ `);
158         F3(A, B, C, D, E, ` ~ GET_P_32!(P1, 3) ~ `);
159         ` ~ prep!(P1, W1, W2, W3, W0, K60_79) ~ `
160 
161         F3(E, A, B, C, D, ` ~ GET_P_32!(P2, 0) ~ `);
162         F3(D, E, A, B, C, ` ~ GET_P_32!(P2, 1) ~ `);
163         F3(C, D, E, A, B, ` ~ GET_P_32!(P2, 2) ~ `);
164         F3(B, C, D, E, A, ` ~ GET_P_32!(P2, 3) ~ `);
165         ` ~ prep!(P2, W2, W3, W0, W1, K60_79) ~ `
166 
167         F4(A, B, C, D, E, ` ~ GET_P_32!(P3, 0) ~ `);
168         F4(E, A, B, C, D, ` ~ GET_P_32!(P3, 1) ~ `);
169         F4(D, E, A, B, C, ` ~ GET_P_32!(P3, 2) ~ `);
170         F4(C, D, E, A, B, ` ~ GET_P_32!(P3, 3) ~ `);
171         ` ~ prep!(P3, W3, W0, W1, W2, K60_79) ~ `
172 
173         F4(B, C, D, E, A, ` ~ GET_P_32!(P0, 0) ~ `);
174         F4(A, B, C, D, E, ` ~ GET_P_32!(P0, 1) ~ `);
175         F4(E, A, B, C, D, ` ~ GET_P_32!(P0, 2) ~ `);
176         F4(D, E, A, B, C, ` ~ GET_P_32!(P0, 3) ~ `);
177 
178         F4(C, D, E, A, B, ` ~ GET_P_32!(P1, 0) ~ `);
179         F4(B, C, D, E, A, ` ~ GET_P_32!(P1, 1) ~ `);
180         F4(A, B, C, D, E, ` ~ GET_P_32!(P1, 2) ~ `);
181         F4(E, A, B, C, D, ` ~ GET_P_32!(P1, 3) ~ `);
182 
183         F4(D, E, A, B, C, ` ~ GET_P_32!(P2, 0) ~ `);
184         F4(C, D, E, A, B, ` ~ GET_P_32!(P2, 1) ~ `);
185         F4(B, C, D, E, A, ` ~ GET_P_32!(P2, 2) ~ `);
186         F4(A, B, C, D, E, ` ~ GET_P_32!(P2, 3) ~ `);
187 
188         F4(E, A, B, C, D, ` ~ GET_P_32!(P3, 0) ~ `);
189         F4(D, E, A, B, C, ` ~ GET_P_32!(P3, 1) ~ `);
190         F4(C, D, E, A, B, ` ~ GET_P_32!(P3, 2) ~ `);
191         F4(B, C, D, E, A, ` ~ GET_P_32!(P3, 3) ~ `);`);
192             
193             A = (m_digest[0] += A);
194             B = (m_digest[1] += B);
195             C = (m_digest[2] += C);
196             D = (m_digest[3] += D);
197             E = (m_digest[4] += E);
198             
199             input += (hashBlockSize / 16);
200         }
201     }
202 
203 }
204 
205 
206 private:
207 
208 /*
209 * First 16 bytes just need ubyte swapping. Preparing just means
210 * adding in the round constants.
211 */
212 
213 /*
214     Using SSE4; slower on Core2 and Nehalem
215     #define GET_P_32(P, i) _mm_extract_epi32(P.u128, i)
216 
217     Much slower on all tested platforms
218     #define GET_P_32(P,i) _mm_cvtsi128_si32(_mm_srli_si128(P.u128, i*4))
219 */
220 enum string GET_P_32(alias P, ubyte i) = 
221     BOTAN_FORCE_SSE4
222     ? `_mm_extract_epi32(` ~ __traits(identifier, P).stringof ~ `.u128, ` ~ i.stringof ~ `)`
223     : __traits(identifier, P) ~ `.u32[` ~ i.stringof ~ `]`;
224 
225 enum string prep00_15(alias P, alias _W) = q{
226     {
227         enum SHUF = _MM_SHUFFLE(2, 3, 0, 1);
228         %1$s = _mm_shufflehi_epi16!SHUF(%1$s);
229         %1$s = _mm_shufflelo_epi16!SHUF(%1$s);
230         %1$s = _mm_or_si128(_mm_slli_epi16!8(%1$s), _mm_srli_epi16!8(%1$s));
231         %2$s.u128 = _mm_add_epi32(%1$s, K00_19);
232     }
233 }.format(__traits(identifier, _W), __traits(identifier, P));
234 
235 /*
236 For each multiple of 4, t, we want to calculate this:
237 
238 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
239 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
240 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
241 W[t+3] = rol(W[t]    ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
242 
243 we'll actually calculate this:
244 
245 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
246 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
247 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
248 W[t+3] = rol(  0     ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
249 W[t+3] ^= rol(W[t+0], 1);
250 
251 the parameters are:
252 
253 W0 = &W[t-16];
254 W1 = &W[t-12];
255 W2 = &W[t- 8];
256 W3 = &W[t- 4];
257 
258 and on output:
259 prepared = W0 + K
260 W0 = W[t]..W[t+3]
261 */
262 
263 /* note that there is a step here where i want to do a rol by 1, which
264 * normally would look like this:
265 *
266 * r1 = psrld r0,$31
267 * r0 = pslld r0,$1
268 * r0 = por r0,r1
269 *
270 * but instead i do this:
271 *
272 * r1 = pcmpltd r0,zero
273 * r0 = paddd r0,r0
274 * r0 = psub r0,r1
275 *
276 * because pcmpltd and paddd are availabe in both MMX units on
277 * efficeon, pentium-m, and opteron but shifts are available in
278 * only one unit.
279 */
280 string prep(alias _prep, alias _XW0, alias _XW1, alias _XW2, alias _XW3, alias _K)()
281 {
282     enum prep = __traits(identifier, _prep);
283     enum XW0 = __traits(identifier, _XW0);
284     enum XW1 = __traits(identifier, _XW1);
285     enum XW2 = __traits(identifier, _XW2);
286     enum XW3 = __traits(identifier, _XW3);
287     enum K = __traits(identifier, _K);
288     return `{
289                 __m128i r0, r1, r2, r3;
290 
291                 /* load W[t-4] 16-ubyte aligned, and shift */
292                 r3 = _mm_srli_si128!4(` ~ XW3 ~ `);
293                 r0 = ` ~ XW0 ~ `;
294                 /* get high 64-bits of XW0 into low 64-bits */
295                 r1 = _mm_shuffle_epi32!(_MM_SHUFFLE(1,0,3,2))(` ~ XW0 ~ `);
296                 /* load high 64-bits of r1 */
297                 r1 = _mm_unpacklo_epi64(r1, ` ~ XW1 ~ `);
298                 r2 = ` ~ XW2 ~ `;
299                 r0 = _mm_xor_si128(r1, r0);
300                 r2 = _mm_xor_si128(r3, r2);    
301                 r0 = _mm_xor_si128(r2, r0);
302                 /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */
303 
304                 r2 = _mm_slli_si128!12(r0);
305                 r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128());
306                 r0 = _mm_add_epi32(r0, r0);    /* shift left by 1 */
307                 r0 = _mm_sub_epi32(r0, r1);    /* r0 has W[t]..W[t+2] */
308 
309                 r3 = _mm_srli_epi32!30(r2);
310                 r2 = _mm_slli_epi32!2(r2);
311                 r0 = _mm_xor_si128(r0, r3);
312                 r0 = _mm_xor_si128(r0, r2);    /* r0 now has W[t+3] */
313                 ` ~ XW0 ~ ` = r0;
314                 ` ~ prep ~ `.u128 = _mm_add_epi32(r0, ` ~ K ~ `);
315         }`;
316 }
317 
318 pure:
319 
320 /*
321 * SHA-160 F1 Function
322 */
323 void F1(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
324 {
325     E += (D ^ (B & (C ^ D))) + msg + rotateLeft(A, 5);
326     B  = rotateLeft(B, 30);
327 }
328 
329 /*
330 * SHA-160 F2 Function
331 */
332 void F2(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
333 {
334     E += (B ^ C ^ D) + msg + rotateLeft(A, 5);
335     B  = rotateLeft(B, 30);
336 }
337 
338 /*
339 * SHA-160 F3 Function
340 */
341 void F3(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
342 {
343     E += ((B & C) | ((B | C) & D)) + msg + rotateLeft(A, 5);
344     B  = rotateLeft(B, 30);
345 }
346 
347 /*
348 * SHA-160 F4 Function
349 */
350 void F4(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
351 {
352     E += (B ^ C ^ D) + msg + rotateLeft(A, 5);
353     B  = rotateLeft(B, 30);
354 }