1 /**
2 * SHA-160
3 *
4 * Copyright:
5 * (C) 1999-2007 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.hash.sha1_sse2;
12
13 import botan.constants;
14 static if (BOTAN_HAS_SHA1 && BOTAN_HAS_SHA1_SSE2 && BOTAN_HAS_SIMD_SSE2):
15
16 import botan.hash.sha160;
17 import botan.utils.rotate;
18 import botan.utils.simd.emmintrin;
19 import botan.hash.hash;
20 import std.format : format;
21
22 /**
23 * SHA-160 using SSE2 for the message expansion
24 */
25 class SHA160SSE2 : SHA160
26 {
27 public:
28 override HashFunction clone() const { return new SHA160SSE2; }
29 this()
30 {
31 super(0);
32 } // no W needed
33
34 protected:
35 /*
36 * SHA-160 Compression Function using SSE for message expansion
37 */
38 override void compressN(const(ubyte)* input_bytes, size_t blocks)
39 {
40
41 const(__m128i) K00_19 = _mm_set1_epi32!(0x5A827999)();
42 const(__m128i) K20_39 = _mm_set1_epi32!(0x6ED9EBA1)();
43 const(__m128i) K40_59 = _mm_set1_epi32!(0x8F1BBCDC)();
44 const(__m128i) K60_79 = _mm_set1_epi32!(0xCA62C1D6)();
45
46 uint A = m_digest[0],
47 B = m_digest[1],
48 C = m_digest[2],
49 D = m_digest[3],
50 E = m_digest[4];
51
52 __m128i* input = cast(__m128i*)(input_bytes);
53
54 foreach (size_t i; 0 .. blocks)
55 {
56 union v4si {
57 uint[4] u32;
58 __m128i u128;
59 }
60
61 v4si P0, P1, P2, P3;
62
63 __m128i W0 = _mm_loadu_si128(input);
64 mixin(prep00_15!(P0, W0));
65
66 __m128i W1 = _mm_loadu_si128(&input[1]);
67 mixin(prep00_15!(P1, W1));
68
69 __m128i W2 = _mm_loadu_si128(&input[2]);
70 mixin(prep00_15!(P2, W2));
71
72 __m128i W3 = _mm_loadu_si128(&input[3]);
73 mixin(prep00_15!(P3, W3));
74
75
76 mixin(`
77 F1(A, B, C, D, E, ` ~ GET_P_32!(P0, 0) ~ `);
78 F1(E, A, B, C, D, ` ~ GET_P_32!(P0, 1) ~ `);
79 F1(D, E, A, B, C, ` ~ GET_P_32!(P0, 2) ~ `);
80 F1(C, D, E, A, B, ` ~ GET_P_32!(P0, 3) ~ `);
81 ` ~ prep!(P0, W0, W1, W2, W3, K00_19) ~ `
82
83 F1(B, C, D, E, A, ` ~ GET_P_32!(P1, 0) ~ `);
84 F1(A, B, C, D, E, ` ~ GET_P_32!(P1, 1) ~ `);
85 F1(E, A, B, C, D, ` ~ GET_P_32!(P1, 2) ~ `);
86 F1(D, E, A, B, C, ` ~ GET_P_32!(P1, 3) ~ `);
87 ` ~ prep!(P1, W1, W2, W3, W0, K20_39) ~ `
88
89 F1(C, D, E, A, B, ` ~ GET_P_32!(P2, 0) ~ `);
90 F1(B, C, D, E, A, ` ~ GET_P_32!(P2, 1) ~ `);
91 F1(A, B, C, D, E, ` ~ GET_P_32!(P2, 2) ~ `);
92 F1(E, A, B, C, D, ` ~ GET_P_32!(P2, 3) ~ `);
93 ` ~ prep!(P2, W2, W3, W0, W1, K20_39) ~ `
94
95 F1(D, E, A, B, C, ` ~ GET_P_32!(P3, 0) ~ `);
96 F1(C, D, E, A, B, ` ~ GET_P_32!(P3, 1) ~ `);
97 F1(B, C, D, E, A, ` ~ GET_P_32!(P3, 2) ~ `);
98 F1(A, B, C, D, E, ` ~ GET_P_32!(P3, 3) ~ `);
99 ` ~ prep!(P3, W3, W0, W1, W2, K20_39) ~ `
100
101 F1(E, A, B, C, D, ` ~ GET_P_32!(P0, 0) ~ `);
102 F1(D, E, A, B, C, ` ~ GET_P_32!(P0, 1) ~ `);
103 F1(C, D, E, A, B, ` ~ GET_P_32!(P0, 2) ~ `);
104 F1(B, C, D, E, A, ` ~ GET_P_32!(P0, 3) ~ `);
105 ` ~ prep!(P0, W0, W1, W2, W3, K20_39) ~ `
106
107 F2(A, B, C, D, E, ` ~ GET_P_32!(P1, 0) ~ `);
108 F2(E, A, B, C, D, ` ~ GET_P_32!(P1, 1) ~ `);
109 F2(D, E, A, B, C, ` ~ GET_P_32!(P1, 2) ~ `);
110 F2(C, D, E, A, B, ` ~ GET_P_32!(P1, 3) ~ `);
111 ` ~ prep!(P1, W1, W2, W3, W0, K20_39) ~ `
112
113 F2(B, C, D, E, A, ` ~ GET_P_32!(P2, 0) ~ `);
114 F2(A, B, C, D, E, ` ~ GET_P_32!(P2, 1) ~ `);
115 F2(E, A, B, C, D, ` ~ GET_P_32!(P2, 2) ~ `);
116 F2(D, E, A, B, C, ` ~ GET_P_32!(P2, 3) ~ `);
117 ` ~ prep!(P2, W2, W3, W0, W1, K40_59) ~ `
118
119 F2(C, D, E, A, B, ` ~ GET_P_32!(P3, 0) ~ `);
120 F2(B, C, D, E, A, ` ~ GET_P_32!(P3, 1) ~ `);
121 F2(A, B, C, D, E, ` ~ GET_P_32!(P3, 2) ~ `);
122 F2(E, A, B, C, D, ` ~ GET_P_32!(P3, 3) ~ `);
123 ` ~ prep!(P3, W3, W0, W1, W2, K40_59) ~ `
124
125 F2(D, E, A, B, C, ` ~ GET_P_32!(P0, 0) ~ `);
126 F2(C, D, E, A, B, ` ~ GET_P_32!(P0, 1) ~ `);
127 F2(B, C, D, E, A, ` ~ GET_P_32!(P0, 2) ~ `);
128 F2(A, B, C, D, E, ` ~ GET_P_32!(P0, 3) ~ `);
129 ` ~ prep!(P0, W0, W1, W2, W3, K40_59) ~ `
130
131 F2(E, A, B, C, D, ` ~ GET_P_32!(P1, 0) ~ `);
132 F2(D, E, A, B, C, ` ~ GET_P_32!(P1, 1) ~ `);
133 F2(C, D, E, A, B, ` ~ GET_P_32!(P1, 2) ~ `);
134 F2(B, C, D, E, A, ` ~ GET_P_32!(P1, 3) ~ `);
135 ` ~ prep!(P1, W1, W2, W3, W0, K40_59) ~ `
136
137 F3(A, B, C, D, E, ` ~ GET_P_32!(P2, 0) ~ `);
138 F3(E, A, B, C, D, ` ~ GET_P_32!(P2, 1) ~ `);
139 F3(D, E, A, B, C, ` ~ GET_P_32!(P2, 2) ~ `);
140 F3(C, D, E, A, B, ` ~ GET_P_32!(P2, 3) ~ `);
141 ` ~ prep!(P2, W2, W3, W0, W1, K40_59) ~ `
142
143 F3(B, C, D, E, A, ` ~ GET_P_32!(P3, 0) ~ `);
144 F3(A, B, C, D, E, ` ~ GET_P_32!(P3, 1) ~ `);
145 F3(E, A, B, C, D, ` ~ GET_P_32!(P3, 2) ~ `);
146 F3(D, E, A, B, C, ` ~ GET_P_32!(P3, 3) ~ `);
147 ` ~ prep!(P3, W3, W0, W1, W2, K60_79) ~ `
148
149 F3(C, D, E, A, B, ` ~ GET_P_32!(P0, 0) ~ `);
150 F3(B, C, D, E, A, ` ~ GET_P_32!(P0, 1) ~ `);
151 F3(A, B, C, D, E, ` ~ GET_P_32!(P0, 2) ~ `);
152 F3(E, A, B, C, D, ` ~ GET_P_32!(P0, 3) ~ `);
153 ` ~ prep!(P0, W0, W1, W2, W3, K60_79) ~ `
154
155 F3(D, E, A, B, C, ` ~ GET_P_32!(P1, 0) ~ `);
156 F3(C, D, E, A, B, ` ~ GET_P_32!(P1, 1) ~ `);
157 F3(B, C, D, E, A, ` ~ GET_P_32!(P1, 2) ~ `);
158 F3(A, B, C, D, E, ` ~ GET_P_32!(P1, 3) ~ `);
159 ` ~ prep!(P1, W1, W2, W3, W0, K60_79) ~ `
160
161 F3(E, A, B, C, D, ` ~ GET_P_32!(P2, 0) ~ `);
162 F3(D, E, A, B, C, ` ~ GET_P_32!(P2, 1) ~ `);
163 F3(C, D, E, A, B, ` ~ GET_P_32!(P2, 2) ~ `);
164 F3(B, C, D, E, A, ` ~ GET_P_32!(P2, 3) ~ `);
165 ` ~ prep!(P2, W2, W3, W0, W1, K60_79) ~ `
166
167 F4(A, B, C, D, E, ` ~ GET_P_32!(P3, 0) ~ `);
168 F4(E, A, B, C, D, ` ~ GET_P_32!(P3, 1) ~ `);
169 F4(D, E, A, B, C, ` ~ GET_P_32!(P3, 2) ~ `);
170 F4(C, D, E, A, B, ` ~ GET_P_32!(P3, 3) ~ `);
171 ` ~ prep!(P3, W3, W0, W1, W2, K60_79) ~ `
172
173 F4(B, C, D, E, A, ` ~ GET_P_32!(P0, 0) ~ `);
174 F4(A, B, C, D, E, ` ~ GET_P_32!(P0, 1) ~ `);
175 F4(E, A, B, C, D, ` ~ GET_P_32!(P0, 2) ~ `);
176 F4(D, E, A, B, C, ` ~ GET_P_32!(P0, 3) ~ `);
177
178 F4(C, D, E, A, B, ` ~ GET_P_32!(P1, 0) ~ `);
179 F4(B, C, D, E, A, ` ~ GET_P_32!(P1, 1) ~ `);
180 F4(A, B, C, D, E, ` ~ GET_P_32!(P1, 2) ~ `);
181 F4(E, A, B, C, D, ` ~ GET_P_32!(P1, 3) ~ `);
182
183 F4(D, E, A, B, C, ` ~ GET_P_32!(P2, 0) ~ `);
184 F4(C, D, E, A, B, ` ~ GET_P_32!(P2, 1) ~ `);
185 F4(B, C, D, E, A, ` ~ GET_P_32!(P2, 2) ~ `);
186 F4(A, B, C, D, E, ` ~ GET_P_32!(P2, 3) ~ `);
187
188 F4(E, A, B, C, D, ` ~ GET_P_32!(P3, 0) ~ `);
189 F4(D, E, A, B, C, ` ~ GET_P_32!(P3, 1) ~ `);
190 F4(C, D, E, A, B, ` ~ GET_P_32!(P3, 2) ~ `);
191 F4(B, C, D, E, A, ` ~ GET_P_32!(P3, 3) ~ `);`);
192
193 A = (m_digest[0] += A);
194 B = (m_digest[1] += B);
195 C = (m_digest[2] += C);
196 D = (m_digest[3] += D);
197 E = (m_digest[4] += E);
198
199 input += (hashBlockSize / 16);
200 }
201 }
202
203 }
204
205
206 private:
207
208 /*
209 * First 16 bytes just need ubyte swapping. Preparing just means
210 * adding in the round constants.
211 */
212
213 /*
214 Using SSE4; slower on Core2 and Nehalem
215 #define GET_P_32(P, i) _mm_extract_epi32(P.u128, i)
216
217 Much slower on all tested platforms
218 #define GET_P_32(P,i) _mm_cvtsi128_si32(_mm_srli_si128(P.u128, i*4))
219 */
220 enum string GET_P_32(alias P, ubyte i) =
221 BOTAN_FORCE_SSE4
222 ? `_mm_extract_epi32(` ~ __traits(identifier, P).stringof ~ `.u128, ` ~ i.stringof ~ `)`
223 : __traits(identifier, P) ~ `.u32[` ~ i.stringof ~ `]`;
224
225 enum string prep00_15(alias P, alias _W) = q{
226 {
227 enum SHUF = _MM_SHUFFLE(2, 3, 0, 1);
228 %1$s = _mm_shufflehi_epi16!SHUF(%1$s);
229 %1$s = _mm_shufflelo_epi16!SHUF(%1$s);
230 %1$s = _mm_or_si128(_mm_slli_epi16!8(%1$s), _mm_srli_epi16!8(%1$s));
231 %2$s.u128 = _mm_add_epi32(%1$s, K00_19);
232 }
233 }.format(__traits(identifier, _W), __traits(identifier, P));
234
235 /*
236 For each multiple of 4, t, we want to calculate this:
237
238 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
239 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
240 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
241 W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
242
243 we'll actually calculate this:
244
245 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
246 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
247 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
248 W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
249 W[t+3] ^= rol(W[t+0], 1);
250
251 the parameters are:
252
253 W0 = &W[t-16];
254 W1 = &W[t-12];
255 W2 = &W[t- 8];
256 W3 = &W[t- 4];
257
258 and on output:
259 prepared = W0 + K
260 W0 = W[t]..W[t+3]
261 */
262
263 /* note that there is a step here where i want to do a rol by 1, which
264 * normally would look like this:
265 *
266 * r1 = psrld r0,$31
267 * r0 = pslld r0,$1
268 * r0 = por r0,r1
269 *
270 * but instead i do this:
271 *
272 * r1 = pcmpltd r0,zero
273 * r0 = paddd r0,r0
274 * r0 = psub r0,r1
275 *
276 * because pcmpltd and paddd are availabe in both MMX units on
277 * efficeon, pentium-m, and opteron but shifts are available in
278 * only one unit.
279 */
280 string prep(alias _prep, alias _XW0, alias _XW1, alias _XW2, alias _XW3, alias _K)()
281 {
282 enum prep = __traits(identifier, _prep);
283 enum XW0 = __traits(identifier, _XW0);
284 enum XW1 = __traits(identifier, _XW1);
285 enum XW2 = __traits(identifier, _XW2);
286 enum XW3 = __traits(identifier, _XW3);
287 enum K = __traits(identifier, _K);
288 return `{
289 __m128i r0, r1, r2, r3;
290
291 /* load W[t-4] 16-ubyte aligned, and shift */
292 r3 = _mm_srli_si128!4(` ~ XW3 ~ `);
293 r0 = ` ~ XW0 ~ `;
294 /* get high 64-bits of XW0 into low 64-bits */
295 r1 = _mm_shuffle_epi32!(_MM_SHUFFLE(1,0,3,2))(` ~ XW0 ~ `);
296 /* load high 64-bits of r1 */
297 r1 = _mm_unpacklo_epi64(r1, ` ~ XW1 ~ `);
298 r2 = ` ~ XW2 ~ `;
299 r0 = _mm_xor_si128(r1, r0);
300 r2 = _mm_xor_si128(r3, r2);
301 r0 = _mm_xor_si128(r2, r0);
302 /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */
303
304 r2 = _mm_slli_si128!12(r0);
305 r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128());
306 r0 = _mm_add_epi32(r0, r0); /* shift left by 1 */
307 r0 = _mm_sub_epi32(r0, r1); /* r0 has W[t]..W[t+2] */
308
309 r3 = _mm_srli_epi32!30(r2);
310 r2 = _mm_slli_epi32!2(r2);
311 r0 = _mm_xor_si128(r0, r3);
312 r0 = _mm_xor_si128(r0, r2); /* r0 now has W[t+3] */
313 ` ~ XW0 ~ ` = r0;
314 ` ~ prep ~ `.u128 = _mm_add_epi32(r0, ` ~ K ~ `);
315 }`;
316 }
317
318 pure:
319
320 /*
321 * SHA-160 F1 Function
322 */
323 void F1(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
324 {
325 E += (D ^ (B & (C ^ D))) + msg + rotateLeft(A, 5);
326 B = rotateLeft(B, 30);
327 }
328
329 /*
330 * SHA-160 F2 Function
331 */
332 void F2(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
333 {
334 E += (B ^ C ^ D) + msg + rotateLeft(A, 5);
335 B = rotateLeft(B, 30);
336 }
337
338 /*
339 * SHA-160 F3 Function
340 */
341 void F3(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
342 {
343 E += ((B & C) | ((B | C) & D)) + msg + rotateLeft(A, 5);
344 B = rotateLeft(B, 30);
345 }
346
347 /*
348 * SHA-160 F4 Function
349 */
350 void F4(uint A, ref uint B, uint C, uint D, ref uint E, uint msg)
351 {
352 E += (B ^ C ^ D) + msg + rotateLeft(A, 5);
353 B = rotateLeft(B, 30);
354 }