1 /** 2 * SHA-160 3 * 4 * Copyright: 5 * (C) 1999-2007 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.hash.sha1_sse2; 12 13 import botan.constants; 14 static if (BOTAN_HAS_SHA1 && BOTAN_HAS_SHA1_SSE2 && BOTAN_HAS_SIMD_SSE2): 15 16 import botan.hash.sha160; 17 import botan.utils.rotate; 18 import botan.utils.simd.emmintrin; 19 import botan.hash.hash; 20 import std.format : format; 21 22 /** 23 * SHA-160 using SSE2 for the message expansion 24 */ 25 class SHA160SSE2 : SHA160 26 { 27 public: 28 override HashFunction clone() const { return new SHA160SSE2; } 29 this() 30 { 31 super(0); 32 } // no W needed 33 34 protected: 35 /* 36 * SHA-160 Compression Function using SSE for message expansion 37 */ 38 override void compressN(const(ubyte)* input_bytes, size_t blocks) 39 { 40 41 const(__m128i) K00_19 = _mm_set1_epi32!(0x5A827999)(); 42 const(__m128i) K20_39 = _mm_set1_epi32!(0x6ED9EBA1)(); 43 const(__m128i) K40_59 = _mm_set1_epi32!(0x8F1BBCDC)(); 44 const(__m128i) K60_79 = _mm_set1_epi32!(0xCA62C1D6)(); 45 46 uint A = m_digest[0], 47 B = m_digest[1], 48 C = m_digest[2], 49 D = m_digest[3], 50 E = m_digest[4]; 51 52 __m128i* input = cast(__m128i*)(input_bytes); 53 54 foreach (size_t i; 0 .. blocks) 55 { 56 union v4si { 57 uint[4] u32; 58 __m128i u128; 59 } 60 61 v4si P0, P1, P2, P3; 62 63 __m128i W0 = _mm_loadu_si128(input); 64 mixin(prep00_15!(P0, W0)); 65 66 __m128i W1 = _mm_loadu_si128(&input[1]); 67 mixin(prep00_15!(P1, W1)); 68 69 __m128i W2 = _mm_loadu_si128(&input[2]); 70 mixin(prep00_15!(P2, W2)); 71 72 __m128i W3 = _mm_loadu_si128(&input[3]); 73 mixin(prep00_15!(P3, W3)); 74 75 76 mixin(` 77 F1(A, B, C, D, E, ` ~ GET_P_32!(P0, 0) ~ `); 78 F1(E, A, B, C, D, ` ~ GET_P_32!(P0, 1) ~ `); 79 F1(D, E, A, B, C, ` ~ GET_P_32!(P0, 2) ~ `); 80 F1(C, D, E, A, B, ` ~ GET_P_32!(P0, 3) ~ `); 81 ` ~ prep!(P0, W0, W1, W2, W3, K00_19) ~ ` 82 83 F1(B, C, D, E, A, ` ~ GET_P_32!(P1, 0) ~ `); 84 F1(A, B, C, D, E, ` ~ GET_P_32!(P1, 1) ~ `); 85 F1(E, A, B, C, D, ` ~ GET_P_32!(P1, 2) ~ `); 86 F1(D, E, A, B, C, ` ~ GET_P_32!(P1, 3) ~ `); 87 ` ~ prep!(P1, W1, W2, W3, W0, K20_39) ~ ` 88 89 F1(C, D, E, A, B, ` ~ GET_P_32!(P2, 0) ~ `); 90 F1(B, C, D, E, A, ` ~ GET_P_32!(P2, 1) ~ `); 91 F1(A, B, C, D, E, ` ~ GET_P_32!(P2, 2) ~ `); 92 F1(E, A, B, C, D, ` ~ GET_P_32!(P2, 3) ~ `); 93 ` ~ prep!(P2, W2, W3, W0, W1, K20_39) ~ ` 94 95 F1(D, E, A, B, C, ` ~ GET_P_32!(P3, 0) ~ `); 96 F1(C, D, E, A, B, ` ~ GET_P_32!(P3, 1) ~ `); 97 F1(B, C, D, E, A, ` ~ GET_P_32!(P3, 2) ~ `); 98 F1(A, B, C, D, E, ` ~ GET_P_32!(P3, 3) ~ `); 99 ` ~ prep!(P3, W3, W0, W1, W2, K20_39) ~ ` 100 101 F1(E, A, B, C, D, ` ~ GET_P_32!(P0, 0) ~ `); 102 F1(D, E, A, B, C, ` ~ GET_P_32!(P0, 1) ~ `); 103 F1(C, D, E, A, B, ` ~ GET_P_32!(P0, 2) ~ `); 104 F1(B, C, D, E, A, ` ~ GET_P_32!(P0, 3) ~ `); 105 ` ~ prep!(P0, W0, W1, W2, W3, K20_39) ~ ` 106 107 F2(A, B, C, D, E, ` ~ GET_P_32!(P1, 0) ~ `); 108 F2(E, A, B, C, D, ` ~ GET_P_32!(P1, 1) ~ `); 109 F2(D, E, A, B, C, ` ~ GET_P_32!(P1, 2) ~ `); 110 F2(C, D, E, A, B, ` ~ GET_P_32!(P1, 3) ~ `); 111 ` ~ prep!(P1, W1, W2, W3, W0, K20_39) ~ ` 112 113 F2(B, C, D, E, A, ` ~ GET_P_32!(P2, 0) ~ `); 114 F2(A, B, C, D, E, ` ~ GET_P_32!(P2, 1) ~ `); 115 F2(E, A, B, C, D, ` ~ GET_P_32!(P2, 2) ~ `); 116 F2(D, E, A, B, C, ` ~ GET_P_32!(P2, 3) ~ `); 117 ` ~ prep!(P2, W2, W3, W0, W1, K40_59) ~ ` 118 119 F2(C, D, E, A, B, ` ~ GET_P_32!(P3, 0) ~ `); 120 F2(B, C, D, E, A, ` ~ GET_P_32!(P3, 1) ~ `); 121 F2(A, B, C, D, E, ` ~ GET_P_32!(P3, 2) ~ `); 122 F2(E, A, B, C, D, ` ~ GET_P_32!(P3, 3) ~ `); 123 ` ~ prep!(P3, W3, W0, W1, W2, K40_59) ~ ` 124 125 F2(D, E, A, B, C, ` ~ GET_P_32!(P0, 0) ~ `); 126 F2(C, D, E, A, B, ` ~ GET_P_32!(P0, 1) ~ `); 127 F2(B, C, D, E, A, ` ~ GET_P_32!(P0, 2) ~ `); 128 F2(A, B, C, D, E, ` ~ GET_P_32!(P0, 3) ~ `); 129 ` ~ prep!(P0, W0, W1, W2, W3, K40_59) ~ ` 130 131 F2(E, A, B, C, D, ` ~ GET_P_32!(P1, 0) ~ `); 132 F2(D, E, A, B, C, ` ~ GET_P_32!(P1, 1) ~ `); 133 F2(C, D, E, A, B, ` ~ GET_P_32!(P1, 2) ~ `); 134 F2(B, C, D, E, A, ` ~ GET_P_32!(P1, 3) ~ `); 135 ` ~ prep!(P1, W1, W2, W3, W0, K40_59) ~ ` 136 137 F3(A, B, C, D, E, ` ~ GET_P_32!(P2, 0) ~ `); 138 F3(E, A, B, C, D, ` ~ GET_P_32!(P2, 1) ~ `); 139 F3(D, E, A, B, C, ` ~ GET_P_32!(P2, 2) ~ `); 140 F3(C, D, E, A, B, ` ~ GET_P_32!(P2, 3) ~ `); 141 ` ~ prep!(P2, W2, W3, W0, W1, K40_59) ~ ` 142 143 F3(B, C, D, E, A, ` ~ GET_P_32!(P3, 0) ~ `); 144 F3(A, B, C, D, E, ` ~ GET_P_32!(P3, 1) ~ `); 145 F3(E, A, B, C, D, ` ~ GET_P_32!(P3, 2) ~ `); 146 F3(D, E, A, B, C, ` ~ GET_P_32!(P3, 3) ~ `); 147 ` ~ prep!(P3, W3, W0, W1, W2, K60_79) ~ ` 148 149 F3(C, D, E, A, B, ` ~ GET_P_32!(P0, 0) ~ `); 150 F3(B, C, D, E, A, ` ~ GET_P_32!(P0, 1) ~ `); 151 F3(A, B, C, D, E, ` ~ GET_P_32!(P0, 2) ~ `); 152 F3(E, A, B, C, D, ` ~ GET_P_32!(P0, 3) ~ `); 153 ` ~ prep!(P0, W0, W1, W2, W3, K60_79) ~ ` 154 155 F3(D, E, A, B, C, ` ~ GET_P_32!(P1, 0) ~ `); 156 F3(C, D, E, A, B, ` ~ GET_P_32!(P1, 1) ~ `); 157 F3(B, C, D, E, A, ` ~ GET_P_32!(P1, 2) ~ `); 158 F3(A, B, C, D, E, ` ~ GET_P_32!(P1, 3) ~ `); 159 ` ~ prep!(P1, W1, W2, W3, W0, K60_79) ~ ` 160 161 F3(E, A, B, C, D, ` ~ GET_P_32!(P2, 0) ~ `); 162 F3(D, E, A, B, C, ` ~ GET_P_32!(P2, 1) ~ `); 163 F3(C, D, E, A, B, ` ~ GET_P_32!(P2, 2) ~ `); 164 F3(B, C, D, E, A, ` ~ GET_P_32!(P2, 3) ~ `); 165 ` ~ prep!(P2, W2, W3, W0, W1, K60_79) ~ ` 166 167 F4(A, B, C, D, E, ` ~ GET_P_32!(P3, 0) ~ `); 168 F4(E, A, B, C, D, ` ~ GET_P_32!(P3, 1) ~ `); 169 F4(D, E, A, B, C, ` ~ GET_P_32!(P3, 2) ~ `); 170 F4(C, D, E, A, B, ` ~ GET_P_32!(P3, 3) ~ `); 171 ` ~ prep!(P3, W3, W0, W1, W2, K60_79) ~ ` 172 173 F4(B, C, D, E, A, ` ~ GET_P_32!(P0, 0) ~ `); 174 F4(A, B, C, D, E, ` ~ GET_P_32!(P0, 1) ~ `); 175 F4(E, A, B, C, D, ` ~ GET_P_32!(P0, 2) ~ `); 176 F4(D, E, A, B, C, ` ~ GET_P_32!(P0, 3) ~ `); 177 178 F4(C, D, E, A, B, ` ~ GET_P_32!(P1, 0) ~ `); 179 F4(B, C, D, E, A, ` ~ GET_P_32!(P1, 1) ~ `); 180 F4(A, B, C, D, E, ` ~ GET_P_32!(P1, 2) ~ `); 181 F4(E, A, B, C, D, ` ~ GET_P_32!(P1, 3) ~ `); 182 183 F4(D, E, A, B, C, ` ~ GET_P_32!(P2, 0) ~ `); 184 F4(C, D, E, A, B, ` ~ GET_P_32!(P2, 1) ~ `); 185 F4(B, C, D, E, A, ` ~ GET_P_32!(P2, 2) ~ `); 186 F4(A, B, C, D, E, ` ~ GET_P_32!(P2, 3) ~ `); 187 188 F4(E, A, B, C, D, ` ~ GET_P_32!(P3, 0) ~ `); 189 F4(D, E, A, B, C, ` ~ GET_P_32!(P3, 1) ~ `); 190 F4(C, D, E, A, B, ` ~ GET_P_32!(P3, 2) ~ `); 191 F4(B, C, D, E, A, ` ~ GET_P_32!(P3, 3) ~ `);`); 192 193 A = (m_digest[0] += A); 194 B = (m_digest[1] += B); 195 C = (m_digest[2] += C); 196 D = (m_digest[3] += D); 197 E = (m_digest[4] += E); 198 199 input += (hashBlockSize / 16); 200 } 201 } 202 203 } 204 205 206 private: 207 208 /* 209 * First 16 bytes just need ubyte swapping. Preparing just means 210 * adding in the round constants. 211 */ 212 213 /* 214 Using SSE4; slower on Core2 and Nehalem 215 #define GET_P_32(P, i) _mm_extract_epi32(P.u128, i) 216 217 Much slower on all tested platforms 218 #define GET_P_32(P,i) _mm_cvtsi128_si32(_mm_srli_si128(P.u128, i*4)) 219 */ 220 enum string GET_P_32(alias P, ubyte i) = 221 BOTAN_FORCE_SSE4 222 ? `_mm_extract_epi32(` ~ __traits(identifier, P).stringof ~ `.u128, ` ~ i.stringof ~ `)` 223 : __traits(identifier, P) ~ `.u32[` ~ i.stringof ~ `]`; 224 225 enum string prep00_15(alias P, alias _W) = q{ 226 { 227 enum SHUF = _MM_SHUFFLE(2, 3, 0, 1); 228 %1$s = _mm_shufflehi_epi16!SHUF(%1$s); 229 %1$s = _mm_shufflelo_epi16!SHUF(%1$s); 230 %1$s = _mm_or_si128(_mm_slli_epi16!8(%1$s), _mm_srli_epi16!8(%1$s)); 231 %2$s.u128 = _mm_add_epi32(%1$s, K00_19); 232 } 233 }.format(__traits(identifier, _W), __traits(identifier, P)); 234 235 /* 236 For each multiple of 4, t, we want to calculate this: 237 238 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); 239 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1); 240 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1); 241 W[t+3] = rol(W[t] ^ W[t-5] ^ W[t-11] ^ W[t-13], 1); 242 243 we'll actually calculate this: 244 245 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1); 246 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1); 247 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1); 248 W[t+3] = rol( 0 ^ W[t-5] ^ W[t-11] ^ W[t-13], 1); 249 W[t+3] ^= rol(W[t+0], 1); 250 251 the parameters are: 252 253 W0 = &W[t-16]; 254 W1 = &W[t-12]; 255 W2 = &W[t- 8]; 256 W3 = &W[t- 4]; 257 258 and on output: 259 prepared = W0 + K 260 W0 = W[t]..W[t+3] 261 */ 262 263 /* note that there is a step here where i want to do a rol by 1, which 264 * normally would look like this: 265 * 266 * r1 = psrld r0,$31 267 * r0 = pslld r0,$1 268 * r0 = por r0,r1 269 * 270 * but instead i do this: 271 * 272 * r1 = pcmpltd r0,zero 273 * r0 = paddd r0,r0 274 * r0 = psub r0,r1 275 * 276 * because pcmpltd and paddd are availabe in both MMX units on 277 * efficeon, pentium-m, and opteron but shifts are available in 278 * only one unit. 279 */ 280 string prep(alias _prep, alias _XW0, alias _XW1, alias _XW2, alias _XW3, alias _K)() 281 { 282 enum prep = __traits(identifier, _prep); 283 enum XW0 = __traits(identifier, _XW0); 284 enum XW1 = __traits(identifier, _XW1); 285 enum XW2 = __traits(identifier, _XW2); 286 enum XW3 = __traits(identifier, _XW3); 287 enum K = __traits(identifier, _K); 288 return `{ 289 __m128i r0, r1, r2, r3; 290 291 /* load W[t-4] 16-ubyte aligned, and shift */ 292 r3 = _mm_srli_si128!4(` ~ XW3 ~ `); 293 r0 = ` ~ XW0 ~ `; 294 /* get high 64-bits of XW0 into low 64-bits */ 295 r1 = _mm_shuffle_epi32!(_MM_SHUFFLE(1,0,3,2))(` ~ XW0 ~ `); 296 /* load high 64-bits of r1 */ 297 r1 = _mm_unpacklo_epi64(r1, ` ~ XW1 ~ `); 298 r2 = ` ~ XW2 ~ `; 299 r0 = _mm_xor_si128(r1, r0); 300 r2 = _mm_xor_si128(r3, r2); 301 r0 = _mm_xor_si128(r2, r0); 302 /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */ 303 304 r2 = _mm_slli_si128!12(r0); 305 r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128()); 306 r0 = _mm_add_epi32(r0, r0); /* shift left by 1 */ 307 r0 = _mm_sub_epi32(r0, r1); /* r0 has W[t]..W[t+2] */ 308 309 r3 = _mm_srli_epi32!30(r2); 310 r2 = _mm_slli_epi32!2(r2); 311 r0 = _mm_xor_si128(r0, r3); 312 r0 = _mm_xor_si128(r0, r2); /* r0 now has W[t+3] */ 313 ` ~ XW0 ~ ` = r0; 314 ` ~ prep ~ `.u128 = _mm_add_epi32(r0, ` ~ K ~ `); 315 }`; 316 } 317 318 pure: 319 320 /* 321 * SHA-160 F1 Function 322 */ 323 void F1(uint A, ref uint B, uint C, uint D, ref uint E, uint msg) 324 { 325 E += (D ^ (B & (C ^ D))) + msg + rotateLeft(A, 5); 326 B = rotateLeft(B, 30); 327 } 328 329 /* 330 * SHA-160 F2 Function 331 */ 332 void F2(uint A, ref uint B, uint C, uint D, ref uint E, uint msg) 333 { 334 E += (B ^ C ^ D) + msg + rotateLeft(A, 5); 335 B = rotateLeft(B, 30); 336 } 337 338 /* 339 * SHA-160 F3 Function 340 */ 341 void F3(uint A, ref uint B, uint C, uint D, ref uint E, uint msg) 342 { 343 E += ((B & C) | ((B | C) & D)) + msg + rotateLeft(A, 5); 344 B = rotateLeft(B, 30); 345 } 346 347 /* 348 * SHA-160 F4 Function 349 */ 350 void F4(uint A, ref uint B, uint C, uint D, ref uint E, uint msg) 351 { 352 E += (B ^ C ^ D) + msg + rotateLeft(A, 5); 353 B = rotateLeft(B, 30); 354 }