1 /** 2 * ChaCha20 3 * 4 * Copyright: 5 * (C) 2014 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.stream.chacha; 12 13 import botan.constants; 14 static if (BOTAN_HAS_CHACHA): 15 16 import botan.stream.stream_cipher; 17 import botan.utils.loadstor; 18 import botan.utils.rotate; 19 import botan.utils.xor_buf; 20 import botan.utils.types; 21 import botan.utils.mem_ops; 22 import botan.utils.cpuid; 23 import std.format : format; 24 25 /** 26 * DJB's ChaCha (http://cr.yp.to/chacha.html) 27 */ 28 final class ChaCha : StreamCipher, SymmetricAlgorithm 29 { 30 public: 31 this(size_t rounds) { 32 m_rounds = rounds; 33 if (m_rounds != 8 && m_rounds != 12 && m_rounds != 20) 34 throw new InvalidArgument("ChaCha only supports 8, 12 or 20 rounds"); 35 } 36 37 /* 38 * Combine cipher stream with message 39 */ 40 override void cipher(const(ubyte)* input, ubyte* output, size_t length) 41 { 42 while (length >= m_buffer.length - m_position) 43 { 44 xorBuf(output, input, m_buffer.ptr + m_position, m_buffer.length - m_position); 45 length -= (m_buffer.length - m_position); 46 input += (m_buffer.length - m_position); 47 output += (m_buffer.length - m_position); 48 version(SIMD_SSE2) { 49 if (CPUID.hasSse2()) 50 chachaSSE2x4(*cast(ubyte[64*4]*) m_buffer.ptr, *cast(uint[16]*) m_state.ptr, m_rounds); 51 else 52 chachax4(*cast(ubyte[64*4]*) m_buffer.ptr, *cast(uint[16]*) m_state.ptr, m_rounds); 53 } else 54 chachax4(*cast(ubyte[64*4]*) m_buffer.ptr, *cast(uint[16]*) m_state.ptr, m_rounds); 55 m_position = 0; 56 } 57 58 xorBuf(output, input, &m_buffer[m_position], length); 59 60 m_position += length; 61 } 62 63 /* 64 * Return the name of this type 65 */ 66 override void setIv(const(ubyte)* iv, size_t length) 67 { 68 if (!validIvLength(length)) 69 throw new InvalidIVLength(name, length); 70 71 m_state[12] = 0; 72 73 m_state[13] = 0; 74 75 if (length == 8) { 76 m_state[14] = loadLittleEndian!uint(iv, 0); 77 m_state[15] = loadLittleEndian!uint(iv, 1); 78 } else if (length == 12) { 79 m_state[13] = loadLittleEndian!uint(iv, 0); 80 m_state[14] = loadLittleEndian!uint(iv, 1); 81 m_state[15] = loadLittleEndian!uint(iv, 2); 82 } 83 version(SIMD_SSE2) { 84 if (CPUID.hasSse2()) 85 chachaSSE2x4(*cast(ubyte[64*4]*) m_buffer.ptr, *cast(uint[16]*) m_state.ptr, m_rounds); 86 else chachax4(*cast(ubyte[64*4]*) m_buffer.ptr, *cast(uint[16]*) m_state.ptr, m_rounds); 87 } else chachax4(*cast(ubyte[64*4]*) m_buffer.ptr, *cast(uint[16]*) m_state.ptr, m_rounds); 88 m_position = 0; 89 } 90 91 override bool validIvLength(size_t iv_len) const 92 { return (iv_len == 8 || iv_len == 12); } 93 94 KeyLengthSpecification keySpec() const 95 { 96 return KeyLengthSpecification(16, 32, 16); 97 } 98 99 /* 100 * Clear memory of sensitive data 101 */ 102 void clear() 103 { 104 zap(m_state); 105 zap(m_buffer); 106 m_position = 0; 107 } 108 109 /* 110 * Return the name of this type 111 */ 112 @property string name() const 113 { 114 return "ChaCha(" ~ m_rounds.to!string ~ ")"; 115 } 116 117 override StreamCipher clone() const { return new ChaCha(m_rounds); } 118 119 120 protected: 121 /* 122 * ChaCha Key Schedule 123 */ 124 override void keySchedule(const(ubyte)* key, size_t length) 125 { 126 __gshared immutable uint[] TAU = [ 0x61707865, 0x3120646e, 0x79622d36, 0x6b206574 ]; 127 128 __gshared immutable uint[] SIGMA = [ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 ]; 129 130 const uint[] CONSTANTS = (length == 16) ? TAU : SIGMA; 131 132 // Repeat the key if 128bits 133 const(ubyte)* key2 = (length == 32) ? key + 16 : key; 134 m_position = 0; 135 m_state.resize(16); 136 m_buffer.resize(4*64); 137 138 m_state[0] = CONSTANTS[0]; 139 m_state[1] = CONSTANTS[1]; 140 m_state[2] = CONSTANTS[2]; 141 m_state[3] = CONSTANTS[3]; 142 143 m_state[4] = loadLittleEndian!uint(key, 0); 144 m_state[5] = loadLittleEndian!uint(key, 1); 145 m_state[6] = loadLittleEndian!uint(key, 2); 146 m_state[7] = loadLittleEndian!uint(key, 3); 147 148 m_state[8] = loadLittleEndian!uint(key2, 0); 149 m_state[9] = loadLittleEndian!uint(key2, 1); 150 m_state[10] = loadLittleEndian!uint(key2, 2); 151 m_state[11] = loadLittleEndian!uint(key2, 3); 152 153 // Default all-zero IV 154 155 const ubyte[8] ZERO; 156 setIv(ZERO.ptr, ZERO.length); 157 } 158 159 SecureVector!uint m_state; 160 SecureVector!ubyte m_buffer; 161 size_t m_position = 0; 162 size_t m_rounds; 163 } 164 165 enum string CHACHA_QUARTER_ROUND(alias _a, alias _b, alias _c, alias _d) = q{ 166 %1$s += %2$s; %4$s ^= %1$s; %4$s = rotateLeft(%4$s, 16); 167 %3$s += %4$s; %2$s ^= %3$s; %2$s = rotateLeft(%2$s, 12); 168 %1$s += %2$s; %4$s ^= %1$s; %4$s = rotateLeft(%4$s, 8); 169 %3$s += %4$s; %2$s ^= %3$s; %2$s = rotateLeft(%2$s, 7); 170 }.format(__traits(identifier, _a), __traits(identifier, _b), __traits(identifier, _c), __traits(identifier, _d)); 171 172 private void chachax4(ref ubyte[64*4] output, ref uint[16] input, size_t rounds) 173 { 174 assert(rounds % 2 == 0, "Valid rounds"); 175 for(int i = 0; i < 4; i++) 176 { 177 uint x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], 178 x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], 179 x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], 180 x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; 181 182 183 for (int j = 0; j < rounds/2; j++) 184 { 185 mixin(CHACHA_QUARTER_ROUND!(x00, x04, x08, x12) ~ 186 CHACHA_QUARTER_ROUND!(x01, x05, x09, x13) ~ 187 CHACHA_QUARTER_ROUND!(x02, x06, x10, x14) ~ 188 CHACHA_QUARTER_ROUND!(x03, x07, x11, x15) ~ 189 190 CHACHA_QUARTER_ROUND!(x00, x05, x10, x15) ~ 191 CHACHA_QUARTER_ROUND!(x01, x06, x11, x12) ~ 192 CHACHA_QUARTER_ROUND!(x02, x07, x08, x13) ~ 193 CHACHA_QUARTER_ROUND!(x03, x04, x09, x14) 194 ); 195 } 196 197 storeLittleEndian(x00 + input[ 0], output.ptr + 64 * i + 4 * 0); 198 storeLittleEndian(x01 + input[ 1], output.ptr + 64 * i + 4 * 1); 199 storeLittleEndian(x02 + input[ 2], output.ptr + 64 * i + 4 * 2); 200 storeLittleEndian(x03 + input[ 3], output.ptr + 64 * i + 4 * 3); 201 storeLittleEndian(x04 + input[ 4], output.ptr + 64 * i + 4 * 4); 202 storeLittleEndian(x05 + input[ 5], output.ptr + 64 * i + 4 * 5); 203 storeLittleEndian(x06 + input[ 6], output.ptr + 64 * i + 4 * 6); 204 storeLittleEndian(x07 + input[ 7], output.ptr + 64 * i + 4 * 7); 205 storeLittleEndian(x08 + input[ 8], output.ptr + 64 * i + 4 * 8); 206 storeLittleEndian(x09 + input[ 9], output.ptr + 64 * i + 4 * 9); 207 storeLittleEndian(x10 + input[10], output.ptr + 64 * i + 4 * 10); 208 storeLittleEndian(x11 + input[11], output.ptr + 64 * i + 4 * 11); 209 storeLittleEndian(x12 + input[12], output.ptr + 64 * i + 4 * 12); 210 storeLittleEndian(x13 + input[13], output.ptr + 64 * i + 4 * 13); 211 storeLittleEndian(x14 + input[14], output.ptr + 64 * i + 4 * 14); 212 storeLittleEndian(x15 + input[15], output.ptr + 64 * i + 4 * 15); 213 214 input[12]++; 215 input[13] += (input[12] < i) ? 1 : 0; 216 } 217 } 218 219 /** SSE2 ChaCha 220 * (C) 2016 Jack Lloyd 221 */ 222 version(SIMD_SSE2) 223 private void chachaSSE2x4(ref ubyte[64*4] output, ref uint[16] input, size_t rounds) 224 { 225 import botan.utils.simd.emmintrin; 226 assert(rounds % 2 == 0, "Valid rounds"); 227 228 const __m128i* input_mm = cast(const(__m128i*)) input; 229 __m128i* output_mm = cast(__m128i*) output; 230 231 __m128i input0 = _mm_loadu_si128(input_mm); 232 __m128i input1 = _mm_loadu_si128(input_mm + 1); 233 __m128i input2 = _mm_loadu_si128(input_mm + 2); 234 __m128i input3 = _mm_loadu_si128(input_mm + 3); 235 236 // TODO: try transposing, which would avoid the permutations each round 237 238 __m128i r0_0 = input0; 239 __m128i r0_1 = input1; 240 __m128i r0_2 = input2; 241 __m128i r0_3 = input3; 242 243 __m128i r1_0 = input0; 244 __m128i r1_1 = input1; 245 __m128i r1_2 = input2; 246 __m128i r1_3 = input3; 247 r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); 248 249 __m128i r2_0 = input0; 250 __m128i r2_1 = input1; 251 __m128i r2_2 = input2; 252 __m128i r2_3 = input3; 253 r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); 254 255 __m128i r3_0 = input0; 256 __m128i r3_1 = input1; 257 __m128i r3_2 = input2; 258 __m128i r3_3 = input3; 259 r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); 260 261 for(size_t r = 0; r != rounds / 2; ++r) 262 { 263 r0_0 = _mm_add_epi32(r0_0, r0_1); 264 r1_0 = _mm_add_epi32(r1_0, r1_1); 265 r2_0 = _mm_add_epi32(r2_0, r2_1); 266 r3_0 = _mm_add_epi32(r3_0, r3_1); 267 268 r0_3 = _mm_xor_si128(r0_3, r0_0); 269 r1_3 = _mm_xor_si128(r1_3, r1_0); 270 r2_3 = _mm_xor_si128(r2_3, r2_0); 271 r3_3 = _mm_xor_si128(r3_3, r3_0); 272 273 r0_3 = _mm_or_si128(_mm_slli_epi32!16(r0_3), _mm_srli_epi32!16(r0_3)); //mm_rotl(r0_3, 16); 274 r1_3 = _mm_or_si128(_mm_slli_epi32!16(r1_3), _mm_srli_epi32!16(r1_3)); //mm_rotl(r1_3, 16); 275 r2_3 = _mm_or_si128(_mm_slli_epi32!16(r2_3), _mm_srli_epi32!16(r2_3)); //mm_rotl(r2_3, 16); 276 r3_3 = _mm_or_si128(_mm_slli_epi32!16(r3_3), _mm_srli_epi32!16(r3_3)); //mm_rotl(r3_3, 16); 277 278 r0_2 = _mm_add_epi32(r0_2, r0_3); 279 r1_2 = _mm_add_epi32(r1_2, r1_3); 280 r2_2 = _mm_add_epi32(r2_2, r2_3); 281 r3_2 = _mm_add_epi32(r3_2, r3_3); 282 283 r0_1 = _mm_xor_si128(r0_1, r0_2); 284 r1_1 = _mm_xor_si128(r1_1, r1_2); 285 r2_1 = _mm_xor_si128(r2_1, r2_2); 286 r3_1 = _mm_xor_si128(r3_1, r3_2); 287 288 r0_1 = _mm_or_si128(_mm_slli_epi32!12(r0_1), _mm_srli_epi32!20(r0_1)); //mm_rotl(r0_1, 12); 289 r1_1 = _mm_or_si128(_mm_slli_epi32!12(r1_1), _mm_srli_epi32!20(r1_1)); //mm_rotl(r1_1, 12); 290 r2_1 = _mm_or_si128(_mm_slli_epi32!12(r2_1), _mm_srli_epi32!20(r2_1)); //mm_rotl(r2_1, 12); 291 r3_1 = _mm_or_si128(_mm_slli_epi32!12(r3_1), _mm_srli_epi32!20(r3_1)); //mm_rotl(r3_1, 12); 292 293 r0_0 = _mm_add_epi32(r0_0, r0_1); 294 r1_0 = _mm_add_epi32(r1_0, r1_1); 295 r2_0 = _mm_add_epi32(r2_0, r2_1); 296 r3_0 = _mm_add_epi32(r3_0, r3_1); 297 298 r0_3 = _mm_xor_si128(r0_3, r0_0); 299 r1_3 = _mm_xor_si128(r1_3, r1_0); 300 r2_3 = _mm_xor_si128(r2_3, r2_0); 301 r3_3 = _mm_xor_si128(r3_3, r3_0); 302 303 r0_3 = _mm_or_si128(_mm_slli_epi32!8(r0_3), _mm_srli_epi32!24(r0_3)); //mm_rotl(r0_3, 8); 304 r1_3 = _mm_or_si128(_mm_slli_epi32!8(r1_3), _mm_srli_epi32!24(r1_3)); //mm_rotl(r1_3, 8); 305 r2_3 = _mm_or_si128(_mm_slli_epi32!8(r2_3), _mm_srli_epi32!24(r2_3)); //mm_rotl(r2_3, 8); 306 r3_3 = _mm_or_si128(_mm_slli_epi32!8(r3_3), _mm_srli_epi32!24(r3_3)); //mm_rotl(r3_3, 8); 307 308 r0_2 = _mm_add_epi32(r0_2, r0_3); 309 r1_2 = _mm_add_epi32(r1_2, r1_3); 310 r2_2 = _mm_add_epi32(r2_2, r2_3); 311 r3_2 = _mm_add_epi32(r3_2, r3_3); 312 313 r0_1 = _mm_xor_si128(r0_1, r0_2); 314 r1_1 = _mm_xor_si128(r1_1, r1_2); 315 r2_1 = _mm_xor_si128(r2_1, r2_2); 316 r3_1 = _mm_xor_si128(r3_1, r3_2); 317 318 r0_1 = _mm_or_si128(_mm_slli_epi32!7(r0_1), _mm_srli_epi32!25(r0_1)); //mm_rotl(r0_1, 7); 319 r1_1 = _mm_or_si128(_mm_slli_epi32!7(r1_1), _mm_srli_epi32!25(r1_1)); //mm_rotl(r1_1, 7); 320 r2_1 = _mm_or_si128(_mm_slli_epi32!7(r2_1), _mm_srli_epi32!25(r2_1)); //mm_rotl(r2_1, 7); 321 r3_1 = _mm_or_si128(_mm_slli_epi32!7(r3_1), _mm_srli_epi32!25(r3_1)); //mm_rotl(r3_1, 7); 322 323 r0_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r0_1); 324 r0_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r0_2); 325 r0_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r0_3); 326 327 r1_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r1_1); 328 r1_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r1_2); 329 r1_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r1_3); 330 331 r2_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r2_1); 332 r2_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r2_2); 333 r2_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r2_3); 334 335 r3_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r3_1); 336 r3_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r3_2); 337 r3_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r3_3); 338 339 r0_0 = _mm_add_epi32(r0_0, r0_1); 340 r1_0 = _mm_add_epi32(r1_0, r1_1); 341 r2_0 = _mm_add_epi32(r2_0, r2_1); 342 r3_0 = _mm_add_epi32(r3_0, r3_1); 343 344 r0_3 = _mm_xor_si128(r0_3, r0_0); 345 r1_3 = _mm_xor_si128(r1_3, r1_0); 346 r2_3 = _mm_xor_si128(r2_3, r2_0); 347 r3_3 = _mm_xor_si128(r3_3, r3_0); 348 349 r0_3 = _mm_or_si128(_mm_slli_epi32!16(r0_3), _mm_srli_epi32!16(r0_3)); //mm_rotl(r0_3, 16); 350 r1_3 = _mm_or_si128(_mm_slli_epi32!16(r1_3), _mm_srli_epi32!16(r1_3)); //mm_rotl(r1_3, 16); 351 r2_3 = _mm_or_si128(_mm_slli_epi32!16(r2_3), _mm_srli_epi32!16(r2_3)); //mm_rotl(r2_3, 16); 352 r3_3 = _mm_or_si128(_mm_slli_epi32!16(r3_3), _mm_srli_epi32!16(r3_3)); //mm_rotl(r3_3, 16); 353 354 r0_2 = _mm_add_epi32(r0_2, r0_3); 355 r1_2 = _mm_add_epi32(r1_2, r1_3); 356 r2_2 = _mm_add_epi32(r2_2, r2_3); 357 r3_2 = _mm_add_epi32(r3_2, r3_3); 358 359 r0_1 = _mm_xor_si128(r0_1, r0_2); 360 r1_1 = _mm_xor_si128(r1_1, r1_2); 361 r2_1 = _mm_xor_si128(r2_1, r2_2); 362 r3_1 = _mm_xor_si128(r3_1, r3_2); 363 364 r0_1 = _mm_or_si128(_mm_slli_epi32!12(r0_1), _mm_srli_epi32!20(r0_1)); //mm_rotl(r0_1, 12); 365 r1_1 = _mm_or_si128(_mm_slli_epi32!12(r1_1), _mm_srli_epi32!20(r1_1)); //mm_rotl(r1_1, 12); 366 r2_1 = _mm_or_si128(_mm_slli_epi32!12(r2_1), _mm_srli_epi32!20(r2_1)); //mm_rotl(r2_1, 12); 367 r3_1 = _mm_or_si128(_mm_slli_epi32!12(r3_1), _mm_srli_epi32!20(r3_1)); //mm_rotl(r3_1, 12); 368 369 r0_0 = _mm_add_epi32(r0_0, r0_1); 370 r1_0 = _mm_add_epi32(r1_0, r1_1); 371 r2_0 = _mm_add_epi32(r2_0, r2_1); 372 r3_0 = _mm_add_epi32(r3_0, r3_1); 373 374 r0_3 = _mm_xor_si128(r0_3, r0_0); 375 r1_3 = _mm_xor_si128(r1_3, r1_0); 376 r2_3 = _mm_xor_si128(r2_3, r2_0); 377 r3_3 = _mm_xor_si128(r3_3, r3_0); 378 379 r0_3 = _mm_or_si128(_mm_slli_epi32!8(r0_3), _mm_srli_epi32!24(r0_3)); //mm_rotl(r0_3, 8); 380 r1_3 = _mm_or_si128(_mm_slli_epi32!8(r1_3), _mm_srli_epi32!24(r1_3)); //mm_rotl(r1_3, 8); 381 r2_3 = _mm_or_si128(_mm_slli_epi32!8(r2_3), _mm_srli_epi32!24(r2_3)); //mm_rotl(r2_3, 8); 382 r3_3 = _mm_or_si128(_mm_slli_epi32!8(r3_3), _mm_srli_epi32!24(r3_3)); //mm_rotl(r3_3, 8); 383 384 r0_2 = _mm_add_epi32(r0_2, r0_3); 385 r1_2 = _mm_add_epi32(r1_2, r1_3); 386 r2_2 = _mm_add_epi32(r2_2, r2_3); 387 r3_2 = _mm_add_epi32(r3_2, r3_3); 388 389 r0_1 = _mm_xor_si128(r0_1, r0_2); 390 r1_1 = _mm_xor_si128(r1_1, r1_2); 391 r2_1 = _mm_xor_si128(r2_1, r2_2); 392 r3_1 = _mm_xor_si128(r3_1, r3_2); 393 394 r0_1 = _mm_or_si128(_mm_slli_epi32!7(r0_1), _mm_srli_epi32!25(r0_1)); //mm_rotl(r0_1, 7); 395 r1_1 = _mm_or_si128(_mm_slli_epi32!7(r1_1), _mm_srli_epi32!25(r1_1)); //mm_rotl(r1_1, 7); 396 r2_1 = _mm_or_si128(_mm_slli_epi32!7(r2_1), _mm_srli_epi32!25(r2_1)); //mm_rotl(r2_1, 7); 397 r3_1 = _mm_or_si128(_mm_slli_epi32!7(r3_1), _mm_srli_epi32!25(r3_1)); //mm_rotl(r3_1, 7); 398 399 r0_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r0_1); 400 r0_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r0_2); 401 r0_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r0_3); 402 403 r1_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r1_1); 404 r1_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r1_2); 405 r1_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r1_3); 406 407 r2_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r2_1); 408 r2_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r2_2); 409 r2_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r2_3); 410 411 r3_1 = _mm_shuffle_epi32!(_MM_SHUFFLE(2, 1, 0, 3))(r3_1); 412 r3_2 = _mm_shuffle_epi32!(_MM_SHUFFLE(1, 0, 3, 2))(r3_2); 413 r3_3 = _mm_shuffle_epi32!(_MM_SHUFFLE(0, 3, 2, 1))(r3_3); 414 } 415 416 r0_0 = _mm_add_epi32(r0_0, input0); 417 r0_1 = _mm_add_epi32(r0_1, input1); 418 r0_2 = _mm_add_epi32(r0_2, input2); 419 r0_3 = _mm_add_epi32(r0_3, input3); 420 421 r1_0 = _mm_add_epi32(r1_0, input0); 422 r1_1 = _mm_add_epi32(r1_1, input1); 423 r1_2 = _mm_add_epi32(r1_2, input2); 424 r1_3 = _mm_add_epi32(r1_3, input3); 425 r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); 426 427 r2_0 = _mm_add_epi32(r2_0, input0); 428 r2_1 = _mm_add_epi32(r2_1, input1); 429 r2_2 = _mm_add_epi32(r2_2, input2); 430 r2_3 = _mm_add_epi32(r2_3, input3); 431 r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); 432 433 r3_0 = _mm_add_epi32(r3_0, input0); 434 r3_1 = _mm_add_epi32(r3_1, input1); 435 r3_2 = _mm_add_epi32(r3_2, input2); 436 r3_3 = _mm_add_epi32(r3_3, input3); 437 r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); 438 439 _mm_storeu_si128(output_mm + 0, r0_0); 440 _mm_storeu_si128(output_mm + 1, r0_1); 441 _mm_storeu_si128(output_mm + 2, r0_2); 442 _mm_storeu_si128(output_mm + 3, r0_3); 443 444 _mm_storeu_si128(output_mm + 4, r1_0); 445 _mm_storeu_si128(output_mm + 5, r1_1); 446 _mm_storeu_si128(output_mm + 6, r1_2); 447 _mm_storeu_si128(output_mm + 7, r1_3); 448 449 _mm_storeu_si128(output_mm + 8, r2_0); 450 _mm_storeu_si128(output_mm + 9, r2_1); 451 _mm_storeu_si128(output_mm + 10, r2_2); 452 _mm_storeu_si128(output_mm + 11, r2_3); 453 454 _mm_storeu_si128(output_mm + 12, r3_0); 455 _mm_storeu_si128(output_mm + 13, r3_1); 456 _mm_storeu_si128(output_mm + 14, r3_2); 457 _mm_storeu_si128(output_mm + 15, r3_3); 458 459 input[12] += 4; 460 if (input[12] < 4) 461 input[13]++; 462 }