1 /** 2 * IDEA in SSE2 3 * 4 * Copyright: 5 * (C) 2009 Jack Lloyd 6 * (C) 2014-2015 Etienne Cimon 7 * 8 * License: 9 * Botan is released under the Simplified BSD License (see LICENSE.md) 10 */ 11 module botan.block.idea_sse2; 12 13 import botan.constants; 14 static if (BOTAN_HAS_IDEA_SSE2 && BOTAN_HAS_SIMD_SSE2): 15 16 import botan.block.idea; 17 import botan.utils.simd.emmintrin; 18 import botan.block.block_cipher; 19 import botan.utils.mem_ops; 20 /** 21 * IDEA in SSE2 22 */ 23 final class IDEASSE2 : IDEA, SymmetricAlgorithm 24 { 25 public: 26 override @property size_t parallelism() const { return 8; } 27 override void clear() 28 { 29 super.clear(); 30 } 31 32 /* 33 * IDEA Encryption 34 */ 35 override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks) 36 { 37 const ushort* KS = super.getEK().ptr; 38 39 while (blocks >= 8) 40 { 41 idea_op_8(*cast(ubyte[64]*) input, *cast(ubyte[64]*) output, *cast(ushort[52]*) KS); 42 input += 8 * BLOCK_SIZE; 43 output += 8 * BLOCK_SIZE; 44 blocks -= 8; 45 } 46 47 if (blocks) 48 super.encryptN(input, output, blocks); 49 } 50 51 /* 52 * IDEA Decryption 53 */ 54 override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks) 55 { 56 const ushort* KS = this.getDK().ptr; 57 58 while (blocks >= 8) 59 { 60 idea_op_8(*cast(ubyte[64]*) input, *cast(ubyte[64]*) output, *cast(ushort[52]*) KS); 61 input += 8 * BLOCK_SIZE; 62 output += 8 * BLOCK_SIZE; 63 blocks -= 8; 64 } 65 66 if (blocks) 67 super.decryptN(input, output, blocks); 68 } 69 override void keySchedule(const(ubyte)* key, size_t sz) { return super.keySchedule(key, sz); } 70 override @property string name() const { return "IDEA"; } 71 override BlockCipher clone() const { return new IDEASSE2; } 72 override size_t blockSize() const { return super.blockSize(); } 73 override KeyLengthSpecification keySpec() const { return super.keySpec(); } 74 } 75 76 package: 77 78 __m128i mul(__m128i X, ushort K_16) pure 79 { 80 const(__m128i) zeros = _mm_set1_epi16!(0)(); 81 const(__m128i) ones = _mm_set1_epi16!(1)(); 82 83 const(__m128i) K = _mm_set1_epi16(K_16); 84 85 const(__m128i) X_is_zero = _mm_cmpeq_epi16(X, zeros); 86 const(__m128i) K_is_zero = _mm_cmpeq_epi16(K, zeros); 87 88 const(__m128i) mul_lo = _mm_mullo_epi16(X, K); 89 const(__m128i) mul_hi = _mm_mulhi_epu16(X, K); 90 91 __m128i T = _mm_sub_epi16(mul_lo, mul_hi); 92 93 // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0 94 const(__m128i) subs = _mm_subs_epu16(mul_hi, mul_lo); 95 const(__m128i) cmp = _mm_min_epu8(_mm_or_si128(subs, _mm_srli_epi16!8(subs)), ones); 96 97 T = _mm_add_epi16(T, cmp); 98 99 /* Selection: if X[i] is zero then assign 1-K 100 if K is zero then assign 1-X[i] 101 102 Could if () off value of K_16 for the second, but this gives a 103 constant time implementation which is a nice bonus. 104 */ 105 106 T = _mm_or_si128( 107 _mm_andnot_si128(X_is_zero, T), 108 _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero)); 109 110 T = _mm_or_si128( 111 _mm_andnot_si128(K_is_zero, T), 112 _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero)); 113 114 return T; 115 } 116 117 /* 118 * 4x8 matrix transpose 119 * 120 * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in 121 * transpose_out doesn't need it. Something with the shuffle? Removing 122 * that extra unpack could easily save 3-4 cycles per block, and would 123 * also help a lot with register pressure on 32-bit x86 124 */ 125 void transpose_in(__m128i* B0, __m128i* B1, __m128i* B2, __m128i* B3) pure 126 { 127 const SHUF = _MM_SHUFFLE(1, 3, 0, 2); 128 const SHUF2 = _MM_SHUFFLE(3, 1, 2, 0); 129 130 __m128i T0; 131 __m128i T1; 132 __m128i T2; 133 __m128i T3; 134 { 135 __m128i B0_ = *B0; 136 __m128i B1_ = *B1; 137 __m128i B2_ = *B2; 138 __m128i B3_ = *B3; 139 T0 = _mm_unpackhi_epi32(B0_, B1_); 140 T1 = _mm_unpacklo_epi32(B0_, B1_); 141 T2 = _mm_unpackhi_epi32(B2_, B3_); 142 T3 = _mm_unpacklo_epi32(B2_, B3_); 143 } 144 145 { 146 __m128i T4 = _mm_unpacklo_epi32(T0, T1); 147 __m128i T5 = _mm_unpackhi_epi32(T0, T1); 148 __m128i T6 = _mm_unpacklo_epi32(T2, T3); 149 __m128i T7 = _mm_unpackhi_epi32(T2, T3); 150 151 T0 = _mm_shufflehi_epi16!SHUF(T4); 152 T1 = _mm_shufflehi_epi16!SHUF(T5); 153 T2 = _mm_shufflehi_epi16!SHUF(T6); 154 T3 = _mm_shufflehi_epi16!SHUF(T7); 155 } 156 157 T0 = _mm_shufflelo_epi16!SHUF(T0); 158 T1 = _mm_shufflelo_epi16!SHUF(T1); 159 T2 = _mm_shufflelo_epi16!SHUF(T2); 160 T3 = _mm_shufflelo_epi16!SHUF(T3); 161 162 T0 = _mm_shuffle_epi32!SHUF2(T0); 163 T1 = _mm_shuffle_epi32!SHUF2(T1); 164 T2 = _mm_shuffle_epi32!SHUF2(T2); 165 T3 = _mm_shuffle_epi32!SHUF2(T3); 166 167 *B0 = _mm_unpacklo_epi64(T0, T2); 168 *B1 = _mm_unpackhi_epi64(T0, T2); 169 *B2 = _mm_unpacklo_epi64(T1, T3); 170 *B3 = _mm_unpackhi_epi64(T1, T3); 171 } 172 173 /* 174 * 4x8 matrix transpose (reverse) 175 */ 176 void transpose_out(__m128i* B0, __m128i* B1, __m128i* B2, __m128i* B3) pure 177 { 178 __m128i T0; 179 __m128i T1; 180 __m128i T2; 181 __m128i T3; 182 183 { 184 __m128i B0_ = *B0; 185 __m128i B1_ = *B1; 186 __m128i B2_ = *B2; 187 __m128i B3_ = *B3; 188 T0 = _mm_unpacklo_epi64(B0_, B1_); 189 T1 = _mm_unpacklo_epi64(B2_, B3_); 190 T2 = _mm_unpackhi_epi64(B0_, B1_); 191 T3 = _mm_unpackhi_epi64(B2_, B3_); 192 } 193 194 const SHUF = _MM_SHUFFLE(3, 1, 2, 0); 195 196 T0 = _mm_shuffle_epi32!SHUF(T0); 197 T1 = _mm_shuffle_epi32!SHUF(T1); 198 T2 = _mm_shuffle_epi32!SHUF(T2); 199 T3 = _mm_shuffle_epi32!SHUF(T3); 200 201 T0 = _mm_shufflehi_epi16!SHUF(T0); 202 T1 = _mm_shufflehi_epi16!SHUF(T1); 203 T2 = _mm_shufflehi_epi16!SHUF(T2); 204 T3 = _mm_shufflehi_epi16!SHUF(T3); 205 206 T0 = _mm_shufflelo_epi16!SHUF(T0); 207 T1 = _mm_shufflelo_epi16!SHUF(T1); 208 T2 = _mm_shufflelo_epi16!SHUF(T2); 209 T3 = _mm_shufflelo_epi16!SHUF(T3); 210 211 *B0 = _mm_unpacklo_epi32(T0, T1); 212 *B1 = _mm_unpackhi_epi32(T0, T1); 213 *B2 = _mm_unpacklo_epi32(T2, T3); 214 *B3 = _mm_unpackhi_epi32(T2, T3); 215 } 216 217 /* 218 * IDEA encryption/decryption in SSE2 219 */ 220 void idea_op_8(in ubyte[64] input, ref ubyte[64] output, in ushort[52] EK) pure 221 { 222 const(__m128i*) in_mm = cast(const(__m128i*))(input.ptr); 223 224 __m128i B0 = _mm_loadu_si128(in_mm + 0); 225 __m128i B1 = _mm_loadu_si128(in_mm + 1); 226 __m128i B2 = _mm_loadu_si128(in_mm + 2); 227 __m128i B3 = _mm_loadu_si128(in_mm + 3); 228 229 transpose_in(&B0, &B1, &B2, &B3); 230 231 // ubyte swap 232 B0 = _mm_or_si128(_mm_slli_epi16!8(B0), _mm_srli_epi16!8(B0)); 233 B1 = _mm_or_si128(_mm_slli_epi16!8(B1), _mm_srli_epi16!8(B1)); 234 B2 = _mm_or_si128(_mm_slli_epi16!8(B2), _mm_srli_epi16!8(B2)); 235 B3 = _mm_or_si128(_mm_slli_epi16!8(B3), _mm_srli_epi16!8(B3)); 236 237 foreach (size_t i; 0 .. 8) 238 { 239 B0 = mul(B0, EK[6*i+0]); 240 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1])); 241 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2])); 242 B3 = mul(B3, EK[6*i+3]); 243 244 __m128i T0 = B2; 245 246 B2 = _mm_xor_si128(B2, B0); 247 B2 = mul(B2, EK[6*i+4]); 248 249 __m128i T1 = B1; 250 251 B1 = _mm_xor_si128(B1, B3); 252 B1 = _mm_add_epi16(B1, B2); 253 B1 = mul(B1, EK[6*i+5]); 254 255 B2 = _mm_add_epi16(B2, B1); 256 257 B0 = _mm_xor_si128(B0, B1); 258 B1 = _mm_xor_si128(B1, T0); 259 B3 = _mm_xor_si128(B3, B2); 260 B2 = _mm_xor_si128(B2, T1); 261 } 262 263 B0 = mul(B0, EK[48]); 264 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50])); 265 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49])); 266 B3 = mul(B3, EK[51]); 267 268 // ubyte swap 269 B0 = _mm_or_si128(_mm_slli_epi16!8(B0), _mm_srli_epi16!8(B0)); 270 B1 = _mm_or_si128(_mm_slli_epi16!8(B1), _mm_srli_epi16!8(B1)); 271 B2 = _mm_or_si128(_mm_slli_epi16!8(B2), _mm_srli_epi16!8(B2)); 272 B3 = _mm_or_si128(_mm_slli_epi16!8(B3), _mm_srli_epi16!8(B3)); 273 274 transpose_out(&B0, &B2, &B1, &B3); 275 276 __m128i* out_mm = cast(__m128i*)(output.ptr); 277 278 _mm_storeu_si128(out_mm + 0, B0); 279 _mm_storeu_si128(out_mm + 1, B2); 280 _mm_storeu_si128(out_mm + 2, B1); 281 _mm_storeu_si128(out_mm + 3, B3); 282 }