1 /**
2 * IDEA in SSE2
3 * 
4 * Copyright:
5 * (C) 2009 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *
8 * License:
9 * Botan is released under the Simplified BSD License (see LICENSE.md)
10 */
11 module botan.block.idea_sse2;
12 
13 import botan.constants;
14 static if (BOTAN_HAS_IDEA_SSE2 && BOTAN_HAS_SIMD_SSE2):
15 
16 import botan.block.idea;
17 import botan.utils.simd.emmintrin;
18 import botan.block.block_cipher;
19 import botan.utils.mem_ops;
20 /**
21 * IDEA in SSE2
22 */
23 final class IDEASSE2 : IDEA, SymmetricAlgorithm
24 {
25 public:
26     override @property size_t parallelism() const { return 8; }
27     override void clear()
28     {
29         super.clear();
30     }
31 
32     /*
33     * IDEA Encryption
34     */
35     override void encryptN(const(ubyte)* input, ubyte* output, size_t blocks)
36     {
37         const ushort* KS = super.getEK().ptr;
38         
39         while (blocks >= 8)
40         {
41             idea_op_8(*cast(ubyte[64]*) input, *cast(ubyte[64]*) output, *cast(ushort[52]*) KS);
42             input += 8 * BLOCK_SIZE;
43             output += 8 * BLOCK_SIZE;
44             blocks -= 8;
45         }
46         
47         if (blocks)
48             super.encryptN(input, output, blocks);
49     }
50 
51     /*
52     * IDEA Decryption
53     */
54     override void decryptN(const(ubyte)* input, ubyte* output, size_t blocks)
55     {
56         const ushort* KS = this.getDK().ptr;
57         
58         while (blocks >= 8)
59         {
60             idea_op_8(*cast(ubyte[64]*) input, *cast(ubyte[64]*) output, *cast(ushort[52]*) KS);
61             input += 8 * BLOCK_SIZE;
62             output += 8 * BLOCK_SIZE;
63             blocks -= 8;
64         }
65         
66         if (blocks)
67             super.decryptN(input, output, blocks);
68     }
69     override void keySchedule(const(ubyte)* key, size_t sz) { return super.keySchedule(key, sz); }
70     override @property string name() const { return "IDEA"; }
71     override BlockCipher clone() const { return new IDEASSE2; }
72     override size_t blockSize() const { return super.blockSize(); }
73     override KeyLengthSpecification keySpec() const { return super.keySpec(); }
74 }
75 
76 package:
77 
78 __m128i mul(__m128i X, ushort K_16) pure
79 {
80     const(__m128i) zeros = _mm_set1_epi16!(0)();
81     const(__m128i) ones = _mm_set1_epi16!(1)();
82     
83     const(__m128i) K = _mm_set1_epi16(K_16);
84     
85     const(__m128i) X_is_zero = _mm_cmpeq_epi16(X, zeros);
86     const(__m128i) K_is_zero = _mm_cmpeq_epi16(K, zeros);
87     
88     const(__m128i) mul_lo = _mm_mullo_epi16(X, K);
89     const(__m128i) mul_hi = _mm_mulhi_epu16(X, K);
90     
91     __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
92     
93     // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
94     const(__m128i) subs = _mm_subs_epu16(mul_hi, mul_lo);
95     const(__m128i) cmp = _mm_min_epu8(_mm_or_si128(subs, _mm_srli_epi16!8(subs)), ones);
96     
97     T = _mm_add_epi16(T, cmp);
98     
99     /* Selection: if X[i] is zero then assign 1-K
100                   if K is zero then assign 1-X[i]
101 
102         Could if () off value of K_16 for the second, but this gives a
103         constant time implementation which is a nice bonus.
104     */
105     
106     T = _mm_or_si128(
107         _mm_andnot_si128(X_is_zero, T),
108         _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
109     
110     T = _mm_or_si128(
111         _mm_andnot_si128(K_is_zero, T),
112         _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
113     
114     return T;
115 }
116 
117 /*
118 * 4x8 matrix transpose
119 *
120 * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
121 * transpose_out doesn't need it. Something with the shuffle? Removing
122 * that extra unpack could easily save 3-4 cycles per block, and would
123 * also help a lot with register pressure on 32-bit x86
124 */
125 void transpose_in(__m128i* B0, __m128i* B1, __m128i* B2, __m128i* B3) pure
126 {
127     const SHUF = _MM_SHUFFLE(1, 3, 0, 2);
128     const SHUF2 = _MM_SHUFFLE(3, 1, 2, 0);
129 
130     __m128i T0;
131     __m128i T1;
132     __m128i T2;
133     __m128i T3;
134     {
135         __m128i B0_ = *B0;
136         __m128i B1_ = *B1;
137         __m128i B2_ = *B2;
138         __m128i B3_ = *B3;
139         T0 = _mm_unpackhi_epi32(B0_, B1_);
140         T1 = _mm_unpacklo_epi32(B0_, B1_);
141         T2 = _mm_unpackhi_epi32(B2_, B3_);
142         T3 = _mm_unpacklo_epi32(B2_, B3_);
143     }
144 
145     {
146         __m128i T4 = _mm_unpacklo_epi32(T0, T1);
147         __m128i T5 = _mm_unpackhi_epi32(T0, T1);
148         __m128i T6 = _mm_unpacklo_epi32(T2, T3);
149         __m128i T7 = _mm_unpackhi_epi32(T2, T3);
150         
151         T0 = _mm_shufflehi_epi16!SHUF(T4);
152         T1 = _mm_shufflehi_epi16!SHUF(T5);
153         T2 = _mm_shufflehi_epi16!SHUF(T6);
154         T3 = _mm_shufflehi_epi16!SHUF(T7);
155     }
156 
157     T0 = _mm_shufflelo_epi16!SHUF(T0);
158     T1 = _mm_shufflelo_epi16!SHUF(T1);
159     T2 = _mm_shufflelo_epi16!SHUF(T2);
160     T3 = _mm_shufflelo_epi16!SHUF(T3);
161     
162     T0 = _mm_shuffle_epi32!SHUF2(T0);
163     T1 = _mm_shuffle_epi32!SHUF2(T1);
164     T2 = _mm_shuffle_epi32!SHUF2(T2);
165     T3 = _mm_shuffle_epi32!SHUF2(T3);
166     
167     *B0 = _mm_unpacklo_epi64(T0, T2);
168     *B1 = _mm_unpackhi_epi64(T0, T2);
169     *B2 = _mm_unpacklo_epi64(T1, T3);
170     *B3 = _mm_unpackhi_epi64(T1, T3);
171 }
172 
173 /*
174 * 4x8 matrix transpose (reverse)
175 */
176 void transpose_out(__m128i* B0, __m128i* B1, __m128i* B2, __m128i* B3) pure
177 {
178     __m128i T0;
179     __m128i T1;
180     __m128i T2;
181     __m128i T3;
182     
183     {
184         __m128i B0_ = *B0;
185         __m128i B1_ = *B1;
186         __m128i B2_ = *B2;
187         __m128i B3_ = *B3;
188         T0 = _mm_unpacklo_epi64(B0_, B1_);
189         T1 = _mm_unpacklo_epi64(B2_, B3_);
190         T2 = _mm_unpackhi_epi64(B0_, B1_);
191         T3 = _mm_unpackhi_epi64(B2_, B3_);
192     }
193 
194     const SHUF = _MM_SHUFFLE(3, 1, 2, 0);
195 
196     T0 = _mm_shuffle_epi32!SHUF(T0);
197     T1 = _mm_shuffle_epi32!SHUF(T1);
198     T2 = _mm_shuffle_epi32!SHUF(T2);
199     T3 = _mm_shuffle_epi32!SHUF(T3);
200     
201     T0 = _mm_shufflehi_epi16!SHUF(T0);
202     T1 = _mm_shufflehi_epi16!SHUF(T1);
203     T2 = _mm_shufflehi_epi16!SHUF(T2);
204     T3 = _mm_shufflehi_epi16!SHUF(T3);
205     
206     T0 = _mm_shufflelo_epi16!SHUF(T0);
207     T1 = _mm_shufflelo_epi16!SHUF(T1);
208     T2 = _mm_shufflelo_epi16!SHUF(T2);
209     T3 = _mm_shufflelo_epi16!SHUF(T3);
210     
211     *B0 = _mm_unpacklo_epi32(T0, T1);
212     *B1 = _mm_unpackhi_epi32(T0, T1);
213     *B2 = _mm_unpacklo_epi32(T2, T3);
214     *B3 = _mm_unpackhi_epi32(T2, T3);
215 }
216 
217 /*
218 * IDEA encryption/decryption in SSE2
219 */
220 void idea_op_8(in ubyte[64] input, ref ubyte[64] output, in ushort[52] EK) pure
221 {
222     const(__m128i*) in_mm = cast(const(__m128i*))(input.ptr);
223     
224     __m128i B0 = _mm_loadu_si128(in_mm + 0);
225     __m128i B1 = _mm_loadu_si128(in_mm + 1);
226     __m128i B2 = _mm_loadu_si128(in_mm + 2);
227     __m128i B3 = _mm_loadu_si128(in_mm + 3);
228     
229     transpose_in(&B0, &B1, &B2, &B3);
230     
231     // ubyte swap
232     B0 = _mm_or_si128(_mm_slli_epi16!8(B0), _mm_srli_epi16!8(B0));
233     B1 = _mm_or_si128(_mm_slli_epi16!8(B1), _mm_srli_epi16!8(B1));
234     B2 = _mm_or_si128(_mm_slli_epi16!8(B2), _mm_srli_epi16!8(B2));
235     B3 = _mm_or_si128(_mm_slli_epi16!8(B3), _mm_srli_epi16!8(B3));
236     
237     foreach (size_t i; 0 .. 8)
238     {
239         B0 = mul(B0, EK[6*i+0]);
240         B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
241         B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
242         B3 = mul(B3, EK[6*i+3]);
243         
244         __m128i T0 = B2;
245         
246         B2 = _mm_xor_si128(B2, B0);
247         B2 = mul(B2, EK[6*i+4]);
248         
249         __m128i T1 = B1;
250         
251         B1 = _mm_xor_si128(B1, B3);
252         B1 = _mm_add_epi16(B1, B2);
253         B1 = mul(B1, EK[6*i+5]);
254         
255         B2 = _mm_add_epi16(B2, B1);
256         
257         B0 = _mm_xor_si128(B0, B1);
258         B1 = _mm_xor_si128(B1, T0);
259         B3 = _mm_xor_si128(B3, B2);
260         B2 = _mm_xor_si128(B2, T1);
261     }
262     
263     B0 = mul(B0, EK[48]);
264     B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
265     B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
266     B3 = mul(B3, EK[51]);
267     
268     // ubyte swap
269     B0 = _mm_or_si128(_mm_slli_epi16!8(B0), _mm_srli_epi16!8(B0));
270     B1 = _mm_or_si128(_mm_slli_epi16!8(B1), _mm_srli_epi16!8(B1));
271     B2 = _mm_or_si128(_mm_slli_epi16!8(B2), _mm_srli_epi16!8(B2));
272     B3 = _mm_or_si128(_mm_slli_epi16!8(B3), _mm_srli_epi16!8(B3));
273     
274     transpose_out(&B0, &B2, &B1, &B3);
275     
276     __m128i* out_mm = cast(__m128i*)(output.ptr);
277     
278     _mm_storeu_si128(out_mm + 0, B0);
279     _mm_storeu_si128(out_mm + 1, B2);
280     _mm_storeu_si128(out_mm + 2, B1);
281     _mm_storeu_si128(out_mm + 3, B3);
282 }