28
35
if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
29
36
throw InvalidRounds(StaticAlgorithmName(), m_rounds);
31
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+1, 4, key, 16);
32
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+11, 4, key + length - 16, 16);
38
// m_state is reordered for SSE2
39
GetBlock<word32, LittleEndian, false> get1(key);
40
get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
41
GetBlock<word32, LittleEndian, false> get2(key + length - 16);
42
get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
34
// m_state[0,5,10,15] forms "expand 16-byte k" or "expand 32-byte k"
44
// "expand 16-byte k" or "expand 32-byte k"
35
45
m_state[0] = 0x61707865;
36
m_state[5] = (length == 16) ? 0x3120646e : 0x3320646e;
37
m_state[10] = (length == 16) ? 0x79622d36 : 0x79622d32;
38
m_state[15] = 0x6b206574;
46
m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
47
m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
48
m_state[3] = 0x6b206574;
41
51
void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
43
GetUserKey(LITTLE_ENDIAN_ORDER, m_state+6, 4, IV, 8);
53
GetBlock<word32, LittleEndian, false> get(IV);
54
get(m_state[14])(m_state[11]);
55
m_state[8] = m_state[5] = 0;
46
58
void Salsa20_Policy::SeekToIteration(lword iterationCount)
48
60
m_state[8] = (word32)iterationCount;
49
m_state[9] = (word32)SafeRightShift<32>(iterationCount);
61
m_state[5] = (word32)SafeRightShift<32>(iterationCount);
64
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
65
unsigned int Salsa20_Policy::GetAlignment() const
67
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
75
unsigned int Salsa20_Policy::GetOptimalBlockSize() const
77
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
79
return 4*BYTES_PER_ITERATION;
82
return BYTES_PER_ITERATION;
52
86
void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
54
KeystreamOutput<LittleEndian> keystreamOutput(operation, output, input);
89
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
92
__m128i *s = (__m128i *)m_state.data();
94
if (iterationCount >= 4)
97
ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
98
ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
99
ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
100
ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
101
ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
102
ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
103
ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
104
ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
105
ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
106
ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
107
ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
108
ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
109
ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
110
ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
114
word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
117
countersLo[i] = m_state[8];
118
countersHi[i] = m_state[5];
119
if (++m_state[8] == 0)
133
__m128i x10 = ss[10];
134
__m128i x11 = ss[11];
135
__m128i x12 = ss[12];
136
__m128i x13 = ss[13];
137
__m128i x14 = ss[14];
138
__m128i x15 = ss[15];
140
for (i=m_rounds; i>0; i-=2)
142
#define SSE2_QUARTER_ROUND(a, b, d, i) {\
143
__m128i t = _mm_add_epi32(a, d); \
144
b = _mm_xor_si128(b, _mm_slli_epi32(t, i)); \
145
b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
147
#define QUARTER_ROUND(a, b, c, d) \
148
SSE2_QUARTER_ROUND(a, b, d, 7) \
149
SSE2_QUARTER_ROUND(b, c, a, 9) \
150
SSE2_QUARTER_ROUND(c, d, b, 13) \
151
SSE2_QUARTER_ROUND(d, a, c, 18)
153
QUARTER_ROUND(x0, x4, x8, x12)
154
QUARTER_ROUND(x1, x5, x9, x13)
155
QUARTER_ROUND(x2, x6, x10, x14)
156
QUARTER_ROUND(x3, x7, x11, x15)
158
QUARTER_ROUND(x0, x13, x10, x7)
159
QUARTER_ROUND(x1, x14, x11, x4)
160
QUARTER_ROUND(x2, x15, x8, x5)
161
QUARTER_ROUND(x3, x12, x9, x6)
166
x0 = _mm_add_epi32(x0, ss[0]);
167
x1 = _mm_add_epi32(x1, ss[1]);
168
x2 = _mm_add_epi32(x2, ss[2]);
169
x3 = _mm_add_epi32(x3, ss[3]);
170
x4 = _mm_add_epi32(x4, ss[4]);
171
x5 = _mm_add_epi32(x5, ss[5]);
172
x6 = _mm_add_epi32(x6, ss[6]);
173
x7 = _mm_add_epi32(x7, ss[7]);
174
x8 = _mm_add_epi32(x8, ss[8]);
175
x9 = _mm_add_epi32(x9, ss[9]);
176
x10 = _mm_add_epi32(x10, ss[10]);
177
x11 = _mm_add_epi32(x11, ss[11]);
178
x12 = _mm_add_epi32(x12, ss[12]);
179
x13 = _mm_add_epi32(x13, ss[13]);
180
x14 = _mm_add_epi32(x14, ss[14]);
181
x15 = _mm_add_epi32(x15, ss[15]);
183
#define OUTPUT_4(x, a, b, c, d, e, f, g, h) {\
184
__m128i t0 = _mm_unpacklo_epi32(a, b);\
185
__m128i t1 = _mm_unpacklo_epi32(c, d);\
186
__m128i t2 = _mm_unpacklo_epi64(t0, t1);\
187
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
188
t2 = _mm_unpackhi_epi64(t0, t1);\
189
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
190
t0 = _mm_unpackhi_epi32(a, b);\
191
t1 = _mm_unpackhi_epi32(c, d);\
192
t2 = _mm_unpacklo_epi64(t0, t1);\
193
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
194
t2 = _mm_unpackhi_epi64(t0, t1);\
195
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
197
#define SALSA_OUTPUT(x) \
198
OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
199
OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
200
OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
201
OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
203
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
206
} while ((iterationCount-=4) >= 4);
209
if (!IsP4()) while (iterationCount)
217
for (i=m_rounds; i>0; i-=2)
219
SSE2_QUARTER_ROUND(x0, x1, x3, 7)
220
SSE2_QUARTER_ROUND(x1, x2, x0, 9)
221
SSE2_QUARTER_ROUND(x2, x3, x1, 13)
222
SSE2_QUARTER_ROUND(x3, x0, x2, 18)
224
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
225
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
226
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
228
SSE2_QUARTER_ROUND(x0, x3, x1, 7)
229
SSE2_QUARTER_ROUND(x3, x2, x0, 9)
230
SSE2_QUARTER_ROUND(x2, x1, x3, 13)
231
SSE2_QUARTER_ROUND(x1, x0, x2, 18)
233
x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
234
x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
235
x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
238
x0 = _mm_add_epi32(x0, s[0]);
239
x1 = _mm_add_epi32(x1, s[1]);
240
x2 = _mm_add_epi32(x2, s[2]);
241
x3 = _mm_add_epi32(x3, s[3]);
243
if (++m_state[8] == 0)
246
CRYPTOPP_ALIGN_DATA(16) static const word32 masks[8] CRYPTOPP_SECTION_ALIGN16 =
247
{0, 0xffffffff, 0, 0xffffffff, 0xffffffff, 0, 0xffffffff, 0};
249
__m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
250
k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
251
__m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
252
k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
253
__m128i maskLo32 = ((__m128i*)masks)[1], maskHi32 = ((__m128i*)masks)[0];
254
__m128i k20 = _mm_or_si128(_mm_and_si128(x2, maskLo32), _mm_and_si128(x1, maskHi32));
255
__m128i k31 = _mm_or_si128(_mm_and_si128(x3, maskLo32), _mm_and_si128(x2, maskHi32));
257
__m128i k0 = _mm_unpackhi_epi64(k02, k20);
258
__m128i k1 = _mm_unpackhi_epi64(k13, k31);
259
__m128i k2 = _mm_unpacklo_epi64(k20, k02);
260
__m128i k3 = _mm_unpacklo_epi64(k31, k13);
262
#define SSE2_OUTPUT(x) {\
263
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
264
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
265
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
266
CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
268
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
56
273
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
57
word32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
76
for (size_t iteration = 0; iteration < iterationCount; ++iteration)
275
while (iterationCount--)
95
for (int i=m_rounds; i>0; i-=2)
294
for (i=m_rounds; i>0; i-=2)
97
#define QUARTER_ROUND(a, b, c, d) \
98
b = b ^ rotlFixed(a + d, 7); \
99
c = c ^ rotlFixed(b + a, 9); \
100
d = d ^ rotlFixed(c + b, 13); \
101
a = a ^ rotlFixed(d + c, 18);
296
#define QUARTER_ROUND(a, b, c, d) \
297
b = b ^ rotlFixed(a + d, 7); \
298
c = c ^ rotlFixed(b + a, 9); \
299
d = d ^ rotlFixed(c + b, 13); \
300
a = a ^ rotlFixed(d + c, 18);
103
302
QUARTER_ROUND(x0, x4, x8, x12)
104
QUARTER_ROUND(x5, x9, x13, x1)
105
QUARTER_ROUND(x10, x14, x2, x6)
106
QUARTER_ROUND(x15, x3, x7, x11)
303
QUARTER_ROUND(x1, x5, x9, x13)
304
QUARTER_ROUND(x2, x6, x10, x14)
305
QUARTER_ROUND(x3, x7, x11, x15)
108
QUARTER_ROUND(x0, x1, x2, x3)
109
QUARTER_ROUND(x5, x6, x7, x4)
110
QUARTER_ROUND(x10, x11, x8, x9)
111
QUARTER_ROUND(x15, x12, x13, x14)
307
QUARTER_ROUND(x0, x13, x10, x7)
308
QUARTER_ROUND(x1, x14, x11, x4)
309
QUARTER_ROUND(x2, x15, x8, x5)
310
QUARTER_ROUND(x3, x12, x9, x6)
114
keystreamOutput (x0 + j0)
313
#define SALSA_OUTPUT(x) {\
314
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
315
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
316
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
317
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
318
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
319
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
320
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
321
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
322
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
323
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
324
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
325
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
326
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
327
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
328
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
329
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
331
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
333
if (++m_state[8] == 0)