1
// sosemanuk.cpp - written and placed in the public domain by Wei Dai
10
NAMESPACE_BEGIN(CryptoPP)
12
void SosemanukPolicy::CipherSetKey(const NameValuePairs ¶ms, const byte *userKey, size_t keylen)
14
Serpent_KeySchedule(m_key, 24, userKey, keylen);
17
void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
21
typedef BlockGetAndPut<word32, LittleEndian> Block;
22
Block::Get(iv)(a)(b)(c)(d);
24
const word32 *k = m_key;
29
beforeS0(KX); beforeS0(S0); afterS0(LT);
30
afterS0(KX); afterS0(S1); afterS1(LT);
31
if (i == 3) // after 18th round
38
afterS1(KX); afterS1(S2); afterS2(LT);
39
afterS2(KX); afterS2(S3); afterS3(LT);
40
if (i == 2) // after 12th round
47
afterS3(KX); afterS3(S4); afterS4(LT);
48
afterS4(KX); afterS4(S5); afterS5(LT);
49
afterS5(KX); afterS5(S6); afterS6(LT);
50
afterS6(KX); afterS6(S7); afterS7(LT);
72
#define XMUX(c, x, y) (x ^ (y & (0 - (c & 1))))
73
m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
74
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
77
static word32 s_mulTables[512] = {
78
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
79
0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
80
0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
81
0x05A7DC90, 0xE4381382, 0x6E30EBB4, 0x8FAF24A6,
82
0xD320B2D8, 0x32BF7DCA, 0xB8B785FC, 0x59284AEE,
83
0x0AE71189, 0xEB78DE9B, 0x617026AD, 0x80EFE9BF,
84
0xDC607FC1, 0x3DFFB0D3, 0xB7F748E5, 0x566887F7,
85
0x0F40CD19, 0xEEDF020B, 0x64D7FA3D, 0x8548352F,
86
0xD9C7A351, 0x38586C43, 0xB2509475, 0x53CF5B67,
87
0x146722BB, 0xF5F8EDA9, 0x7FF0159F, 0x9E6FDA8D,
88
0xC2E04CF3, 0x237F83E1, 0xA9777BD7, 0x48E8B4C5,
89
0x11C0FE2B, 0xF05F3139, 0x7A57C90F, 0x9BC8061D,
90
0xC7479063, 0x26D85F71, 0xACD0A747, 0x4D4F6855,
91
0x1E803332, 0xFF1FFC20, 0x75170416, 0x9488CB04,
92
0xC8075D7A, 0x29989268, 0xA3906A5E, 0x420FA54C,
93
0x1B27EFA2, 0xFAB820B0, 0x70B0D886, 0x912F1794,
94
0xCDA081EA, 0x2C3F4EF8, 0xA637B6CE, 0x47A879DC,
95
0x28CE44DF, 0xC9518BCD, 0x435973FB, 0xA2C6BCE9,
96
0xFE492A97, 0x1FD6E585, 0x95DE1DB3, 0x7441D2A1,
97
0x2D69984F, 0xCCF6575D, 0x46FEAF6B, 0xA7616079,
98
0xFBEEF607, 0x1A713915, 0x9079C123, 0x71E60E31,
99
0x22295556, 0xC3B69A44, 0x49BE6272, 0xA821AD60,
100
0xF4AE3B1E, 0x1531F40C, 0x9F390C3A, 0x7EA6C328,
101
0x278E89C6, 0xC61146D4, 0x4C19BEE2, 0xAD8671F0,
102
0xF109E78E, 0x1096289C, 0x9A9ED0AA, 0x7B011FB8,
103
0x3CA96664, 0xDD36A976, 0x573E5140, 0xB6A19E52,
104
0xEA2E082C, 0x0BB1C73E, 0x81B93F08, 0x6026F01A,
105
0x390EBAF4, 0xD89175E6, 0x52998DD0, 0xB30642C2,
106
0xEF89D4BC, 0x0E161BAE, 0x841EE398, 0x65812C8A,
107
0x364E77ED, 0xD7D1B8FF, 0x5DD940C9, 0xBC468FDB,
108
0xE0C919A5, 0x0156D6B7, 0x8B5E2E81, 0x6AC1E193,
109
0x33E9AB7D, 0xD276646F, 0x587E9C59, 0xB9E1534B,
110
0xE56EC535, 0x04F10A27, 0x8EF9F211, 0x6F663D03,
111
0x50358817, 0xB1AA4705, 0x3BA2BF33, 0xDA3D7021,
112
0x86B2E65F, 0x672D294D, 0xED25D17B, 0x0CBA1E69,
113
0x55925487, 0xB40D9B95, 0x3E0563A3, 0xDF9AACB1,
114
0x83153ACF, 0x628AF5DD, 0xE8820DEB, 0x091DC2F9,
115
0x5AD2999E, 0xBB4D568C, 0x3145AEBA, 0xD0DA61A8,
116
0x8C55F7D6, 0x6DCA38C4, 0xE7C2C0F2, 0x065D0FE0,
117
0x5F75450E, 0xBEEA8A1C, 0x34E2722A, 0xD57DBD38,
118
0x89F22B46, 0x686DE454, 0xE2651C62, 0x03FAD370,
119
0x4452AAAC, 0xA5CD65BE, 0x2FC59D88, 0xCE5A529A,
120
0x92D5C4E4, 0x734A0BF6, 0xF942F3C0, 0x18DD3CD2,
121
0x41F5763C, 0xA06AB92E, 0x2A624118, 0xCBFD8E0A,
122
0x97721874, 0x76EDD766, 0xFCE52F50, 0x1D7AE042,
123
0x4EB5BB25, 0xAF2A7437, 0x25228C01, 0xC4BD4313,
124
0x9832D56D, 0x79AD1A7F, 0xF3A5E249, 0x123A2D5B,
125
0x4B1267B5, 0xAA8DA8A7, 0x20855091, 0xC11A9F83,
126
0x9D9509FD, 0x7C0AC6EF, 0xF6023ED9, 0x179DF1CB,
127
0x78FBCCC8, 0x996403DA, 0x136CFBEC, 0xF2F334FE,
128
0xAE7CA280, 0x4FE36D92, 0xC5EB95A4, 0x24745AB6,
129
0x7D5C1058, 0x9CC3DF4A, 0x16CB277C, 0xF754E86E,
130
0xABDB7E10, 0x4A44B102, 0xC04C4934, 0x21D38626,
131
0x721CDD41, 0x93831253, 0x198BEA65, 0xF8142577,
132
0xA49BB309, 0x45047C1B, 0xCF0C842D, 0x2E934B3F,
133
0x77BB01D1, 0x9624CEC3, 0x1C2C36F5, 0xFDB3F9E7,
134
0xA13C6F99, 0x40A3A08B, 0xCAAB58BD, 0x2B3497AF,
135
0x6C9CEE73, 0x8D032161, 0x070BD957, 0xE6941645,
136
0xBA1B803B, 0x5B844F29, 0xD18CB71F, 0x3013780D,
137
0x693B32E3, 0x88A4FDF1, 0x02AC05C7, 0xE333CAD5,
138
0xBFBC5CAB, 0x5E2393B9, 0xD42B6B8F, 0x35B4A49D,
139
0x667BFFFA, 0x87E430E8, 0x0DECC8DE, 0xEC7307CC,
140
0xB0FC91B2, 0x51635EA0, 0xDB6BA696, 0x3AF46984,
141
0x63DC236A, 0x8243EC78, 0x084B144E, 0xE9D4DB5C,
142
0xB55B4D22, 0x54C48230, 0xDECC7A06, 0x3F53B514,
144
0x00000000, 0xE19FCF13, 0x6B973726, 0x8A08F835,
145
0xD6876E4C, 0x3718A15F, 0xBD10596A, 0x5C8F9679,
146
0x05A7DC98, 0xE438138B, 0x6E30EBBE, 0x8FAF24AD,
147
0xD320B2D4, 0x32BF7DC7, 0xB8B785F2, 0x59284AE1,
148
0x0AE71199, 0xEB78DE8A, 0x617026BF, 0x80EFE9AC,
149
0xDC607FD5, 0x3DFFB0C6, 0xB7F748F3, 0x566887E0,
150
0x0F40CD01, 0xEEDF0212, 0x64D7FA27, 0x85483534,
151
0xD9C7A34D, 0x38586C5E, 0xB250946B, 0x53CF5B78,
152
0x1467229B, 0xF5F8ED88, 0x7FF015BD, 0x9E6FDAAE,
153
0xC2E04CD7, 0x237F83C4, 0xA9777BF1, 0x48E8B4E2,
154
0x11C0FE03, 0xF05F3110, 0x7A57C925, 0x9BC80636,
155
0xC747904F, 0x26D85F5C, 0xACD0A769, 0x4D4F687A,
156
0x1E803302, 0xFF1FFC11, 0x75170424, 0x9488CB37,
157
0xC8075D4E, 0x2998925D, 0xA3906A68, 0x420FA57B,
158
0x1B27EF9A, 0xFAB82089, 0x70B0D8BC, 0x912F17AF,
159
0xCDA081D6, 0x2C3F4EC5, 0xA637B6F0, 0x47A879E3,
160
0x28CE449F, 0xC9518B8C, 0x435973B9, 0xA2C6BCAA,
161
0xFE492AD3, 0x1FD6E5C0, 0x95DE1DF5, 0x7441D2E6,
162
0x2D699807, 0xCCF65714, 0x46FEAF21, 0xA7616032,
163
0xFBEEF64B, 0x1A713958, 0x9079C16D, 0x71E60E7E,
164
0x22295506, 0xC3B69A15, 0x49BE6220, 0xA821AD33,
165
0xF4AE3B4A, 0x1531F459, 0x9F390C6C, 0x7EA6C37F,
166
0x278E899E, 0xC611468D, 0x4C19BEB8, 0xAD8671AB,
167
0xF109E7D2, 0x109628C1, 0x9A9ED0F4, 0x7B011FE7,
168
0x3CA96604, 0xDD36A917, 0x573E5122, 0xB6A19E31,
169
0xEA2E0848, 0x0BB1C75B, 0x81B93F6E, 0x6026F07D,
170
0x390EBA9C, 0xD891758F, 0x52998DBA, 0xB30642A9,
171
0xEF89D4D0, 0x0E161BC3, 0x841EE3F6, 0x65812CE5,
172
0x364E779D, 0xD7D1B88E, 0x5DD940BB, 0xBC468FA8,
173
0xE0C919D1, 0x0156D6C2, 0x8B5E2EF7, 0x6AC1E1E4,
174
0x33E9AB05, 0xD2766416, 0x587E9C23, 0xB9E15330,
175
0xE56EC549, 0x04F10A5A, 0x8EF9F26F, 0x6F663D7C,
176
0x50358897, 0xB1AA4784, 0x3BA2BFB1, 0xDA3D70A2,
177
0x86B2E6DB, 0x672D29C8, 0xED25D1FD, 0x0CBA1EEE,
178
0x5592540F, 0xB40D9B1C, 0x3E056329, 0xDF9AAC3A,
179
0x83153A43, 0x628AF550, 0xE8820D65, 0x091DC276,
180
0x5AD2990E, 0xBB4D561D, 0x3145AE28, 0xD0DA613B,
181
0x8C55F742, 0x6DCA3851, 0xE7C2C064, 0x065D0F77,
182
0x5F754596, 0xBEEA8A85, 0x34E272B0, 0xD57DBDA3,
183
0x89F22BDA, 0x686DE4C9, 0xE2651CFC, 0x03FAD3EF,
184
0x4452AA0C, 0xA5CD651F, 0x2FC59D2A, 0xCE5A5239,
185
0x92D5C440, 0x734A0B53, 0xF942F366, 0x18DD3C75,
186
0x41F57694, 0xA06AB987, 0x2A6241B2, 0xCBFD8EA1,
187
0x977218D8, 0x76EDD7CB, 0xFCE52FFE, 0x1D7AE0ED,
188
0x4EB5BB95, 0xAF2A7486, 0x25228CB3, 0xC4BD43A0,
189
0x9832D5D9, 0x79AD1ACA, 0xF3A5E2FF, 0x123A2DEC,
190
0x4B12670D, 0xAA8DA81E, 0x2085502B, 0xC11A9F38,
191
0x9D950941, 0x7C0AC652, 0xF6023E67, 0x179DF174,
192
0x78FBCC08, 0x9964031B, 0x136CFB2E, 0xF2F3343D,
193
0xAE7CA244, 0x4FE36D57, 0xC5EB9562, 0x24745A71,
194
0x7D5C1090, 0x9CC3DF83, 0x16CB27B6, 0xF754E8A5,
195
0xABDB7EDC, 0x4A44B1CF, 0xC04C49FA, 0x21D386E9,
196
0x721CDD91, 0x93831282, 0x198BEAB7, 0xF81425A4,
197
0xA49BB3DD, 0x45047CCE, 0xCF0C84FB, 0x2E934BE8,
198
0x77BB0109, 0x9624CE1A, 0x1C2C362F, 0xFDB3F93C,
199
0xA13C6F45, 0x40A3A056, 0xCAAB5863, 0x2B349770,
200
0x6C9CEE93, 0x8D032180, 0x070BD9B5, 0xE69416A6,
201
0xBA1B80DF, 0x5B844FCC, 0xD18CB7F9, 0x301378EA,
202
0x693B320B, 0x88A4FD18, 0x02AC052D, 0xE333CA3E,
203
0xBFBC5C47, 0x5E239354, 0xD42B6B61, 0x35B4A472,
204
0x667BFF0A, 0x87E43019, 0x0DECC82C, 0xEC73073F,
205
0xB0FC9146, 0x51635E55, 0xDB6BA660, 0x3AF46973,
206
0x63DC2392, 0x8243EC81, 0x084B14B4, 0xE9D4DBA7,
207
0xB55B4DDE, 0x54C482CD, 0xDECC7AF8, 0x3F53B5EB,
209
0x00000000, 0x180F40CD, 0x301E8033, 0x2811C0FE,
210
0x603CA966, 0x7833E9AB, 0x50222955, 0x482D6998,
211
0xC078FBCC, 0xD877BB01, 0xF0667BFF, 0xE8693B32,
212
0xA04452AA, 0xB84B1267, 0x905AD299, 0x88559254,
213
0x29F05F31, 0x31FF1FFC, 0x19EEDF02, 0x01E19FCF,
214
0x49CCF657, 0x51C3B69A, 0x79D27664, 0x61DD36A9,
215
0xE988A4FD, 0xF187E430, 0xD99624CE, 0xC1996403,
216
0x89B40D9B, 0x91BB4D56, 0xB9AA8DA8, 0xA1A5CD65,
217
0x5249BE62, 0x4A46FEAF, 0x62573E51, 0x7A587E9C,
218
0x32751704, 0x2A7A57C9, 0x026B9737, 0x1A64D7FA,
219
0x923145AE, 0x8A3E0563, 0xA22FC59D, 0xBA208550,
220
0xF20DECC8, 0xEA02AC05, 0xC2136CFB, 0xDA1C2C36,
221
0x7BB9E153, 0x63B6A19E, 0x4BA76160, 0x53A821AD,
222
0x1B854835, 0x038A08F8, 0x2B9BC806, 0x339488CB,
223
0xBBC11A9F, 0xA3CE5A52, 0x8BDF9AAC, 0x93D0DA61,
224
0xDBFDB3F9, 0xC3F2F334, 0xEBE333CA, 0xF3EC7307,
225
0xA492D5C4, 0xBC9D9509, 0x948C55F7, 0x8C83153A,
226
0xC4AE7CA2, 0xDCA13C6F, 0xF4B0FC91, 0xECBFBC5C,
227
0x64EA2E08, 0x7CE56EC5, 0x54F4AE3B, 0x4CFBEEF6,
228
0x04D6876E, 0x1CD9C7A3, 0x34C8075D, 0x2CC74790,
229
0x8D628AF5, 0x956DCA38, 0xBD7C0AC6, 0xA5734A0B,
230
0xED5E2393, 0xF551635E, 0xDD40A3A0, 0xC54FE36D,
231
0x4D1A7139, 0x551531F4, 0x7D04F10A, 0x650BB1C7,
232
0x2D26D85F, 0x35299892, 0x1D38586C, 0x053718A1,
233
0xF6DB6BA6, 0xEED42B6B, 0xC6C5EB95, 0xDECAAB58,
234
0x96E7C2C0, 0x8EE8820D, 0xA6F942F3, 0xBEF6023E,
235
0x36A3906A, 0x2EACD0A7, 0x06BD1059, 0x1EB25094,
236
0x569F390C, 0x4E9079C1, 0x6681B93F, 0x7E8EF9F2,
237
0xDF2B3497, 0xC724745A, 0xEF35B4A4, 0xF73AF469,
238
0xBF179DF1, 0xA718DD3C, 0x8F091DC2, 0x97065D0F,
239
0x1F53CF5B, 0x075C8F96, 0x2F4D4F68, 0x37420FA5,
240
0x7F6F663D, 0x676026F0, 0x4F71E60E, 0x577EA6C3,
241
0xE18D0321, 0xF98243EC, 0xD1938312, 0xC99CC3DF,
242
0x81B1AA47, 0x99BEEA8A, 0xB1AF2A74, 0xA9A06AB9,
243
0x21F5F8ED, 0x39FAB820, 0x11EB78DE, 0x09E43813,
244
0x41C9518B, 0x59C61146, 0x71D7D1B8, 0x69D89175,
245
0xC87D5C10, 0xD0721CDD, 0xF863DC23, 0xE06C9CEE,
246
0xA841F576, 0xB04EB5BB, 0x985F7545, 0x80503588,
247
0x0805A7DC, 0x100AE711, 0x381B27EF, 0x20146722,
248
0x68390EBA, 0x70364E77, 0x58278E89, 0x4028CE44,
249
0xB3C4BD43, 0xABCBFD8E, 0x83DA3D70, 0x9BD57DBD,
250
0xD3F81425, 0xCBF754E8, 0xE3E69416, 0xFBE9D4DB,
251
0x73BC468F, 0x6BB30642, 0x43A2C6BC, 0x5BAD8671,
252
0x1380EFE9, 0x0B8FAF24, 0x239E6FDA, 0x3B912F17,
253
0x9A34E272, 0x823BA2BF, 0xAA2A6241, 0xB225228C,
254
0xFA084B14, 0xE2070BD9, 0xCA16CB27, 0xD2198BEA,
255
0x5A4C19BE, 0x42435973, 0x6A52998D, 0x725DD940,
256
0x3A70B0D8, 0x227FF015, 0x0A6E30EB, 0x12617026,
257
0x451FD6E5, 0x5D109628, 0x750156D6, 0x6D0E161B,
258
0x25237F83, 0x3D2C3F4E, 0x153DFFB0, 0x0D32BF7D,
259
0x85672D29, 0x9D686DE4, 0xB579AD1A, 0xAD76EDD7,
260
0xE55B844F, 0xFD54C482, 0xD545047C, 0xCD4A44B1,
261
0x6CEF89D4, 0x74E0C919, 0x5CF109E7, 0x44FE492A,
262
0x0CD320B2, 0x14DC607F, 0x3CCDA081, 0x24C2E04C,
263
0xAC977218, 0xB49832D5, 0x9C89F22B, 0x8486B2E6,
264
0xCCABDB7E, 0xD4A49BB3, 0xFCB55B4D, 0xE4BA1B80,
265
0x17566887, 0x0F59284A, 0x2748E8B4, 0x3F47A879,
266
0x776AC1E1, 0x6F65812C, 0x477441D2, 0x5F7B011F,
267
0xD72E934B, 0xCF21D386, 0xE7301378, 0xFF3F53B5,
268
0xB7123A2D, 0xAF1D7AE0, 0x870CBA1E, 0x9F03FAD3,
269
0x3EA637B6, 0x26A9777B, 0x0EB8B785, 0x16B7F748,
270
0x5E9A9ED0, 0x4695DE1D, 0x6E841EE3, 0x768B5E2E,
271
0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
272
0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
276
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
277
unsigned int SosemanukPolicy::GetAlignment() const
279
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
280
#ifdef __INTEL_COMPILER
281
if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
291
unsigned int SosemanukPolicy::GetOptimalBlockSize() const
293
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
294
#ifdef __INTEL_COMPILER
295
if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
299
return 4*BYTES_PER_ITERATION;
302
return BYTES_PER_ITERATION;
306
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
308
void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
310
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
311
#ifdef __INTEL_COMPILER
312
if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
320
".intel_syntax noprefix;"
323
word32 *state = m_state;
324
AS2( mov WORD_REG(ax), state)
325
AS2( mov WORD_REG(di), output)
326
AS2( mov WORD_REG(dx), input)
327
AS2( mov WORD_REG(cx), iterationCount)
330
#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
331
#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
332
#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
333
#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
334
#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
335
#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
336
#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
337
#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ
338
#define SSE2_uvStart SSE2_stateCopy + 12*4
341
AS2( mov WORD_REG(bx), WORD_REG(sp))
342
AS2( and WORD_REG(sp), -16)
343
AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
344
AS2( mov [WORD_REG(sp)], WORD_REG(bx))
345
AS2( mov SSE2_output, WORD_REG(di))
346
AS2( mov SSE2_input, WORD_REG(dx))
347
AS2( mov SSE2_state, WORD_REG(ax))
349
AS2( mov SSE2_pMulTables, WORD_REG(si))
351
AS2( lea WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
352
AS2( lea WORD_REG(si), [4*WORD_REG(cx)])
353
AS2( mov SSE2_wordsLeft, WORD_REG(si))
354
AS2( movdqa xmm0, [WORD_REG(ax)+0*16]) // copy state to stack to save a register
355
AS2( movdqa [SSE2_stateCopy+0*16], xmm0)
356
AS2( movdqa xmm0, [WORD_REG(ax)+1*16])
357
AS2( movdqa [SSE2_stateCopy+1*16], xmm0)
358
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
359
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
361
AS2( movd ebx, xmm0) // s(9)
362
AS2( mov ecx, [WORD_REG(ax)+10*4])
363
AS2( mov edx, [WORD_REG(ax)+11*4])
364
AS2( pcmpeqb xmm7, xmm7) // all ones
366
#define s(i) SSE2_stateCopy + ASM_MOD(i,10)*4
367
#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
368
#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
375
#define SSE2_STEP(i, j) \
376
AS2( mov eax, [s(i+0)])\
377
AS2( mov [v(i)], eax)\
379
AS2( lea ebp, [ebx + r2##j])\
380
AS2( xor ebp, r1##j)\
381
AS2( mov [u(i)], ebp)\
383
AS2( and ebp, r2##j)\
388
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
389
AS2( mov eax, [s(i+3)])\
390
AS2( xor ebp, [s(i+2)])\
391
AS2( add r1##j, ebp)\
394
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
396
AS2( imul r2##j, 0x54655307)\
398
AS2( mov [s(i+0)], ebx)\
400
ASL(2) // outer loop, each iteration of this processes 80 words
401
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
402
AS2( mov WORD_REG(ax), 80)
403
AS2( cmp WORD_REG(si), 80)
404
AS2( cmovg WORD_REG(si), WORD_REG(ax))
405
AS2( mov SSE2_wordsLeft2, WORD_REG(si))
406
AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
407
AS2( mov SSE2_diEnd, WORD_REG(si))
409
AS2( lea WORD_REG(si), s_mulTables)
411
AS2( mov WORD_REG(si), SSE2_pMulTables)
414
ASL(0) // first inner loop, 20 words each, 4 iterations
436
AS2( add WORD_REG(di), 5*4)
437
AS2( cmp WORD_REG(di), SSE2_diEnd)
440
AS2( mov WORD_REG(ax), SSE2_input)
441
AS2( mov WORD_REG(bp), SSE2_output)
442
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
443
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
445
ASL(1) // second inner loop, 16 words each, 5 iterations
446
AS2( movdqa xmm0, [WORD_REG(di)+0*20*4])
447
AS2( movdqa xmm2, [WORD_REG(di)+2*20*4])
448
AS2( movdqa xmm3, [WORD_REG(di)+3*20*4])
449
AS2( movdqa xmm1, [WORD_REG(di)+1*20*4])
451
AS2( movdqa xmm4, xmm0)
452
AS2( pand xmm0, xmm2)
453
AS2( pxor xmm0, xmm3)
454
AS2( pxor xmm2, xmm1)
455
AS2( pxor xmm2, xmm0)
457
AS2( pxor xmm3, xmm1)
458
AS2( pxor xmm4, xmm2)
459
AS2( movdqa xmm1, xmm3)
461
AS2( pxor xmm3, xmm0)
462
AS2( pand xmm0, xmm1)
463
AS2( pxor xmm4, xmm0)
464
AS2( pxor xmm1, xmm3)
465
AS2( pxor xmm1, xmm4)
466
AS2( pxor xmm4, xmm7)
468
AS2( pxor xmm2, [WORD_REG(di)+80*4])
469
AS2( pxor xmm3, [WORD_REG(di)+80*5])
470
AS2( pxor xmm1, [WORD_REG(di)+80*6])
471
AS2( pxor xmm4, [WORD_REG(di)+80*7])
472
// exit loop early if less than 16 words left to output
473
// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
474
AS2( cmp WORD_REG(si), 16)
477
AS2( movdqa xmm6, xmm2)
478
AS2( punpckldq xmm2, xmm3)
479
AS2( movdqa xmm5, xmm1)
480
AS2( punpckldq xmm1, xmm4)
481
AS2( movdqa xmm0, xmm2)
482
AS2( punpcklqdq xmm2, xmm1)
483
AS2( punpckhqdq xmm0, xmm1)
484
AS2( punpckhdq xmm6, xmm3)
485
AS2( punpckhdq xmm5, xmm4)
486
AS2( movdqa xmm3, xmm6)
487
AS2( punpcklqdq xmm6, xmm5)
488
AS2( punpckhqdq xmm3, xmm5)
490
AS2( test WORD_REG(ax), WORD_REG(ax))
494
AS2( pxor xmm2, [WORD_REG(ax)+0*16])
495
AS2( pxor xmm0, [WORD_REG(ax)+1*16])
496
AS2( pxor xmm6, [WORD_REG(ax)+2*16])
497
AS2( pxor xmm3, [WORD_REG(ax)+3*16])
498
AS2( add WORD_REG(ax), 4*16)
501
AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
502
AS2( pxor xmm2, xmm1)
503
AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
504
AS2( pxor xmm0, xmm1)
505
AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
506
AS2( pxor xmm6, xmm1)
507
AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
508
AS2( pxor xmm3, xmm1)
509
AS2( add WORD_REG(ax), 4*16)
513
AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
514
AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
515
AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
516
AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
519
AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
520
AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
521
AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
522
AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
525
AS2( add WORD_REG(di), 4*4)
526
AS2( add WORD_REG(bp), 4*16)
527
AS2( sub WORD_REG(si), 16)
531
AS2( mov WORD_REG(si), SSE2_wordsLeft)
532
AS2( sub WORD_REG(si), 80)
534
AS2( mov SSE2_wordsLeft, WORD_REG(si))
535
AS2( mov SSE2_input, WORD_REG(ax))
536
AS2( mov SSE2_output, WORD_REG(bp))
539
ASL(4) // final output of less than 16 words
540
AS2( test WORD_REG(ax), WORD_REG(ax))
542
AS2( movd xmm0, [WORD_REG(ax)+0*4])
543
AS2( pxor xmm2, xmm0)
544
AS2( movd xmm0, [WORD_REG(ax)+1*4])
545
AS2( pxor xmm3, xmm0)
546
AS2( movd xmm0, [WORD_REG(ax)+2*4])
547
AS2( pxor xmm1, xmm0)
548
AS2( movd xmm0, [WORD_REG(ax)+3*4])
549
AS2( pxor xmm4, xmm0)
550
AS2( add WORD_REG(ax), 16)
552
AS2( movd [WORD_REG(bp)+0*4], xmm2)
553
AS2( movd [WORD_REG(bp)+1*4], xmm3)
554
AS2( movd [WORD_REG(bp)+2*4], xmm1)
555
AS2( movd [WORD_REG(bp)+3*4], xmm4)
556
AS2( sub WORD_REG(si), 4)
558
AS2( add WORD_REG(bp), 16)
566
AS2( mov WORD_REG(bx), SSE2_state)
567
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
568
AS2( movdqa [WORD_REG(bx)+0*16], xmm0)
569
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
570
AS2( movdqa [WORD_REG(bx)+1*16], xmm0)
571
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
572
AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0)
573
AS2( mov [WORD_REG(bx)+10*4], ecx)
574
AS2( mov [WORD_REG(bx)+11*4], edx)
581
".att_syntax prefix;"
583
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
591
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
592
#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)])
594
#define MUL_A(x) (((x) << 8) ^ s_mulTables[(x) >> 24])
597
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)])
599
#define r1(i) ((i%2) ? reg2 : reg1)
600
#define r2(i) ((i%2) ? reg1 : reg2)
602
#define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u) \
603
u = (s##x9 + r2(x0)) ^ r1(x0);\
605
s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
606
r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
607
r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
609
#define SOSEMANUK_OUTPUT(x) \
610
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
611
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, u3 ^ v1);\
612
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, u1 ^ v2);\
613
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, u4 ^ v3);
616
S2(0, u0, u1, u2, u3, u4);\
617
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SOSEMANUK_OUTPUT, 4*4);
619
word32 s0 = m_state[0];
620
word32 s1 = m_state[1];
621
word32 s2 = m_state[2];
622
word32 s3 = m_state[3];
623
word32 s4 = m_state[4];
624
word32 s5 = m_state[5];
625
word32 s6 = m_state[6];
626
word32 s7 = m_state[7];
627
word32 s8 = m_state[8];
628
word32 s9 = m_state[9];
629
word32 reg1 = m_state[10];
630
word32 reg2 = m_state[11];
631
word32 u0, u1, u2, u3, u4, v0, v1, v2, v3;
635
STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v0, u0)
636
STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v1, u1)
637
STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v2, u2)
638
STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v3, u3)
640
STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v0, u0)
641
STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v1, u1)
642
STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v2, u2)
643
STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v3, u3)
645
STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v0, u0)
646
STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v1, u1)
647
STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v2, u2)
648
STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v3, u3)
650
STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v0, u0)
651
STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v1, u1)
652
STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v2, u2)
653
STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v3, u3)
655
STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v0, u0)
656
STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v1, u1)
657
STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v2, u2)
658
STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v3, u3)
661
while (--iterationCount);