14
19
Serpent_KeySchedule(m_key, 24, userKey, keylen);
17
void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
22
void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv, size_t length)
19
26
word32 a, b, c, d, e;
21
28
typedef BlockGetAndPut<word32, LittleEndian> Block;
74
81
m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
77
static word32 s_mulTables[512] = {
85
word32 s_sosemanukMulTables[512] = {
78
86
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
79
87
0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
80
88
0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
271
279
0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
272
280
0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
276
284
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
277
285
unsigned int SosemanukPolicy::GetAlignment() const
314
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
316
void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
306
320
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
308
322
void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
324
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
326
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
327
Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
310
331
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
332
#ifdef CRYPTOPP_GENERATE_X64_MASM
334
Sosemanuk_OperateKeystream PROC FRAME
337
alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
338
save_xmm128 xmm6, 02f0h
339
save_xmm128 xmm7, 0300h
311
344
#ifdef __INTEL_COMPILER
312
345
if (HasSSE2() && !IsP4()) // Intel compiler produces faster code for this algorithm on the P4
326
362
AS2( mov WORD_REG(dx), input)
327
363
AS2( mov WORD_REG(cx), iterationCount)
330
#define SSE2_output WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
331
#define SSE2_input WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
332
#define SSE2_wordsLeft WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
333
#define SSE2_diEnd WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
334
#define SSE2_pMulTables WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
335
#define SSE2_state WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
336
#define SSE2_wordsLeft2 WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
337
#define SSE2_stateCopy WORD_REG(sp) + 8*WORD_SZ
365
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
367
#if defined(__GNUC__) && CRYPTOPP_BOOL_X64
368
#define SSE2_workspace %5
370
#define SSE2_workspace WORD_REG(sp)
373
#define SSE2_output WORD_PTR [SSE2_workspace+1*WORD_SZ]
374
#define SSE2_input WORD_PTR [SSE2_workspace+2*WORD_SZ]
375
#define SSE2_wordsLeft WORD_PTR [SSE2_workspace+3*WORD_SZ]
376
#define SSE2_diEnd WORD_PTR [SSE2_workspace+4*WORD_SZ]
377
#define SSE2_pMulTables WORD_PTR [SSE2_workspace+5*WORD_SZ]
378
#define SSE2_state WORD_PTR [SSE2_workspace+6*WORD_SZ]
379
#define SSE2_wordsLeft2 WORD_PTR [SSE2_workspace+7*WORD_SZ]
380
#define SSE2_stateCopy SSE2_workspace + 8*WORD_SZ
338
381
#define SSE2_uvStart SSE2_stateCopy + 12*4
341
AS2( mov WORD_REG(bx), WORD_REG(sp))
342
AS2( and WORD_REG(sp), -16)
343
AS2( sub WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
344
AS2( mov [WORD_REG(sp)], WORD_REG(bx))
383
#if CRYPTOPP_BOOL_X86
385
AS2( mov AS_REG_6, esp)
387
AS2( sub esp, 80*4*2+12*4+8*WORD_SZ) // 80 v's, 80 u's, 12 state, 8 locals
388
AS2( mov [esp], AS_REG_6)
345
390
AS2( mov SSE2_output, WORD_REG(di))
346
391
AS2( mov SSE2_input, WORD_REG(dx))
347
392
AS2( mov SSE2_state, WORD_REG(ax))
358
403
AS2( movq xmm0, QWORD PTR [WORD_REG(ax)+2*16])
359
404
AS2( movq QWORD PTR [SSE2_stateCopy+2*16], xmm0)
360
405
AS2( psrlq xmm0, 32)
361
AS2( movd ebx, xmm0) // s(9)
406
AS2( movd AS_REG_6d, xmm0) // s(9)
362
407
AS2( mov ecx, [WORD_REG(ax)+10*4])
363
408
AS2( mov edx, [WORD_REG(ax)+11*4])
364
409
AS2( pcmpeqb xmm7, xmm7) // all ones
367
412
#define u(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
368
413
#define v(j) WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
419
// workaround bug in GAS 2.15
420
#define R20r WORD_REG(dx)
421
#define R21r WORD_REG(cx)
375
423
#define SSE2_STEP(i, j) \
376
424
AS2( mov eax, [s(i+0)])\
377
425
AS2( mov [v(i)], eax)\
378
426
AS2( rol eax, 8)\
379
AS2( lea ebp, [ebx + r2##j])\
380
AS2( xor ebp, r1##j)\
381
AS2( mov [u(i)], ebp)\
383
AS2( and ebp, r2##j)\
427
AS2( lea AS_REG_7, [AS_REG_6 + R2##j##r])\
428
AS2( xor AS_REG_7d, R1##j)\
429
AS2( mov [u(i)], AS_REG_7d)\
430
AS2( mov AS_REG_7d, 1)\
431
AS2( and AS_REG_7d, R2##j)\
433
AS2( and AS_REG_7d, AS_REG_6d)\
434
AS2( xor AS_REG_6d, eax)\
387
435
AS2( movzx eax, al)\
388
AS2( xor ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
436
AS2( xor AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
389
437
AS2( mov eax, [s(i+3)])\
390
AS2( xor ebp, [s(i+2)])\
391
AS2( add r1##j, ebp)\
438
AS2( xor AS_REG_7d, [s(i+2)])\
439
AS2( add R1##j, AS_REG_7d)\
440
AS2( movzx AS_REG_7d, al)\
393
441
AS2( shr eax, 8)\
394
AS2( xor ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
396
AS2( imul r2##j, 0x54655307)\
398
AS2( mov [s(i+0)], ebx)\
442
AS2( xor AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
443
AS2( xor AS_REG_6d, eax)\
444
AS2( imul R2##j, AS_HEX(54655307))\
446
AS2( mov [s(i+0)], AS_REG_6d)\
400
448
ASL(2) // outer loop, each iteration of this processes 80 words
401
449
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
406
454
AS2( lea WORD_REG(si), [WORD_REG(di)+WORD_REG(si)]) // use to end first inner loop
407
455
AS2( mov SSE2_diEnd, WORD_REG(si))
409
AS2( lea WORD_REG(si), s_mulTables)
457
AS2( lea WORD_REG(si), s_sosemanukMulTables)
411
459
AS2( mov WORD_REG(si), SSE2_pMulTables)
440
488
AS2( mov WORD_REG(ax), SSE2_input)
441
AS2( mov WORD_REG(bp), SSE2_output)
489
AS2( mov AS_REG_7, SSE2_output)
442
490
AS2( lea WORD_REG(di), [SSE2_uvStart]) // start of v and u
443
491
AS2( mov WORD_REG(si), SSE2_wordsLeft2)
487
535
AS2( punpcklqdq xmm6, xmm5)
488
536
AS2( punpckhqdq xmm3, xmm5)
489
537
// output keystream
490
AS2( test WORD_REG(ax), WORD_REG(ax))
494
AS2( pxor xmm2, [WORD_REG(ax)+0*16])
495
AS2( pxor xmm0, [WORD_REG(ax)+1*16])
496
AS2( pxor xmm6, [WORD_REG(ax)+2*16])
497
AS2( pxor xmm3, [WORD_REG(ax)+3*16])
498
AS2( add WORD_REG(ax), 4*16)
501
AS2( movdqu xmm1, [WORD_REG(ax)+0*16])
502
AS2( pxor xmm2, xmm1)
503
AS2( movdqu xmm1, [WORD_REG(ax)+1*16])
504
AS2( pxor xmm0, xmm1)
505
AS2( movdqu xmm1, [WORD_REG(ax)+2*16])
506
AS2( pxor xmm6, xmm1)
507
AS2( movdqu xmm1, [WORD_REG(ax)+3*16])
508
AS2( pxor xmm3, xmm1)
509
AS2( add WORD_REG(ax), 4*16)
513
AS2( movdqa [WORD_REG(bp)+0*16], xmm2)
514
AS2( movdqa [WORD_REG(bp)+1*16], xmm0)
515
AS2( movdqa [WORD_REG(bp)+2*16], xmm6)
516
AS2( movdqa [WORD_REG(bp)+3*16], xmm3)
519
AS2( movdqu [WORD_REG(bp)+0*16], xmm2)
520
AS2( movdqu [WORD_REG(bp)+1*16], xmm0)
521
AS2( movdqu [WORD_REG(bp)+2*16], xmm6)
522
AS2( movdqu [WORD_REG(bp)+3*16], xmm3)
538
AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
525
541
AS2( add WORD_REG(di), 4*4)
526
AS2( add WORD_REG(bp), 4*16)
527
542
AS2( sub WORD_REG(si), 16)
534
549
AS2( mov SSE2_wordsLeft, WORD_REG(si))
535
550
AS2( mov SSE2_input, WORD_REG(ax))
536
AS2( mov SSE2_output, WORD_REG(bp))
551
AS2( mov SSE2_output, AS_REG_7)
539
554
ASL(4) // final output of less than 16 words
540
555
AS2( test WORD_REG(ax), WORD_REG(ax))
542
AS2( movd xmm0, [WORD_REG(ax)+0*4])
557
AS2( movd xmm0, dword ptr [WORD_REG(ax)+0*4])
543
558
AS2( pxor xmm2, xmm0)
544
AS2( movd xmm0, [WORD_REG(ax)+1*4])
559
AS2( movd xmm0, dword ptr [WORD_REG(ax)+1*4])
545
560
AS2( pxor xmm3, xmm0)
546
AS2( movd xmm0, [WORD_REG(ax)+2*4])
561
AS2( movd xmm0, dword ptr [WORD_REG(ax)+2*4])
547
562
AS2( pxor xmm1, xmm0)
548
AS2( movd xmm0, [WORD_REG(ax)+3*4])
563
AS2( movd xmm0, dword ptr [WORD_REG(ax)+3*4])
549
564
AS2( pxor xmm4, xmm0)
550
565
AS2( add WORD_REG(ax), 16)
552
AS2( movd [WORD_REG(bp)+0*4], xmm2)
553
AS2( movd [WORD_REG(bp)+1*4], xmm3)
554
AS2( movd [WORD_REG(bp)+2*4], xmm1)
555
AS2( movd [WORD_REG(bp)+3*4], xmm4)
567
AS2( movd dword ptr [AS_REG_7+0*4], xmm2)
568
AS2( movd dword ptr [AS_REG_7+1*4], xmm3)
569
AS2( movd dword ptr [AS_REG_7+2*4], xmm1)
570
AS2( movd dword ptr [AS_REG_7+3*4], xmm4)
556
571
AS2( sub WORD_REG(si), 4)
558
AS2( add WORD_REG(bp), 16)
573
AS2( add AS_REG_7, 16)
559
574
AS2( psrldq xmm2, 4)
560
575
AS2( psrldq xmm3, 4)
561
576
AS2( psrldq xmm1, 4)
565
580
ASL(6) // save state
566
AS2( mov WORD_REG(bx), SSE2_state)
581
AS2( mov AS_REG_6, SSE2_state)
567
582
AS2( movdqa xmm0, [SSE2_stateCopy+0*16])
568
AS2( movdqa [WORD_REG(bx)+0*16], xmm0)
583
AS2( movdqa [AS_REG_6+0*16], xmm0)
569
584
AS2( movdqa xmm0, [SSE2_stateCopy+1*16])
570
AS2( movdqa [WORD_REG(bx)+1*16], xmm0)
585
AS2( movdqa [AS_REG_6+1*16], xmm0)
571
586
AS2( movq xmm0, QWORD PTR [SSE2_stateCopy+2*16])
572
AS2( movq QWORD PTR [WORD_REG(bx)+2*16], xmm0)
573
AS2( mov [WORD_REG(bx)+10*4], ecx)
574
AS2( mov [WORD_REG(bx)+11*4], edx)
587
AS2( movq QWORD PTR [AS_REG_6+2*16], xmm0)
588
AS2( mov [AS_REG_6+10*4], ecx)
589
AS2( mov [AS_REG_6+11*4], edx)
581
596
".att_syntax prefix;"
583
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
598
: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
599
#if CRYPTOPP_BOOL_X64
600
, "r" (workspace.m_ptr)
601
: "memory", "cc", "%r9", "%r10", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
607
#ifdef CRYPTOPP_GENERATE_X64_MASM
608
movdqa xmm6, [rsp + 02f0h]
609
movdqa xmm7, [rsp + 0300h]
610
add rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
614
Sosemanuk_OperateKeystream ENDP
620
#ifndef CRYPTOPP_GENERATE_X64_MASM
591
622
#if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
592
#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)])
623
#define MUL_A(x) (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
594
#define MUL_A(x) (((x) << 8) ^ s_mulTables[(x) >> 24])
625
#define MUL_A(x) (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
597
#define DIV_A(x) (((x) >> 8) ^ s_mulTables[256 + byte(x)])
628
#define DIV_A(x) (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
599
630
#define r1(i) ((i%2) ? reg2 : reg1)
600
631
#define r2(i) ((i%2) ? reg1 : reg2)