2
EXTERNDEF s_sosemanukMulTables:FAR
6
Salsa20_OperateKeystream PROC FRAME
8
alloc_stack(10*16 + 32*16 + 8)
9
save_xmm128 xmm6, 0200h
10
save_xmm128 xmm7, 0210h
11
save_xmm128 xmm8, 0220h
12
save_xmm128 xmm9, 0230h
13
save_xmm128 xmm10, 0240h
14
save_xmm128 xmm11, 0250h
15
save_xmm128 xmm12, 0260h
16
save_xmm128 xmm13, 0270h
17
save_xmm128 xmm14, 0280h
18
save_xmm128 xmm15, 0290h
22
movdqa xmm0, [r10 + 0*16]
23
movdqa xmm1, [r10 + 1*16]
24
movdqa xmm2, [r10 + 2*16]
25
movdqa xmm3, [r10 + 3*16]
26
pshufd xmm4, xmm0, 0*64+0*16+0*4+0
27
movdqa [rsp + (0*4+0)*16 + 256], xmm4
28
pshufd xmm4, xmm0, 1*64+1*16+1*4+1
29
movdqa [rsp + (0*4+1)*16 + 256], xmm4
30
pshufd xmm4, xmm0, 2*64+2*16+2*4+2
31
movdqa [rsp + (0*4+2)*16 + 256], xmm4
32
pshufd xmm4, xmm0, 3*64+3*16+3*4+3
33
movdqa [rsp + (0*4+3)*16 + 256], xmm4
34
pshufd xmm4, xmm1, 0*64+0*16+0*4+0
35
movdqa [rsp + (1*4+0)*16 + 256], xmm4
36
pshufd xmm4, xmm1, 2*64+2*16+2*4+2
37
movdqa [rsp + (1*4+2)*16 + 256], xmm4
38
pshufd xmm4, xmm1, 3*64+3*16+3*4+3
39
movdqa [rsp + (1*4+3)*16 + 256], xmm4
40
pshufd xmm4, xmm2, 1*64+1*16+1*4+1
41
movdqa [rsp + (2*4+1)*16 + 256], xmm4
42
pshufd xmm4, xmm2, 2*64+2*16+2*4+2
43
movdqa [rsp + (2*4+2)*16 + 256], xmm4
44
pshufd xmm4, xmm2, 3*64+3*16+3*4+3
45
movdqa [rsp + (2*4+3)*16 + 256], xmm4
46
pshufd xmm4, xmm3, 0*64+0*16+0*4+0
47
movdqa [rsp + (3*4+0)*16 + 256], xmm4
48
pshufd xmm4, xmm3, 1*64+1*16+1*4+1
49
movdqa [rsp + (3*4+1)*16 + 256], xmm4
50
pshufd xmm4, xmm3, 2*64+2*16+2*4+2
51
movdqa [rsp + (3*4+2)*16 + 256], xmm4
52
pshufd xmm4, xmm3, 3*64+3*16+3*4+3
53
movdqa [rsp + (3*4+3)*16 + 256], xmm4
55
mov eax, dword ptr [r10 + 8*4]
56
mov r11d, dword ptr [r10 + 5*4]
57
mov dword ptr [rsp + 8*16 + 0*4 + 256], eax
58
mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d
61
mov dword ptr [rsp + 8*16 + 1*4 + 256], eax
62
mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d
65
mov dword ptr [rsp + 8*16 + 2*4 + 256], eax
66
mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d
69
mov dword ptr [rsp + 8*16 + 3*4 + 256], eax
70
mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d
73
mov dword ptr [r10 + 8*4], eax
74
mov dword ptr [r10 + 5*4], r11d
75
movdqa xmm0, [rsp + 12*16 + 1*256]
76
movdqa xmm4, [rsp + 13*16 + 1*256]
77
movdqa xmm8, [rsp + 14*16 + 1*256]
78
movdqa xmm12, [rsp + 15*16 + 1*256]
79
movdqa xmm2, [rsp + 0*16 + 1*256]
80
movdqa xmm6, [rsp + 1*16 + 1*256]
81
movdqa xmm10, [rsp + 2*16 + 1*256]
82
movdqa xmm14, [rsp + 3*16 + 1*256]
99
pxor xmm0, [rsp + 4*16 + 1*256]
100
pxor xmm4, [rsp + 5*16 + 1*256]
101
pxor xmm8, [rsp + 6*16 + 1*256]
102
pxor xmm12, [rsp + 7*16 + 1*256]
107
movdqa [rsp + 4*16], xmm0
108
movdqa [rsp + 5*16], xmm4
109
movdqa [rsp + 6*16], xmm8
110
movdqa [rsp + 7*16], xmm12
131
pxor xmm0, [rsp + 8*16 + 1*256]
132
pxor xmm4, [rsp + 9*16 + 1*256]
133
pxor xmm8, [rsp + 10*16 + 1*256]
134
pxor xmm12, [rsp + 11*16 + 1*256]
139
movdqa [rsp + 8*16], xmm0
140
movdqa [rsp + 9*16], xmm4
141
movdqa [rsp + 10*16], xmm8
142
movdqa [rsp + 11*16], xmm12
163
pxor xmm0, [rsp + 12*16 + 1*256]
164
pxor xmm4, [rsp + 13*16 + 1*256]
165
pxor xmm8, [rsp + 14*16 + 1*256]
166
pxor xmm12, [rsp + 15*16 + 1*256]
171
movdqa [rsp + 12*16], xmm0
172
movdqa [rsp + 13*16], xmm4
173
movdqa [rsp + 14*16], xmm8
174
movdqa [rsp + 15*16], xmm12
199
movdqa [rsp + 0*16], xmm0
200
movdqa [rsp + 1*16], xmm4
201
movdqa [rsp + 2*16], xmm8
202
movdqa [rsp + 3*16], xmm12
205
labelSSE2_Salsa_Output:
211
punpcklqdq xmm4, xmm6
212
punpckhqdq xmm2, xmm6
216
punpcklqdq xmm0, xmm1
217
punpckhqdq xmm6, xmm1
219
jz labelSSE2_Salsa_Output_A3
221
jnz labelSSE2_Salsa_Output_A7
222
pxor xmm4, [rdx+0*16]
223
pxor xmm2, [rdx+4*16]
224
pxor xmm0, [rdx+8*16]
225
pxor xmm6, [rdx+12*16]
227
jmp labelSSE2_Salsa_Output_A3
228
labelSSE2_Salsa_Output_A7:
229
movdqu xmm1, [rdx+0*16]
231
movdqu xmm1, [rdx+4*16]
233
movdqu xmm1, [rdx+8*16]
235
movdqu xmm1, [rdx+12*16]
238
labelSSE2_Salsa_Output_A3:
240
jnz labelSSE2_Salsa_Output_A8
241
movdqa [rcx+0*16], xmm4
242
movdqa [rcx+4*16], xmm2
243
movdqa [rcx+8*16], xmm0
244
movdqa [rcx+12*16], xmm6
245
jmp labelSSE2_Salsa_Output_A9
246
labelSSE2_Salsa_Output_A8:
247
movdqu [rcx+0*16], xmm4
248
movdqu [rcx+4*16], xmm2
249
movdqu [rcx+8*16], xmm0
250
movdqu [rcx+12*16], xmm6
251
labelSSE2_Salsa_Output_A9:
255
movdqa xmm0, [rsp + 12*16 + 0*256]
256
movdqa xmm4, [rsp + 13*16 + 0*256]
257
movdqa xmm8, [rsp + 14*16 + 0*256]
258
movdqa xmm12, [rsp + 15*16 + 0*256]
259
movdqa xmm2, [rsp + 0*16 + 0*256]
260
movdqa xmm6, [rsp + 1*16 + 0*256]
261
movdqa xmm10, [rsp + 2*16 + 0*256]
262
movdqa xmm14, [rsp + 3*16 + 0*256]
279
pxor xmm0, [rsp + 4*16 + 0*256]
280
pxor xmm4, [rsp + 5*16 + 0*256]
281
pxor xmm8, [rsp + 6*16 + 0*256]
282
pxor xmm12, [rsp + 7*16 + 0*256]
287
movdqa [rsp + 4*16], xmm0
288
movdqa [rsp + 5*16], xmm4
289
movdqa [rsp + 6*16], xmm8
290
movdqa [rsp + 7*16], xmm12
311
pxor xmm0, [rsp + 8*16 + 0*256]
312
pxor xmm4, [rsp + 9*16 + 0*256]
313
pxor xmm8, [rsp + 10*16 + 0*256]
314
pxor xmm12, [rsp + 11*16 + 0*256]
319
movdqa [rsp + 8*16], xmm0
320
movdqa [rsp + 9*16], xmm4
321
movdqa [rsp + 10*16], xmm8
322
movdqa [rsp + 11*16], xmm12
343
pxor xmm0, [rsp + 12*16 + 0*256]
344
pxor xmm4, [rsp + 13*16 + 0*256]
345
pxor xmm8, [rsp + 14*16 + 0*256]
346
pxor xmm12, [rsp + 15*16 + 0*256]
351
movdqa [rsp + 12*16], xmm0
352
movdqa [rsp + 13*16], xmm4
353
movdqa [rsp + 14*16], xmm8
354
movdqa [rsp + 15*16], xmm12
379
movdqa [rsp + 0*16], xmm0
380
movdqa [rsp + 1*16], xmm4
381
movdqa [rsp + 2*16], xmm8
382
movdqa [rsp + 3*16], xmm12
384
movdqa xmm0, [rsp + 7*16 + 0*256]
385
movdqa xmm4, [rsp + 4*16 + 0*256]
386
movdqa xmm8, [rsp + 5*16 + 0*256]
387
movdqa xmm12, [rsp + 6*16 + 0*256]
388
movdqa xmm2, [rsp + 0*16 + 0*256]
389
movdqa xmm6, [rsp + 1*16 + 0*256]
390
movdqa xmm10, [rsp + 2*16 + 0*256]
391
movdqa xmm14, [rsp + 3*16 + 0*256]
408
pxor xmm0, [rsp + 13*16 + 0*256]
409
pxor xmm4, [rsp + 14*16 + 0*256]
410
pxor xmm8, [rsp + 15*16 + 0*256]
411
pxor xmm12, [rsp + 12*16 + 0*256]
416
movdqa [rsp + 13*16], xmm0
417
movdqa [rsp + 14*16], xmm4
418
movdqa [rsp + 15*16], xmm8
419
movdqa [rsp + 12*16], xmm12
440
pxor xmm0, [rsp + 10*16 + 0*256]
441
pxor xmm4, [rsp + 11*16 + 0*256]
442
pxor xmm8, [rsp + 8*16 + 0*256]
443
pxor xmm12, [rsp + 9*16 + 0*256]
448
movdqa [rsp + 10*16], xmm0
449
movdqa [rsp + 11*16], xmm4
450
movdqa [rsp + 8*16], xmm8
451
movdqa [rsp + 9*16], xmm12
472
pxor xmm0, [rsp + 7*16 + 0*256]
473
pxor xmm4, [rsp + 4*16 + 0*256]
474
pxor xmm8, [rsp + 5*16 + 0*256]
475
pxor xmm12, [rsp + 6*16 + 0*256]
480
movdqa [rsp + 7*16], xmm0
481
movdqa [rsp + 4*16], xmm4
482
movdqa [rsp + 5*16], xmm8
483
movdqa [rsp + 6*16], xmm12
508
movdqa [rsp + 0*16], xmm0
509
movdqa [rsp + 1*16], xmm4
510
movdqa [rsp + 2*16], xmm8
511
movdqa [rsp + 3*16], xmm12
514
movdqa xmm4, [rsp + 0*16 + 256]
515
paddd xmm4, [rsp + 0*16]
516
movdqa xmm5, [rsp + 13*16 + 256]
517
paddd xmm5, [rsp + 13*16]
518
movdqa xmm6, [rsp + 10*16 + 256]
519
paddd xmm6, [rsp + 10*16]
520
movdqa xmm7, [rsp + 7*16 + 256]
521
paddd xmm7, [rsp + 7*16]
522
call labelSSE2_Salsa_Output
523
movdqa xmm4, [rsp + 4*16 + 256]
524
paddd xmm4, [rsp + 4*16]
525
movdqa xmm5, [rsp + 1*16 + 256]
526
paddd xmm5, [rsp + 1*16]
527
movdqa xmm6, [rsp + 14*16 + 256]
528
paddd xmm6, [rsp + 14*16]
529
movdqa xmm7, [rsp + 11*16 + 256]
530
paddd xmm7, [rsp + 11*16]
531
call labelSSE2_Salsa_Output
532
movdqa xmm4, [rsp + 8*16 + 256]
533
paddd xmm4, [rsp + 8*16]
534
movdqa xmm5, [rsp + 5*16 + 256]
535
paddd xmm5, [rsp + 5*16]
536
movdqa xmm6, [rsp + 2*16 + 256]
537
paddd xmm6, [rsp + 2*16]
538
movdqa xmm7, [rsp + 15*16 + 256]
539
paddd xmm7, [rsp + 15*16]
540
call labelSSE2_Salsa_Output
541
movdqa xmm4, [rsp + 12*16 + 256]
542
paddd xmm4, [rsp + 12*16]
543
movdqa xmm5, [rsp + 9*16 + 256]
544
paddd xmm5, [rsp + 9*16]
545
movdqa xmm6, [rsp + 6*16 + 256]
546
paddd xmm6, [rsp + 6*16]
547
movdqa xmm7, [rsp + 3*16 + 256]
548
paddd xmm7, [rsp + 3*16]
549
call labelSSE2_Salsa_Output
561
movdqa xmm0, [r10 + 0*16]
562
movdqa xmm1, [r10 + 1*16]
563
movdqa xmm2, [r10 + 2*16]
564
movdqa xmm3, [r10 + 3*16]
595
pshufd xmm1, xmm1, 2*64+1*16+0*4+3
596
pshufd xmm2, xmm2, 1*64+0*16+3*4+2
597
pshufd xmm3, xmm3, 0*64+3*16+2*4+1
626
pshufd xmm1, xmm1, 0*64+3*16+2*4+1
627
pshufd xmm2, xmm2, 1*64+0*16+3*4+2
628
pshufd xmm3, xmm3, 2*64+1*16+0*4+3
631
paddd xmm0, [r10 + 0*16]
632
paddd xmm1, [r10 + 1*16]
633
paddd xmm2, [r10 + 2*16]
634
paddd xmm3, [r10 + 3*16]
635
add dword ptr [r10 + 8*4], 1
636
adc dword ptr [r10 + 5*4], 0
639
pshufd xmm7, xmm6, 0*64+1*16+2*4+3
662
jz labelSSE2_Salsa_Output_B3
664
jnz labelSSE2_Salsa_Output_B7
665
pxor xmm4, [rdx+0*16]
666
pxor xmm0, [rdx+1*16]
667
pxor xmm1, [rdx+2*16]
668
pxor xmm2, [rdx+3*16]
670
jmp labelSSE2_Salsa_Output_B3
671
labelSSE2_Salsa_Output_B7:
672
movdqu xmm3, [rdx+0*16]
674
movdqu xmm3, [rdx+1*16]
676
movdqu xmm3, [rdx+2*16]
678
movdqu xmm3, [rdx+3*16]
681
labelSSE2_Salsa_Output_B3:
683
jnz labelSSE2_Salsa_Output_B8
684
movdqa [rcx+0*16], xmm4
685
movdqa [rcx+1*16], xmm0
686
movdqa [rcx+2*16], xmm1
687
movdqa [rcx+3*16], xmm2
688
jmp labelSSE2_Salsa_Output_B9
689
labelSSE2_Salsa_Output_B8:
690
movdqu [rcx+0*16], xmm4
691
movdqu [rcx+1*16], xmm0
692
movdqu [rcx+2*16], xmm1
693
movdqu [rcx+3*16], xmm2
694
labelSSE2_Salsa_Output_B9:
698
movdqa xmm6, [rsp + 0200h]
699
movdqa xmm7, [rsp + 0210h]
700
movdqa xmm8, [rsp + 0220h]
701
movdqa xmm9, [rsp + 0230h]
702
movdqa xmm10, [rsp + 0240h]
703
movdqa xmm11, [rsp + 0250h]
704
movdqa xmm12, [rsp + 0260h]
705
movdqa xmm13, [rsp + 0270h]
706
movdqa xmm14, [rsp + 0280h]
707
movdqa xmm15, [rsp + 0290h]
708
add rsp, 10*16 + 32*16 + 8
710
Salsa20_OperateKeystream ENDP
713
Sosemanuk_OperateKeystream PROC FRAME
716
alloc_stack(80*4*2+12*4+8*8 + 2*16+8)
717
save_xmm128 xmm6, 02f0h
718
save_xmm128 xmm7, 0300h
722
mov QWORD PTR [rsp+1*8], rdi
723
mov QWORD PTR [rsp+2*8], rdx
724
mov QWORD PTR [rsp+6*8], rax
727
mov QWORD PTR [rsp+3*8], rsi
728
movdqa xmm0, [rax+0*16]
729
movdqa [rsp + 8*8+0*16], xmm0
730
movdqa xmm0, [rax+1*16]
731
movdqa [rsp + 8*8+1*16], xmm0
732
movq xmm0, QWORD PTR [rax+2*16]
733
movq QWORD PTR [rsp + 8*8+2*16], xmm0
740
lea rdi, [rsp + 8*8 + 12*4]
744
mov QWORD PTR [rsp+7*8], rsi
746
mov QWORD PTR [rsp+4*8], rsi
747
lea rsi, s_sosemanukMulTables
749
mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4]
750
mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax
752
lea r11d, [r10d + edx]
754
mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d
761
xor r10d, [rsi+rax*4]
762
mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4]
763
xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4]
767
xor r10d, [rsi+1024+r11*4]
771
mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d
772
mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4]
773
mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax
775
lea r11d, [r10d + ecx]
777
mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d
784
xor r10d, [rsi+rax*4]
785
mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4]
786
xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4]
790
xor r10d, [rsi+1024+r11*4]
794
mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d
795
mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4]
796
mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax
798
lea r11d, [r10d + edx]
800
mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d
807
xor r10d, [rsi+rax*4]
808
mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4]
809
xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4]
813
xor r10d, [rsi+1024+r11*4]
817
mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d
818
mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4]
819
mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax
821
lea r11d, [r10d + ecx]
823
mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d
830
xor r10d, [rsi+rax*4]
831
mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4]
832
xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4]
836
xor r10d, [rsi+1024+r11*4]
840
mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d
841
mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4]
842
mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax
844
lea r11d, [r10d + edx]
846
mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d
853
xor r10d, [rsi+rax*4]
854
mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4]
855
xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4]
859
xor r10d, [rsi+1024+r11*4]
863
mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d
864
mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4]
865
mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax
867
lea r11d, [r10d + ecx]
869
mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d
876
xor r10d, [rsi+rax*4]
877
mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4]
878
xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4]
882
xor r10d, [rsi+1024+r11*4]
886
mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d
887
mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4]
888
mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax
890
lea r11d, [r10d + edx]
892
mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d
899
xor r10d, [rsi+rax*4]
900
mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4]
901
xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4]
905
xor r10d, [rsi+1024+r11*4]
909
mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d
910
mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4]
911
mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax
913
lea r11d, [r10d + ecx]
915
mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d
922
xor r10d, [rsi+rax*4]
923
mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4]
924
xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4]
928
xor r10d, [rsi+1024+r11*4]
932
mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d
933
mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4]
934
mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax
936
lea r11d, [r10d + edx]
938
mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d
945
xor r10d, [rsi+rax*4]
946
mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4]
947
xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4]
951
xor r10d, [rsi+1024+r11*4]
955
mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d
956
mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4]
957
mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax
959
lea r11d, [r10d + ecx]
961
mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d
968
xor r10d, [rsi+rax*4]
969
mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4]
970
xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4]
974
xor r10d, [rsi+1024+r11*4]
978
mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d
979
mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4]
980
mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax
982
lea r11d, [r10d + edx]
984
mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d
991
xor r10d, [rsi+rax*4]
992
mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4]
993
xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4]
997
xor r10d, [rsi+1024+r11*4]
1001
mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d
1002
mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4]
1003
mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax
1005
lea r11d, [r10d + ecx]
1007
mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d
1014
xor r10d, [rsi+rax*4]
1015
mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4]
1016
xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4]
1020
xor r10d, [rsi+1024+r11*4]
1024
mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d
1025
mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4]
1026
mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax
1028
lea r11d, [r10d + edx]
1030
mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d
1037
xor r10d, [rsi+rax*4]
1038
mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4]
1039
xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4]
1043
xor r10d, [rsi+1024+r11*4]
1047
mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d
1048
mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4]
1049
mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax
1051
lea r11d, [r10d + ecx]
1053
mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d
1060
xor r10d, [rsi+rax*4]
1061
mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4]
1062
xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4]
1066
xor r10d, [rsi+1024+r11*4]
1070
mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d
1071
mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4]
1072
mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax
1074
lea r11d, [r10d + edx]
1076
mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d
1083
xor r10d, [rsi+rax*4]
1084
mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4]
1085
xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4]
1089
xor r10d, [rsi+1024+r11*4]
1093
mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d
1094
mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4]
1095
mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax
1097
lea r11d, [r10d + ecx]
1099
mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d
1106
xor r10d, [rsi+rax*4]
1107
mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4]
1108
xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4]
1112
xor r10d, [rsi+1024+r11*4]
1116
mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d
1117
mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4]
1118
mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax
1120
lea r11d, [r10d + edx]
1122
mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d
1129
xor r10d, [rsi+rax*4]
1130
mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4]
1131
xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4]
1135
xor r10d, [rsi+1024+r11*4]
1139
mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d
1140
mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4]
1141
mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax
1143
lea r11d, [r10d + ecx]
1145
mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d
1152
xor r10d, [rsi+rax*4]
1153
mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4]
1154
xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4]
1158
xor r10d, [rsi+1024+r11*4]
1162
mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d
1163
mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4]
1164
mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax
1166
lea r11d, [r10d + edx]
1168
mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d
1175
xor r10d, [rsi+rax*4]
1176
mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4]
1177
xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4]
1181
xor r10d, [rsi+1024+r11*4]
1185
mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d
1186
mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4]
1187
mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax
1189
lea r11d, [r10d + ecx]
1191
mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d
1198
xor r10d, [rsi+rax*4]
1199
mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4]
1200
xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4]
1204
xor r10d, [rsi+1024+r11*4]
1208
mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d
1210
cmp rdi, QWORD PTR [rsp+4*8]
1212
mov rax, QWORD PTR [rsp+2*8]
1213
mov r11, QWORD PTR [rsp+1*8]
1214
lea rdi, [rsp + 8*8 + 12*4]
1215
mov rsi, QWORD PTR [rsp+7*8]
1217
movdqa xmm0, [rdi+0*20*4]
1218
movdqa xmm2, [rdi+2*20*4]
1219
movdqa xmm3, [rdi+3*20*4]
1220
movdqa xmm1, [rdi+1*20*4]
1237
pxor xmm2, [rdi+80*4]
1238
pxor xmm3, [rdi+80*5]
1239
pxor xmm1, [rdi+80*6]
1240
pxor xmm4, [rdi+80*7]
1244
punpckldq xmm2, xmm3
1246
punpckldq xmm1, xmm4
1248
punpcklqdq xmm2, xmm1
1249
punpckhqdq xmm0, xmm1
1250
punpckhdq xmm6, xmm3
1251
punpckhdq xmm5, xmm4
1253
punpcklqdq xmm6, xmm5
1254
punpckhqdq xmm3, xmm5
1256
jz labelSSE2_Sosemanuk_Output3
1258
jnz labelSSE2_Sosemanuk_Output7
1259
pxor xmm2, [rax+0*16]
1260
pxor xmm0, [rax+1*16]
1261
pxor xmm6, [rax+2*16]
1262
pxor xmm3, [rax+3*16]
1264
jmp labelSSE2_Sosemanuk_Output3
1265
labelSSE2_Sosemanuk_Output7:
1266
movdqu xmm1, [rax+0*16]
1268
movdqu xmm1, [rax+1*16]
1270
movdqu xmm1, [rax+2*16]
1272
movdqu xmm1, [rax+3*16]
1275
labelSSE2_Sosemanuk_Output3:
1277
jnz labelSSE2_Sosemanuk_Output8
1278
movdqa [r11+0*16], xmm2
1279
movdqa [r11+1*16], xmm0
1280
movdqa [r11+2*16], xmm6
1281
movdqa [r11+3*16], xmm3
1282
jmp labelSSE2_Sosemanuk_Output9
1283
labelSSE2_Sosemanuk_Output8:
1284
movdqu [r11+0*16], xmm2
1285
movdqu [r11+1*16], xmm0
1286
movdqu [r11+2*16], xmm6
1287
movdqu [r11+3*16], xmm3
1288
labelSSE2_Sosemanuk_Output9:
1293
mov rsi, QWORD PTR [rsp+3*8]
1296
mov QWORD PTR [rsp+3*8], rsi
1297
mov QWORD PTR [rsp+2*8], rax
1298
mov QWORD PTR [rsp+1*8], r11
1303
movd xmm0, dword ptr [rax+0*4]
1305
movd xmm0, dword ptr [rax+1*4]
1307
movd xmm0, dword ptr [rax+2*4]
1309
movd xmm0, dword ptr [rax+3*4]
1313
movd dword ptr [r11+0*4], xmm2
1314
movd dword ptr [r11+1*4], xmm3
1315
movd dword ptr [r11+2*4], xmm1
1316
movd dword ptr [r11+3*4], xmm4
1326
mov r10, QWORD PTR [rsp+6*8]
1327
movdqa xmm0, [rsp + 8*8+0*16]
1328
movdqa [r10+0*16], xmm0
1329
movdqa xmm0, [rsp + 8*8+1*16]
1330
movdqa [r10+1*16], xmm0
1331
movq xmm0, QWORD PTR [rsp + 8*8+2*16]
1332
movq QWORD PTR [r10+2*16], xmm0
1335
movdqa xmm6, [rsp + 02f0h]
1336
movdqa xmm7, [rsp + 0300h]
1337
add rsp, 80*4*2+12*4+8*8 + 2*16+8
1341
Sosemanuk_OperateKeystream ENDP
1343
Panama_SSE2_Pull PROC FRAME
1346
save_xmm128 xmm6, 0h
1347
save_xmm128 xmm7, 10h
1351
mov r10d, [rdx+4*17]
1354
movdqa xmm0, xmmword ptr [rdx+0*16]
1355
movdqa xmm1, xmmword ptr [rdx+1*16]
1356
movdqa xmm2, xmmword ptr [rdx+2*16]
1357
movdqa xmm3, xmmword ptr [rdx+3*16]
1358
mov eax, dword ptr [rdx+4*16]
1362
pshufd xmm5, xmm6, 0*64+3*16+2*4+1
1366
pshufd xmm6, xmm7, 0*64+3*16+2*4+1
1377
rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32))
1378
mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx
1379
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1381
rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32))
1382
mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx
1383
punpckhqdq xmm7, xmm7
1385
rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32))
1386
mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx
1387
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1389
rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32))
1390
mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx
1396
rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32))
1397
mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx
1398
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1400
rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32))
1401
mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx
1402
punpckhqdq xmm7, xmm7
1404
rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32))
1405
mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx
1406
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1408
rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32))
1409
mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx
1415
rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32))
1416
mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx
1417
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1419
rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32))
1420
mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx
1421
punpckhqdq xmm7, xmm7
1423
rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32))
1424
mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx
1425
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1427
rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32))
1428
mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx
1434
rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32))
1435
mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx
1436
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1438
rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32))
1439
mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx
1440
punpckhqdq xmm7, xmm7
1442
rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32))
1443
mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx
1444
pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
1446
rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32))
1447
mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx
1449
punpcklqdq xmm3, xmm2
1450
punpckhdq xmm4, xmm2
1452
punpcklqdq xmm1, xmm0
1453
punpckhdq xmm2, xmm0
1457
punpcklqdq xmm4, xmm2
1458
punpckhqdq xmm6, xmm2
1469
movdqu xmm2, [r9+16]
1476
movdqa xmmword ptr [r8], xmm4
1477
movdqa xmmword ptr [r8+16], xmm6
1481
movdqu xmmword ptr [r8], xmm4
1482
movdqu xmmword ptr [r8+16], xmm6
1487
lea r11, [r10 + (32-24)*32]
1489
movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8]
1491
pshufd xmm0, xmm0, 2*64+3*16+0*4+1
1492
movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3
1493
pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8]
1494
movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0
1495
movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8]
1497
movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1
1498
pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8]
1499
movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4
1500
movdqa xmm3, xmmword ptr [rdx+3*16]
1501
movdqa xmm2, xmmword ptr [rdx+2*16]
1502
movdqa xmm1, xmmword ptr [rdx+1*16]
1503
movdqa xmm0, xmmword ptr [rdx+0*16]
1513
pshufd xmm7, xmm7, 0*64+3*16+2*4+1
1514
pshufd xmm6, xmm6, 0*64+3*16+2*4+1
1515
pshufd xmm5, xmm5, 0*64+3*16+2*4+1
1516
pshufd xmm4, xmm4, 0*64+3*16+2*4+1
1530
lea rcx, [r10 + (32-4)*32]
1532
lea r11, [r10 + 16*32]
1534
movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16]
1535
movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16]
1537
punpcklqdq xmm4, xmm5
1538
punpckhqdq xmm6, xmm5
1541
movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16]
1542
movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16]
1544
punpcklqdq xmm4, xmm5
1545
punpckhqdq xmm6, xmm5
1552
movdqa xmmword ptr [rdx+3*16], xmm3
1553
movdqa xmmword ptr [rdx+2*16], xmm2
1554
movdqa xmmword ptr [rdx+1*16], xmm1
1555
movdqa xmmword ptr [rdx+0*16], xmm0
1557
movdqa xmm6, [rsp + 0h]
1558
movdqa xmm7, [rsp + 10h]
1562
Panama_SSE2_Pull ENDP