44
51
#elif defined(HAVE_SHORT_32)
45
52
typedef unsigned short sh_word32;
47
#error No 32 byte type found !
54
#error No 32 bit type found !
50
57
typedef unsigned char sh_byte;
53
#if !(defined(__alpha)||defined(__i386__)||defined(__vax__))
58
/* The following macro denotes that an optimization */
59
/* for Alpha is required. It is used only for */
60
/* optimization of time. Otherwise it does nothing. */
62
#define OPTIMIZE_FOR_ALPHA
59
#if defined(TIGER_OPT_ASM)
60
#define TIGER_ASM64_2 1
65
/* NOTE that this code is NOT FULLY OPTIMIZED for any */
66
/* machine. Assembly code might be much faster on some */
67
/* machines, especially if the code is compiled with */
70
/* The number of passes of the hash function. */
71
/* Three passes are recommended. */
72
/* Use four passes when you need extra security. */
73
/* Must be at least three. */
65
/* The number of passes of the hash function. */
66
/* Three passes are recommended. */
67
/* Use four passes when you need extra security. */
68
/* Must be at least three. */
76
71
extern word64 tiger_table[4*256];
73
/* Volatile can help if compiler is smart enough to use memory operand */
74
static /*volatile*/ const word64 XOR_CONST1=0xA5A5A5A5A5A5A5A5LL;
75
static /*volatile*/ const word64 XOR_CONST2=0x0123456789ABCDEFLL;
78
77
#define t1 (tiger_table)
79
78
#define t2 (tiger_table+256)
80
79
#define t3 (tiger_table+256*2)
81
80
#define t4 (tiger_table+256*3)
88
#ifdef OPTIMIZE_FOR_ALPHA
89
/* This is the official definition of round */
90
#define round(a,b,c,x,mul) \
92
a -= t1[((c)>>(0*8))&0xFF] ^ t2[((c)>>(2*8))&0xFF] ^ \
93
t3[((c)>>(4*8))&0xFF] ^ t4[((c)>>(6*8))&0xFF] ; \
94
b += t4[((c)>>(1*8))&0xFF] ^ t3[((c)>>(3*8))&0xFF] ^ \
95
t2[((c)>>(5*8))&0xFF] ^ t1[((c)>>(7*8))&0xFF] ; \
98
/* This code works faster when compiled on 32-bit machines */
99
/* (but works slower on Alpha) */
100
#define round(a,b,c,x,mul) \
102
a -= t1[(sh_byte)(c)] ^ \
103
t2[(sh_byte)(((sh_word32)(c))>>(2*8))] ^ \
104
t3[(sh_byte)((c)>>(4*8))] ^ \
105
t4[(sh_byte)(((sh_word32)((c)>>(4*8)))>>(2*8))] ; \
106
b += t4[(sh_byte)(((sh_word32)(c))>>(1*8))] ^ \
107
t3[(sh_byte)(((sh_word32)(c))>>(3*8))] ^ \
108
t2[(sh_byte)(((sh_word32)((c)>>(4*8)))>>(1*8))] ^ \
109
t1[(sh_byte)(((sh_word32)((c)>>(4*8)))>>(3*8))]; \
113
#define pass(a,b,c,mul) \
114
round(a,b,c,x0,mul) \
115
round(b,c,a,x1,mul) \
116
round(c,a,b,x2,mul) \
117
round(a,b,c,x3,mul) \
118
round(b,c,a,x4,mul) \
119
round(c,a,b,x5,mul) \
120
round(a,b,c,x6,mul) \
94
#define BN(x,n) (((x)>>((n)*8))&0xFF)
97
/* Depending on outer code one of these two can be better*/
98
#define roundX(a,b,c,x) \
100
a -= t1[BN(c,0)] ^ t2[BN(c,2)] ^ \
101
t3[BN(c,4)] ^ t4[BN(c,6)] ; \
102
b += t4[BN(c,1)] ^ t3[BN(c,3)] ^ \
103
t2[BN(c,5)] ^ t1[BN(c,7)] ;
105
#define round5(a,b,c,x) roundX(a,b,c,x) b = b+b*4;
106
#define round7(a,b,c,x) roundX(a,b,c,x) b = b*8-b;
107
#define round9(a,b,c,x) roundX(a,b,c,x) b = b+b*8;
115
#define MASK8 0xFF00L
116
#define MASK16 0xFF0000L
117
#define MASK32 0xFF00000000LL
118
#define MASK40 0xFF0000000000LL
119
#define MASK48 0xFF000000000000LL
121
#define roundstart __asm__ (
123
/* a will be moved into different reg each round
124
* using register substitution feature of GCC asm
125
* b will be moved in 2-nd pass rounds only
129
#define roundend(a,b,c,x) \
130
: "+r" (a), "+r" (b), "+r" (c) \
131
: "r" (a), "r" (b), "r" (c), "m" (x), "r" (&tiger_table),\
132
"i" (MASK0), "i" (MASK8), "i" (MASK16), "r" (MASK32), "r" (MASK40), "r" (MASK48) \
133
: "3", "%rax","%rbx","%rcx","%rdx","%rsi", "%edi", "%r8" );
137
a -= t1[BN(c,0)] ^ t2[BN(c,2)] ^
138
t3[BN(c,4)] ^ t4[BN(c,6)] ;
139
b += t4[BN(c,1)] ^ t3[BN(c,3)] ^
140
t2[BN(c,5)] ^ t1[BN(c,7)] ; */
142
#define roundX(a,b,c,x) \
143
" movl %10, %%ebx \n"\
144
" movq %11, %%rcx \n"\
145
" movq %13, %%rdx \n"\
147
" xorq %%r8, %2 \n" \
148
" andq %2, %%rbx \n"\
149
" andq %2, %%rcx \n"\
150
" andq %2, %%rdx \n"\
151
" shrl $(16-3), %%ebx \n"\
152
" shrq $(32-3), %%rcx \n"\
153
" shrq $(48-3), %%rdx \n"\
154
" movzbl %2b, %%eax \n"\
155
" movzwl %2w, %%edi \n"\
156
" movq (%7,%%rax,8), %%rsi \n"\
157
" shrl $(8), %%edi \n" \
158
" movq %2, %%rax \n" \
159
" xorq (2048*1)(%7,%%rbx), %%rsi \n"\
160
" movq %2, %%rbx \n"\
161
" shrl $24, %%eax \n"\
162
" andq %12, %%rbx \n"\
163
" xorq (2048*2)(%7,%%rcx), %%rsi \n"\
164
" shrq $(40-3), %%rbx \n"\
165
" movq %2, %%rcx \n"\
166
" xorq (2048*3)(%7,%%rdx), %%rsi \n"\
167
" movq (2048*3)(%7,%%rdi,8), %%rdx \n"\
168
" shrq $56, %%rcx \n"\
169
" xorq (2048*2)(%7,%%rax,8), %%rdx \n"\
170
" xorq (2048*1)(%7,%%rbx), %%rdx \n" \
171
" subq %%rsi, %0 \n"\
172
" xorq (%7,%%rcx,8), %%rdx \n"\
175
#define round5(a,b,c,x) \
179
"leaq (%1,%1,4), %1\n" \
183
#define round7(a,b,c,x) \
189
"leaq (%1,%1,8), %0\n" \
192
:"=&r" (b): "r"(b): "1" );
194
#define round9(a,b,c,x) \
197
"leaq (%1,%1,8), %1\n" \
205
/* ============== Common macros ================== */
123
207
#define key_schedule \
124
x0 -= x7 ^ 0xA5A5A5A5A5A5A5A5LL; \
127
x3 -= x2 ^ ((~x1)<<19); \
130
x6 -= x5 ^ ((~x4)>>23); \
133
x1 -= x0 ^ ((~x7)<<19); \
136
x4 -= x3 ^ ((~x2)>>23); \
139
x7 -= x6 ^ 0x0123456789ABCDEFLL;
208
x0 -= x7 ^ XOR_CONST1; \
211
x3 -= x2 ^ ((~x1)<<19);\
214
x6 -= x5 ^ ((~x4)>>23); \
217
x1 -= x0 ^ ((~x7)<<19); \
220
x4 -= x3 ^ ((~x2)>>23); \
223
x7 -= x6 ^ XOR_CONST2;
225
#define pass5n(a,b,c) \
227
x0 -= x7 ^ XOR_CONST1; \
233
x3 -= x2 ^ ((~x1)<<19); \
239
x6 -= x5 ^ ((~x4)>>23); \
243
x1 -= x0 ^ ((~x7)<<19); \
246
x4 -= x3 ^ ((~x2)>>23); \
249
x7 -= x6 ^ XOR_CONST2;
251
#define pass7n(a,b,c) \
253
x0 -= x7 ^ XOR_CONST1; \
259
x3 -= x2 ^ ((~x1)<<19); \
265
x6 -= x5 ^ ((~x4)>>23); \
269
x1 -= x0 ^ ((~x7)<<19); \
272
x4 -= x3 ^ ((~x2)>>23); \
275
x7 -= x6 ^ XOR_CONST2;
277
#define pass5(a,b,c) \
289
#define pass7(a,b,c) \
302
#define pass9(a,b,c) \
141
314
#define feedforward \
146
#ifdef OPTIMIZE_FOR_ALPHA
147
/* The loop is unrolled: works better on Alpha */
155
for(pass_no=3; pass_no<PASSES; pass_no++) { \
158
tmpa=a; a=c; c=b; b=tmpa;} \
161
/* loop: works better on PC and Sun (smaller cache?) */
164
for(pass_no=0; pass_no<PASSES; pass_no++) { \
165
if(pass_no != 0) {key_schedule} \
166
pass(a,b,c,(pass_no==0?5:pass_no==1?7:9)); \
167
tmpa=a; a=c; c=b; b=tmpa;} \
320
/* This version works ok with C variant and also with new asm version
321
* that just wastes a register r8
322
* reason? who knows, write forwarding is faster than keeping value
330
for(pass_no=3; pass_no<PASSES; pass_no++) { \
333
tmpa=a; a=c; c=b; b=tmpa; \
337
#define compress_old \
344
for(pass_no=3; pass_no<PASSES; pass_no++) { \
347
tmpa=a; a=c; c=b; b=tmpa; \
171
351
#define tiger_compress_macro(str, state) \
173
register word64 a, b, c, tmpa; \
353
register word64 a, b, c; \
354
register word64 tmpa; \
174
355
word64 aa, bb, cc; \
175
register word64 x0, x1, x2, x3, x4, x5, x6, x7; \
356
word64 x0, x1, x2, x3, x4, x5, x6, x7; \