2
* Blowfish Cipher Algorithm (x86_64)
4
* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6
* This program is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; either version 2 of the License, or
9
* (at your option) any later version.
11
* This program is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with this program; if not, write to the Free Software
18
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23
.file "blowfish-x86_64-asm.S"
26
/* structure of crypto context */
28
#define s0 ((16 + 2) * 4)
29
#define s1 ((16 + 2 + (1 * 256)) * 4)
30
#define s2 ((16 + 2 + (2 * 256)) * 4)
31
#define s3 ((16 + 2 + (3 * 256)) * 4)
69
/***********************************************************************
71
***********************************************************************/
77
movl s0(CTX,RT0,4), RT0d; \
78
addl s1(CTX,RT1,4), RT0d; \
82
xorl s2(CTX,RT1,4), RT0d; \
83
addl s3(CTX,RT2,4), RT0d; \
86
#define add_roundkey_enc(n) \
87
xorq p+4*(n)(CTX), RX0;
89
#define round_enc(n) \
90
add_roundkey_enc(n); \
95
#define add_roundkey_dec(n) \
96
movq p+4*(n-1)(CTX), RT0; \
100
#define round_dec(n) \
101
add_roundkey_dec(n); \
106
#define read_block() \
111
#define write_block() \
115
#define xor_block() \
120
.global __blowfish_enc_blk
121
.type __blowfish_enc_blk,@function;
128
* %rcx: bool, if true: xor output
145
add_roundkey_enc(16);
160
.global blowfish_dec_blk
161
.type blowfish_dec_blk,@function;
193
/**********************************************************************
194
4-way blowfish, four blocks parallel
195
**********************************************************************/
197
/* F() for 4-way. Slower when used alone/1-way, but faster when used
198
* parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
201
movzbl x ## bh, RT1d; \
202
movzbl x ## bl, RT3d; \
204
movzbl x ## bh, RT0d; \
205
movzbl x ## bl, RT2d; \
207
movl s0(CTX,RT0,4), RT0d; \
208
addl s1(CTX,RT2,4), RT0d; \
209
xorl s2(CTX,RT1,4), RT0d; \
210
addl s3(CTX,RT3,4), RT0d; \
213
#define add_preloaded_roundkey4() \
219
#define preload_roundkey_enc(n) \
220
movq p+4*(n)(CTX), RKEY;
222
#define add_roundkey_enc4(n) \
223
add_preloaded_roundkey4(); \
224
preload_roundkey_enc(n + 2);
226
#define round_enc4(n) \
227
add_roundkey_enc4(n); \
239
#define preload_roundkey_dec(n) \
240
movq p+4*((n)-1)(CTX), RKEY; \
243
#define add_roundkey_dec4(n) \
244
add_preloaded_roundkey4(); \
245
preload_roundkey_dec(n - 2);
247
#define round_dec4(n) \
248
add_roundkey_dec4(n); \
260
#define read_block4() \
277
#define write_block4() \
290
#define xor_block4() \
304
.global __blowfish_enc_blk_4way
305
.type __blowfish_enc_blk_4way,@function;
307
__blowfish_enc_blk_4way:
312
* %rcx: bool, if true: xor output
318
preload_roundkey_enc(0);
333
add_preloaded_roundkey4();
355
.global blowfish_dec_blk_4way
356
.type blowfish_dec_blk_4way,@function;
358
blowfish_dec_blk_4way:
366
preload_roundkey_dec(17);
381
add_preloaded_roundkey4();