1
// vmac.cpp - written and placed in the public domain by Wei Dai
2
// based on Ted Krovetz's public domain vmac.c and draft-krovetz-vmac-01.txt
9
NAMESPACE_BEGIN(CryptoPP)
11
#if defined(_MSC_VER) && !defined(CRYPTOPP_SLOW_WORD64)
15
#define VMAC_BOOL_WORD128 (defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE))
17
#define const // Turbo C++ 2006 workaround
19
static const word64 p64 = W64LIT(0xfffffffffffffeff); /* 2^64 - 257 prime */
20
static const word64 m62 = W64LIT(0x3fffffffffffffff); /* 62-bit mask */
21
static const word64 m63 = W64LIT(0x7fffffffffffffff); /* 63-bit mask */
22
static const word64 m64 = W64LIT(0xffffffffffffffff); /* 64-bit mask */
23
static const word64 mpoly = W64LIT(0x1fffffff1fffffff); /* Poly key mask */
28
static const word128 m126 = (word128(m62)<<64)|m64; /* 126-bit mask */
31
void VMAC_Base::UncheckedSetKey(const byte *userKey, unsigned int keylength, const NameValuePairs ¶ms)
33
int digestLength = params.GetIntValueWithDefault(Name::DigestSize(), DefaultDigestSize());
34
if (digestLength != 8 && digestLength != 16)
35
throw InvalidArgument("VMAC: DigestSize must be 8 or 16");
36
m_is128 = digestLength == 16;
38
m_L1KeyLength = params.GetIntValueWithDefault(Name::L1KeyLength(), 128);
39
if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0)
40
throw InvalidArgument("VMAC: L1KeyLength must be a positive multiple of 128");
44
BlockCipher &cipher = AccessCipher();
45
cipher.SetKey(userKey, keylength, params);
46
unsigned int blockSize = cipher.BlockSize();
47
unsigned int blockSizeInWords = blockSize / sizeof(word64);
48
SecBlock<word64> out(blockSizeInWords);
50
in.CleanNew(blockSize);
55
for (i = 0; i < m_nhKeySize()*sizeof(word64); i += blockSize)
57
cipher.ProcessBlock(in, out.BytePtr());
58
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_nhKey()+i/sizeof(word64), out.begin(), blockSize);
65
for (i = 0; i <= (size_t)m_is128; i++)
67
cipher.ProcessBlock(in, out.BytePtr());
68
m_polyState()[i*4+2] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()) & mpoly;
69
m_polyState()[i*4+3] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8) & mpoly;
76
word64 *l3Key = m_l3Key();
77
for (i = 0; i <= (size_t)m_is128; i++)
80
cipher.ProcessBlock(in, out.BytePtr());
81
l3Key[i*2+0] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr());
82
l3Key[i*2+1] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8);
84
} while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64));
87
Resynchronize(GetIVAndThrowIfInvalid(params));
90
void VMAC_Base::GetNextIV(RandomNumberGenerator &rng, byte *IV)
92
SimpleKeyingInterface::GetNextIV(rng, IV);
96
void VMAC_Base::Resynchronize(const byte *IV)
101
memcpy(m_nonce(), IV, s);
102
AccessCipher().ProcessBlock(m_nonce(), m_pad());
106
m_padCached = m_padCached && (m_nonce()[s-1] | 1) == (IV[s-1] | 1) && memcmp(m_nonce(), IV, s-1) == 0;
109
memcpy(m_nonce(), IV, s);
110
m_nonce()[s-1] &= 0xfe;
111
AccessCipher().ProcessBlock(m_nonce(), m_pad());
114
m_nonce()[s-1] = IV[s-1];
116
m_isFirstBlock = true;
120
void VMAC_Base::HashEndianCorrectedBlock(const word64 *data)
125
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
126
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
129
__attribute__ ((noinline)) // Intel Compiler 9.1 workaround
131
VMAC_Base::VHASH_Update_SSE2(const word64 *data, size_t blocksRemainingInWord64, int tagPart)
133
const word64 *nhK = m_nhKey();
134
word64 *polyS = m_polyState();
142
".intel_syntax noprefix;"
145
word32 L1KeyLength = m_L1KeyLength;
146
char isFirstBlock = m_isFirstBlock;
147
AS2( mov ebx, [L1KeyLength])
148
AS2( mov dl, [isFirstBlock])
151
AS2( mov ebx, [ecx+m_L1KeyLength])
152
AS2( mov dl, [ecx+m_isFirstBlock])
154
AS2( mov eax, tagPart)
162
AS2( mov ecx, blocksRemainingInWord64)
173
AS2( lea ebp, [edi+8*ebp]) // end of nhK
174
AS2( movq mm6, [esi])
175
AS2( paddq mm6, [edi])
176
AS2( movq mm5, [esi+8])
177
AS2( paddq mm5, [edi+8])
181
ASS( pshufw mm2, mm6, 1, 0, 3, 2)
182
AS2( pmuludq mm6, mm5)
183
ASS( pshufw mm3, mm5, 1, 0, 3, 2)
184
AS2( pmuludq mm5, mm2)
185
AS2( pmuludq mm2, mm3)
186
AS2( pmuludq mm3, mm4)
188
AS2( movd [esp], mm6)
190
AS2( movd [esp+4], mm5)
195
AS2( movq mm0, [esi])
196
AS2( paddq mm0, [edi])
197
AS2( movq mm1, [esi+8])
198
AS2( paddq mm1, [edi+8])
203
ASS( pshufw mm2, mm0, 1, 0, 3, 2)
204
AS2( pmuludq mm0, mm1)
205
AS2( movd [esp+8], mm3)
208
ASS( pshufw mm3, mm1, 1, 0, 3, 2)
209
AS2( pmuludq mm1, mm2)
210
AS2( pmuludq mm2, mm3)
211
AS2( pmuludq mm3, mm4)
212
AS2( movd mm4, [esp])
214
AS2( movd mm4, [esp+4])
216
AS2( movd mm4, [esp+8])
218
AS2( movd [esp], mm0)
221
AS2( movd [esp+4], mm1)
228
AS2( movd [esp+8], mm3)
231
AS2( movd mm4, [esp])
233
AS2( movd mm4, [esp+4])
235
AS2( movd mm4, [esp+8])
237
AS2( lea ebp, [8*ebx])
238
AS2( sub edi, ebp) // reset edi to start of nhK
240
AS2( movd [esp], mm7)
243
AS2( movd [esp+4], mm6)
253
#define k0 [eax+2*8+2*4]
254
#define k1 [eax+2*8+3*4]
255
#define k2 [eax+2*8+0*4]
256
#define k3 [eax+2*8+1*4]
260
AS2( movd mm0, [esp])
265
AS2( movd mm2, [esp+4])
278
AS2( pmuludq mm0, k3) // a3*k3
280
AS2( pmuludq mm1, k2) // a0*k2
283
AS2( pmuludq mm2, mm6) // a1*k1
289
AS2( pmuludq mm3, mm7) // a2*k0
290
AS2( pmuludq mm4, mm7) // a3*k0
291
AS2( pmuludq mm5, mm6) // a2*k1
296
AS2( pmuludq mm1, k2) // a1*k2
301
AS2( pmuludq mm2, k3) // a0*k3
302
AS2( pmuludq mm3, mm7) // a0*k0
303
AS2( movd [esp+8], mm0)
305
AS2( pmuludq mm7, mm5) // a1*k0
306
AS2( pmuludq mm5, k3) // a1*k3
309
AS2( pmuludq mm1, k2) // a2*k2
314
AS2( pmuludq mm2, mm6) // a3*k1
315
AS2( pmuludq mm6, a0) // a0*k1
318
AS2( movd mm3, [esp])
321
AS2( pmuludq mm3, k3) // a2*k3
324
AS2( pmuludq mm1, k2) // a3*k2
326
AS2( movd mm2, [esp+4])
333
AS2( movd mm7, [esp+8])
362
".att_syntax prefix;"
365
: "m" (m_L1KeyLength), "c" (blocksRemainingInWord64), "S" (data), "D" (nhK+tagPart*2), "d" (m_isFirstBlock), "a" (polyS+tagPart*4)
372
#if VMAC_BOOL_WORD128
373
#define DeclareNH(a) word128 a=0
374
#define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);}
375
#define AccumulateNH(a, b, c) a += word128(b)*(c)
376
#define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2)
378
#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
379
#define MUL32(a, b) __emulu(word32(a), word32(b))
381
#define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b))
383
#if defined(CRYPTOPP_X64_ASM_AVAILABLE)
384
#define DeclareNH(a) word64 a##0=0, a##1=0
385
#define MUL64(rh,rl,i1,i2) asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc");
386
#define AccumulateNH(a, b, c) asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc");
387
#define ADD128(rh,rl,ih,il) asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc");
388
#elif defined(_MSC_VER) && !defined(CRYPTOPP_SLOW_WORD64)
389
#define DeclareNH(a) word64 a##0=0, a##1=0
390
#define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
391
#define AccumulateNH(a, b, c) {\
393
pl = _umul128(b,c,&ph);\
395
a##1 += ph + (a##0 < pl);}
397
#define VMAC_BOOL_32BIT 1
398
#define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0
399
#define MUL64(rh,rl,i1,i2) \
400
{ word64 _i1 = (i1), _i2 = (i2); \
401
word64 m1= MUL32(_i1,_i2>>32); \
402
word64 m2= MUL32(_i1>>32,_i2); \
403
rh = MUL32(_i1>>32,_i2>>32); \
404
rl = MUL32(_i1,_i2); \
405
ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
406
ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
408
#define AccumulateNH(a, b, c) {\
409
word64 p = MUL32(b, c);\
410
a##1 += word32((p)>>32);\
412
p = MUL32((b)>>32, c);\
413
a##2 += word32((p)>>32);\
415
p = MUL32((b)>>32, (c)>>32);\
417
p = MUL32(b, (c)>>32);\
419
a##2 += word32(p>>32);}
422
#ifndef VMAC_BOOL_32BIT
423
#define VMAC_BOOL_32BIT 0
426
#define ADD128(rh,rl,ih,il) \
427
{ word64 _il = (il); \
429
(rh) += (ih) + ((rl) < (_il)); \
433
#if !(defined(_MSC_VER) && _MSC_VER < 1300)
434
template <bool T_128BitTag>
436
void VMAC_Base::VHASH_Update_Template(const word64 *data, size_t blocksRemainingInWord64)
438
#define INNER_LOOP_ITERATION(j) {\
439
word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\
440
word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\
441
AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\
443
AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\
446
#if (defined(_MSC_VER) && _MSC_VER < 1300)
447
bool T_128BitTag = m_is128;
449
size_t L1KeyLengthInWord64 = m_L1KeyLength / 8;
450
size_t innerLoopEnd = L1KeyLengthInWord64;
451
const word64 *nhK = m_nhKey();
452
word64 *polyS = m_polyState();
453
bool isFirstBlock = true;
457
#if VMAC_BOOL_WORD128
460
word64 ah1, al1, ah2, al2;
462
word64 kh1, kl1, kh2, kl2;
463
kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1];
466
kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1];
475
if (blocksRemainingInWord64 < L1KeyLengthInWord64)
477
if (blocksRemainingInWord64 % 8)
479
innerLoopEnd = blocksRemainingInWord64 % 8;
480
for (i=0; i<innerLoopEnd; i+=2)
481
INNER_LOOP_ITERATION(0);
482
blocksRemainingInWord64 -= innerLoopEnd;
483
data += innerLoopEnd;
485
innerLoopEnd = blocksRemainingInWord64;
488
for (i=0; i<innerLoopEnd; i+=8)
490
INNER_LOOP_ITERATION(0);
491
INNER_LOOP_ITERATION(1);
492
INNER_LOOP_ITERATION(2);
493
INNER_LOOP_ITERATION(3);
495
blocksRemainingInWord64 -= innerLoopEnd;
496
data += innerLoopEnd;
499
word32 nh0[2], nh1[2];
502
nh0[0] = word32(nhA0);
503
nhA1 += (nhA0 >> 32);
504
nh1[0] = word32(nhA1);
505
nh2[0] = (nhA2 + (nhA1 >> 32)) & m62;
509
nh0[1] = word32(nhB0);
510
nhB1 += (nhB0 >> 32);
511
nh1[1] = word32(nhB1);
512
nh2[1] = (nhB2 + (nhB1 >> 32)) & m62;
515
#define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()])
516
#define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum())) // workaround for GCC 3.2
517
#define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()])
518
#define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum()))
519
#define aHi ((polyS+i*4)[0])
520
#define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()])
521
#define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum()))
522
#define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()])
523
#define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum()))
524
#define kHi ((polyS+i*4+2)[0])
528
isFirstBlock = false;
531
m_isFirstBlock = false;
532
for (i=0; i<=(size_t)T_128BitTag; i++)
534
word64 t = (word64)nh0[i] + k0;
536
t = (t >> 32) + nh1[i] + k1;
538
aHi = (t >> 32) + nh2[i] + kHi;
543
for (i=0; i<=(size_t)T_128BitTag; i++)
559
t = (word64(word32(p) & 0x7fffffff) << 32) | t2;
563
p += MUL32(a1, 2*k3);
564
p += MUL32(a2, 2*k2);
565
p += MUL32(a3, 2*k1);
571
p += MUL32(a2, 2*k3);
572
p += MUL32(a3, 2*k2);
588
#else // #if VMAC_BOOL_32BIT
591
isFirstBlock = false;
594
m_isFirstBlock = false;
595
#if VMAC_BOOL_WORD128
596
#define first_poly_step(a, kh, kl, m) a = (m & m126) + ((word128(kh) << 64) | kl)
598
first_poly_step(a1, kh1, kl1, nhA);
600
first_poly_step(a2, kh2, kl2, nhB);
602
#define first_poly_step(ah, al, kh, kl, mh, ml) {\
604
ADD128(mh, ml, kh, kl); \
607
first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
609
first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
615
#if VMAC_BOOL_WORD128
616
a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1];
618
ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1];
622
#if VMAC_BOOL_WORD128
623
a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1];
625
ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1];
631
#if VMAC_BOOL_WORD128
632
#define poly_step(a, kh, kl, m) \
633
{ word128 t1, t2, t3, t4;\
634
Multiply128(t2, a>>64, kl);\
635
Multiply128(t3, a, kh);\
636
Multiply128(t1, a, kl);\
637
Multiply128(t4, a>>64, 2*kh);\
641
a = (word128(word64(t2)&m63) << 64) | word64(t4);\
646
poly_step(a1, kh1, kl1, nhA);
648
poly_step(a2, kh2, kl2, nhB);
650
#define poly_step(ah, al, kh, kl, mh, ml) \
651
{ word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
652
/* compute ab*cd, put bd into result registers */ \
653
MUL64(t2h,t2l,ah,kl); \
654
MUL64(t3h,t3l,al,kh); \
655
MUL64(t1h,t1l,ah,2*kh); \
656
MUL64(ah,al,al,kl); \
657
/* add together ad + bc */ \
658
ADD128(t2h,t2l,t3h,t3l); \
659
/* add 2 * ac to result */ \
660
ADD128(ah,al,t1h,t1l); \
661
/* now (ah,al), (t2l,2*t2h) need summing */ \
662
/* first add the high registers, carrying into t2h */ \
663
ADD128(t2h,ah,z,t2l); \
664
/* double t2h and add top bit of ah */ \
665
t2h += t2h + (ah >> 63); \
667
/* now add the low registers */ \
669
ADD128(ah,al,mh,ml); \
670
ADD128(ah,al,z,t2h); \
673
poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
675
poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
677
#endif // #if VMAC_BOOL_32BIT
678
} while (blocksRemainingInWord64);
680
#if VMAC_BOOL_WORD128
681
(polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1);
684
(polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2);
686
#elif !VMAC_BOOL_32BIT
687
(polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1;
690
(polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2;
695
inline void VMAC_Base::VHASH_Update(const word64 *data, size_t blocksRemainingInWord64)
697
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
700
VHASH_Update_SSE2(data, blocksRemainingInWord64, 0);
702
VHASH_Update_SSE2(data, blocksRemainingInWord64, 1);
703
m_isFirstBlock = false;
708
#if defined(_MSC_VER) && _MSC_VER < 1300
709
VHASH_Update_Template(data, blocksRemainingInWord64);
712
VHASH_Update_Template<true>(data, blocksRemainingInWord64);
714
VHASH_Update_Template<false>(data, blocksRemainingInWord64);
719
size_t VMAC_Base::HashMultipleBlocks(const word64 *data, size_t length)
721
size_t remaining = ModPowerOf2(length, m_L1KeyLength);
722
VHASH_Update(data, (length-remaining)/8);
726
static word64 L3Hash(const word64 *input, const word64 *l3Key, size_t len)
728
word64 rh, rl, t, z=0;
729
word64 p1 = input[0], p2 = input[1];
730
word64 k1 = l3Key[0], k2 = l3Key[1];
732
/* fully reduce (p1,p2)+(len,0) mod p127 */
735
ADD128(p1, p2, len, t);
736
/* At this point, (p1,p2) is at most 2^127+(len<<64) */
737
t = (p1 > m63) + ((p1 == m63) & (p2 == m64));
738
ADD128(p1, p2, z, t);
741
/* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
744
t += (word32)t > 0xfffffffeU;
748
/* compute (p1+k1)%p64 and (p2+k2)%p64 */
750
p1 += (0 - (p1 < k1)) & 257;
752
p2 += (0 - (p2 < k2)) & 257;
754
/* compute (p1+k1)*(p2+k2)%p64 */
755
MUL64(rh, rl, p1, p2);
757
ADD128(t, rl, z, rh);
759
ADD128(t, rl, z, rh);
762
rl += (0 - (rl < t)) & 257;
763
rl += (0 - (rl > p64-1)) & 257;
767
void VMAC_Base::TruncatedFinal(byte *mac, size_t size)
769
size_t len = ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength);
773
memset(m_data()+len, 0, (0-len)%16);
774
VHASH_Update(DataBuf(), ((len+15)/16)*2);
775
len *= 8; // convert to bits
777
else if (m_isFirstBlock)
779
// special case for empty string
780
m_polyState()[0] = m_polyState()[2];
781
m_polyState()[1] = m_polyState()[3];
784
m_polyState()[4] = m_polyState()[6];
785
m_polyState()[5] = m_polyState()[7];
792
t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad());
793
t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()+8);
796
PutWord(false, BIG_ENDIAN_ORDER, mac, t[0]);
797
PutWord(false, BIG_ENDIAN_ORDER, mac+8, t[1]);
801
t[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[0]);
802
t[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[1]);
803
memcpy(mac, t, size);
808
word64 t = L3Hash(m_polyState(), m_l3Key(), len);
809
t += GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad() + (m_nonce()[IVSize()-1]&1) * 8);
811
PutWord(false, BIG_ENDIAN_ORDER, mac, t);
814
t = ConditionalByteReverse(BIG_ENDIAN_ORDER, t);
815
memcpy(mac, &t, size);