2
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8
* Routines for 64-bit Galois fields
15
#define GF_FIELD_WIDTH (64)
16
#define GF_FIRST_BIT (1ULL << 63)
18
#define GF_BASE_FIELD_WIDTH (32)
19
#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
20
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
22
struct gf_w64_group_data {
28
struct gf_split_4_64_lazy_data {
29
uint64_t tables[16][16];
33
struct gf_split_8_64_lazy_data {
34
uint64_t tables[8][(1<<8)];
38
struct gf_split_16_64_lazy_data {
39
uint64_t tables[4][(1<<16)];
43
struct gf_split_8_8_data {
44
uint64_t tables[15][256][256];
49
gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
51
return gf->divide.w64(gf, 1, a);
54
#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
58
gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
60
b = gf->inverse.w64(gf, b);
61
return gf->multiply.w64(gf, a, b);
66
gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
73
s64 = (gf_val_64_t *) src;
74
d64 = (gf_val_64_t *) dest;
76
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
77
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
80
for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
81
d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
84
for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
85
d64[i] = gf->multiply.w64(gf, val, s64[i]);
90
#if defined(INTEL_SSE4_PCLMUL)
93
gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
96
gf_val_64_t *s64, *d64, *top;
103
__m128i m1, m2, m3, m4;
104
gf_internal_t * h = gf->scratch;
106
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
107
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
109
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
110
gf_do_initial_region_alignment(&rd);
112
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
113
b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
114
m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
115
m2 = _mm_slli_si128(m1, 4);
116
m2 = _mm_or_si128(m1, m2);
117
m3 = _mm_slli_si128(m1, 8);
118
m4 = _mm_slli_si128(m3, 4);
120
s64 = (gf_val_64_t *) rd.s_start;
121
d64 = (gf_val_64_t *) rd.d_start;
122
top = (gf_val_64_t *) rd.d_top;
126
a = _mm_load_si128((__m128i *) s64);
127
result = _mm_clmulepi64_si128 (a, b, 1);
129
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
130
result = _mm_xor_si128 (result, w);
131
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
132
r1 = _mm_xor_si128 (result, w);
134
result = _mm_clmulepi64_si128 (a, b, 0);
136
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
137
result = _mm_xor_si128 (result, w);
139
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
140
result = _mm_xor_si128 (result, w);
142
result = _mm_unpacklo_epi64(result, r1);
144
r1 = _mm_load_si128((__m128i *) d64);
145
result = _mm_xor_si128(r1, result);
146
_mm_store_si128((__m128i *) d64, result);
153
a = _mm_load_si128((__m128i *) s64);
154
result = _mm_clmulepi64_si128 (a, b, 1);
156
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
157
result = _mm_xor_si128 (result, w);
158
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
159
r1 = _mm_xor_si128 (result, w);
161
result = _mm_clmulepi64_si128 (a, b, 0);
163
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
164
result = _mm_xor_si128 (result, w);
165
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
166
result = _mm_xor_si128 (result, w);
168
result = _mm_unpacklo_epi64(result, r1);
170
_mm_store_si128((__m128i *) d64, result);
175
gf_do_final_region_alignment(&rd);
179
#if defined(INTEL_SSE4_PCLMUL)
182
gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
185
gf_val_64_t *s64, *d64, *top;
193
gf_internal_t * h = gf->scratch;
195
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
196
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
198
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
199
gf_do_initial_region_alignment(&rd);
201
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
202
b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
203
m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
204
m3 = _mm_slli_si128(m1, 8);
205
m4 = _mm_slli_si128(m3, 4);
207
s64 = (gf_val_64_t *) rd.s_start;
208
d64 = (gf_val_64_t *) rd.d_start;
209
top = (gf_val_64_t *) rd.d_top;
213
a = _mm_load_si128((__m128i *) s64);
214
result = _mm_clmulepi64_si128 (a, b, 1);
216
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
217
result = _mm_xor_si128 (result, w);
218
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
219
r1 = _mm_xor_si128 (result, w);
221
result = _mm_clmulepi64_si128 (a, b, 0);
223
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
224
result = _mm_xor_si128 (result, w);
226
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
227
result = _mm_xor_si128 (result, w);
229
result = _mm_unpacklo_epi64(result, r1);
231
r1 = _mm_load_si128((__m128i *) d64);
232
result = _mm_xor_si128(r1, result);
233
_mm_store_si128((__m128i *) d64, result);
239
a = _mm_load_si128((__m128i *) s64);
240
result = _mm_clmulepi64_si128 (a, b, 1);
242
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
243
result = _mm_xor_si128 (result, w);
244
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
245
r1 = _mm_xor_si128 (result, w);
247
result = _mm_clmulepi64_si128 (a, b, 0);
249
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
250
result = _mm_xor_si128 (result, w);
251
w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
252
result = _mm_xor_si128 (result, w);
254
result = _mm_unpacklo_epi64(result, r1);
256
_mm_store_si128((__m128i *) d64, result);
261
gf_do_final_region_alignment(&rd);
267
gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
269
gf_val_64_t e_i, e_im1, e_ip1;
270
gf_val_64_t d_i, d_im1, d_ip1;
271
gf_val_64_t y_i, y_im1, y_ip1;
275
if (b == 0) return -1;
276
e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
279
for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ;
289
while (d_ip1 >= d_i) {
290
c_i ^= (one << (d_ip1 - d_i));
291
e_ip1 ^= (e_i << (d_ip1 - d_i));
293
if (e_ip1 == 0) return 0;
294
while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
297
y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i);
310
/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
311
include it for completeness. It does have the feature that it requires no
318
gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
320
uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit;
323
h = (gf_internal_t *) gf->scratch;
326
/* Allen: set leading one of primitive polynomial */
336
pl = 0; /* Allen: left side of product */
337
pr = 0; /* Allen: right side of product */
339
/* Allen: unlike the corresponding functions for smaller word sizes,
340
* this loop carries out the initial carryless multiply by
341
* shifting b itself rather than simply looking at successively
342
* higher shifts of b */
344
for (i = 0; i < GF_FIELD_WIDTH; i++) {
345
if (a & (one << i)) {
351
if (br & lbit) bl ^= 1;
355
/* Allen: the name of the variable "one" is no longer descriptive at this point */
358
ppl = (h->prim_poly >> 2) | one;
359
ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
367
if (ppl & 1) ppr ^= lbit;
374
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
380
gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
384
#if defined(INTEL_SSE4_PCLMUL)
390
gf_internal_t * h = gf->scratch;
392
a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
393
b = _mm_insert_epi64 (a, b64, 0);
394
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
395
/* Do the initial multiply */
397
result = _mm_clmulepi64_si128 (a, b, 0);
399
/* Mask off the high order 32 bits using subtraction of the polynomial.
400
* NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
403
/* Adam: We cant include the leading one in the 64 bit pclmul,
404
so we need to split up the high 8 bytes of the result into two
405
parts before we multiply them with the prim_poly.*/
407
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
408
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
409
result = _mm_xor_si128 (result, w);
410
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
411
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
412
result = _mm_xor_si128 (result, w);
414
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
422
gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
426
#if defined(INTEL_SSE4_PCLMUL)
432
gf_internal_t * h = gf->scratch;
434
a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
435
b = _mm_insert_epi64 (a, b64, 0);
436
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
438
/* Do the initial multiply */
440
result = _mm_clmulepi64_si128 (a, b, 0);
442
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
443
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
444
result = _mm_xor_si128 (result, w);
445
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
446
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
447
result = _mm_xor_si128 (result, w);
449
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
450
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
451
result = _mm_xor_si128 (result, w);
452
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
453
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
454
result = _mm_xor_si128 (result, w);
456
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
463
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
465
#if defined(INTEL_SSE4_PCLMUL)
467
uint8_t *s8, *d8, *dtop;
469
__m128i v, b, m, prim_poly, c, fr, w, result;
471
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
472
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
474
h = (gf_internal_t *) gf->scratch;
476
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
477
gf_do_initial_region_alignment(&rd);
479
s8 = (uint8_t *) rd.s_start;
480
d8 = (uint8_t *) rd.d_start;
481
dtop = (uint8_t *) rd.d_top;
483
v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
484
m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
485
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
489
b = _mm_load_si128((__m128i *) s8);
490
result = _mm_clmulepi64_si128 (b, v, 0);
491
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
492
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
493
result = _mm_xor_si128 (result, w);
494
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
495
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
496
fr = _mm_xor_si128 (result, w);
497
fr = _mm_and_si128 (fr, m);
499
result = _mm_clmulepi64_si128 (b, v, 1);
500
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
501
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
502
result = _mm_xor_si128 (result, w);
503
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
504
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
505
result = _mm_xor_si128 (result, w);
506
result = _mm_slli_si128 (result, 8);
507
fr = _mm_xor_si128 (result, fr);
508
result = _mm_load_si128((__m128i *) d8);
509
fr = _mm_xor_si128 (result, fr);
511
_mm_store_si128((__m128i *) d8, fr);
517
b = _mm_load_si128((__m128i *) s8);
518
result = _mm_clmulepi64_si128 (b, v, 0);
519
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
520
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
521
result = _mm_xor_si128 (result, w);
522
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
523
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
524
fr = _mm_xor_si128 (result, w);
525
fr = _mm_and_si128 (fr, m);
527
result = _mm_clmulepi64_si128 (b, v, 1);
528
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
529
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
530
result = _mm_xor_si128 (result, w);
531
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
532
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
533
result = _mm_xor_si128 (result, w);
534
result = _mm_slli_si128 (result, 8);
535
fr = _mm_xor_si128 (result, fr);
537
_mm_store_si128((__m128i *) d8, fr);
542
gf_do_final_region_alignment(&rd);
547
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
550
struct gf_split_4_64_lazy_data *ld;
552
uint64_t pp, v, s, *s64, *d64, *top;
555
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
556
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
558
h = (gf_internal_t *) gf->scratch;
561
ld = (struct gf_split_4_64_lazy_data *) h->private;
563
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
564
gf_do_initial_region_alignment(&rd);
566
if (ld->last_value != val) {
568
for (i = 0; i < 16; i++) {
569
ld->tables[i][0] = 0;
570
for (j = 1; j < 16; j <<= 1) {
571
for (k = 0; k < j; k++) {
572
ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
574
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
578
ld->last_value = val;
580
s64 = (uint64_t *) rd.s_start;
581
d64 = (uint64_t *) rd.d_start;
582
top = (uint64_t *) rd.d_top;
585
v = (xor) ? *d64 : 0;
589
v ^= ld->tables[i][s&0xf];
597
gf_do_final_region_alignment(&rd);
603
gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64)
605
uint64_t product, i, j, mask, tb;
607
struct gf_split_8_8_data *d8;
609
h = (gf_internal_t *) gf->scratch;
610
d8 = (struct gf_split_8_8_data *) h->private;
614
for (i = 0; a64 != 0; i++) {
616
for (j = 0; tb != 0; j++) {
617
product ^= d8->tables[i+j][a64&mask][tb&mask];
626
gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
629
struct gf_split_8_64_lazy_data *ld;
631
uint64_t pp, v, s, *s64, *d64, *top;
634
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
635
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
637
h = (gf_internal_t *) gf->scratch;
640
ld = (struct gf_split_8_64_lazy_data *) h->private;
642
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
643
gf_do_initial_region_alignment(&rd);
645
if (ld->last_value != val) {
647
for (i = 0; i < 8; i++) {
648
ld->tables[i][0] = 0;
649
for (j = 1; j < 256; j <<= 1) {
650
for (k = 0; k < j; k++) {
651
ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
653
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
657
ld->last_value = val;
659
s64 = (uint64_t *) rd.s_start;
660
d64 = (uint64_t *) rd.d_start;
661
top = (uint64_t *) rd.d_top;
664
v = (xor) ? *d64 : 0;
668
v ^= ld->tables[i][s&0xff];
676
gf_do_final_region_alignment(&rd);
680
gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
683
struct gf_split_16_64_lazy_data *ld;
685
uint64_t pp, v, s, *s64, *d64, *top;
688
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
689
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
691
h = (gf_internal_t *) gf->scratch;
694
ld = (struct gf_split_16_64_lazy_data *) h->private;
696
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
697
gf_do_initial_region_alignment(&rd);
699
if (ld->last_value != val) {
701
for (i = 0; i < 4; i++) {
702
ld->tables[i][0] = 0;
703
for (j = 1; j < (1<<16); j <<= 1) {
704
for (k = 0; k < j; k++) {
705
ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
707
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
711
ld->last_value = val;
713
s64 = (uint64_t *) rd.s_start;
714
d64 = (uint64_t *) rd.d_start;
715
top = (uint64_t *) rd.d_top;
718
v = (xor) ? *d64 : 0;
722
v ^= ld->tables[i][s&0xffff];
730
gf_do_final_region_alignment(&rd);
734
int gf_w64_shift_init(gf_t *gf)
736
gf->multiply.w64 = gf_w64_shift_multiply;
737
gf->inverse.w64 = gf_w64_euclid;
738
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
743
int gf_w64_cfm_init(gf_t *gf)
745
gf->inverse.w64 = gf_w64_euclid;
746
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
748
#if defined(INTEL_SSE4_PCLMUL)
751
h = (gf_internal_t *) gf->scratch;
753
if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
754
gf->multiply.w64 = gf_w64_clm_multiply_2;
755
gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2;
756
}else if((0xfffe000000000000ULL & h->prim_poly) == 0){
757
gf->multiply.w64 = gf_w64_clm_multiply_4;
758
gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
770
gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
780
for (i = 1; i < (1 << g_s); i <<= 1) {
781
for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
782
if (val & (one << 63)) {
794
gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
796
uint64_t top, bot, mask, tp;
797
int g_s, g_r, lshift, rshift;
798
struct gf_w64_group_data *gd;
800
gf_internal_t *h = (gf_internal_t *) gf->scratch;
803
gd = (struct gf_w64_group_data *) h->private;
804
gf_w64_group_set_shift_tables(gd->shift, b, h);
806
mask = ((1 << g_s) - 1);
808
bot = gd->shift[a&mask];
811
if (a == 0) return bot;
815
do { /* Shifting out is straightfoward */
818
tp = gd->shift[a&mask];
819
top ^= (tp >> rshift);
820
bot ^= (tp << lshift);
824
/* Reducing is a bit gross, because I don't zero out the index bits of top.
825
The reason is that we throw top away. Even better, that last (tp >> rshift)
826
is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */
828
lshift = ((lshift-1) / g_r) * g_r;
829
rshift = 64 - lshift;
830
mask = (1 << g_r) - 1;
831
while (lshift >= 0) {
832
tp = gd->reduce[(top >> lshift) & mask];
833
top ^= (tp >> rshift);
834
bot ^= (tp << lshift);
843
void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
846
uint64_t a64, smask, rmask, top, bot, tp;
847
int lshift, rshift, g_s, g_r;
849
uint64_t *s64, *d64, *dtop;
850
struct gf_w64_group_data *gd;
851
gf_internal_t *h = (gf_internal_t *) gf->scratch;
853
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
854
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
856
gd = (struct gf_w64_group_data *) h->private;
859
gf_w64_group_set_shift_tables(gd->shift, val, h);
861
for (i = 63; !(val & (1ULL << i)); i--) ;
864
/* i is the bit position of the first zero bit in any element of
871
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
873
gf_do_initial_region_alignment(&rd);
875
s64 = (uint64_t *) rd.s_start;
876
d64 = (uint64_t *) rd.d_start;
877
dtop = (uint64_t *) rd.d_top;
879
smask = (1 << g_s) - 1;
880
rmask = (1 << g_r) - 1;
886
bot = gd->shift[a64&smask];
897
tp = gd->shift[a64&smask];
898
top ^= (tp >> rshift);
899
bot ^= (tp << lshift);
904
lshift = ((i-64-1) / g_r) * g_r;
905
rshift = 64 - lshift;
906
while (lshift >= 0) {
907
tp = gd->reduce[(top >> lshift) & rmask];
908
top ^= (tp >> rshift);
909
bot ^= (tp << lshift);
915
if (xor) bot ^= *d64;
920
gf_do_final_region_alignment(&rd);
926
gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
929
uint64_t p, l, ind, a64;
933
struct gf_w64_group_data *gd;
934
gf_internal_t *h = (gf_internal_t *) gf->scratch;
937
gd = (struct gf_w64_group_data *) h->private;
938
gf_w64_group_set_shift_tables(gd->shift, b, h);
941
if (leftover == 0) leftover = g_s;
952
while (bits_left > 0) {
957
p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
963
void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
966
uint64_t p, l, ind, a64;
970
uint64_t *s64, *d64, *top;
971
struct gf_w64_group_data *gd;
972
gf_internal_t *h = (gf_internal_t *) gf->scratch;
974
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
975
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
977
gd = (struct gf_w64_group_data *) h->private;
979
gf_w64_group_set_shift_tables(gd->shift, val, h);
981
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
982
gf_do_initial_region_alignment(&rd);
984
s64 = (uint64_t *) rd.s_start;
985
d64 = (uint64_t *) rd.d_start;
986
top = (uint64_t *) rd.d_top;
989
if (leftover == 0) leftover = g_s;
1001
while (bits_left > 0) {
1006
p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
1013
gf_do_final_region_alignment(&rd);
1018
int gf_w64_group_init(gf_t *gf)
1020
uint64_t i, j, p, index;
1021
struct gf_w64_group_data *gd;
1022
gf_internal_t *h = (gf_internal_t *) gf->scratch;
1028
gd = (struct gf_w64_group_data *) h->private;
1029
gd->shift = (uint64_t *) (&(gd->memory));
1030
gd->reduce = gd->shift + (1 << g_s);
1033
for (i = 0; i < (1 << g_r); i++) {
1036
for (j = 0; j < g_r; j++) {
1038
p ^= (h->prim_poly << j);
1040
if (j > 0) index ^= (h->prim_poly >> (64-j));
1043
gd->reduce[index] = p;
1047
gf->multiply.w64 = gf_w64_group_s_equals_r_multiply;
1048
gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region;
1050
gf->multiply.w64 = gf_w64_group_multiply;
1051
gf->multiply_region.w64 = gf_w64_group_multiply_region;
1053
gf->divide.w64 = NULL;
1054
gf->inverse.w64 = gf_w64_euclid;
1060
gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index)
1064
r64 = (uint64_t *) start;
1070
gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
1075
uint64_t a, b, *r64;
1078
h = (gf_internal_t *) gf->scratch;
1079
gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
1080
r64 = (uint64_t *) start;
1081
if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
1082
if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
1083
index -= (((uint64_t *) rd.d_start) - r64);
1084
r8 = (uint8_t *) rd.d_start;
1085
top = (uint8_t *) rd.d_top;
1086
sub_size = (top-r8)/2;
1088
a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
1089
b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
1090
return (a | ((uint64_t)b << 32));
1094
gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index)
1101
gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128);
1102
r64 = (uint64_t *) start;
1103
if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
1104
if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
1105
index -= (((uint64_t *) rd.d_start) - r64);
1106
r8 = (uint8_t *) rd.d_start;
1107
r8 += ((index & 0xfffffff0)*8);
1108
r8 += (index & 0xf);
1111
for (i = 0; i < 8; i++) {
1122
gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1124
uint64_t prod, pp, bmask;
1127
h = (gf_internal_t *) gf->scratch;
1131
bmask = 0x8000000000000000ULL;
1134
if (a & 1) prod ^= b;
1136
if (a == 0) return prod;
1138
b = ((b << 1) ^ pp);
1148
gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1150
uint64_t prod, pp, pmask, amask;
1153
h = (gf_internal_t *) gf->scratch;
1158
/* changed from declare then shift to just declare.*/
1160
pmask = 0x8000000000000000ULL;
1161
amask = 0x8000000000000000ULL;
1163
while (amask != 0) {
1165
prod = ((prod << 1) ^ pp);
1169
if (a & amask) prod ^= b;
1177
gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1179
uint64_t *s64, *d64, ta, prod, amask, pmask, pp;
1183
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1184
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1186
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1187
gf_do_initial_region_alignment(&rd);
1189
h = (gf_internal_t *) gf->scratch;
1191
s64 = (uint64_t *) rd.s_start;
1192
d64 = (uint64_t *) rd.d_start;
1198
while (s64 < (uint64_t *) rd.s_top) {
1202
while (amask != 0) {
1203
prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
1204
if (val & amask) prod ^= ta;
1212
while (s64 < (uint64_t *) rd.s_top) {
1216
while (amask != 0) {
1217
prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
1218
if (val & amask) prod ^= ta;
1226
gf_do_final_region_alignment(&rd);
1231
gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1233
uint64_t *s64, *d64, ta, tb, prod, bmask, pp;
1237
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1238
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1240
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1241
gf_do_initial_region_alignment(&rd);
1243
h = (gf_internal_t *) gf->scratch;
1245
s64 = (uint64_t *) rd.s_start;
1246
d64 = (uint64_t *) rd.d_start;
1252
while (s64 < (uint64_t *) rd.s_top) {
1257
if (tb & 1) prod ^= ta;
1260
ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
1267
while (s64 < (uint64_t *) rd.s_top) {
1272
if (tb & 1) prod ^= ta;
1275
ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
1282
gf_do_final_region_alignment(&rd);
1285
#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
1286
t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
1287
t2 = _mm_and_si128(va, m2); \
1288
t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
1289
va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
1291
#define BYTWO_P_ONESTEP {\
1292
SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1293
t1 = _mm_and_si128(v, one); \
1294
t1 = _mm_sub_epi64(t1, one); \
1295
t1 = _mm_and_si128(t1, ta); \
1296
prod = _mm_xor_si128(prod, t1); \
1297
v = _mm_srli_epi64(v, 1); }
1300
void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1305
uint64_t vrev, one64;
1307
__m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1311
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1312
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1314
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1315
gf_do_initial_region_alignment(&rd);
1317
h = (gf_internal_t *) gf->scratch;
1320
for (i = 0; i < 64; i++) {
1322
if (!(val & (one64 << i))) vrev |= 1;
1325
s8 = (uint8_t *) rd.s_start;
1326
d8 = (uint8_t *) rd.d_start;
1330
pp = _mm_set1_epi64x(h->prim_poly);
1331
m1 = _mm_set1_epi64x(amask);
1332
m2 = _mm_set1_epi64x(one64 << 63);
1333
one = _mm_set1_epi64x(1);
1335
while (d8 < (uint8_t *) rd.d_top) {
1336
prod = _mm_setzero_si128();
1337
v = _mm_set1_epi64x(vrev);
1338
ta = _mm_load_si128((__m128i *) s8);
1339
tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1340
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1341
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1342
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1343
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1344
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1345
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1346
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1347
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1348
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1349
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1350
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1351
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1352
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1353
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1354
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1355
BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1356
_mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1360
gf_do_final_region_alignment(&rd);
1367
gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
1369
uint64_t one64, amask;
1371
__m128i pp, m1, m2, t1, t2, va, vb;
1374
s8 = (uint8_t *) rd->s_start;
1375
d8 = (uint8_t *) rd->d_start;
1377
h = (gf_internal_t *) rd->gf->scratch;
1381
pp = _mm_set1_epi64x(h->prim_poly);
1382
m1 = _mm_set1_epi64x(amask);
1383
m2 = _mm_set1_epi64x(one64 << 63);
1385
while (d8 < (uint8_t *) rd->d_top) {
1386
va = _mm_load_si128 ((__m128i *)(s8));
1387
SSE_AB2(pp, m1, m2, va, t1, t2);
1388
vb = _mm_load_si128 ((__m128i *)(d8));
1389
vb = _mm_xor_si128(vb, va);
1390
_mm_store_si128((__m128i *)d8, vb);
1400
gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
1402
uint64_t one64, amask;
1404
__m128i pp, m1, m2, t1, t2, va;
1407
s8 = (uint8_t *) rd->s_start;
1408
d8 = (uint8_t *) rd->d_start;
1410
h = (gf_internal_t *) rd->gf->scratch;
1414
pp = _mm_set1_epi64x(h->prim_poly);
1415
m1 = _mm_set1_epi64x(amask);
1416
m2 = _mm_set1_epi64x(one64 << 63);
1418
while (d8 < (uint8_t *) rd->d_top) {
1419
va = _mm_load_si128 ((__m128i *)(s8));
1420
SSE_AB2(pp, m1, m2, va, t1, t2);
1421
_mm_store_si128((__m128i *)d8, va);
1431
gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1433
uint64_t itb, amask, one64;
1435
__m128i pp, m1, m2, t1, t2, va, vb;
1439
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1440
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1442
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1443
gf_do_initial_region_alignment(&rd);
1447
gf_w64_bytwo_b_sse_region_2_xor(&rd);
1449
gf_w64_bytwo_b_sse_region_2_noxor(&rd);
1451
gf_do_final_region_alignment(&rd);
1455
s8 = (uint8_t *) rd.s_start;
1456
d8 = (uint8_t *) rd.d_start;
1457
h = (gf_internal_t *) gf->scratch;
1462
pp = _mm_set1_epi64x(h->prim_poly);
1463
m1 = _mm_set1_epi64x(amask);
1464
m2 = _mm_set1_epi64x(one64 << 63);
1466
while (d8 < (uint8_t *) rd.d_top) {
1467
va = _mm_load_si128 ((__m128i *)(s8));
1468
vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1471
if (itb & 1) vb = _mm_xor_si128(vb, va);
1473
if (itb == 0) break;
1474
SSE_AB2(pp, m1, m2, va, t1, t2);
1476
_mm_store_si128((__m128i *)d8, vb);
1481
gf_do_final_region_alignment(&rd);
1487
int gf_w64_bytwo_init(gf_t *gf)
1491
h = (gf_internal_t *) gf->scratch;
1493
if (h->mult_type == GF_MULT_BYTWO_p) {
1494
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
1496
if (h->region_type & GF_REGION_NOSSE)
1497
gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
1499
gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region;
1501
gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
1502
if(h->region_type & GF_REGION_SSE)
1506
gf->multiply.w64 = gf_w64_bytwo_b_multiply;
1508
if (h->region_type & GF_REGION_NOSSE)
1509
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
1511
gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region;
1513
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
1514
if(h->region_type & GF_REGION_SSE)
1518
gf->inverse.w64 = gf_w64_euclid;
1525
gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1527
gf_internal_t *h = (gf_internal_t *) gf->scratch;
1528
gf_t *base_gf = h->base_gf;
1529
uint32_t b0 = b & 0x00000000ffffffff;
1530
uint32_t b1 = (b & 0xffffffff00000000) >> 32;
1531
uint32_t a0 = a & 0x00000000ffffffff;
1532
uint32_t a1 = (a & 0xffffffff00000000) >> 32;
1535
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1537
return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1538
((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
1542
* Composite field division trick (explained in 2007 tech report)
1544
* Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
1548
* c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
1550
* want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
1552
* let d = b1c1 and d+1 = b0c0
1554
* solve s*b1c1+b1c0+b0c1 = 0
1556
* solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
1566
gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
1568
gf_internal_t *h = (gf_internal_t *) gf->scratch;
1569
gf_t *base_gf = h->base_gf;
1570
uint32_t a0 = a & 0x00000000ffffffff;
1571
uint32_t a1 = (a & 0xffffffff00000000) >> 32;
1572
uint32_t c0, c1, d, tmp;
1574
uint32_t a0inv, a1inv;
1577
a1inv = base_gf->inverse.w32(base_gf, a1);
1578
c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
1580
} else if (a1 == 0) {
1581
c0 = base_gf->inverse.w32(base_gf, a0);
1584
a1inv = base_gf->inverse.w32(base_gf, a1);
1585
a0inv = base_gf->inverse.w32(base_gf, a0);
1587
d = base_gf->multiply.w32(base_gf, a1, a0inv);
1589
tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
1590
tmp = base_gf->inverse.w32(base_gf, tmp);
1592
d = base_gf->multiply.w32(base_gf, d, tmp);
1594
c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
1595
c1 = base_gf->multiply.w32(base_gf, d, a1inv);
1598
c = c0 | ((uint64_t)c1 << 32);
1605
gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1607
gf_internal_t *h = (gf_internal_t *) gf->scratch;
1608
gf_t *base_gf = h->base_gf;
1609
uint32_t b0 = val & 0x00000000ffffffff;
1610
uint32_t b1 = (val & 0xffffffff00000000) >> 32;
1611
uint64_t *s64, *d64;
1613
uint64_t a0, a1, a1b1;
1616
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1617
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1625
a0 = *s64 & 0x00000000ffffffff;
1626
a1 = (*s64 & 0xffffffff00000000) >> 32;
1627
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1629
*d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1630
((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
1636
a0 = *s64 & 0x00000000ffffffff;
1637
a1 = (*s64 & 0xffffffff00000000) >> 32;
1638
a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1640
*d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1641
((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
1650
gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1652
gf_internal_t *h = (gf_internal_t *) gf->scratch;
1653
gf_t *base_gf = h->base_gf;
1654
gf_val_32_t val0 = val & 0x00000000ffffffff;
1655
gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32;
1656
uint8_t *slow, *shigh;
1657
uint8_t *dlow, *dhigh, *top;
1662
memset(dest, 0, bytes);
1665
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1666
gf_do_initial_region_alignment(&rd);
1668
slow = (uint8_t *) rd.s_start;
1669
dlow = (uint8_t *) rd.d_start;
1670
top = (uint8_t*) rd.d_top;
1671
sub_reg_size = (top - dlow)/2;
1672
shigh = slow + sub_reg_size;
1673
dhigh = dlow + sub_reg_size;
1675
base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
1676
base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
1677
base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
1678
base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
1679
base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
1681
gf_do_final_region_alignment(&rd);
1687
int gf_w64_composite_init(gf_t *gf)
1689
gf_internal_t *h = (gf_internal_t *) gf->scratch;
1691
if (h->region_type & GF_REGION_ALTMAP) {
1692
gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt;
1694
gf->multiply_region.w64 = gf_w64_composite_multiply_region;
1697
gf->multiply.w64 = gf_w64_composite_multiply;
1698
gf->divide.w64 = NULL;
1699
gf->inverse.w64 = gf_w64_composite_inverse;
1707
gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
1711
uint64_t pp, v, *s64, *d64, *top;
1712
__m128i si, tables[16][8], p[8], v0, mask1;
1713
struct gf_split_4_64_lazy_data *ld;
1717
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1718
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1720
h = (gf_internal_t *) gf->scratch;
1723
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
1724
gf_do_initial_region_alignment(&rd);
1726
s64 = (uint64_t *) rd.s_start;
1727
d64 = (uint64_t *) rd.d_start;
1728
top = (uint64_t *) rd.d_top;
1730
ld = (struct gf_split_4_64_lazy_data *) h->private;
1733
for (i = 0; i < 16; i++) {
1734
ld->tables[i][0] = 0;
1735
for (j = 1; j < 16; j <<= 1) {
1736
for (k = 0; k < j; k++) {
1737
ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1739
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1741
for (j = 0; j < 8; j++) {
1742
for (k = 0; k < 16; k++) {
1743
btable[k] = (uint8_t) ld->tables[i][k];
1744
ld->tables[i][k] >>= 8;
1746
tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1750
mask1 = _mm_set1_epi8(0xf);
1752
while (d64 != top) {
1755
for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
1757
for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
1760
for (k = 0; k < 8; k++) {
1761
v0 = _mm_load_si128((__m128i *) s64);
1762
/* MM_PRINT8("v", v0); */
1765
si = _mm_and_si128(v0, mask1);
1767
for (j = 0; j < 8; j++) {
1768
p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1771
v0 = _mm_srli_epi32(v0, 4);
1772
si = _mm_and_si128(v0, mask1);
1773
for (j = 0; j < 8; j++) {
1774
p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1778
for (i = 0; i < 8; i++) {
1779
/* MM_PRINT8("v", p[i]); */
1780
_mm_store_si128((__m128i *) d64, p[i]);
1784
gf_do_final_region_alignment(&rd);
1791
gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
1795
uint64_t pp, v, *s64, *d64, *top;
1796
__m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1;
1797
struct gf_split_4_64_lazy_data *ld;
1801
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1802
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1804
h = (gf_internal_t *) gf->scratch;
1807
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
1808
gf_do_initial_region_alignment(&rd);
1810
s64 = (uint64_t *) rd.s_start;
1811
d64 = (uint64_t *) rd.d_start;
1812
top = (uint64_t *) rd.d_top;
1814
ld = (struct gf_split_4_64_lazy_data *) h->private;
1817
for (i = 0; i < 16; i++) {
1818
ld->tables[i][0] = 0;
1819
for (j = 1; j < 16; j <<= 1) {
1820
for (k = 0; k < j; k++) {
1821
ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1823
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1825
for (j = 0; j < 8; j++) {
1826
for (k = 0; k < 16; k++) {
1827
btable[k] = (uint8_t) ld->tables[i][k];
1828
ld->tables[i][k] >>= 8;
1830
tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1834
mask1 = _mm_set1_epi8(0xf);
1835
mask8 = _mm_set1_epi16(0xff);
1836
mask16 = _mm_set1_epi32(0xffff);
1838
while (d64 != top) {
1840
for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
1842
for (k = 0; k < 8; k++) {
1843
st[k] = _mm_load_si128((__m128i *) s64);
1847
for (k = 0; k < 4; k ++) {
1848
st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
1849
st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
1850
t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
1851
st[k] = _mm_srli_si128(st[k], 8);
1852
st[k+4] = _mm_slli_si128(st[k+4], 8);
1853
st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
1858
printf("After pack pass 1\n");
1859
for (k = 0; k < 8; k++) {
1860
MM_PRINT8("v", st[k]);
1865
t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
1866
st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
1868
t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
1869
st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
1871
t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
1872
st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
1874
t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
1875
st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
1879
printf("After pack pass 2\n");
1880
for (k = 0; k < 8; k++) {
1881
MM_PRINT8("v", st[k]);
1885
t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
1886
st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
1888
t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
1889
st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
1891
t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
1892
st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
1894
t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
1895
st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
1899
printf("After final pack pass 2\n");
1900
for (k = 0; k < 8; k++) {
1901
MM_PRINT8("v", st[k]);
1905
for (k = 0; k < 8; k++) {
1906
si = _mm_and_si128(st[k], mask1);
1908
for (j = 0; j < 8; j++) {
1909
p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1912
st[k] = _mm_srli_epi32(st[k], 4);
1913
si = _mm_and_si128(st[k], mask1);
1914
for (j = 0; j < 8; j++) {
1915
p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1920
t1 = _mm_unpacklo_epi8(p[0], p[1]);
1921
p[1] = _mm_unpackhi_epi8(p[0], p[1]);
1923
t1 = _mm_unpacklo_epi8(p[2], p[3]);
1924
p[3] = _mm_unpackhi_epi8(p[2], p[3]);
1926
t1 = _mm_unpacklo_epi8(p[4], p[5]);
1927
p[5] = _mm_unpackhi_epi8(p[4], p[5]);
1929
t1 = _mm_unpacklo_epi8(p[6], p[7]);
1930
p[7] = _mm_unpackhi_epi8(p[6], p[7]);
1934
printf("After unpack pass 1:\n");
1935
for (i = 0; i < 8; i++) {
1936
MM_PRINT8("v", p[i]);
1940
t1 = _mm_unpacklo_epi16(p[0], p[2]);
1941
p[2] = _mm_unpackhi_epi16(p[0], p[2]);
1943
t1 = _mm_unpacklo_epi16(p[1], p[3]);
1944
p[3] = _mm_unpackhi_epi16(p[1], p[3]);
1946
t1 = _mm_unpacklo_epi16(p[4], p[6]);
1947
p[6] = _mm_unpackhi_epi16(p[4], p[6]);
1949
t1 = _mm_unpacklo_epi16(p[5], p[7]);
1950
p[7] = _mm_unpackhi_epi16(p[5], p[7]);
1954
printf("After unpack pass 2:\n");
1955
for (i = 0; i < 8; i++) {
1956
MM_PRINT8("v", p[i]);
1960
t1 = _mm_unpacklo_epi32(p[0], p[4]);
1961
p[4] = _mm_unpackhi_epi32(p[0], p[4]);
1963
t1 = _mm_unpacklo_epi32(p[1], p[5]);
1964
p[5] = _mm_unpackhi_epi32(p[1], p[5]);
1966
t1 = _mm_unpacklo_epi32(p[2], p[6]);
1967
p[6] = _mm_unpackhi_epi32(p[2], p[6]);
1969
t1 = _mm_unpacklo_epi32(p[3], p[7]);
1970
p[7] = _mm_unpackhi_epi32(p[3], p[7]);
1974
for (i = 0; i < 8; i++) {
1975
t1 = _mm_load_si128((__m128i *) d64);
1976
_mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
1980
for (i = 0; i < 8; i++) {
1981
_mm_store_si128((__m128i *) d64, p[i]);
1988
gf_do_final_region_alignment(&rd);
1992
#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
1995
int gf_w64_split_init(gf_t *gf)
1998
struct gf_split_4_64_lazy_data *d4;
1999
struct gf_split_8_64_lazy_data *d8;
2000
struct gf_split_8_8_data *d88;
2001
struct gf_split_16_64_lazy_data *d16;
2005
h = (gf_internal_t *) gf->scratch;
2009
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
2011
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
2013
#if defined(INTEL_SSE4_PCLMUL)
2014
if ((!(h->region_type & GF_REGION_NOSSE) &&
2015
(h->arg1 == 64 || h->arg2 == 64)) ||
2016
h->mult_type == GF_MULT_DEFAULT){
2018
if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
2019
gf->multiply.w64 = gf_w64_clm_multiply_2;
2020
gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2;
2021
}else if((0xfffe000000000000ULL & h->prim_poly) == 0){
2022
gf->multiply.w64 = gf_w64_clm_multiply_4;
2023
gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
2030
gf->inverse.w64 = gf_w64_euclid;
2032
/* Allen: set region pointers for default mult type. Single pointers are
2033
* taken care of above (explicitly for sse, implicitly for no sse). */
2036
if (h->mult_type == GF_MULT_DEFAULT) {
2037
d4 = (struct gf_split_4_64_lazy_data *) h->private;
2039
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
2042
if (h->mult_type == GF_MULT_DEFAULT) {
2043
d8 = (struct gf_split_8_64_lazy_data *) h->private;
2045
gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
2049
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
2050
d4 = (struct gf_split_4_64_lazy_data *) h->private;
2053
if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0;
2054
if(h->region_type & GF_REGION_ALTMAP)
2057
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region;
2065
if(h->region_type & GF_REGION_NOSSE)
2066
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
2068
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
2070
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
2071
if(h->region_type & GF_REGION_SSE)
2076
if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
2077
d8 = (struct gf_split_8_64_lazy_data *) h->private;
2079
gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
2081
if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) {
2082
d16 = (struct gf_split_16_64_lazy_data *) h->private;
2083
d16->last_value = 0;
2084
gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region;
2086
if ((h->arg1 == 8 && h->arg2 == 8)) {
2087
d88 = (struct gf_split_8_8_data *) h->private;
2088
gf->multiply.w64 = gf_w64_split_8_8_multiply;
2090
/* The performance of this guy sucks, so don't bother with a region op */
2093
for (exp = 0; exp < 15; exp++) {
2094
for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
2095
for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0;
2096
d88->tables[exp][1][1] = basep;
2097
for (i = 2; i < 256; i++) {
2099
p = d88->tables[exp][i^1][1];
2100
d88->tables[exp][i][1] = p ^ basep;
2102
p = d88->tables[exp][i>>1][1];
2103
d88->tables[exp][i][1] = GF_MULTBY_TWO(p);
2106
for (i = 1; i < 256; i++) {
2107
p = d88->tables[exp][i][1];
2108
for (j = 1; j < 256; j++) {
2110
d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p;
2112
d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]);
2116
for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
2122
int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2127
return sizeof(gf_internal_t);
2129
case GF_MULT_CARRY_FREE:
2130
return sizeof(gf_internal_t);
2132
case GF_MULT_BYTWO_p:
2133
case GF_MULT_BYTWO_b:
2134
return sizeof(gf_internal_t);
2137
case GF_MULT_DEFAULT:
2139
/* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
2140
* then fall through to split table scratch size code. */
2150
case GF_MULT_SPLIT_TABLE:
2151
if (arg1 == 8 && arg2 == 8) {
2152
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
2154
if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
2155
return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
2157
if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
2158
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
2161
if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
2162
return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
2166
return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
2167
sizeof(uint64_t) * (1 << arg1) +
2168
sizeof(uint64_t) * (1 << arg2) + 64;
2170
case GF_MULT_COMPOSITE:
2171
if (arg1 == 2) return sizeof(gf_internal_t) + 64;
2179
int gf_w64_init(gf_t *gf)
2182
int no_default_flag = 0;
2184
h = (gf_internal_t *) gf->scratch;
2186
/* Allen: set default primitive polynomial / irreducible polynomial if needed */
2188
/* Omitting the leftmost 1 as in w=32 */
2190
if (h->prim_poly == 0) {
2191
if (h->mult_type == GF_MULT_COMPOSITE) {
2192
h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2193
if (h->prim_poly == 0) return 0; /* This shouldn't happen */
2195
h->prim_poly = 0x1b;
2197
if (no_default_flag == 1) {
2198
fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n");
2203
gf->multiply.w64 = NULL;
2204
gf->divide.w64 = NULL;
2205
gf->inverse.w64 = NULL;
2206
gf->multiply_region.w64 = NULL;
2208
switch(h->mult_type) {
2209
case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break;
2210
case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break;
2211
case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break;
2212
case GF_MULT_DEFAULT:
2213
case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break;
2214
case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break;
2215
case GF_MULT_BYTWO_p:
2216
case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break;
2219
if (h->divide_type == GF_DIVIDE_EUCLID) {
2220
gf->divide.w64 = gf_w64_divide_from_inverse;
2221
gf->inverse.w64 = gf_w64_euclid;
2224
if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
2225
gf->divide.w64 = gf_w64_divide_from_inverse;
2227
if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) {
2228
gf->inverse.w64 = gf_w64_inverse_from_divide;
2231
if (h->region_type == GF_REGION_CAUCHY) return 0;
2233
if (h->region_type & GF_REGION_ALTMAP) {
2234
if (h->mult_type == GF_MULT_COMPOSITE) {
2235
gf->extract_word.w64 = gf_w64_composite_extract_word;
2236
} else if (h->mult_type == GF_MULT_SPLIT_TABLE) {
2237
gf->extract_word.w64 = gf_w64_split_extract_word;
2240
gf->extract_word.w64 = gf_w64_extract_word;