1
/* gcc linpack.c cpuidc64.o cpuida64.o -m64 -lrt -lc -lm -o linpack
3
* Linpack 100x100 Benchmark In C/C++ For PCs
5
* Different compilers can produce different floating point numeric
6
* results, probably due to compiling instructions in a different
7
* sequence. As the program checks these, they may need to be changed.
8
* The log file indicates non-standard results and these values can
9
* be copied and pasted into this program. See // Values near the
12
* Different compilers do not optimise the code in the same way.
13
* This can lead to wide variations in benchmark speeds. See results
14
* with MS6 compiler ID and compare with those from same CPUs from
15
* the Watcom compiler generated code.
17
***************************************************************************
20
#define _CRT_SECURE_NO_WARNINGS 1
45
#define ROLLING "Rolled"
48
#define ROLLING "Unrolled"
54
#define options "Non-optimised"
57
// #define options "Optimised"
58
#define options "Opt 3 64 Bit"
70
/* this is truly rank, but it's minimally invasive, and lifted in part from the STREAM scores */
82
i = gettimeofday(&tp,&tzp);
83
return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
89
static LARGE_INTEGER freq = {0};
90
LARGE_INTEGER count = {0};
91
if(freq.QuadPart == 0LL) {
92
QueryPerformanceFrequency(&freq);
94
QueryPerformanceCounter(&count);
95
return (double)count.QuadPart / (double)freq.QuadPart;
107
secs = mysecond() - secs;
110
void print_time (int row);
111
void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
112
void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
113
void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
114
void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
115
void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
116
REAL epslon (REAL x);
117
int idamax (int n, REAL dx[], int incx);
118
void dscal (int n, REAL da, REAL dx[], int incx);
119
REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);
121
static REAL atime[9][15];
125
int main (int argc, char *argv[])
127
static REAL aa[200*200],a[200*201],b[200],x[200];
128
REAL cray,ops,total,norma,normx;
129
REAL resid,residn,eps,tm2,epsn,x1,x2;
131
static int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
132
int endit, pass, loop;
133
REAL overhead1, overhead2, time2;
143
printf("##########################################\n");
152
fprintf(stdout, "%s ", ROLLING);
153
fprintf(stdout, "%s ", PREC);
154
fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n");
156
fprintf(stdout,"Optimisation %s\n\n",options);
158
ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
160
matgen(a,lda,n,b,&norma);
162
dgefa(a,lda,n,ipvt,&info);
166
dgesl(a,lda,n,ipvt,b,0);
169
total = atime[0][0] + atime[1][0];
171
/* compute a residual to verify results. */
173
for (i = 0; i < n; i++) {
176
matgen(a,lda,n,b,&norma);
177
for (i = 0; i < n; i++) {
180
dmxpy(n,b,n,lda,x,a);
183
for (i = 0; i < n; i++) {
184
resid = (resid > fabs((double)b[i]))
185
? resid : fabs((double)b[i]);
186
normx = (normx > fabs((double)x[i]))
187
? normx : fabs((double)x[i]);
190
residn = resid/( n*norma*normx*eps );
195
printf("norm resid resid machep");
196
printf(" x[0]-1 x[n-1]-1\n");
197
printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
198
(double)residn, (double)resid, (double)epsn,
199
(double)x1, (double)x2);
201
printf("Times are reported for matrices of order %5d\n",n);
202
printf("1 pass times for array with leading dimension of%5d\n\n",lda);
203
printf(" dgefa dgesl total Mflops unit");
209
atime[3][0] = ops/(1.0e6*total);
210
atime[4][0] = 2.0/atime[3][0];
217
atime[5][0] = total/cray;
221
/************************************************************************
222
* Calculate overhead of executing matgen procedure *
223
************************************************************************/
225
printf("\nCalculating matgen overhead\n");
232
for ( i = 0 ; i < loop ; i++)
234
matgen(a,lda,n,b,&norma);
238
printf("%10d times %6.2f seconds\n", loop, overhead1);
239
if (overhead1 > runSecs)
257
overhead1 = overhead1 / (double)loop;
259
printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead1);
261
/************************************************************************
262
* Calculate matgen/dgefa passes for runSecs seconds *
263
************************************************************************/
265
printf("Calculating matgen/dgefa passes for %d seconds\n", (int)runSecs);
272
for ( i = 0 ; i < ntimes ; i++)
274
matgen(a,lda,n,b,&norma);
275
dgefa(a,lda,n,ipvt,&info );
279
printf("%10d times %6.2f seconds\n", ntimes, time2);
288
ntimes = ntimes * 10;
298
ntimes = (int)(runSecs * (double)ntimes / time2);
299
if (ntimes == 0) ntimes = 1;
301
printf("Passes used %10d \n\n", ntimes);
302
printf("Times for array with leading dimension of%4d\n\n",lda);
303
printf(" dgefa dgesl total Mflops unit");
306
/************************************************************************
308
************************************************************************/
310
tm2 = ntimes * overhead1;
313
for (j=1 ; j<6 ; j++)
316
for (i = 0; i < ntimes; i++)
318
matgen(a,lda,n,b,&norma);
319
dgefa(a,lda,n,ipvt,&info );
322
atime[0][j] = (secs - tm2)/ntimes;
325
for (i = 0; i < ntimes; i++)
327
dgesl(a,lda,n,ipvt,b,0);
331
atime[1][j] = secs/ntimes;
332
total = atime[0][j] + atime[1][j];
334
atime[3][j] = ops/(1.0e6*total);
335
atime[4][j] = 2.0/atime[3][j];
336
atime[5][j] = total/cray;
337
atime[3][6] = atime[3][6] + atime[3][j];
341
atime[3][6] = atime[3][6] / 5.0;
342
printf("Average %11.2f\n",
343
(double)atime[3][6]);
345
printf("\nCalculating matgen2 overhead\n");
347
/************************************************************************
348
* Calculate overhead of executing matgen procedure *
349
************************************************************************/
352
for ( i = 0 ; i < loop ; i++)
354
matgen(aa,ldaa,n,b,&norma);
358
overhead2 = overhead2 / (double)loop;
360
printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
361
printf("Times for array with leading dimension of%4d\n\n",ldaa);
362
printf(" dgefa dgesl total Mflops unit");
365
/************************************************************************
367
************************************************************************/
369
tm2 = ntimes * overhead2;
372
for (j=7 ; j<12 ; j++)
375
for (i = 0; i < ntimes; i++)
377
matgen(aa,ldaa,n,b,&norma);
378
dgefa(aa,ldaa,n,ipvt,&info );
381
atime[0][j] = (secs - tm2)/ntimes;
384
for (i = 0; i < ntimes; i++)
386
dgesl(aa,ldaa,n,ipvt,b,0);
389
atime[1][j] = secs/ntimes;
390
total = atime[0][j] + atime[1][j];
392
atime[3][j] = ops/(1.0e6*total);
393
atime[4][j] = 2.0/atime[3][j];
394
atime[5][j] = total/cray;
395
atime[3][12] = atime[3][12] + atime[3][j];
399
atime[3][12] = atime[3][12] / 5.0;
400
printf("Average %11.2f\n",
401
(double)atime[3][12]);
403
/************************************************************************
404
* Use minimum average as overall Mflops rating *
405
************************************************************************/
407
mflops = atime[3][6];
408
if (atime[3][12] < mflops) mflops = atime[3][12];
411
printf( "%s ", ROLLING);
412
printf( "%s ", PREC);
413
printf(" Precision %11.2f Mflops \n\n",mflops);
417
for (i=1 ; i<6 ; i++)
419
if (atime[3][i] > max1) max1 = atime[3][i];
423
for (i=7 ; i<12 ; i++)
425
if (atime[3][i] > max2) max2 = atime[3][i];
427
if (max1 < max2) max2 = max1;
429
sprintf(was[0], "%16.1f",(double)residn);
430
sprintf(was[1], "%16.8e",(double)resid);
431
sprintf(was[2], "%16.8e",(double)epsn);
432
sprintf(was[3], "%16.8e",(double)x1);
433
sprintf(was[4], "%16.8e",(double)x2);
438
sprintf(expect[0], " 0.4");
439
sprintf(expect[1], " 7.41628980e-014");
440
sprintf(expect[2], " 1.00000000e-015");
441
sprintf(expect[3], "-1.49880108e-014");
442
sprintf(expect[4], "-1.89848137e-014");
443
// Values for Visual C++
445
sprintf(expect[0], " 1.7");
446
sprintf(expect[1], " 7.41628980e-014");
447
sprintf(expect[2], " 2.22044605e-016");
448
sprintf(expect[3], "-1.49880108e-014");
449
sprintf(expect[4], "-1.89848137e-014");
451
// Values for Ubuntu GCC 32 Bit
453
sprintf(expect[0], " 1.9");
454
sprintf(expect[1], " 8.39915160e-14");
455
sprintf(expect[2], " 2.22044605e-16");
456
sprintf(expect[3], " -6.22835117e-14");
457
sprintf(expect[4], " -4.16333634e-14");
460
// Values for Ubuntu GCC 32 Bit
462
sprintf(expect[0], " 1.7");
463
sprintf(expect[1], " 7.41628980e-14");
464
sprintf(expect[2], " 2.22044605e-16");
465
sprintf(expect[3], " -1.49880108e-14");
466
sprintf(expect[4], " -1.89848137e-14");
468
sprintf(title[0], "norm. resid");
469
sprintf(title[1], "resid ");
470
sprintf(title[2], "machep ");
471
sprintf(title[3], "x[0]-1 ");
472
sprintf(title[4], "x[n-1]-1 ");
474
if (strtol(opt, NULL, 10) == 0)
476
sprintf(expect[2], " 8.88178420e-016");
483
/*----------------------*/
484
void print_time (int row)
487
printf("%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row],
488
(double)atime[1][row], (double)atime[2][row], (double)atime[3][row],
489
(double)atime[4][row], (double)atime[5][row]);
493
/*----------------------*/
495
void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)
498
/* We would like to declare a[][lda], but c does not allow it. In this
499
function, references to a[i][j] are written a[lda*i+j]. */
506
for (j = 0; j < n; j++) {
507
for (i = 0; i < n; i++) {
508
init = 3125*init % 65536;
509
a[lda*j+i] = (init - 32768.0)/16384.0;
510
*norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
512
/* alternative for some compilers
513
if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
517
for (i = 0; i < n; i++) {
520
for (j = 0; j < n; j++) {
521
for (i = 0; i < n; i++) {
522
b[i] = b[i] + a[lda*j+i];
528
/*----------------------*/
529
void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
532
/* We would like to declare a[][lda], but c does not allow it. In this
533
function, references to a[i][j] are written a[lda*i+j]. */
535
dgefa factors a double precision matrix by gaussian elimination.
537
dgefa is usually called by dgeco, but it can be called
538
directly with a saving in time if rcond is not needed.
539
(time for dgeco) = (1 + 9/n)*(time for dgefa) .
543
a REAL precision[n][lda]
544
the matrix to be factored.
547
the leading dimension of the array a .
550
the order of the matrix a .
554
a an upper triangular matrix and the multipliers
555
which were used to obtain it.
556
the factorization can be written a = l*u where
557
l is a product of permutation and unit lower
558
triangular matrices and u is upper triangular.
561
an integer vector of pivot indices.
565
= k if u[k][k] .eq. 0.0 . this is not an error
566
condition for this subroutine, but it does
567
indicate that dgesl or dgedi will divide by zero
568
if called. use rcond in dgeco for a reliable
569
indication of singularity.
571
linpack. this version dated 08/14/78 .
572
cleve moler, university of new mexico, argonne national lab.
576
blas daxpy,dscal,idamax
580
/* internal variables */
586
/* gaussian elimination with partial pivoting */
591
for (k = 0; k < nm1; k++) {
594
/* find l = pivot index */
596
l = idamax(n-k,&a[lda*k+k],1) + k;
599
/* zero pivot implies this column already
602
if (a[lda*k+l] != ZERO) {
604
/* interchange if necessary */
608
a[lda*k+l] = a[lda*k+k];
612
/* compute multipliers */
615
dscal(n-(k+1),t,&a[lda*k+k+1],1);
617
/* row elimination with column indexing */
619
for (j = kp1; j < n; j++) {
622
a[lda*j+l] = a[lda*j+k];
625
daxpy(n-(k+1),t,&a[lda*k+k+1],1,
635
if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
639
/*----------------------*/
641
void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )
644
/* We would like to declare a[][lda], but c does not allow it. In this
645
function, references to a[i][j] are written a[lda*i+j]. */
648
dgesl solves the double precision system
649
a * x = b or trans(a) * x = b
650
using the factors computed by dgeco or dgefa.
654
a double precision[n][lda]
655
the output from dgeco or dgefa.
658
the leading dimension of the array a .
661
the order of the matrix a .
664
the pivot vector from dgeco or dgefa.
666
b double precision[n]
667
the right hand side vector.
670
= 0 to solve a*x = b ,
671
= nonzero to solve trans(a)*x = b where
672
trans(a) is the transpose.
676
b the solution vector x .
680
a division by zero will occur if the input factor contains a
681
zero on the diagonal. technically this indicates singularity
682
but it is often caused by improper arguments or improper
683
setting of lda . it will not occur if the subroutines are
684
called correctly and if dgeco has set rcond .gt. 0.0
685
or dgefa has set info .eq. 0 .
687
to compute inverse(a) * c where c is a matrix
689
dgeco(a,lda,n,ipvt,rcond,z)
690
if (!rcond is too small){
692
dgesl(a,lda,n,ipvt,c[j][0],0);
695
linpack. this version dated 08/14/78 .
696
cleve moler, university of new mexico, argonne national lab.
703
/* internal variables */
711
/* job = 0 , solve a * x = b
712
first solve l*y = b */
715
for (k = 0; k < nm1; k++) {
722
daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
726
/* now solve u*x = y */
728
for (kb = 0; kb < n; kb++) {
730
b[k] = b[k]/a[lda*k+k];
732
daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
737
/* job = nonzero, solve trans(a) * x = b
738
first solve trans(u)*y = b */
740
for (k = 0; k < n; k++) {
741
t = ddot(k,&a[lda*k+0],1,&b[0],1);
742
b[k] = (b[k] - t)/a[lda*k+k];
745
/* now solve trans(l)*x = y */
748
for (kb = 1; kb < nm1; kb++) {
750
b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
763
/*----------------------*/
765
void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
767
constant times a vector plus a vector.
768
jack dongarra, linpack, 3/11/78.
778
if (da == ZERO) return;
780
if(incx != 1 || incy != 1) {
782
/* code for unequal increments or equal increments
787
if(incx < 0) ix = (-n+1)*incx;
788
if(incy < 0)iy = (-n+1)*incy;
789
for (i = 0;i < n; i++) {
790
dy[iy] = dy[iy] + da*dx[ix];
798
/* code for both increments equal to 1 */
803
for (i = 0;i < n; i++) {
804
dy[i] = dy[i] + da*dx[i];
814
for (i = 0; i < m; i++)
815
dy[i] = dy[i] + da*dx[i];
819
for (i = m; i < n; i = i + 4) {
820
dy[i] = dy[i] + da*dx[i];
821
dy[i+1] = dy[i+1] + da*dx[i+1];
822
dy[i+2] = dy[i+2] + da*dx[i+2];
823
dy[i+3] = dy[i+3] + da*dx[i+3];
831
/*----------------------*/
833
REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
835
forms the dot product of two vectors.
836
jack dongarra, linpack, 3/11/78.
848
if(n <= 0) return(ZERO);
850
if(incx != 1 || incy != 1) {
852
/* code for unequal increments or equal increments
857
if (incx < 0) ix = (-n+1)*incx;
858
if (incy < 0) iy = (-n+1)*incy;
859
for (i = 0;i < n; i++) {
860
dtemp = dtemp + dx[ix]*dy[iy];
868
/* code for both increments equal to 1 */
874
dtemp = dtemp + dx[i]*dy[i];
885
for (i = 0; i < m; i++)
886
dtemp = dtemp + dx[i]*dy[i];
887
if (n < 5) return(dtemp);
889
for (i = m; i < n; i = i + 5) {
890
dtemp = dtemp + dx[i]*dy[i] +
891
dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
892
dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
900
/*----------------------*/
901
void dscal(int n, REAL da, REAL dx[], int incx)
903
/* scales a vector by a constant.
904
jack dongarra, linpack, 3/11/78.
916
/* code for increment not equal to 1 */
919
for (i = 0; i < nincx; i = i + incx)
925
/* code for increment equal to 1 */
930
for (i = 0; i < n; i++)
941
for (i = 0; i < m; i++)
945
for (i = m; i < n; i = i + 5){
947
dx[i+1] = da*dx[i+1];
948
dx[i+2] = da*dx[i+2];
949
dx[i+3] = da*dx[i+3];
950
dx[i+4] = da*dx[i+4];
957
/*----------------------*/
958
int idamax(int n, REAL dx[], int incx)
961
finds the index of element having max. absolute value.
962
jack dongarra, linpack, 3/11/78.
970
if( n < 1 ) return(-1);
971
if(n ==1 ) return(0);
974
/* code for increment not equal to 1 */
977
dmax = fabs((double)dx[0]);
979
for (i = 1; i < n; i++) {
980
if(fabs((double)dx[ix]) > dmax) {
982
dmax = fabs((double)dx[ix]);
989
/* code for increment equal to 1 */
992
dmax = fabs((double)dx[0]);
993
for (i = 1; i < n; i++) {
994
if(fabs((double)dx[i]) > dmax) {
996
dmax = fabs((double)dx[i]);
1003
/*----------------------*/
1004
REAL epslon (REAL x)
1007
estimate unit roundoff in quantities of size x.
1013
this program should function properly on all systems
1014
satisfying the following two assumptions,
1015
1. the base used in representing dfloating point
1016
numbers is not a power of three.
1017
2. the quantity a in statement 10 is represented to
1018
the accuracy used in dfloating point variables
1019
that are stored in memory.
1020
the statement number 10 and the go to 10 are intended to
1021
force optimizing compilers to generate code satisfying
1023
under these assumptions, it should be true that,
1024
a is not exactly equal to four-thirds,
1025
b has a zero for its last bit or digit,
1026
c is not exactly equal to one,
1027
eps measures the separation of 1.0 from
1028
the next larger dfloating point number.
1029
the developers of eispack would appreciate being informed
1030
about any systems where these assumptions do not hold.
1032
*****************************************************************
1033
this routine is one of the auxiliary routines used by eispack iii
1034
to avoid machine dependencies.
1035
*****************************************************************
1037
this version dated 4/6/83.
1042
while (eps == ZERO) {
1045
eps = fabs((double)(c-ONE));
1047
return(eps*fabs((double)x));
1050
/*----------------------*/
1051
void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
1054
/* We would like to declare m[][ldm], but c does not allow it. In this
1055
function, references to m[i][j] are written m[ldm*i+j]. */
1059
multiply matrix m times vector x and add the result to vector y.
1063
n1 integer, number of elements in vector y, and number of rows in
1066
y double [n1], vector of length n1 to which is added
1069
n2 integer, number of elements in vector x, and number of columns
1072
ldm integer, leading dimension of array m
1074
x double [n2], vector of length n2
1076
m double [ldm][n2], matrix of n1 rows and n2 columns
1078
----------------------------------------------------------------------
1082
/* cleanup odd vector */
1087
for (i = 0; i < n1; i++)
1088
y[i] = (y[i]) + x[j]*m[ldm*j+i];
1091
/* cleanup odd group of two vectors */
1096
for (i = 0; i < n1; i++)
1098
+ x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1101
/* cleanup odd group of four vectors */
1106
for (i = 0; i < n1; i++)
1108
+ x[j-3]*m[ldm*(j-3)+i])
1109
+ x[j-2]*m[ldm*(j-2)+i])
1110
+ x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1113
/* cleanup odd group of eight vectors */
1118
for (i = 0; i < n1; i++)
1119
y[i] = ((((((( (y[i])
1120
+ x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
1121
+ x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
1122
+ x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
1123
+ x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i];
1126
/* main loop - groups of sixteen vectors */
1129
for (j = jmin-1; j < n2; j = j + 16) {
1130
for (i = 0; i < n1; i++)
1131
y[i] = ((((((((((((((( (y[i])
1132
+ x[j-15]*m[ldm*(j-15)+i])
1133
+ x[j-14]*m[ldm*(j-14)+i])
1134
+ x[j-13]*m[ldm*(j-13)+i])
1135
+ x[j-12]*m[ldm*(j-12)+i])
1136
+ x[j-11]*m[ldm*(j-11)+i])
1137
+ x[j-10]*m[ldm*(j-10)+i])
1138
+ x[j- 9]*m[ldm*(j- 9)+i])
1139
+ x[j- 8]*m[ldm*(j- 8)+i])
1140
+ x[j- 7]*m[ldm*(j- 7)+i])
1141
+ x[j- 6]*m[ldm*(j- 6)+i])
1142
+ x[j- 5]*m[ldm*(j- 5)+i])
1143
+ x[j- 4]*m[ldm*(j- 4)+i])
1144
+ x[j- 3]*m[ldm*(j- 3)+i])
1145
+ x[j- 2]*m[ldm*(j- 2)+i])
1146
+ x[j- 1]*m[ldm*(j- 1)+i])