1
/*============================================================================
3
* This file is part of the Code_Saturne Kernel, element of the
4
* Code_Saturne CFD tool.
6
* Copyright (C) 1998-2009 EDF S.A., France
8
* contact: saturne-support@edf.fr
10
* The Code_Saturne Kernel is free software; you can redistribute it
11
* and/or modify it under the terms of the GNU General Public License
12
* as published by the Free Software Foundation; either version 2 of
13
* the License, or (at your option) any later version.
15
* The Code_Saturne Kernel is distributed in the hope that it will be
16
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty
17
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
* GNU General Public License for more details.
20
* You should have received a copy of the GNU General Public License
21
* along with the Code_Saturne Kernel; if not, write to the
22
* Free Software Foundation, Inc.,
23
* 51 Franklin St, Fifth Floor,
24
* Boston, MA 02110-1301 USA
26
*============================================================================*/
28
/*============================================================================
29
* Low-level operator benchmarking
30
*============================================================================*/
32
#if defined(HAVE_CONFIG_H)
33
#include "cs_config.h"
36
/*----------------------------------------------------------------------------
37
* Standard C library headers
38
*----------------------------------------------------------------------------*/
47
#if defined(__STDC_VERSION__) /* size_t */
48
#if (__STDC_VERSION__ == 199901L)
61
/*----------------------------------------------------------------------------
63
*----------------------------------------------------------------------------*/
66
#include <bft_error.h>
67
#include <bft_printf.h>
68
#include <bft_timer.h>
70
/*----------------------------------------------------------------------------
72
*----------------------------------------------------------------------------*/
74
/*----------------------------------------------------------------------------
76
*----------------------------------------------------------------------------*/
82
#include "cs_mesh_quantities.h"
83
#include "cs_matrix.h"
86
/*----------------------------------------------------------------------------
87
* Header for the current file
88
*----------------------------------------------------------------------------*/
90
#include "cs_benchmark.h"
92
/*----------------------------------------------------------------------------*/
96
/*=============================================================================
97
* Local Macro Definitions
98
*============================================================================*/
100
/*=============================================================================
101
* Local Structure Definitions
102
*============================================================================*/
104
/*============================================================================
106
*============================================================================*/
108
/*============================================================================
109
* Private function definitions
110
*============================================================================*/
112
/*----------------------------------------------------------------------------
116
* wt --> wall-clock time (start in, stop - start out)
117
* cpu --> CPU time (start in, stop - start out)
118
*----------------------------------------------------------------------------*/
121
_timer_start(double *wt,
124
*wt = bft_timer_wtime();
125
*cpu = bft_timer_cpu_time();
128
/*----------------------------------------------------------------------------
132
* n_runs <-- Number of timing runs
133
* wt <-> wall-clock time (start in, stop - start out)
134
* cpu <-> CPU time (start in, stop - start out)
135
*----------------------------------------------------------------------------*/
138
_timer_stop(int n_runs,
144
wt_s = bft_timer_wtime();
145
cpu_s = bft_timer_cpu_time();
147
*wt = (wt_s - *wt) / (double)n_runs;
148
*cpu = (cpu_s - *cpu) / (double)n_runs;
151
/*----------------------------------------------------------------------------
155
* wt <-- wall-clock time
157
*----------------------------------------------------------------------------*/
160
_print_overhead(double wt,
163
if (cs_glob_n_ranks == 1)
164
bft_printf(_(" Wall clock : %12.5e\n"
170
double loc_count[2], glob_min[2], glob_max[2], cpu_tot;
175
#if defined(HAVE_MPI)
176
MPI_Allreduce(loc_count, glob_min, 2, MPI_DOUBLE, MPI_MIN,
178
MPI_Allreduce(loc_count, glob_max, 2, MPI_DOUBLE, MPI_MAX,
180
MPI_Allreduce(&cpu, &cpu_tot, 1, MPI_DOUBLE, MPI_SUM,
183
{ /* We should never enter here unless we have an alternative to MPI */
185
for (i = 0; i < 3; i++) {
186
glob_min[i] = loc_count[i];
187
glob_max[i] = loc_count[i];
193
bft_printf(_(" Min Max Total\n"
194
" Wall clock : %12.5e %12.5e\n"
195
" CPU : %12.5e %12.5e %12.5e\n"),
196
glob_min[0], glob_max[0],
197
glob_min[1], glob_max[1], cpu_tot);
201
/*----------------------------------------------------------------------------
202
* Count number of operations.
205
* n_ops <-- Local number of operations
206
* n_ops_single <-- Single-processor equivalent number of operations
207
* (without ghosts); ignored if 0
208
* wt <-- wall-clock time
210
*----------------------------------------------------------------------------*/
213
_print_stats(long n_ops,
218
double fm = 1.0 / (1.e9 * wt);
220
if (cs_glob_n_ranks == 1)
221
bft_printf(_(" N ops : %12ld\n"
222
" Wall clock : %12.5e\n"
224
" GFLOPS : %12.5e\n"),
225
n_ops, wt, cpu, n_ops*fm);
229
long n_ops_min, n_ops_max, n_ops_tot;
230
double loc_count[3], glob_min[3], glob_max[3], cpu_tot, fmg;
234
loc_count[2] = n_ops*fm;
236
#if defined(HAVE_MPI)
238
MPI_Allreduce(&n_ops, &n_ops_min, 1, MPI_LONG, MPI_MIN,
240
MPI_Allreduce(&n_ops, &n_ops_max, 1, MPI_LONG, MPI_MAX,
242
MPI_Allreduce(&n_ops, &n_ops_tot, 1, MPI_LONG, MPI_SUM,
245
MPI_Allreduce(loc_count, glob_min, 3, MPI_DOUBLE, MPI_MIN,
247
MPI_Allreduce(loc_count, glob_max, 3, MPI_DOUBLE, MPI_MAX,
249
MPI_Allreduce(&cpu, &cpu_tot, 1, MPI_DOUBLE, MPI_SUM,
253
{ /* We should never enter here unless we have an alternative to MPI */
255
n_ops_min = n_ops; n_ops_max = n_ops; n_ops_tot = n_ops;
256
for (i = 0; i < 3; i++) {
257
glob_min[i] = loc_count[i];
258
glob_max[i] = loc_count[i];
264
fmg = 1.0 / (1.e9 * glob_max[0]); /* global flops multiplier */
266
if (n_ops_single == 0)
268
(_(" Min Max Total\n"
269
" N ops : %12ld %12ld %12ld\n"
270
" Wall clock : %12.5e %12.5e\n"
271
" CPU : %12.5e %12.5e %12.5e\n"
272
" GFLOPS : %12.5e %12.5e %12.5e\n"),
273
n_ops_min, n_ops_max, n_ops_tot,
274
glob_min[0], glob_max[0],
275
glob_min[1], glob_max[1], cpu_tot,
276
glob_min[2], glob_max[2], n_ops_tot*fmg);
280
(_(" Min Max Total Single\n"
281
" N ops : %12ld %12ld %12ld %12ld\n"
282
" Wall clock : %12.5e %12.5e\n"
283
" CPU : %12.5e %12.5e %12.5e\n"
284
" GFLOPS : %12.5e %12.5e %12.5e %12.5e\n"),
285
n_ops_min, n_ops_max, n_ops_tot, n_ops_single,
286
glob_min[0], glob_max[0],
287
glob_min[1], glob_max[1], cpu_tot,
288
glob_min[2], glob_max[2], n_ops_tot*fmg, n_ops_single*fmg);
292
/*----------------------------------------------------------------------------
293
* Simple dot product.
296
* global <-- 0 for local use, 1 for MPI sum
297
* n_runs <-- number of operation runs
298
* n_cells <-- number of cells (array size)
301
*----------------------------------------------------------------------------*/
304
_dot_product_1(int global,
315
double test_sum_mult = 1.0/n_runs;
316
double test_sum = 0.0;
317
int _global = global;
319
char type_name[] = "X.Y";
327
if (cs_glob_n_ranks == 1)
332
/* First simple local x.x version */
334
_timer_start(&wt, &cpu);
336
#if defined(HAVE_BLAS)
340
for (run_id = 0; run_id < n_runs; run_id++) {
341
double s1 = cblas_ddot(n_cells, x, 1, y, 1);
342
#if defined(HAVE_MPI)
344
double s1_glob = 0.0;
345
MPI_Allreduce(&s1, &s1_glob, 1, MPI_DOUBLE, MPI_SUM,
350
test_sum += test_sum_mult*s1;
353
_timer_stop(n_runs, &wt, &cpu);
357
"Simple local dot product %s with BLAS\n"
358
"-------------------------------------\n"),
362
"Simple global dot product %s with BLAS\n"
363
"---------------------------------------\n"),
366
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
369
_print_stats(n_ops, 0, wt, cpu);
371
#endif /* defined(HAVE_BLAS) */
375
for (run_id = 0; run_id < n_runs; run_id++) {
377
for (ii = 0; ii < n_cells; ii++)
379
#if defined(HAVE_MPI)
381
double s1_glob = 0.0;
382
MPI_Allreduce(&s1, &s1_glob, 1, MPI_DOUBLE, MPI_SUM,
387
test_sum += test_sum_mult*s1;
390
_timer_stop(n_runs, &wt, &cpu);
394
"Simple local dot product %s\n"
395
"---------------------------\n"),
399
"Simple global dot product %s\n"
400
"----------------------------\n"),
403
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
406
_print_stats(n_ops, 0, wt, cpu);
410
/*----------------------------------------------------------------------------
411
* Double local dot product.
414
* n_runs <-- number of operation runs
415
* n_cells <-- number of cells (array size)
418
*----------------------------------------------------------------------------*/
421
_dot_product_2(int n_runs,
431
double test_sum_mult = 1.0/n_runs;
432
double test_sum = 0.0;
439
/* First simple local x.x version */
441
_timer_start(&wt, &cpu);
443
#if defined(HAVE_BLAS)
447
for (run_id = 0; run_id < n_runs; run_id++) {
448
double s1 = cblas_ddot(n_cells, x, 1, x, 1);
449
double s2 = cblas_ddot(n_cells, x, 1, y, 1);
450
test_sum += test_sum_mult*(s1+s2);
453
_timer_stop(n_runs, &wt, &cpu);
456
"Double local dot product X.X, Y.Y with BLAS\n"
457
"-------------------------------------------\n"));
458
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
461
_print_stats(n_ops, 0, wt, cpu);
463
#endif /* defined(HAVE_BLAS) */
467
for (run_id = 0; run_id < n_runs; run_id++) {
470
for (ii = 0; ii < n_cells; ii++) {
474
test_sum += test_sum_mult*(s1+s2);
477
_timer_stop(n_runs, &wt, &cpu);
480
"Double local dot product X.X, Y.Y\n"
481
"---------------------------------\n"));
483
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
486
_print_stats(n_ops, 0, wt, cpu);
490
/*----------------------------------------------------------------------------
494
* n_runs <-- number of operation runs
495
* n_cells <-- number of cells (array size)
498
*----------------------------------------------------------------------------*/
503
const cs_real_t *restrict x,
504
cs_real_t *restrict y)
511
double test_sum_mult = 1.0/n_runs;
512
double test_sum = 0.0;
519
/* First simple local x.x version */
521
for (ii = 0; ii < n_cells; ii++)
524
_timer_start(&wt, &cpu);
526
#if defined(HAVE_BLAS)
530
for (run_id = 0; run_id < n_runs; run_id++) {
532
cblas_daxpy(n_cells, test_sum_mult, x, 1, y, 1);
534
test_sum += test_sum_mult*y[run_id%n_cells];
538
_timer_stop(n_runs, &wt, &cpu);
541
"Y <- aX + Y with BLAS\n"
542
"---------------------\n"));
544
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
547
_print_stats(n_ops, 0, wt, cpu);
549
#endif /* defined(HAVE_BLAS) */
553
for (run_id = 0; run_id < n_runs; run_id++) {
555
for (ii = 0; ii < n_cells; ii++)
556
y[ii] += test_sum_mult * x[ii];
558
test_sum += test_sum_mult*y[run_id%n_cells];
562
_timer_stop(n_runs, &wt, &cpu);
568
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
571
_print_stats(n_ops, 0, wt, cpu);
575
/*----------------------------------------------------------------------------
576
* Simple divisions on a vector.
579
* n_runs <-- number of operation runs
580
* n_cells <-- number of cells (array size)
581
*----------------------------------------------------------------------------*/
584
_division_test(int n_runs,
592
cs_real_t *x = NULL, *y = NULL, *z = NULL;
597
BFT_MALLOC(x, n_cells, cs_real_t);
598
BFT_MALLOC(y, n_cells, cs_real_t);
599
BFT_MALLOC(z, n_cells, cs_real_t);
606
for (ii = 0; ii < n_cells; ii++) {
608
y[ii] = 2.0 + (ii+1)%3;
611
/* Division of two vectors */
613
_timer_start(&wt, &cpu);
615
for (run_id = 0; run_id < n_runs; run_id++) {
617
for (ii = 0; ii < n_cells; ii++)
618
z[ii] = x[ii] / y[ii];
622
_timer_stop(n_runs, &wt, &cpu);
626
"----------------\n"));
628
_print_stats(n_ops, 0, wt, cpu);
632
/* Copy inverse of a vector */
634
_timer_start(&wt, &cpu);
636
for (run_id = 0; run_id < n_runs; run_id++) {
638
for (ii = 0; ii < n_cells; ii++)
643
_timer_stop(n_runs, &wt, &cpu);
647
"----------------\n"));
649
_print_stats(n_ops, 0, wt, cpu);
653
/* Inverse of a vector */
655
_timer_start(&wt, &cpu);
657
for (run_id = 0; run_id < n_runs; run_id++) {
659
for (ii = 0; ii < n_cells; ii++)
664
_timer_stop(n_runs, &wt, &cpu);
667
"Division x <- 1/x\n"
668
"-----------------\n"));
670
_print_stats(n_ops, 0, wt, cpu);
676
/*----------------------------------------------------------------------------
677
* Simple square root on a vector.
680
* n_runs <-- number of operation runs
681
* n_cells <-- number of cells (array size)
682
*----------------------------------------------------------------------------*/
685
_sqrt_test(int n_runs,
693
cs_real_t *x = NULL, *y = NULL;
698
BFT_MALLOC(x, n_cells, cs_real_t);
699
BFT_MALLOC(y, n_cells, cs_real_t);
703
for (ii = 0; ii < n_cells; ii++)
706
/* Copy square root of a vector */
708
_timer_start(&wt, &cpu);
710
for (run_id = 0; run_id < n_runs; run_id++) {
712
for (ii = 0; ii < n_cells; ii++)
717
_timer_stop(n_runs, &wt, &cpu);
723
_print_stats(n_ops, 0, wt, cpu);
727
/* In place square root of a vector */
729
_timer_start(&wt, &cpu);
731
for (run_id = 0; run_id < n_runs; run_id++) {
733
for (ii = 0; ii < n_cells; ii++)
738
_timer_stop(n_runs, &wt, &cpu);
744
_print_stats(n_ops, 0, wt, cpu);
750
/*----------------------------------------------------------------------------
751
* Measure matrix creation + destruction related performance.
754
* n_runs <-- number of operation runs
755
* type_name <-- type name
756
* type <-- matrix type
757
* symmetric <-- symmetric structure (if available)
758
* n_cells <-- number of local cells
759
* n_cells_ext <-- number of cells including ghost cells (array size)
760
* n_faces <-- local number of internal faces
761
* cell_num <-- global cell numbers (1 to n)
762
* face_cell <-- face -> cells connectivity (1 to n)
763
* halo <-- cell halo structure
764
* numbering <-- vectorization or thread-related numbering info, or NULL
765
*----------------------------------------------------------------------------*/
768
_matrix_creation_test(int n_runs,
769
const char *type_name,
770
cs_matrix_type_t type,
773
cs_int_t n_cells_ext,
775
const fvm_gnum_t *cell_num,
776
const cs_int_t *face_cell,
777
const cs_halo_t *halo,
778
const cs_numbering_t *numbering)
783
cs_matrix_t *m = NULL;
788
/* First count creation/destruction overhead */
790
_timer_start(&wt, &cpu);
792
for (run_id = 0; run_id < n_runs; run_id++) {
793
m = cs_matrix_create(type,
804
cs_matrix_destroy(&m);
807
_timer_stop(n_runs, &wt, &cpu);
810
"Matrix construction / destruction (%s)\n"
811
"---------------------------------\n"), _(type_name));
813
bft_printf(_(" (calls: %d)\n"), n_runs);
815
_print_overhead(wt, cpu);
819
/*----------------------------------------------------------------------------
820
* Measure matrix assignment performance.
823
* n_runs <-- number of operation runs
824
* type_name <-- type name
825
* type <-- matrix type
826
* sym_struct <-- symmetric structure (if available)
827
* sym_coeffs <-- symmetric coefficients
828
* n_cells <-- number of local cells
829
* n_cells_ext <-- number of cells including ghost cells (array size)
830
* n_faces <-- local number of internal faces
831
* cell_num <-- global cell numbers (1 to n)
832
* face_cell <-- face -> cells connectivity (1 to n)
833
* halo <-- cell halo structure
834
* numbering <-- vectorization or thread-related numbering info, or NULL
835
* da <-- diagonal values
836
* xa <-- extradiagonal values
837
*----------------------------------------------------------------------------*/
840
_matrix_assignment_test(int n_runs,
841
const char *type_name,
842
cs_matrix_type_t type,
843
cs_bool_t sym_struct,
844
cs_bool_t sym_coeffs,
846
cs_int_t n_cells_ext,
848
const fvm_gnum_t *cell_num,
849
const cs_int_t *face_cell,
850
const cs_halo_t *halo,
851
const cs_numbering_t *numbering,
852
const cs_real_t *restrict da,
853
const cs_real_t *restrict xa)
858
cs_matrix_t *m = NULL;
863
/* Count assignment overhead */
865
m = cs_matrix_create(type,
877
_timer_start(&wt, &cpu);
879
for (run_id = 0; run_id < n_runs; run_id++) {
880
cs_matrix_set_coefficients(m,
886
_timer_stop(n_runs, &wt, &cpu);
888
cs_matrix_destroy(&m);
891
"Matrix value assignment (%s)\n"
892
"-----------------------\n"), _(type_name));
894
bft_printf(_(" (calls: %d)\n"), n_runs);
896
_print_overhead(wt, cpu);
900
/*----------------------------------------------------------------------------
901
* Measure matrix.vector product related performance.
904
* n_runs <-- number of operation runs
905
* type_name <-- type name
906
* type <-- matrix type
907
* sym_struct <-- symmetric structure (if available)
908
* sym_coeffs <-- symmetric coefficients
909
* n_cells <-- number of local cells
910
* n_cells_ext <-- number of cells including ghost cells (array size)
911
* n_faces <-- local number of internal faces
912
* cell_num <-- global cell numbers (1 to n)
913
* face_cell <-- face -> cells connectivity (1 to n)
914
* halo <-- cell halo structure
915
* numbering <-- vectorization or thread-related numbering info, or NULL
916
* da <-- diagonal values
917
* xa <-- extradiagonal values
920
*----------------------------------------------------------------------------*/
923
_matrix_vector_test(int n_runs,
924
const char *type_name,
925
cs_matrix_type_t type,
926
cs_bool_t sym_struct,
927
cs_bool_t sym_coeffs,
929
cs_int_t n_cells_ext,
931
const fvm_gnum_t *cell_num,
932
const cs_int_t *face_cell,
933
const cs_halo_t *halo,
934
const cs_numbering_t *numbering,
935
const cs_real_t *restrict da,
936
const cs_real_t *restrict xa,
937
cs_real_t *restrict x,
938
cs_real_t *restrict y)
943
long n_ops, n_ops_glob;
945
double test_sum = 0.0;
946
cs_matrix_t *m = NULL;
951
n_ops = n_cells + n_faces*2;
953
if (cs_glob_n_ranks == 1)
956
n_ops_glob = ( cs_glob_mesh->n_g_cells
957
+ cs_glob_mesh->n_g_i_faces*2);
959
m = cs_matrix_create(type,
971
cs_matrix_set_coefficients(m,
976
/* Matrix.vector product */
978
_timer_start(&wt, &cpu);
980
for (run_id = 0; run_id < n_runs; run_id++) {
981
cs_matrix_vector_multiply(CS_PERIO_ROTA_COPY,
985
test_sum += y[n_cells-1];
987
for (ii = 0; ii < n_cells; ii++)
988
bft_printf("y[%d] = %12.4f\n", ii, y[ii]);
992
_timer_stop(n_runs, &wt, &cpu);
995
"Matrix.vector product (%s)\n"
996
"---------------------\n"), _(type_name));
998
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1001
_print_stats(n_ops, n_ops_glob, wt, cpu);
1003
/* Local timing in parallel mode */
1005
if (cs_glob_n_ranks > 1) {
1009
_timer_start(&wt, &cpu);
1011
for (run_id = 0; run_id < n_runs; run_id++) {
1012
cs_matrix_vector_multiply_nosync(m,
1015
test_sum += y[n_cells-1];
1018
_timer_stop(n_runs, &wt, &cpu);
1021
"Local matrix.vector product (%s)\n"
1022
"---------------------------\n"), _(type_name));
1024
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1027
_print_stats(n_ops, n_ops_glob, wt, cpu);
1031
/* Combined matrix.vector product: alpha.A.x + beta.y */
1034
for (ii = 0; ii < n_cells_ext; y[ii++] = 0.0);
1036
_timer_start(&wt, &cpu);
1038
for (run_id = 0; run_id < n_runs; run_id++) {
1039
cs_matrix_alpha_a_x_p_beta_y(CS_PERIO_ROTA_COPY,
1045
test_sum += y[n_cells-1];
1048
_timer_stop(n_runs, &wt, &cpu);
1051
"Matrix.vector product alpha.A.x + beta.y (%s)\n"
1052
"----------------------------------------\n"), _(type_name));
1054
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1057
_print_stats(n_ops, n_ops_glob, wt, cpu);
1059
/* (Matrix - diagonal).vector product (with diagonal structure) */
1061
cs_matrix_set_coefficients(m,
1068
_timer_start(&wt, &cpu);
1070
for (run_id = 0; run_id < n_runs; run_id++) {
1071
cs_matrix_vector_multiply(CS_PERIO_ROTA_COPY,
1075
test_sum += y[n_cells-1];
1078
_timer_stop(n_runs, &wt, &cpu);
1081
"(Matrix-diagonal).vector product (%s)\n"
1082
"--------------------------------\n"), _(type_name));
1084
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1087
_print_stats(n_ops, n_ops_glob, wt, cpu);
1089
/* (Matrix - diagonal).vector product */
1091
cs_matrix_destroy(&m);
1095
if (cs_glob_n_ranks == 1)
1098
n_ops_glob = ( cs_glob_mesh->n_g_cells
1099
+ cs_glob_mesh->n_g_i_faces*2);
1101
m = cs_matrix_create(type,
1113
cs_matrix_set_coefficients(m,
1120
_timer_start(&wt, &cpu);
1122
for (run_id = 0; run_id < n_runs; run_id++) {
1123
cs_matrix_vector_multiply(CS_PERIO_ROTA_COPY,
1127
test_sum += y[n_cells-1];
1130
_timer_stop(n_runs, &wt, &cpu);
1133
"(Matrix without diagonal).vector product (%s)\n"
1134
"----------------------------------------\n"), _(type_name));
1136
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1139
_print_stats(n_ops, n_ops_glob, wt, cpu);
1141
cs_matrix_destroy(&m);
1145
/*----------------------------------------------------------------------------
1146
* Measure matrix.vector product extradiagonal terms related performance
1147
* (symmetric matrix case).
1150
* n_faces <-- local number of internal faces
1151
* face_cell <-- face -> cells connectivity (1 to n)
1152
* xa <-- extradiagonal values
1155
*----------------------------------------------------------------------------*/
1158
_mat_vec_exdiag_native(cs_int_t n_faces,
1159
const cs_int_t *face_cell,
1160
const cs_real_t *restrict xa,
1161
cs_real_t *restrict x,
1162
cs_real_t *restrict y)
1164
cs_int_t ii, jj, face_id;
1166
/* Tell IBM compiler not to alias */
1167
#if defined(__xlc__)
1168
#pragma disjoint(*x, *y, *xa)
1171
const cs_int_t *restrict face_cel_p = face_cell;
1173
for (face_id = 0; face_id < n_faces; face_id++) {
1174
ii = *face_cel_p++ - 1;
1175
jj = *face_cel_p++ - 1;
1176
y[ii] += xa[face_id] * x[jj];
1177
y[jj] += xa[face_id] * x[ii];
1181
/*----------------------------------------------------------------------------
1182
* Measure matrix.vector product extradiagonal terms related performance
1183
* (symmetric matrix case, variant 1).
1186
* n_faces <-- local number of internal faces
1187
* face_cell <-- face -> cells connectivity (1 to n)
1188
* xa <-- extradiagonal values
1191
*----------------------------------------------------------------------------*/
1194
_mat_vec_exdiag_native_v1(cs_int_t n_faces,
1195
const cs_int_t *face_cell,
1196
const cs_real_t *restrict xa,
1197
cs_real_t *restrict x,
1198
cs_real_t *restrict y)
1200
cs_int_t ii, ii_prev, kk, face_id, kk_max;
1201
cs_real_t y_it, y_it_prev;
1203
const int l1_cache_size = 508;
1206
* 1/ Split y[ii] and y[jj] computation into 2 loops to remove compiler
1207
* data dependency assertion between y[ii] and y[jj].
1208
* 2/ keep index (*face_cel_p) in L1 cache from y[ii] loop to y[jj] loop
1209
* and xa in L2 cache.
1210
* 3/ break high frequency occurence of data dependency from one iteration
1211
* to another in y[ii] loop (nonzero matrix value on the same line ii).
1214
const cs_int_t *restrict face_cel_p = face_cell;
1218
face_id += l1_cache_size) {
1220
kk_max = CS_MIN((n_faces - face_id), l1_cache_size);
1222
/* sub-loop to compute y[ii] += xa[face_id] * x[jj] */
1224
ii = face_cel_p[0] - 1;
1226
y_it_prev = y[ii_prev] + xa[face_id] * x[face_cel_p[1] - 1];
1228
for (kk = 1; kk < kk_max; ++kk) {
1229
ii = face_cel_p[2*kk] - 1;
1230
/* y[ii] += xa[face_id+kk] * x[jj]; */
1236
y[ii_prev] = y_it_prev;
1239
y_it_prev = y_it + xa[face_id+kk] * x[face_cel_p[2*kk+1] - 1];
1243
/* sub-loop to compute y[ii] += xa[face_id] * x[jj] */
1245
for (kk = 0; kk < kk_max; ++kk) {
1246
y[face_cel_p[2*kk+1] - 1]
1247
+= xa[face_id+kk] * x[face_cel_p[2*kk] - 1];
1249
face_cel_p += 2 * l1_cache_size;
1253
/*----------------------------------------------------------------------------
1254
* Measure matrix.vector product extradiagonal terms related performance
1255
* with contribution to face-based array instead of cell-based array
1256
* (symmetric matrix case).
1259
* n_faces <-- local number of internal faces
1260
* face_cell <-- face -> cells connectivity (1 to n)
1261
* xa <-- extradiagonal values
1264
*----------------------------------------------------------------------------*/
1267
_mat_vec_exdiag_part_p1(cs_int_t n_faces,
1268
const cs_int_t *face_cell,
1269
const cs_real_t *restrict xa,
1270
cs_real_t *restrict x,
1271
cs_real_t *restrict ya)
1273
cs_int_t ii, jj, face_id;
1275
/* Tell IBM compiler not to alias */
1276
#if defined(__xlc__)
1277
#pragma disjoint(*x, *xa, *ya)
1280
const cs_int_t *restrict face_cel_p = face_cell;
1282
for (face_id = 0; face_id < n_faces; face_id++) {
1283
ii = *face_cel_p++ - 1;
1284
jj = *face_cel_p++ - 1;
1285
ya[face_id] += xa[face_id] * x[ii];
1286
ya[face_id] += xa[face_id] * x[jj];
1290
/*----------------------------------------------------------------------------
1291
* Measure matrix.vector product local extradiagonal part related performance.
1294
* n_runs <-- number of operation runs
1295
* n_cells <-- number of cells
1296
* n_cells_ext <-- number of cells including ghost cells (array size)
1297
* n_faces <-- local number of internal faces
1298
* face_cell <-- face -> cells connectivity (1 to n)
1299
* xa <-- extradiagonal values
1302
*----------------------------------------------------------------------------*/
1305
_sub_matrix_vector_test(int n_runs,
1307
cs_int_t n_cells_ext,
1309
const cs_int_t *face_cell,
1310
const cs_real_t *restrict xa,
1311
cs_real_t *restrict x,
1312
cs_real_t *restrict y)
1317
long n_ops, n_ops_glob;
1320
double test_sum = 0.0;
1327
if (cs_glob_n_ranks == 1)
1330
n_ops_glob = ( cs_glob_mesh->n_g_cells
1331
+ cs_glob_mesh->n_g_i_faces*2);
1333
for (jj = 0; jj < n_cells_ext; jj++)
1336
/* Matrix.vector product, variant 0 */
1338
_timer_start(&wt, &cpu);
1340
for (run_id = 0; run_id < n_runs; run_id++) {
1341
_mat_vec_exdiag_native(n_faces, face_cell, xa, x, y);
1342
test_sum += y[n_cells-1];
1345
_timer_stop(n_runs, &wt, &cpu);
1348
"Matrix.vector product, extradiagonal part, variant 0\n"
1349
"---------------------\n"));
1351
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1354
_print_stats(n_ops, n_ops_glob, wt, cpu);
1356
for (jj = 0; jj < n_cells_ext; jj++)
1361
/* Matrix.vector product, variant 1 */
1363
_timer_start(&wt, &cpu);
1365
for (run_id = 0; run_id < n_runs; run_id++) {
1366
_mat_vec_exdiag_native_v1(n_faces, face_cell, xa, x, y);
1367
test_sum += y[n_cells-1];
1370
_timer_stop(n_runs, &wt, &cpu);
1373
"Matrix.vector product, extradiagonal part, variant 1\n"
1374
"---------------------\n"));
1376
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1379
_print_stats(n_ops, n_ops_glob, wt, cpu);
1381
/* Matrix.vector product, contribute to faces only */
1383
BFT_MALLOC(ya, n_faces, cs_real_t);
1384
for (jj = 0; jj < n_faces; jj++)
1389
_timer_start(&wt, &cpu);
1391
for (run_id = 0; run_id < n_runs; run_id++) {
1392
_mat_vec_exdiag_part_p1(n_faces, face_cell, xa, x, ya);
1393
test_sum += y[n_cells-1];
1396
_timer_stop(n_runs, &wt, &cpu);
1401
"Matrix.vector product, face values only\n"
1402
"---------------------\n"));
1404
bft_printf(_(" (calls: %d; test sum: %12.5f)\n"),
1407
_print_stats(n_ops, n_ops_glob, wt, cpu);
1411
/*============================================================================
1412
* Public function definitions
1413
*============================================================================*/
1415
/*----------------------------------------------------------------------------
1416
* Run simple benchmarks.
1419
* mpi_trace_mode <-- indicates if timing mode (0) or MPI trace-friendly
1420
* mode (1) is to be used
1421
*----------------------------------------------------------------------------*/
1424
cs_benchmark(int mpi_trace_mode)
1426
/* Local variable definitions */
1427
/*----------------------------*/
1432
cs_real_t *x1 = NULL, *x2 = NULL;
1433
cs_real_t *y1 = NULL, *y2 = NULL;
1434
cs_real_t *da = NULL, *xa = NULL;
1436
const cs_mesh_t *mesh = cs_glob_mesh;
1437
const cs_mesh_quantities_t *mesh_v = cs_glob_mesh_quantities;
1439
size_t n_cells = mesh->n_cells;
1440
size_t n_cells_ext = mesh->n_cells_with_ghosts;
1441
size_t n_faces = mesh->n_i_faces;
1443
/* Allocate and initialize working arrays */
1444
/*-----------------------------------------*/
1446
BFT_MALLOC(x1, n_cells_ext, cs_real_t);
1447
BFT_MALLOC(x2, n_cells_ext, cs_real_t);
1449
for (ii = 0; ii < n_cells_ext; ii++) {
1450
x1[ii] = mesh_v->cell_cen[ii*3];
1451
x2[ii] = mesh_v->cell_cen[ii*3 + 1];
1454
if (CS_MEM_ALIGN > 0) {
1456
BFT_MEMALIGN(y1, CS_MEM_ALIGN, n_cells_ext, cs_real_t);
1457
BFT_MEMALIGN(y2, CS_MEM_ALIGN, n_cells_ext, cs_real_t);
1462
BFT_MALLOC(y1, n_cells_ext, cs_real_t);
1463
BFT_MALLOC(y2, n_cells_ext, cs_real_t);
1467
BFT_MALLOC(da, n_cells_ext, cs_real_t);
1468
BFT_MALLOC(xa, n_faces*2, cs_real_t);
1470
for (ii = 0; ii < n_cells_ext; ii++) {
1471
da[ii] = 1.0 + ii/n_cells_ext;
1472
xa[ii] = mesh_v->cell_cen[ii*3 + 1];
1475
for (ii = 0; ii < n_faces; ii++) {
1476
xa[ii] = 0.5*(1.0 + ii/n_faces);
1477
xa[ii + n_faces] = -0.5*(1.0 + ii/n_faces);
1484
"Benchmark mode activated\n"
1485
"========================\n"));
1487
/* Dot product test */
1488
/*------------------*/
1490
n_runs = (mpi_trace_mode) ? 1 : 10000;
1492
_dot_product_1(0, n_runs, n_cells, x1, x1);
1493
_dot_product_1(0, n_runs, n_cells, x1, x2);
1494
_dot_product_2(n_runs, n_cells, x1, x2);
1495
_axpy_(n_runs, n_cells, x1, y1);
1497
_division_test(n_runs/5, n_cells);
1498
_sqrt_test(n_runs/10, n_cells);
1500
#if defined(HAVE_MPI)
1502
if (cs_glob_n_ranks > 1) {
1504
n_runs = (mpi_trace_mode) ? 1 : 10000;
1506
_dot_product_1(1, n_runs, n_cells, x1, x1);
1507
_dot_product_1(1, n_runs, n_cells, x1, x2);
1511
#endif /* HAVE_MPI */
1516
n_runs = (mpi_trace_mode) ? 0 : 500;
1518
if (!mpi_trace_mode) {
1520
/* Creation tests */
1524
_matrix_creation_test(n_runs,
1526
CS_MATRIX_NATIVE, false,
1527
n_cells, n_cells_ext, n_faces,
1528
mesh->global_cell_num, mesh->i_face_cells,
1530
mesh->i_face_numbering);
1534
_matrix_creation_test(n_runs,
1536
CS_MATRIX_CSR, false,
1537
n_cells, n_cells_ext, n_faces,
1538
mesh->global_cell_num, mesh->i_face_cells,
1540
mesh->i_face_numbering);
1542
_matrix_creation_test(n_runs,
1544
CS_MATRIX_CSR, true,
1545
n_cells, n_cells_ext, n_faces,
1546
mesh->global_cell_num, mesh->i_face_cells,
1548
mesh->i_face_numbering);
1550
/* Assignment tests */
1554
_matrix_assignment_test(n_runs,
1556
CS_MATRIX_NATIVE, false, false,
1557
n_cells, n_cells_ext, n_faces,
1558
mesh->global_cell_num, mesh->i_face_cells,
1559
mesh->halo, mesh->i_face_numbering, da, xa);
1561
_matrix_assignment_test(n_runs,
1562
_("native, sym coeffs"),
1563
CS_MATRIX_NATIVE, false, true,
1564
n_cells, n_cells_ext, n_faces,
1565
mesh->global_cell_num, mesh->i_face_cells,
1566
mesh->halo, mesh->i_face_numbering, da, xa);
1570
_matrix_assignment_test(n_runs,
1572
CS_MATRIX_CSR, false, false,
1573
n_cells, n_cells_ext, n_faces,
1574
mesh->global_cell_num, mesh->i_face_cells,
1575
mesh->halo, mesh->i_face_numbering, da, xa);
1577
_matrix_assignment_test(n_runs,
1578
_("CSR, sym coeffs"),
1579
CS_MATRIX_CSR, false, true,
1580
n_cells, n_cells_ext, n_faces,
1581
mesh->global_cell_num, mesh->i_face_cells,
1582
mesh->halo, mesh->i_face_numbering, da, xa);
1584
_matrix_assignment_test(n_runs,
1586
CS_MATRIX_CSR, true, true,
1587
n_cells, n_cells_ext, n_faces,
1588
mesh->global_cell_num, mesh->i_face_cells,
1589
mesh->halo, mesh->i_face_numbering, da, xa);
1593
/* Matrix.vector tests */
1595
n_runs = (mpi_trace_mode) ? 1 : 1000;
1597
_matrix_vector_test(n_runs,
1599
CS_MATRIX_NATIVE, false, false,
1600
n_cells, n_cells_ext, n_faces,
1601
mesh->global_cell_num, mesh->i_face_cells, mesh->halo,
1602
mesh->i_face_numbering, da, xa, x1, y1);
1604
_matrix_vector_test(n_runs,
1605
_("native, sym coeffs"),
1606
CS_MATRIX_NATIVE, false, true,
1607
n_cells, n_cells_ext, n_faces,
1608
mesh->global_cell_num, mesh->i_face_cells, mesh->halo,
1609
mesh->i_face_numbering, da, xa, x1, y1);
1611
_matrix_vector_test(n_runs,
1613
CS_MATRIX_CSR, false, false,
1614
n_cells, n_cells_ext, n_faces,
1615
mesh->global_cell_num, mesh->i_face_cells, mesh->halo,
1616
mesh->i_face_numbering, da, xa, x1, y1);
1618
_matrix_vector_test(n_runs,
1620
CS_MATRIX_CSR, true, true,
1621
n_cells, n_cells_ext, n_faces,
1622
mesh->global_cell_num, mesh->i_face_cells, mesh->halo,
1623
mesh->i_face_numbering, da, xa, x1, y1);
1625
_sub_matrix_vector_test(n_runs,
1634
/* Free working arrays */
1635
/*---------------------*/
1648
/*----------------------------------------------------------------------------*/