2
/*******************************************************
4
* Copyright (c) 2003-2010 by University of Queensland
5
* Earth Systems Science Computational Center (ESSCC)
6
* http://www.uq.edu.au/esscc
8
* Primary Business: Queensland, Australia
9
* Licensed under the Open Software License version 3.0
10
* http://www.opensource.org/licenses/osl-3.0.php
12
*******************************************************/
14
/**************************************************************/
16
/* Dudley: Mesh: optimizes the distribution of DOFs across processors */
17
/* using ParMETIS. On return a new distribution is given and the globalDOF are relabled */
18
/* accordingly but the mesh has not been redesitributed yet */
20
/**************************************************************/
23
#include "IndexList.h"
31
/**************************************************************
32
Check whether there is any node which has no vertex. In case
33
such node exists, we don't use parmetis since parmetis requires
34
that every node has at least 1 vertex (at line 129 of file
35
"xyzpart.c" in parmetis 3.1.1, variable "nvtxs" would be 0 if
36
any node has no vertex).
37
**************************************************************/
39
int Check_Inputs_For_Parmetis(dim_t mpiSize, dim_t rank, dim_t * distribution, MPI_Comm * comm)
46
for (i = 0; i < mpiSize; i++)
48
len = distribution[i + 1] - distribution[i];
56
MPI_Bcast(&ret_val, 1, MPI_INTEGER, 0, *comm);
58
printf("INFO: Parmetis is not used since some nodes have no vertex!\n");
63
/**************************************************************/
65
void Dudley_Mesh_optimizeDOFDistribution(Dudley_Mesh * in, dim_t * distribution)
68
dim_t dim, i, j, k, myNumVertices, p, mpiSize, len, globalNumVertices, *partition_count = NULL, *new_distribution =
69
NULL, *loc_partition_count = NULL;
70
bool_t *setNewDOFId = NULL;
71
index_t myFirstVertex, myLastVertex, firstVertex, lastVertex, *newGlobalDOFID = NULL;
73
index_t *partition = NULL;
74
Paso_Pattern *pattern = NULL;
75
Esys_MPI_rank myRank, dest, source, current_rank, rank;
76
Dudley_IndexList *index_list = NULL;
86
if (in->Nodes == NULL)
89
myRank = in->MPIInfo->rank;
90
mpiSize = in->MPIInfo->size;
91
mpiSize_size = mpiSize * sizeof(dim_t);
92
dim = in->Nodes->numDim;
93
/* first step is to distribute the elements according to a global X of DOF */
95
myFirstVertex = distribution[myRank];
96
myLastVertex = distribution[myRank + 1];
97
myNumVertices = myLastVertex - myFirstVertex;
98
globalNumVertices = distribution[mpiSize];
100
for (p = 0; p < mpiSize; ++p)
101
len = MAX(len, distribution[p + 1] - distribution[p]);
102
partition = TMPMEMALLOC(len, index_t); /* len is used for the sending around of partition later on */
103
xyz = TMPMEMALLOC(myNumVertices * dim, float);
104
partition_count = TMPMEMALLOC(mpiSize + 1, dim_t);
105
new_distribution = TMPMEMALLOC(mpiSize + 1, dim_t);
106
newGlobalDOFID = TMPMEMALLOC(len, index_t);
107
setNewDOFId = TMPMEMALLOC(in->Nodes->numNodes, bool_t);
109
(Dudley_checkPtr(partition) || Dudley_checkPtr(xyz) || Dudley_checkPtr(partition_count)
110
|| Dudley_checkPtr(partition_count) || Dudley_checkPtr(newGlobalDOFID) || Dudley_checkPtr(setNewDOFId)))
112
dim_t *recvbuf = TMPMEMALLOC(mpiSize * mpiSize, dim_t);
114
/* set the coordinates: */
115
/* it is assumed that at least one node on this processor provides a coordinate */
116
#pragma omp parallel for private(i,j,k)
117
for (i = 0; i < in->Nodes->numNodes; ++i)
119
k = in->Nodes->globalDegreesOfFreedom[i] - myFirstVertex;
120
if ((k >= 0) && (k < myNumVertices))
122
for (j = 0; j < dim; ++j)
123
xyz[k * dim + j] = (float)(in->Nodes->Coordinates[INDEX2(j, i, dim)]);
127
index_list = TMPMEMALLOC(myNumVertices, Dudley_IndexList);
128
/* ksteube CSR of DOF IDs */
129
/* create the adjacency structure xadj and adjncy */
130
if (!Dudley_checkPtr(index_list))
132
#pragma omp parallel private(i)
134
#pragma omp for schedule(static)
135
for (i = 0; i < myNumVertices; ++i)
137
index_list[i].extension = NULL;
140
/* ksteube build CSR format */
141
/* insert contributions from element matrices into colums index index_list: */
142
Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list, myFirstVertex, myLastVertex,
144
in->Nodes->globalDegreesOfFreedom,
145
in->Nodes->globalDegreesOfFreedom);
146
Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list, myFirstVertex, myLastVertex,
148
in->Nodes->globalDegreesOfFreedom,
149
in->Nodes->globalDegreesOfFreedom);
150
Dudley_IndexList_insertElementsWithRowRangeNoMainDiagonal(index_list, myFirstVertex, myLastVertex,
151
in->Points, in->Nodes->globalDegreesOfFreedom,
152
in->Nodes->globalDegreesOfFreedom);
155
/* create the local matrix pattern */
156
pattern = Dudley_IndexList_createPattern(0, myNumVertices, index_list, 0, globalNumVertices, 0);
158
/* clean up index list */
159
if (index_list != NULL)
161
#pragma omp parallel for private(i)
162
for (i = 0; i < myNumVertices; ++i)
163
Dudley_IndexList_free(index_list[i].extension);
166
if (Dudley_noError())
171
if (mpiSize > 1 && Check_Inputs_For_Parmetis(mpiSize, myRank, distribution, &(in->MPIInfo->comm)) > 0)
175
int numflag = 0; /* pattern->ptr is C style: starting from 0 instead of 1 */
179
float *tpwgts = TMPMEMALLOC(ncon * mpiSize, float);
180
float *ubvec = TMPMEMALLOC(ncon, float);
181
for (i = 0; i < ncon * mpiSize; i++)
182
tpwgts[i] = 1.0 / (float)mpiSize;
183
for (i = 0; i < ncon; i++)
187
ParMETIS_V3_PartGeomKway(distribution, pattern->ptr, pattern->index, NULL, NULL, &wgtflag, &numflag, &dim, xyz, &ncon, &mpiSize, tpwgts, ubvec, options, &edgecut, partition, /* new CPU ownership of elements */
188
&(in->MPIInfo->comm));
189
/* printf("ParMETIS number of edges cut by partitioning per processor: %d\n", edgecut/MAX(in->MPIInfo->size,1)); */
195
for (i = 0; i < myNumVertices; ++i)
196
partition[i] = 0; /* CPU 0 owns it */
199
for (i = 0; i < myNumVertices; ++i)
200
partition[i] = myRank; /* CPU 0 owns it */
205
Paso_Pattern_free(pattern);
207
/* create a new distributioin and labeling of the DOF */
208
memset(new_distribution, 0, mpiSize_size);
209
#pragma omp parallel private(loc_partition_count)
211
loc_partition_count = THREAD_MEMALLOC(mpiSize, dim_t);
212
memset(loc_partition_count, 0, mpiSize_size);
213
#pragma omp for private(i)
214
for (i = 0; i < myNumVertices; ++i)
215
loc_partition_count[partition[i]]++;
218
for (i = 0; i < mpiSize; ++i)
219
new_distribution[i] += loc_partition_count[i];
221
THREAD_MEMFREE(loc_partition_count);
224
/* recvbuf will be the concatenation of each CPU's contribution to new_distribution */
225
MPI_Allgather(new_distribution, mpiSize, MPI_INT, recvbuf, mpiSize, MPI_INT, in->MPIInfo->comm);
227
for (i = 0; i < mpiSize; ++i)
228
recvbuf[i] = new_distribution[i];
230
new_distribution[0] = 0;
231
for (rank = 0; rank < mpiSize; rank++)
234
for (i = 0; i < myRank; ++i)
235
c += recvbuf[rank + mpiSize * i];
236
for (i = 0; i < myNumVertices; ++i)
238
if (rank == partition[i])
240
newGlobalDOFID[i] = new_distribution[rank] + c;
244
for (i = myRank + 1; i < mpiSize; ++i)
245
c += recvbuf[rank + mpiSize * i];
246
new_distribution[rank + 1] = new_distribution[rank] + c;
250
/* now the overlap needs to be created by sending the partition around */
252
dest = Esys_MPIInfo_mod(mpiSize, myRank + 1);
253
source = Esys_MPIInfo_mod(mpiSize, myRank - 1);
254
current_rank = myRank;
255
#pragma omp parallel for private(i)
256
for (i = 0; i < in->Nodes->numNodes; ++i)
257
setNewDOFId[i] = TRUE;
259
for (p = 0; p < mpiSize; ++p)
262
firstVertex = distribution[current_rank];
263
lastVertex = distribution[current_rank + 1];
264
#pragma omp parallel for private(i,j,k)
265
for (i = 0; i < in->Nodes->numNodes; ++i)
267
k = in->Nodes->globalDegreesOfFreedom[i];
268
if (setNewDOFId[i] && (firstVertex <= k) && (k < lastVertex))
270
in->Nodes->globalDegreesOfFreedom[i] = newGlobalDOFID[k - firstVertex];
271
setNewDOFId[i] = FALSE;
276
{ /* the final send can be skipped */
278
MPI_Sendrecv_replace(newGlobalDOFID, len, MPI_INT,
279
dest, in->MPIInfo->msg_tag_counter,
280
source, in->MPIInfo->msg_tag_counter, in->MPIInfo->comm, &status);
282
in->MPIInfo->msg_tag_counter++;
283
current_rank = Esys_MPIInfo_mod(mpiSize, current_rank - 1);
286
for (i = 0; i < mpiSize + 1; ++i)
287
distribution[i] = new_distribution[i];
289
TMPMEMFREE(index_list);
291
TMPMEMFREE(newGlobalDOFID);
292
TMPMEMFREE(setNewDOFId);
293
TMPMEMFREE(new_distribution);
294
TMPMEMFREE(partition_count);
295
TMPMEMFREE(partition);