2
Bullet Continuous Collision Detection and Physics Library
3
Copyright (c) 2003-2006 Erwin Coumans http://continuousphysics.com/Bullet/
5
This software is provided 'as-is', without any express or implied warranty.
6
In no event will the authors be held liable for any damages arising from the use of this software.
7
Permission is granted to anyone to use this software for any purpose,
8
including commercial applications, and to alter it and redistribute it freely,
9
subject to the following restrictions:
11
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13
3. This notice may not be removed or altered from any source distribution.
17
#include "BulletCollision/CollisionShapes/btTriangleIndexVertexArray.h"
18
#include "vectormath/vmInclude.h"
19
#include <stdio.h> //@todo: remove the debugging printf at some stage
20
#include "btSoftBodySolver_OpenCLSIMDAware.h"
21
#include "BulletSoftBody/btSoftBodySolverVertexBuffer.h"
22
#include "BulletSoftBody/btSoftBody.h"
23
#include "BulletCollision/CollisionShapes/btCapsuleShape.h"
26
#define WAVEFRONT_SIZE 32
27
#define WAVEFRONT_BLOCK_MULTIPLIER 2
28
#define GROUP_SIZE (WAVEFRONT_SIZE*WAVEFRONT_BLOCK_MULTIPLIER)
29
#define LINKS_PER_SIMD_LANE 16
31
static const size_t workGroupSize = GROUP_SIZE;
34
//CL_VERSION_1_1 seems broken on NVidia SDK so just disable it
36
#if (0)//CL_VERSION_1_1 == 1)
37
//OpenCL 1.1 kernels use float3
38
#define MSTRINGIFY(A) #A
39
static const char* UpdatePositionsFromVelocitiesCLString =
40
#include "OpenCLC/UpdatePositionsFromVelocities.cl"
41
static const char* SolvePositionsCLString =
42
#include "OpenCLC/SolvePositionsSIMDBatched.cl"
43
static const char* UpdateNodesCLString =
44
#include "OpenCLC/UpdateNodes.cl"
45
static const char* UpdatePositionsCLString =
46
#include "OpenCLC/UpdatePositions.cl"
47
static const char* UpdateConstantsCLString =
48
#include "OpenCLC/UpdateConstants.cl"
49
static const char* IntegrateCLString =
50
#include "OpenCLC/Integrate.cl"
51
static const char* ApplyForcesCLString =
52
#include "OpenCLC/ApplyForces.cl"
53
static const char* UpdateNormalsCLString =
54
#include "OpenCLC/UpdateNormals.cl"
55
static const char* VSolveLinksCLString =
56
#include "OpenCLC/VSolveLinks.cl"
57
static const char* SolveCollisionsAndUpdateVelocitiesCLString =
58
#include "OpenCLC/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
59
static const char* OutputToVertexArrayCLString =
60
#include "OpenCLC/OutputToVertexArray.cl"
62
////OpenCL 1.0 kernels don't use float3
63
#define MSTRINGIFY(A) #A
64
static const char* UpdatePositionsFromVelocitiesCLString =
65
#include "OpenCLC10/UpdatePositionsFromVelocities.cl"
66
static const char* SolvePositionsCLString =
67
#include "OpenCLC10/SolvePositionsSIMDBatched.cl"
68
static const char* UpdateNodesCLString =
69
#include "OpenCLC10/UpdateNodes.cl"
70
static const char* UpdatePositionsCLString =
71
#include "OpenCLC10/UpdatePositions.cl"
72
static const char* UpdateConstantsCLString =
73
#include "OpenCLC10/UpdateConstants.cl"
74
static const char* IntegrateCLString =
75
#include "OpenCLC10/Integrate.cl"
76
static const char* ApplyForcesCLString =
77
#include "OpenCLC10/ApplyForces.cl"
78
static const char* UpdateNormalsCLString =
79
#include "OpenCLC10/UpdateNormals.cl"
80
static const char* VSolveLinksCLString =
81
#include "OpenCLC10/VSolveLinks.cl"
82
static const char* SolveCollisionsAndUpdateVelocitiesCLString =
83
#include "OpenCLC10/SolveCollisionsAndUpdateVelocitiesSIMDBatched.cl"
84
static const char* OutputToVertexArrayCLString =
85
#include "OpenCLC10/OutputToVertexArray.cl"
86
#endif //CL_VERSION_1_1
90
btSoftBodyLinkDataOpenCLSIMDAware::btSoftBodyLinkDataOpenCLSIMDAware(cl_command_queue queue, cl_context ctx) :
91
m_cqCommandQue(queue),
92
m_wavefrontSize( WAVEFRONT_SIZE ),
93
m_linksPerWorkItem( LINKS_PER_SIMD_LANE ),
94
m_maxBatchesWithinWave( 0 ),
95
m_maxLinksPerWavefront( m_wavefrontSize * m_linksPerWorkItem ),
98
m_clNumBatchesAndVerticesWithinWaves( queue, ctx, &m_numBatchesAndVerticesWithinWaves, true ),
99
m_clWavefrontVerticesGlobalAddresses( queue, ctx, &m_wavefrontVerticesGlobalAddresses, true ),
100
m_clLinkVerticesLocalAddresses( queue, ctx, &m_linkVerticesLocalAddresses, true ),
101
m_clLinkStrength( queue, ctx, &m_linkStrength, false ),
102
m_clLinksMassLSC( queue, ctx, &m_linksMassLSC, false ),
103
m_clLinksRestLengthSquared( queue, ctx, &m_linksRestLengthSquared, false ),
104
m_clLinksRestLength( queue, ctx, &m_linksRestLength, false ),
105
m_clLinksMaterialLinearStiffnessCoefficient( queue, ctx, &m_linksMaterialLinearStiffnessCoefficient, false )
109
btSoftBodyLinkDataOpenCLSIMDAware::~btSoftBodyLinkDataOpenCLSIMDAware()
113
static Vectormath::Aos::Vector3 toVector3( const btVector3 &vec )
115
Vectormath::Aos::Vector3 outVec( vec.getX(), vec.getY(), vec.getZ() );
119
/** Allocate enough space in all link-related arrays to fit numLinks links */
120
void btSoftBodyLinkDataOpenCLSIMDAware::createLinks( int numLinks )
122
int previousSize = m_links.size();
123
int newSize = previousSize + numLinks;
125
btSoftBodyLinkData::createLinks( numLinks );
127
// Resize the link addresses array as well
128
m_linkAddresses.resize( newSize );
131
/** Insert the link described into the correct data structures assuming space has already been allocated by a call to createLinks */
132
void btSoftBodyLinkDataOpenCLSIMDAware::setLinkAt(
133
const LinkDescription &link,
136
btSoftBodyLinkData::setLinkAt( link, linkIndex );
138
if( link.getVertex0() > m_maxVertex )
139
m_maxVertex = link.getVertex0();
140
if( link.getVertex1() > m_maxVertex )
141
m_maxVertex = link.getVertex1();
143
// Set the link index correctly for initialisation
144
m_linkAddresses[linkIndex] = linkIndex;
147
bool btSoftBodyLinkDataOpenCLSIMDAware::onAccelerator()
152
bool btSoftBodyLinkDataOpenCLSIMDAware::moveToAccelerator()
155
success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();
156
success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();
157
success = success && m_clLinkVerticesLocalAddresses.moveToGPU();
158
success = success && m_clLinkStrength.moveToGPU();
159
success = success && m_clLinksMassLSC.moveToGPU();
160
success = success && m_clLinksRestLengthSquared.moveToGPU();
161
success = success && m_clLinksRestLength.moveToGPU();
162
success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveToGPU();
171
bool btSoftBodyLinkDataOpenCLSIMDAware::moveFromAccelerator()
174
success = success && m_clNumBatchesAndVerticesWithinWaves.moveToGPU();
175
success = success && m_clWavefrontVerticesGlobalAddresses.moveToGPU();
176
success = success && m_clLinkVerticesLocalAddresses.moveToGPU();
177
success = success && m_clLinkStrength.moveFromGPU();
178
success = success && m_clLinksMassLSC.moveFromGPU();
179
success = success && m_clLinksRestLengthSquared.moveFromGPU();
180
success = success && m_clLinksRestLength.moveFromGPU();
181
success = success && m_clLinksMaterialLinearStiffnessCoefficient.moveFromGPU();
197
btOpenCLSoftBodySolverSIMDAware::btOpenCLSoftBodySolverSIMDAware(cl_command_queue queue, cl_context ctx) :
198
btOpenCLSoftBodySolver( queue, ctx ),
199
m_linkData(queue, ctx)
201
// Initial we will clearly need to update solver constants
202
// For now this is global for the cloths linked with this solver - we should probably make this body specific
203
// for performance in future once we understand more clearly when constants need to be updated
204
m_updateSolverConstants = true;
206
m_shadersInitialized = false;
209
btOpenCLSoftBodySolverSIMDAware::~btOpenCLSoftBodySolverSIMDAware()
214
void btOpenCLSoftBodySolverSIMDAware::optimize( btAlignedObjectArray< btSoftBody * > &softBodies ,bool forceUpdate)
216
if( forceUpdate|| m_softBodySet.size() != softBodies.size() )
218
// Have a change in the soft body set so update, reloading all the data
219
getVertexData().clear();
220
getTriangleData().clear();
221
getLinkData().clear();
222
m_softBodySet.resize(0);
225
for( int softBodyIndex = 0; softBodyIndex < softBodies.size(); ++softBodyIndex )
227
btSoftBody *softBody = softBodies[ softBodyIndex ];
228
using Vectormath::Aos::Matrix3;
229
using Vectormath::Aos::Point3;
231
// Create SoftBody that will store the information within the solver
232
btOpenCLAcceleratedSoftBodyInterface* newSoftBody = new btOpenCLAcceleratedSoftBodyInterface( softBody );
233
m_softBodySet.push_back( newSoftBody );
235
m_perClothAcceleration.push_back( toVector3(softBody->getWorldInfo()->m_gravity) );
236
m_perClothDampingFactor.push_back(softBody->m_cfg.kDP);
237
m_perClothVelocityCorrectionCoefficient.push_back( softBody->m_cfg.kVCF );
238
m_perClothLiftFactor.push_back( softBody->m_cfg.kLF );
239
m_perClothDragFactor.push_back( softBody->m_cfg.kDG );
240
m_perClothMediumDensity.push_back(softBody->getWorldInfo()->air_density);
243
m_perClothFriction.push_back( softBody->getFriction() );
244
m_perClothCollisionObjects.push_back( CollisionObjectIndices(-1, -1) );
246
// Add space for new vertices and triangles in the default solver for now
247
// TODO: Include space here for tearing too later
248
int firstVertex = getVertexData().getNumVertices();
249
int numVertices = softBody->m_nodes.size();
250
// Round maxVertices to a multiple of the workgroup size so we know we're safe to run over in a given group
251
// maxVertices can be increased to allow tearing, but should be used sparingly because these extra verts will always be processed
252
int maxVertices = GROUP_SIZE*((numVertices+GROUP_SIZE)/GROUP_SIZE);
253
// Allocate space for new vertices in all the vertex arrays
254
getVertexData().createVertices( numVertices, softBodyIndex, maxVertices );
256
int firstTriangle = getTriangleData().getNumTriangles();
257
int numTriangles = softBody->m_faces.size();
258
int maxTriangles = numTriangles;
259
getTriangleData().createTriangles( maxTriangles );
261
// Copy vertices from softbody into the solver
262
for( int vertex = 0; vertex < numVertices; ++vertex )
264
Point3 multPoint(softBody->m_nodes[vertex].m_x.getX(), softBody->m_nodes[vertex].m_x.getY(), softBody->m_nodes[vertex].m_x.getZ());
265
btSoftBodyVertexData::VertexDescription desc;
267
// TODO: Position in the softbody might be pre-transformed
268
// or we may need to adapt for the pose.
269
//desc.setPosition( cloth.getMeshTransform()*multPoint );
270
desc.setPosition( multPoint );
272
float vertexInverseMass = softBody->m_nodes[vertex].m_im;
273
desc.setInverseMass(vertexInverseMass);
274
getVertexData().setVertexAt( desc, firstVertex + vertex );
277
// Copy triangles similarly
278
// We're assuming here that vertex indices are based on the firstVertex rather than the entire scene
279
for( int triangle = 0; triangle < numTriangles; ++triangle )
281
// Note that large array storage is relative to the array not to the cloth
282
// So we need to add firstVertex to each value
283
int vertexIndex0 = (softBody->m_faces[triangle].m_n[0] - &(softBody->m_nodes[0]));
284
int vertexIndex1 = (softBody->m_faces[triangle].m_n[1] - &(softBody->m_nodes[0]));
285
int vertexIndex2 = (softBody->m_faces[triangle].m_n[2] - &(softBody->m_nodes[0]));
286
btSoftBodyTriangleData::TriangleDescription newTriangle(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, vertexIndex2 + firstVertex);
287
getTriangleData().setTriangleAt( newTriangle, firstTriangle + triangle );
289
// Increase vertex triangle counts for this triangle
290
getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex0)++;
291
getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex1)++;
292
getVertexData().getTriangleCount(newTriangle.getVertexSet().vertex2)++;
295
int firstLink = getLinkData().getNumLinks();
296
int numLinks = softBody->m_links.size();
297
int maxLinks = numLinks;
299
// Allocate space for the links
300
getLinkData().createLinks( numLinks );
303
for( int link = 0; link < numLinks; ++link )
305
int vertexIndex0 = softBody->m_links[link].m_n[0] - &(softBody->m_nodes[0]);
306
int vertexIndex1 = softBody->m_links[link].m_n[1] - &(softBody->m_nodes[0]);
308
btSoftBodyLinkData::LinkDescription newLink(vertexIndex0 + firstVertex, vertexIndex1 + firstVertex, softBody->m_links[link].m_material->m_kLST);
309
newLink.setLinkStrength(1.f);
310
getLinkData().setLinkAt(newLink, firstLink + link);
313
newSoftBody->setFirstVertex( firstVertex );
314
newSoftBody->setFirstTriangle( firstTriangle );
315
newSoftBody->setNumVertices( numVertices );
316
newSoftBody->setMaxVertices( maxVertices );
317
newSoftBody->setNumTriangles( numTriangles );
318
newSoftBody->setMaxTriangles( maxTriangles );
319
newSoftBody->setFirstLink( firstLink );
320
newSoftBody->setNumLinks( numLinks );
325
updateConstants(0.f);
328
m_linkData.generateBatches();
329
m_triangleData.generateBatches();
332
// Build the shaders to match the batching parameters
338
btSoftBodyLinkData &btOpenCLSoftBodySolverSIMDAware::getLinkData()
340
// TODO: Consider setting link data to "changed" here
347
void btOpenCLSoftBodySolverSIMDAware::updateConstants( float timeStep )
350
using namespace Vectormath::Aos;
352
if( m_updateSolverConstants )
354
m_updateSolverConstants = false;
356
// Will have to redo this if we change the structure (tear, maybe) or various other possible changes
358
// Initialise link constants
359
const int numLinks = m_linkData.getNumLinks();
360
for( int linkIndex = 0; linkIndex < numLinks; ++linkIndex )
362
btSoftBodyLinkData::LinkNodePair &vertices( m_linkData.getVertexPair(linkIndex) );
363
m_linkData.getRestLength(linkIndex) = length((m_vertexData.getPosition( vertices.vertex0 ) - m_vertexData.getPosition( vertices.vertex1 )));
364
float invMass0 = m_vertexData.getInverseMass(vertices.vertex0);
365
float invMass1 = m_vertexData.getInverseMass(vertices.vertex1);
366
float linearStiffness = m_linkData.getLinearStiffnessCoefficient(linkIndex);
367
float massLSC = (invMass0 + invMass1)/linearStiffness;
368
m_linkData.getMassLSC(linkIndex) = massLSC;
369
float restLength = m_linkData.getRestLength(linkIndex);
370
float restLengthSquared = restLength*restLength;
371
m_linkData.getRestLengthSquared(linkIndex) = restLengthSquared;
379
void btOpenCLSoftBodySolverSIMDAware::solveConstraints( float solverdt )
382
using Vectormath::Aos::Vector3;
383
using Vectormath::Aos::Point3;
384
using Vectormath::Aos::lengthSqr;
385
using Vectormath::Aos::dot;
388
int numLinks = m_linkData.getNumLinks();
389
int numVertices = m_vertexData.getNumVertices();
395
m_clPerClothDampingFactor.moveToGPU();
396
m_clPerClothVelocityCorrectionCoefficient.moveToGPU();
399
// Ensure data is on accelerator
400
m_linkData.moveToAccelerator();
401
m_vertexData.moveToAccelerator();
406
prepareCollisionConstraints();
409
for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
412
for( int i = 0; i < m_linkData.m_wavefrontBatchStartLengths.size(); ++i )
414
int startWave = m_linkData.m_wavefrontBatchStartLengths[i].start;
415
int numWaves = m_linkData.m_wavefrontBatchStartLengths[i].length;
416
solveLinksForPosition( startWave, numWaves, kst, ti );
418
} // for( int iteration = 0; iteration < m_numberOfPositionIterations ; ++iteration )
421
// At this point assume that the force array is blank - we will overwrite it
422
solveCollisionsAndUpdateVelocities( 1.f/solverdt );
426
//////////////////////////////////////
430
void btOpenCLSoftBodySolverSIMDAware::solveLinksForPosition( int startWave, int numWaves, float kst, float ti )
433
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,0, sizeof(int), &startWave);
434
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,1, sizeof(int), &numWaves);
435
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,2, sizeof(float), &kst);
436
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,3, sizeof(float), &ti);
439
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,4, sizeof(cl_mem), &m_linkData.m_clNumBatchesAndVerticesWithinWaves.m_buffer);
440
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,5, sizeof(cl_mem), &m_linkData.m_clWavefrontVerticesGlobalAddresses.m_buffer);
441
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,6, sizeof(cl_mem), &m_linkData.m_clLinkVerticesLocalAddresses.m_buffer);
442
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,7, sizeof(cl_mem), &m_linkData.m_clLinksMassLSC.m_buffer);
444
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,8, sizeof(cl_mem), &m_linkData.m_clLinksRestLengthSquared.m_buffer);
445
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,9, sizeof(cl_mem), &m_vertexData.m_clVertexInverseMass.m_buffer);
446
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,10, sizeof(cl_mem), &m_vertexData.m_clVertexPosition.m_buffer);
448
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,11, WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_int2), 0);
449
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,12, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float4), 0);
450
ciErrNum = clSetKernelArg(solvePositionsFromLinksKernel,13, m_linkData.getMaxVerticesPerWavefront()*WAVEFRONT_BLOCK_MULTIPLIER*sizeof(cl_float), 0);
452
size_t numWorkItems = workGroupSize*((numWaves*WAVEFRONT_SIZE + (workGroupSize-1)) / workGroupSize);
454
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solvePositionsFromLinksKernel,1,NULL,&numWorkItems,&workGroupSize,0,0,0);
456
if( ciErrNum!= CL_SUCCESS )
458
btAssert( 0 && "enqueueNDRangeKernel(solvePositionsFromLinksKernel)");
461
} // solveLinksForPosition
463
void btOpenCLSoftBodySolverSIMDAware::solveCollisionsAndUpdateVelocities( float isolverdt )
465
// Copy kernel parameters to GPU
466
m_vertexData.moveToAccelerator();
467
m_clPerClothFriction.moveToGPU();
468
m_clPerClothDampingFactor.moveToGPU();
469
m_clPerClothCollisionObjects.moveToGPU();
470
m_clCollisionObjectDetails.moveToGPU();
473
int numVerts = m_vertexData.getNumVertices();
474
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 0, sizeof(int), &numVerts);
475
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 1, sizeof(int), &isolverdt);
476
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 2, sizeof(cl_mem),&m_vertexData.m_clClothIdentifier.m_buffer);
477
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 3, sizeof(cl_mem),&m_vertexData.m_clVertexPreviousPosition.m_buffer);
478
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 4, sizeof(cl_mem),&m_clPerClothFriction.m_buffer);
479
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 5, sizeof(cl_mem),&m_clPerClothDampingFactor.m_buffer);
480
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 6, sizeof(cl_mem),&m_clPerClothCollisionObjects.m_buffer);
481
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 7, sizeof(cl_mem),&m_clCollisionObjectDetails.m_buffer);
482
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 8, sizeof(cl_mem),&m_vertexData.m_clVertexForceAccumulator.m_buffer);
483
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 9, sizeof(cl_mem),&m_vertexData.m_clVertexVelocity.m_buffer);
484
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 10, sizeof(cl_mem),&m_vertexData.m_clVertexPosition.m_buffer);
485
ciErrNum = clSetKernelArg(solveCollisionsAndUpdateVelocitiesKernel, 11, sizeof(CollisionShapeDescription)*16,0);
486
size_t numWorkItems = workGroupSize*((m_vertexData.getNumVertices() + (workGroupSize-1)) / workGroupSize);
490
ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue,solveCollisionsAndUpdateVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0,0,0);
492
if( ciErrNum != CL_SUCCESS )
494
btAssert( 0 && "enqueueNDRangeKernel(solveCollisionsAndUpdateVelocitiesKernel)");
498
} // btOpenCLSoftBodySolverSIMDAware::updateVelocitiesFromPositionsWithoutVelocities
500
// End kernel dispatches
501
/////////////////////////////////////
505
bool btOpenCLSoftBodySolverSIMDAware::buildShaders()
507
bool returnVal = true;
509
if( m_shadersInitialized )
512
char *wavefrontMacros = new char[256];
516
"-DMAX_NUM_VERTICES_PER_WAVE=%d -DMAX_BATCHES_PER_WAVE=%d -DWAVEFRONT_SIZE=%d -DWAVEFRONT_BLOCK_MULTIPLIER=%d -DBLOCK_SIZE=%d",
517
m_linkData.getMaxVerticesPerWavefront(),
518
m_linkData.getMaxBatchesPerWavefront(),
519
m_linkData.getWavefrontSize(),
520
WAVEFRONT_BLOCK_MULTIPLIER,
521
WAVEFRONT_BLOCK_MULTIPLIER*m_linkData.getWavefrontSize());
523
updatePositionsFromVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsFromVelocitiesCLString, "UpdatePositionsFromVelocitiesKernel", "" );
524
solvePositionsFromLinksKernel = clFunctions.compileCLKernelFromString( SolvePositionsCLString, "SolvePositionsFromLinksKernel", wavefrontMacros );
525
updateVelocitiesFromPositionsWithVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdateNodesCLString, "updateVelocitiesFromPositionsWithVelocitiesKernel", "" );
526
updateVelocitiesFromPositionsWithoutVelocitiesKernel = clFunctions.compileCLKernelFromString( UpdatePositionsCLString, "updateVelocitiesFromPositionsWithoutVelocitiesKernel", "" );
527
integrateKernel = clFunctions.compileCLKernelFromString( IntegrateCLString, "IntegrateKernel", "" );
528
applyForcesKernel = clFunctions.compileCLKernelFromString( ApplyForcesCLString, "ApplyForcesKernel", "" );
529
solveCollisionsAndUpdateVelocitiesKernel = clFunctions.compileCLKernelFromString( SolveCollisionsAndUpdateVelocitiesCLString, "SolveCollisionsAndUpdateVelocitiesKernel", "" );
531
// TODO: Rename to UpdateSoftBodies
532
resetNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "ResetNormalsAndAreasKernel", "" );
533
normalizeNormalsAndAreasKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "NormalizeNormalsAndAreasKernel", "" );
534
updateSoftBodiesKernel = clFunctions.compileCLKernelFromString( UpdateNormalsCLString, "UpdateSoftBodiesKernel", "" );
536
delete [] wavefrontMacros;
539
m_shadersInitialized = true;
547
static Vectormath::Aos::Transform3 toTransform3( const btTransform &transform )
549
Vectormath::Aos::Transform3 outTransform;
550
outTransform.setCol(0, toVector3(transform.getBasis().getColumn(0)));
551
outTransform.setCol(1, toVector3(transform.getBasis().getColumn(1)));
552
outTransform.setCol(2, toVector3(transform.getBasis().getColumn(2)));
553
outTransform.setCol(3, toVector3(transform.getOrigin()));
558
static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts, btSoftBodyLinkData &linkData, int numVertices, btAlignedObjectArray < btAlignedObjectArray <int> > &wavefrontBatches )
560
// A per-batch map of truth values stating whether a given vertex is in that batch
561
// This allows us to significantly optimize the batching
562
btAlignedObjectArray <btAlignedObjectArray<bool> > mapOfVerticesInBatches;
564
for( int waveIndex = 0; waveIndex < linksForWavefronts.size(); ++waveIndex )
566
btAlignedObjectArray <int> &wavefront( linksForWavefronts[waveIndex] );
570
while( batch < wavefrontBatches.size() && !placed )
572
// Test the current batch, see if this wave shares any vertex with the waves in the batch
573
bool foundSharedVertex = false;
574
for( int link = 0; link < wavefront.size(); ++link )
576
btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
577
if( (mapOfVerticesInBatches[batch])[vertices.vertex0] || (mapOfVerticesInBatches[batch])[vertices.vertex1] )
579
foundSharedVertex = true;
583
if( !foundSharedVertex )
585
wavefrontBatches[batch].push_back( waveIndex );
586
// Insert vertices into this batch too
587
for( int link = 0; link < wavefront.size(); ++link )
589
btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
590
(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
591
(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
597
if( batch == wavefrontBatches.size() && !placed )
599
wavefrontBatches.resize( batch + 1 );
600
wavefrontBatches[batch].push_back( waveIndex );
602
// And resize map as well
603
mapOfVerticesInBatches.resize( batch + 1 );
605
// Resize maps with total number of vertices
606
mapOfVerticesInBatches[batch].resize( numVertices, false );
608
// Insert vertices into this batch too
609
for( int link = 0; link < wavefront.size(); ++link )
611
btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( wavefront[link] );
612
(mapOfVerticesInBatches[batch])[vertices.vertex0] = true;
613
(mapOfVerticesInBatches[batch])[vertices.vertex1] = true;
617
mapOfVerticesInBatches.clear();
620
// Function to remove an object from a vector maintaining correct ordering of the vector
621
template< typename T > static void removeFromVector( btAlignedObjectArray< T > &vectorToUpdate, int indexToRemove )
623
int currentSize = vectorToUpdate.size();
624
for( int i = indexToRemove; i < (currentSize-1); ++i )
626
vectorToUpdate[i] = vectorToUpdate[i+1];
628
if( currentSize > 0 )
629
vectorToUpdate.resize( currentSize - 1 );
633
* Insert element into vectorToUpdate at index index.
635
template< typename T > static void insertAtIndex( btAlignedObjectArray< T > &vectorToUpdate, int index, T element )
637
vectorToUpdate.resize( vectorToUpdate.size() + 1 );
638
for( int i = (vectorToUpdate.size() - 1); i > index; --i )
640
vectorToUpdate[i] = vectorToUpdate[i-1];
642
vectorToUpdate[index] = element;
646
* Insert into btAlignedObjectArray assuming the array is ordered and maintaining both ordering and uniqueness.
647
* ie it treats vectorToUpdate as an ordered set.
649
template< typename T > static void insertUniqueAndOrderedIntoVector( btAlignedObjectArray<T> &vectorToUpdate, T element )
652
while( index < vectorToUpdate.size() && vectorToUpdate[index] < element )
656
if( index == vectorToUpdate.size() || vectorToUpdate[index] != element )
657
insertAtIndex( vectorToUpdate, index, element );
660
static void generateLinksPerVertex( int numVertices, btSoftBodyLinkData &linkData, btAlignedObjectArray< int > &listOfLinksPerVertex, btAlignedObjectArray <int> &numLinksPerVertex, int &maxLinks )
662
for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
664
btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
665
numLinksPerVertex[nodes.vertex0]++;
666
numLinksPerVertex[nodes.vertex1]++;
668
int maxLinksPerVertex = 0;
669
for( int vertexIndex = 0; vertexIndex < numVertices; ++vertexIndex )
671
maxLinksPerVertex = btMax(numLinksPerVertex[vertexIndex], maxLinksPerVertex);
673
maxLinks = maxLinksPerVertex;
675
btAlignedObjectArray< int > linksFoundPerVertex;
676
linksFoundPerVertex.resize( numVertices, 0 );
678
listOfLinksPerVertex.resize( maxLinksPerVertex * numVertices );
680
for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
682
btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
685
int vertexIndex = nodes.vertex0;
686
int linkForVertex = linksFoundPerVertex[nodes.vertex0];
687
int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
689
listOfLinksPerVertex[linkAddress] = linkIndex;
691
linksFoundPerVertex[nodes.vertex0] = linkForVertex + 1;
695
int vertexIndex = nodes.vertex1;
696
int linkForVertex = linksFoundPerVertex[nodes.vertex1];
697
int linkAddress = vertexIndex * maxLinksPerVertex + linkForVertex;
699
listOfLinksPerVertex[linkAddress] = linkIndex;
701
linksFoundPerVertex[nodes.vertex1] = linkForVertex + 1;
706
static void computeBatchingIntoWavefronts(
707
btSoftBodyLinkData &linkData,
709
int linksPerWorkItem,
710
int maxLinksPerWavefront,
711
btAlignedObjectArray < btAlignedObjectArray <int> > &linksForWavefronts,
712
btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > &batchesWithinWaves, /* wave, batch, links in batch */
713
btAlignedObjectArray< btAlignedObjectArray< int > > &verticesForWavefronts /* wavefront, vertex */
718
// Attempt generation of larger batches of links.
719
btAlignedObjectArray< bool > processedLink;
720
processedLink.resize( linkData.getNumLinks() );
721
btAlignedObjectArray< int > listOfLinksPerVertex;
722
int maxLinksPerVertex = 0;
724
// Count num vertices
726
for( int linkIndex = 0; linkIndex < linkData.getNumLinks(); ++linkIndex )
728
btSoftBodyLinkData::LinkNodePair nodes( linkData.getVertexPair(linkIndex) );
729
numVertices = btMax( numVertices, nodes.vertex0 + 1 );
730
numVertices = btMax( numVertices, nodes.vertex1 + 1 );
733
// Need list of links per vertex
734
// Compute valence of each vertex
735
btAlignedObjectArray <int> numLinksPerVertex;
736
numLinksPerVertex.resize(0);
737
numLinksPerVertex.resize( numVertices, 0 );
739
generateLinksPerVertex( numVertices, linkData, listOfLinksPerVertex, numLinksPerVertex, maxLinksPerVertex );
744
for( int vertex = 0; vertex < 10; ++vertex )
746
for( int link = 0; link < numLinksPerVertex[vertex]; ++link )
748
int linkAddress = vertex * maxLinksPerVertex + link;
753
// At this point we know what links we have for each vertex so we can start batching
755
// We want a vertex to start with, let's go with 0
756
int currentVertex = 0;
757
int linksProcessed = 0;
759
btAlignedObjectArray <int> verticesToProcess;
761
while( linksProcessed < linkData.getNumLinks() )
764
int nextWavefront = linksForWavefronts.size();
765
linksForWavefronts.resize( nextWavefront + 1 );
766
btAlignedObjectArray <int> &linksForWavefront(linksForWavefronts[nextWavefront]);
767
verticesForWavefronts.resize( nextWavefront + 1 );
768
btAlignedObjectArray<int> &vertexSet( verticesForWavefronts[nextWavefront] );
770
linksForWavefront.resize(0);
772
// Loop to find enough links to fill the wavefront
773
// Stopping if we either run out of links, or fill it
774
while( linksProcessed < linkData.getNumLinks() && linksForWavefront.size() < maxLinksPerWavefront )
776
// Go through the links for the current vertex
777
for( int link = 0; link < numLinksPerVertex[currentVertex] && linksForWavefront.size() < maxLinksPerWavefront; ++link )
779
int linkAddress = currentVertex * maxLinksPerVertex + link;
780
int linkIndex = listOfLinksPerVertex[linkAddress];
782
// If we have not already processed this link, add it to the wavefront
783
// Claim it as another processed link
784
// Add the vertex at the far end to the list of vertices to process.
785
if( !processedLink[linkIndex] )
787
linksForWavefront.push_back( linkIndex );
789
processedLink[linkIndex] = true;
790
int v0 = linkData.getVertexPair(linkIndex).vertex0;
791
int v1 = linkData.getVertexPair(linkIndex).vertex1;
792
if( v0 == currentVertex )
793
verticesToProcess.push_back( v1 );
795
verticesToProcess.push_back( v0 );
798
if( verticesToProcess.size() > 0 )
800
// Get the element on the front of the queue and remove it
801
currentVertex = verticesToProcess[0];
802
removeFromVector( verticesToProcess, 0 );
804
// If we've not yet processed all the links, find the first unprocessed one
805
// and select one of its vertices as the current vertex
806
if( linksProcessed < linkData.getNumLinks() )
809
while( processedLink[searchLink] )
811
currentVertex = linkData.getVertexPair(searchLink).vertex0;
816
// We have either finished or filled a wavefront
817
for( int link = 0; link < linksForWavefront.size(); ++link )
819
int v0 = linkData.getVertexPair( linksForWavefront[link] ).vertex0;
820
int v1 = linkData.getVertexPair( linksForWavefront[link] ).vertex1;
821
insertUniqueAndOrderedIntoVector( vertexSet, v0 );
822
insertUniqueAndOrderedIntoVector( vertexSet, v1 );
824
// Iterate over links mapped to the wave and batch those
825
// We can run a batch on each cycle trivially
827
batchesWithinWaves.resize( batchesWithinWaves.size() + 1 );
828
btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWave( batchesWithinWaves[batchesWithinWaves.size()-1] );
831
for( int link = 0; link < linksForWavefront.size(); ++link )
833
int linkIndex = linksForWavefront[link];
834
btSoftBodyLinkData::LinkNodePair vertices = linkData.getVertexPair( linkIndex );
838
while( batch < batchesWithinWave.size() && !placed )
840
bool foundSharedVertex = false;
841
if( batchesWithinWave[batch].size() >= wavefrontSize )
843
// If we have already filled this batch, move on to another
844
foundSharedVertex = true;
846
for( int link2 = 0; link2 < batchesWithinWave[batch].size(); ++link2 )
848
btSoftBodyLinkData::LinkNodePair vertices2 = linkData.getVertexPair( (batchesWithinWave[batch])[link2] );
850
if( vertices.vertex0 == vertices2.vertex0 ||
851
vertices.vertex1 == vertices2.vertex0 ||
852
vertices.vertex0 == vertices2.vertex1 ||
853
vertices.vertex1 == vertices2.vertex1 )
855
foundSharedVertex = true;
860
if( !foundSharedVertex )
862
batchesWithinWave[batch].push_back( linkIndex );
868
if( batch == batchesWithinWave.size() && !placed )
870
batchesWithinWave.resize( batch + 1 );
871
batchesWithinWave[batch].push_back( linkIndex );
879
void btSoftBodyLinkDataOpenCLSIMDAware::generateBatches()
881
btAlignedObjectArray < btAlignedObjectArray <int> > linksForWavefronts;
882
btAlignedObjectArray < btAlignedObjectArray <int> > wavefrontBatches;
883
btAlignedObjectArray< btAlignedObjectArray < btAlignedObjectArray <int> > > batchesWithinWaves;
884
btAlignedObjectArray< btAlignedObjectArray< int > > verticesForWavefronts; // wavefronts, vertices in wavefront as an ordered set
886
// Group the links into wavefronts
887
computeBatchingIntoWavefronts( *this, m_wavefrontSize, m_linksPerWorkItem, m_maxLinksPerWavefront, linksForWavefronts, batchesWithinWaves, verticesForWavefronts );
890
// Batch the wavefronts
891
generateBatchesOfWavefronts( linksForWavefronts, *this, m_maxVertex, wavefrontBatches );
893
m_numWavefronts = linksForWavefronts.size();
895
// At this point we have a description of which links we need to process in each wavefront
897
// First correctly fill the batch ranges vector
898
int numBatches = wavefrontBatches.size();
899
m_wavefrontBatchStartLengths.resize(0);
901
for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
903
int wavesInBatch = wavefrontBatches[batchIndex].size();
904
int nextPrefixSum = prefixSum + wavesInBatch;
905
m_wavefrontBatchStartLengths.push_back( BatchPair( prefixSum, nextPrefixSum - prefixSum ) );
907
prefixSum += wavesInBatch;
910
// Also find max number of batches within a wave
911
m_maxBatchesWithinWave = 0;
912
m_maxVerticesWithinWave = 0;
913
m_numBatchesAndVerticesWithinWaves.resize( m_numWavefronts );
914
for( int waveIndex = 0; waveIndex < m_numWavefronts; ++waveIndex )
916
// See if the number of batches in this wave is greater than the current maxium
917
int batchesInCurrentWave = batchesWithinWaves[waveIndex].size();
918
int verticesInCurrentWave = verticesForWavefronts[waveIndex].size();
919
m_maxBatchesWithinWave = btMax( batchesInCurrentWave, m_maxBatchesWithinWave );
920
m_maxVerticesWithinWave = btMax( verticesInCurrentWave, m_maxVerticesWithinWave );
923
// Add padding values both for alignment and as dudd addresses within LDS to compute junk rather than branch around
924
m_maxVerticesWithinWave = 16*((m_maxVerticesWithinWave/16)+2);
926
// Now we know the maximum number of vertices per-wave we can resize the global vertices array
927
m_wavefrontVerticesGlobalAddresses.resize( m_maxVerticesWithinWave * m_numWavefronts );
929
// Grab backup copies of all the link data arrays for the sorting process
930
btAlignedObjectArray<btSoftBodyLinkData::LinkNodePair> m_links_Backup(m_links);
931
btAlignedObjectArray<float> m_linkStrength_Backup(m_linkStrength);
932
btAlignedObjectArray<float> m_linksMassLSC_Backup(m_linksMassLSC);
933
btAlignedObjectArray<float> m_linksRestLengthSquared_Backup(m_linksRestLengthSquared);
934
//btAlignedObjectArray<Vectormath::Aos::Vector3> m_linksCLength_Backup(m_linksCLength);
935
//btAlignedObjectArray<float> m_linksLengthRatio_Backup(m_linksLengthRatio);
936
btAlignedObjectArray<float> m_linksRestLength_Backup(m_linksRestLength);
937
btAlignedObjectArray<float> m_linksMaterialLinearStiffnessCoefficient_Backup(m_linksMaterialLinearStiffnessCoefficient);
939
// Resize to a wavefront sized batch per batch per wave so we get perfectly coherent memory accesses.
940
m_links.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
941
m_linkVerticesLocalAddresses.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
942
m_linkStrength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
943
m_linksMassLSC.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
944
m_linksRestLengthSquared.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
945
m_linksRestLength.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
946
m_linksMaterialLinearStiffnessCoefficient.resize( m_maxBatchesWithinWave * m_wavefrontSize * m_numWavefronts );
948
// Then re-order links into wavefront blocks
950
// Total number of wavefronts moved. This will decide the ordering of sorted wavefronts.
951
int wavefrontCount = 0;
953
// Iterate over batches of wavefronts, then wavefronts in the batch
954
for( int batchIndex = 0; batchIndex < numBatches; ++batchIndex )
956
btAlignedObjectArray <int> &batch( wavefrontBatches[batchIndex] );
957
int wavefrontsInBatch = batch.size();
960
for( int wavefrontIndex = 0; wavefrontIndex < wavefrontsInBatch; ++wavefrontIndex )
963
int originalWavefrontIndex = batch[wavefrontIndex];
964
btAlignedObjectArray< int > &wavefrontVertices( verticesForWavefronts[originalWavefrontIndex] );
965
int verticesUsedByWavefront = wavefrontVertices.size();
967
// Copy the set of vertices into the correctly structured array for use on the device
968
// Fill the non-vertices with -1s
969
// so we can mask out those reads
970
for( int vertex = 0; vertex < verticesUsedByWavefront; ++vertex )
972
m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = wavefrontVertices[vertex];
974
for( int vertex = verticesUsedByWavefront; vertex < m_maxVerticesWithinWave; ++vertex )
976
m_wavefrontVerticesGlobalAddresses[m_maxVerticesWithinWave * wavefrontCount + vertex] = -1;
979
// Obtain the set of batches within the current wavefront
980
btAlignedObjectArray < btAlignedObjectArray <int> > &batchesWithinWavefront( batchesWithinWaves[originalWavefrontIndex] );
981
// Set the size of the batches for use in the solver, correctly ordered
982
NumBatchesVerticesPair batchesAndVertices;
983
batchesAndVertices.numBatches = batchesWithinWavefront.size();
984
batchesAndVertices.numVertices = verticesUsedByWavefront;
985
m_numBatchesAndVerticesWithinWaves[wavefrontCount] = batchesAndVertices;
988
// Now iterate over batches within the wavefront to structure the links correctly
989
for( int wavefrontBatch = 0; wavefrontBatch < batchesWithinWavefront.size(); ++wavefrontBatch )
991
btAlignedObjectArray <int> &linksInBatch( batchesWithinWavefront[wavefrontBatch] );
992
int wavefrontBatchSize = linksInBatch.size();
994
int batchAddressInTarget = m_maxBatchesWithinWave * m_wavefrontSize * wavefrontCount + m_wavefrontSize * wavefrontBatch;
996
for( int linkIndex = 0; linkIndex < wavefrontBatchSize; ++linkIndex )
998
int originalLinkAddress = linksInBatch[linkIndex];
999
// Reorder simple arrays trivially
1000
m_links[batchAddressInTarget + linkIndex] = m_links_Backup[originalLinkAddress];
1001
m_linkStrength[batchAddressInTarget + linkIndex] = m_linkStrength_Backup[originalLinkAddress];
1002
m_linksMassLSC[batchAddressInTarget + linkIndex] = m_linksMassLSC_Backup[originalLinkAddress];
1003
m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = m_linksRestLengthSquared_Backup[originalLinkAddress];
1004
m_linksRestLength[batchAddressInTarget + linkIndex] = m_linksRestLength_Backup[originalLinkAddress];
1005
m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = m_linksMaterialLinearStiffnessCoefficient_Backup[originalLinkAddress];
1007
// The local address is more complicated. We need to work out where a given vertex will end up
1008
// by searching the set of vertices for this link and using the index as the local address
1009
btSoftBodyLinkData::LinkNodePair localPair;
1010
btSoftBodyLinkData::LinkNodePair globalPair = m_links[batchAddressInTarget + linkIndex];
1011
localPair.vertex0 = wavefrontVertices.findLinearSearch( globalPair.vertex0 );
1012
localPair.vertex1 = wavefrontVertices.findLinearSearch( globalPair.vertex1 );
1013
m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
1015
for( int linkIndex = wavefrontBatchSize; linkIndex < m_wavefrontSize; ++linkIndex )
1017
// Put 0s into these arrays for padding for cleanliness
1018
m_links[batchAddressInTarget + linkIndex] = btSoftBodyLinkData::LinkNodePair(0, 0);
1019
m_linkStrength[batchAddressInTarget + linkIndex] = 0.f;
1020
m_linksMassLSC[batchAddressInTarget + linkIndex] = 0.f;
1021
m_linksRestLengthSquared[batchAddressInTarget + linkIndex] = 0.f;
1022
m_linksRestLength[batchAddressInTarget + linkIndex] = 0.f;
1023
m_linksMaterialLinearStiffnessCoefficient[batchAddressInTarget + linkIndex] = 0.f;
1026
// For local addresses of junk data choose a set of addresses just above the range of valid ones
1027
// and cycling tyhrough % 16 so that we don't have bank conficts between all dud addresses
1028
// The valid addresses will do scatter and gather in the valid range, the junk ones should happily work
1029
// off the end of that range so we need no control
1030
btSoftBodyLinkData::LinkNodePair localPair;
1031
localPair.vertex0 = verticesUsedByWavefront + (linkIndex % 16);
1032
localPair.vertex1 = verticesUsedByWavefront + (linkIndex % 16);
1033
m_linkVerticesLocalAddresses[batchAddressInTarget + linkIndex] = localPair;
1045
} // void btSoftBodyLinkDataDX11SIMDAware::generateBatches()