1611 lines
69 KiB
C
1611 lines
69 KiB
C
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* *
|
||
|
* Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
|
||
|
* All rights reserved. Email: russ@q12.org Web: www.q12.org *
|
||
|
* *
|
||
|
* This library is free software; you can redistribute it and/or *
|
||
|
* modify it under the terms of EITHER: *
|
||
|
* (1) The GNU Lesser General Public License as published by the Free *
|
||
|
* Software Foundation; either version 2.1 of the License, or (at *
|
||
|
* your option) any later version. The text of the GNU Lesser *
|
||
|
* General Public License is included with this library in the *
|
||
|
* file LICENSE.TXT. *
|
||
|
* (2) The BSD-style license that is included with this library in *
|
||
|
* the file LICENSE-BSD.TXT. *
|
||
|
* *
|
||
|
* This library is distributed in the hope that it will be useful, *
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
|
||
|
* LICENSE.TXT and LICENSE-BSD.TXT for more details. *
|
||
|
* *
|
||
|
*************************************************************************/
|
||
|
|
||
|
/*
|
||
|
* Code style improvements and optimizations by Oleh Derevenko ????-2024
|
||
|
* L1Straight cooperative solving code of ThreadedEquationSolverLDLT copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
|
||
|
*/
|
||
|
|
||
|
#ifndef _ODE_FASTLSOLVE_IMPL_H_
|
||
|
#define _ODE_FASTLSOLVE_IMPL_H_
|
||
|
|
||
|
|
||
|
/* solve L*X=B, with B containing 1 right hand sides.
|
||
|
* L is an n*n lower triangular matrix with ones on the diagonal.
|
||
|
* L is stored by rows and its leading dimension is lskip.
|
||
|
* B is an n*1 matrix that contains the right hand sides.
|
||
|
* B is stored by columns and its leading dimension is also lskip.
|
||
|
* B is overwritten with X.
|
||
|
* this processes blocks of 4*4.
|
||
|
* if this is in the factorizer source file, n must be a multiple of 4.
|
||
|
*/
|
||
|
|
||
|
template<unsigned int b_stride>
|
||
|
void solveL1Straight (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
|
||
|
{
|
||
|
dIASSERT(rowCount != 0);
|
||
|
|
||
|
/* compute all 4 x 1 blocks of X */
|
||
|
unsigned blockStartRow = 0;
|
||
|
bool subsequentPass = false;
|
||
|
bool goForLoopX4 = rowCount >= 4;
|
||
|
const unsigned loopX4LastRow = goForLoopX4 ? rowCount - 4 : 0;
|
||
|
for (; goForLoopX4; subsequentPass = true, goForLoopX4 = (blockStartRow += 4) <= loopX4LastRow)
|
||
|
{
|
||
|
/* declare variables - Z matrix, p and q vectors, etc */
|
||
|
const dReal *ptrLElement;
|
||
|
dReal *ptrBElement;
|
||
|
|
||
|
dReal Z11, Z21, Z31, Z41;
|
||
|
|
||
|
/* compute all 4 x 1 block of X, from rows i..i+4-1 */
|
||
|
if (subsequentPass)
|
||
|
{
|
||
|
ptrLElement = L + (1 + blockStartRow) * rowSkip;
|
||
|
ptrBElement = B;
|
||
|
/* set the Z matrix to 0 */
|
||
|
Z11 = 0; Z21 = 0; Z31 = 0; Z41 = 0;
|
||
|
|
||
|
/* the inner loop that computes outer products and adds them to Z */
|
||
|
for (unsigned columnCounter = blockStartRow; ; )
|
||
|
{
|
||
|
dReal q1, p1, p2, p3, p4;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[0];
|
||
|
p2 = ptrLElement[0];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[0];
|
||
|
p4 = ptrLElement[0 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[1 * b_stride];
|
||
|
p3 = ptrLElement[1];
|
||
|
p4 = ptrLElement[1 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[1];
|
||
|
p2 = ptrLElement[1];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[2 * b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[2];
|
||
|
p2 = ptrLElement[2];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[2];
|
||
|
p4 = ptrLElement[2 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[3 * b_stride];
|
||
|
p3 = ptrLElement[3];
|
||
|
p4 = ptrLElement[3 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[3];
|
||
|
p2 = ptrLElement[3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
if (columnCounter > 12)
|
||
|
{
|
||
|
columnCounter -= 12;
|
||
|
|
||
|
/* advance pointers */
|
||
|
ptrLElement += 12;
|
||
|
ptrBElement += 12 * b_stride;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-8 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-8];
|
||
|
p2 = ptrLElement[-8];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-8];
|
||
|
p4 = ptrLElement[-8 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-7 * (int)b_stride];
|
||
|
p3 = ptrLElement[-7];
|
||
|
p4 = ptrLElement[-7 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-7];
|
||
|
p2 = ptrLElement[-7];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-6 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-6];
|
||
|
p2 = ptrLElement[-6];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-6];
|
||
|
p4 = ptrLElement[-6 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-5 * (int)b_stride];
|
||
|
p3 = ptrLElement[-5];
|
||
|
p4 = ptrLElement[-5 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-5];
|
||
|
p2 = ptrLElement[-5];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-4 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-4];
|
||
|
p2 = ptrLElement[-4];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-4];
|
||
|
p4 = ptrLElement[-4 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-3 * (int)b_stride];
|
||
|
p3 = ptrLElement[-3];
|
||
|
p4 = ptrLElement[-3 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-3];
|
||
|
p2 = ptrLElement[-3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-2 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-2];
|
||
|
p2 = ptrLElement[-2];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-2];
|
||
|
p4 = ptrLElement[-2 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-1 * (int)b_stride];
|
||
|
p3 = ptrLElement[-1];
|
||
|
p4 = ptrLElement[-1 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-1];
|
||
|
p2 = ptrLElement[-1];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z21 += p2 * q1;
|
||
|
Z31 += p3 * q1;
|
||
|
Z41 += p4 * q1;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* advance pointers */
|
||
|
ptrLElement += 4;
|
||
|
ptrBElement += 4 * b_stride;
|
||
|
|
||
|
if ((columnCounter -= 4) == 0)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
/* end of inner loop */
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrLElement = L + rowSkip/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
|
||
|
ptrBElement = B;
|
||
|
/* set the Z matrix to 0 */
|
||
|
Z11 = 0; Z21 = 0; Z31 = 0; Z41 = 0;
|
||
|
}
|
||
|
|
||
|
/* finish computing the X(i) block */
|
||
|
dReal Y11, Y21, Y31, Y41;
|
||
|
{
|
||
|
Y11 = ptrBElement[0 * b_stride] - Z11;
|
||
|
ptrBElement[0 * b_stride] = Y11;
|
||
|
}
|
||
|
{
|
||
|
dReal p2 = ptrLElement[0];
|
||
|
Y21 = ptrBElement[1 * b_stride] - Z21 - p2 * Y11;
|
||
|
ptrBElement[1 * b_stride] = Y21;
|
||
|
}
|
||
|
ptrLElement += rowSkip;
|
||
|
{
|
||
|
dReal p3 = ptrLElement[0];
|
||
|
dReal p3_1 = ptrLElement[1];
|
||
|
Y31 = ptrBElement[2 * b_stride] - Z31 - p3 * Y11 - p3_1 * Y21;
|
||
|
ptrBElement[2 * b_stride] = Y31;
|
||
|
}
|
||
|
{
|
||
|
dReal p4 = ptrLElement[rowSkip];
|
||
|
dReal p4_1 = ptrLElement[1 + rowSkip];
|
||
|
dReal p4_2 = ptrLElement[2 + rowSkip];
|
||
|
Y41 = ptrBElement[3 * b_stride] - Z41 - p4 * Y11 - p4_1 * Y21 - p4_2 * Y31;
|
||
|
ptrBElement[3 * b_stride] = Y41;
|
||
|
}
|
||
|
/* end of outer loop */
|
||
|
}
|
||
|
|
||
|
/* compute rows at end that are not a multiple of block size */
|
||
|
for (; !subsequentPass || blockStartRow < rowCount; subsequentPass = true, ++blockStartRow)
|
||
|
{
|
||
|
/* compute all 1 x 1 block of X, from rows i..i+1-1 */
|
||
|
dReal *ptrBElement;
|
||
|
|
||
|
dReal Z11, Z12;
|
||
|
|
||
|
if (subsequentPass)
|
||
|
{
|
||
|
ptrBElement = B;
|
||
|
/* set the Z matrix to 0 */
|
||
|
Z11 = 0; Z12 = 0;
|
||
|
|
||
|
const dReal *ptrLElement = L + blockStartRow * rowSkip;
|
||
|
|
||
|
/* the inner loop that computes outer products and adds them to Z */
|
||
|
unsigned columnCounter = blockStartRow;
|
||
|
for (bool exitLoop = columnCounter < 4; !exitLoop; exitLoop = false)
|
||
|
{
|
||
|
dReal p1, p2, q1, q2;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[0];
|
||
|
p2 = ptrLElement[1];
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
q2 = ptrBElement[1 * b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[2];
|
||
|
p2 = ptrLElement[3];
|
||
|
q1 = ptrBElement[2 * b_stride];
|
||
|
q2 = ptrBElement[3 * b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
|
||
|
if (columnCounter >= (12 + 4))
|
||
|
{
|
||
|
columnCounter -= 12;
|
||
|
|
||
|
/* advance pointers */
|
||
|
ptrLElement += 12;
|
||
|
ptrBElement += 12 * b_stride;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[-8];
|
||
|
p2 = ptrLElement[-7];
|
||
|
q1 = ptrBElement[-8 * (int)b_stride];
|
||
|
q2 = ptrBElement[-7 * (int)b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[-6];
|
||
|
p2 = ptrLElement[-5];
|
||
|
q1 = ptrBElement[-6 * (int)b_stride];
|
||
|
q2 = ptrBElement[-5 * (int)b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[-4];
|
||
|
p2 = ptrLElement[-3];
|
||
|
q1 = ptrBElement[-4 * (int)b_stride];
|
||
|
q2 = ptrBElement[-3 * (int)b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[-2];
|
||
|
p2 = ptrLElement[-1];
|
||
|
q1 = ptrBElement[-2 * (int)b_stride];
|
||
|
q2 = ptrBElement[-1 * (int)b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* advance pointers */
|
||
|
ptrLElement += 4;
|
||
|
ptrBElement += 4 * b_stride;
|
||
|
|
||
|
if ((columnCounter -= 4) < 4)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
/* end of inner loop */
|
||
|
}
|
||
|
|
||
|
/* compute left-over iterations */
|
||
|
if ((columnCounter & 2) != 0)
|
||
|
{
|
||
|
dReal p1, p2, q1, q2;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[0];
|
||
|
p2 = ptrLElement[1];
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
q2 = ptrBElement[1 * b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
Z12 += p2 * q2;
|
||
|
|
||
|
/* advance pointers */
|
||
|
ptrLElement += 2;
|
||
|
ptrBElement += 2 * b_stride;
|
||
|
}
|
||
|
|
||
|
if ((columnCounter & 1) != 0)
|
||
|
{
|
||
|
dReal p1, q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
p1 = ptrLElement[0];
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z11 += p1 * q1;
|
||
|
|
||
|
/* advance pointers */
|
||
|
// ptrLElement += 1; -- not needed anymore
|
||
|
ptrBElement += 1 * b_stride;
|
||
|
}
|
||
|
|
||
|
/* finish computing the X(i) block */
|
||
|
dReal Y11 = ptrBElement[0 * b_stride] - (Z11 + Z12);
|
||
|
ptrBElement[0 * b_stride] = Y11;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
template<unsigned int block_step>
|
||
|
/*static */
|
||
|
sizeint ThreadedEquationSolverLDLT::estimateCooperativelySolvingL1StraightMemoryRequirement(unsigned rowCount, SolvingL1StraightMemoryEstimates &ref_solvingMemoryEstimates)
|
||
|
{
|
||
|
unsigned blockCount = deriveSolvingL1StraightBlockCount(rowCount, block_step);
|
||
|
sizeint descriptorSizeRequired = dEFFICIENT_SIZE(sizeof(cellindexint) * blockCount);
|
||
|
sizeint contextSizeRequired = dEFFICIENT_SIZE(sizeof(SolveL1StraightCellContext) * (CCI__MAX + 1) * blockCount);
|
||
|
ref_solvingMemoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired);
|
||
|
|
||
|
sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired;
|
||
|
return totalSizeRequired;
|
||
|
}
|
||
|
|
||
|
template<unsigned int block_step>
|
||
|
/*static */
|
||
|
void ThreadedEquationSolverLDLT::initializeCooperativelySolveL1StraightMemoryStructures(unsigned rowCount,
|
||
|
atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *dUNUSED(cellContexts))
|
||
|
{
|
||
|
unsigned blockCount = deriveSolvingL1StraightBlockCount(rowCount, block_step);
|
||
|
|
||
|
out_blockCompletionProgress = 0;
|
||
|
memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors));
|
||
|
}
|
||
|
|
||
|
template<unsigned int block_step, unsigned int b_stride>
|
||
|
void ThreadedEquationSolverLDLT::participateSolvingL1Straight(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
|
||
|
volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
|
||
|
SolveL1StraightCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
|
||
|
{
|
||
|
const unsigned lookaheadRange = 32;
|
||
|
const unsigned blockCount = deriveSolvingL1StraightBlockCount(rowCount, block_step), lastBlock = blockCount - 1;
|
||
|
/* compute rows at end that are not a multiple of block size */
|
||
|
const unsigned loopX1RowCount = rowCount % block_step;
|
||
|
|
||
|
BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
|
||
|
|
||
|
unsigned completedBlocks = refBlockCompletionProgress;
|
||
|
unsigned currentBlock = completedBlocks;
|
||
|
dIASSERT(completedBlocks <= blockCount);
|
||
|
|
||
|
for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
|
||
|
{
|
||
|
bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
|
||
|
bool goAssigningTheResult = false, stayWithinTheBlock = false;
|
||
|
|
||
|
dReal Z[block_step];
|
||
|
dReal Y[block_step];
|
||
|
|
||
|
dReal *ptrBElement;
|
||
|
|
||
|
CellContextInstance previousContextInstance;
|
||
|
unsigned completedColumnBlock;
|
||
|
bool partialBlock;
|
||
|
|
||
|
for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
|
||
|
{
|
||
|
if (testDescriptor == INVALID_CELLDESCRIPTOR)
|
||
|
{
|
||
|
// Invalid descriptor is the indication that the row has been fully calculated
|
||
|
// Test if this was the last row and break out if so.
|
||
|
if (currentBlock + 1 == blockCount)
|
||
|
{
|
||
|
exitLoop = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Treat detected row advancement as a row processed
|
||
|
// blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
CooperativeAtomics::AtomicReadReorderBarrier();
|
||
|
// It is necessary to read up to date completedBblocks value after the descriptor retrieval
|
||
|
// as otherwise the logic below breaks
|
||
|
completedBlocks = refBlockCompletionProgress;
|
||
|
|
||
|
if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
|
||
|
{
|
||
|
completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
|
||
|
dIASSERT(completedColumnBlock < currentBlock || (completedColumnBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
|
||
|
dIASSERT(completedColumnBlock <= completedBlocks); // Since the descriptor is not locked
|
||
|
|
||
|
if (completedColumnBlock == completedBlocks && currentBlock != completedBlocks)
|
||
|
{
|
||
|
dIASSERT(completedBlocks < currentBlock);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
|
||
|
{
|
||
|
if (completedColumnBlock != 0)
|
||
|
{
|
||
|
CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
|
||
|
previousContextInstance = contextInstance;
|
||
|
|
||
|
const SolveL1StraightCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
|
||
|
sourceContext.loadPrecalculatedZs(Z);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
previousContextInstance = CCI__MIN;
|
||
|
SolveL1StraightCellContext::initializePrecalculatedZs(Z);
|
||
|
}
|
||
|
|
||
|
goForLockedBlockPrimaryCalculation = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
testDescriptor = blockProgressDescriptors[currentBlock];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
cellindexint verificativeDescriptor;
|
||
|
bool verificationFailure = false;
|
||
|
|
||
|
completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
|
||
|
dIASSERT(completedColumnBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
|
||
|
|
||
|
if (completedColumnBlock != 0)
|
||
|
{
|
||
|
CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
|
||
|
const SolveL1StraightCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
|
||
|
sourceContext.loadPrecalculatedZs(Z);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
SolveL1StraightCellContext::initializePrecalculatedZs(Z);
|
||
|
}
|
||
|
|
||
|
if (completedColumnBlock != 0 && completedColumnBlock <= currentBlock)
|
||
|
{
|
||
|
// Make sure the descriptor is re-read after the precalculates
|
||
|
CooperativeAtomics::AtomicReadReorderBarrier();
|
||
|
}
|
||
|
|
||
|
if (completedColumnBlock <= currentBlock)
|
||
|
{
|
||
|
verificativeDescriptor = blockProgressDescriptors[currentBlock];
|
||
|
verificationFailure = verificativeDescriptor != testDescriptor;
|
||
|
}
|
||
|
|
||
|
if (!verificationFailure)
|
||
|
{
|
||
|
dIASSERT(completedColumnBlock <= currentBlock + 1);
|
||
|
|
||
|
goForLockedBlockDuplicateCalculation = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
testDescriptor = verificativeDescriptor;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (exitLoop)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (goForLockedBlockPrimaryCalculation)
|
||
|
{
|
||
|
blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
|
||
|
|
||
|
// Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
|
||
|
bool handleComputationTakenOver = false, rowEndReached = false;
|
||
|
|
||
|
const dReal *ptrLElement;
|
||
|
unsigned finalColumnBlock;
|
||
|
|
||
|
/* check if this is not the partial block of fewer rows */
|
||
|
if (currentBlock != lastBlock || loopX1RowCount == 0)
|
||
|
{
|
||
|
partialBlock = false;
|
||
|
|
||
|
if (currentBlock != 0)
|
||
|
{
|
||
|
ptrLElement = L + (sizeint)(1 + currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
|
||
|
ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;
|
||
|
|
||
|
/* the inner loop that computes outer products and adds them to Z */
|
||
|
finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
|
||
|
dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);
|
||
|
|
||
|
for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
|
||
|
{
|
||
|
dReal q1, p1, p2, p3, p4;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[0];
|
||
|
p2 = ptrLElement[0];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[0];
|
||
|
p4 = ptrLElement[0 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[1 * b_stride];
|
||
|
p3 = ptrLElement[1];
|
||
|
p4 = ptrLElement[1 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[1];
|
||
|
p2 = ptrLElement[1];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[2 * b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[2];
|
||
|
p2 = ptrLElement[2];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[2];
|
||
|
p4 = ptrLElement[2 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[3 * b_stride];
|
||
|
p3 = ptrLElement[3];
|
||
|
p4 = ptrLElement[3 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[3];
|
||
|
p2 = ptrLElement[3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
if (columnCounter > 3)
|
||
|
{
|
||
|
columnCounter -= 3;
|
||
|
|
||
|
ptrLElement += 3 * block_step;
|
||
|
ptrBElement += 3 * block_step * b_stride;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-8 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-8];
|
||
|
p2 = ptrLElement[-8];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-8];
|
||
|
p4 = ptrLElement[-8 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-7 * (int)b_stride];
|
||
|
p3 = ptrLElement[-7];
|
||
|
p4 = ptrLElement[-7 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-7];
|
||
|
p2 = ptrLElement[-7];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-6 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-6];
|
||
|
p2 = ptrLElement[-6];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-6];
|
||
|
p4 = ptrLElement[-6 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-5 * (int)b_stride];
|
||
|
p3 = ptrLElement[-5];
|
||
|
p4 = ptrLElement[-5 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-5];
|
||
|
p2 = ptrLElement[-5];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-4 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-4];
|
||
|
p2 = ptrLElement[-4];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-4];
|
||
|
p4 = ptrLElement[-4 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-3 * (int)b_stride];
|
||
|
p3 = ptrLElement[-3];
|
||
|
p4 = ptrLElement[-3 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-3];
|
||
|
p2 = ptrLElement[-3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-2 * (int)b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[-2];
|
||
|
p2 = ptrLElement[-2];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[-2];
|
||
|
p4 = ptrLElement[-2 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-1 * (int)b_stride];
|
||
|
p3 = ptrLElement[-1];
|
||
|
p4 = ptrLElement[-1 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[-1];
|
||
|
p2 = ptrLElement[-1];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrLElement += block_step;
|
||
|
ptrBElement += block_step * b_stride;
|
||
|
|
||
|
if (--columnCounter == 0)
|
||
|
{
|
||
|
if (finalColumnBlock == currentBlock)
|
||
|
{
|
||
|
rowEndReached = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Take a look if anymore columns have been completed...
|
||
|
completedBlocks = refBlockCompletionProgress;
|
||
|
dIASSERT(completedBlocks >= finalColumnBlock);
|
||
|
|
||
|
if (completedBlocks == finalColumnBlock)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// ...continue if so.
|
||
|
unsigned columnCompletedSoFar = finalColumnBlock;
|
||
|
finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
|
||
|
columnCounter = finalColumnBlock - columnCompletedSoFar;
|
||
|
}
|
||
|
}
|
||
|
/* end of inner loop */
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrLElement = L + (sizeint)(1/* + currentBlock * block_step*/) * rowSkip/* + completedColumnBlock * block_step*/;
|
||
|
ptrBElement = B/* + (sizeint)(completedColumnBlock * block_step) * b_stride*/;
|
||
|
dIASSERT(completedColumnBlock == 0);
|
||
|
|
||
|
rowEndReached = true;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
partialBlock = true;
|
||
|
|
||
|
if (currentBlock != 0)
|
||
|
{
|
||
|
dReal tempZ[dMACRO_MAX(block_step - 1U, 1U)] = { REAL(0.0), };
|
||
|
|
||
|
ptrLElement = L + (sizeint)(/*1 + */currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
|
||
|
ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;
|
||
|
|
||
|
/* the inner loop that computes outer products and adds them to Z */
|
||
|
finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
|
||
|
dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);
|
||
|
|
||
|
for (unsigned partialRow = 0, columnCompletedSoFar = completedColumnBlock; ; )
|
||
|
{
|
||
|
dReal Z1 = 0, Z2 = 0, Z3 = 0, Z4 = 0;
|
||
|
|
||
|
for (unsigned columnCounter = finalColumnBlock - columnCompletedSoFar; ; )
|
||
|
{
|
||
|
dReal q1, q2, q3, q4, p1, p2, p3, p4;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
q2 = ptrBElement[1 * b_stride];
|
||
|
q3 = ptrBElement[2 * b_stride];
|
||
|
q4 = ptrBElement[3 * b_stride];
|
||
|
p1 = ptrLElement[0];
|
||
|
p2 = ptrLElement[1];
|
||
|
p3 = ptrLElement[2];
|
||
|
p4 = ptrLElement[3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z1 += p1 * q1;
|
||
|
Z2 += p2 * q2;
|
||
|
Z3 += p3 * q3;
|
||
|
Z4 += p4 * q4;
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
if (columnCounter > 3)
|
||
|
{
|
||
|
columnCounter -= 3;
|
||
|
|
||
|
ptrLElement += 3 * block_step;
|
||
|
ptrBElement += 3 * block_step * b_stride;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-8 * (int)b_stride];
|
||
|
q2 = ptrBElement[-7 * (int)b_stride];
|
||
|
q3 = ptrBElement[-6 * (int)b_stride];
|
||
|
q4 = ptrBElement[-5 * (int)b_stride];
|
||
|
p1 = ptrLElement[-8];
|
||
|
p2 = ptrLElement[-7];
|
||
|
p3 = ptrLElement[-6];
|
||
|
p4 = ptrLElement[-5];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z1 += p1 * q1;
|
||
|
Z2 += p2 * q2;
|
||
|
Z3 += p3 * q3;
|
||
|
Z4 += p4 * q4;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[-4 * (int)b_stride];
|
||
|
q2 = ptrBElement[-3 * (int)b_stride];
|
||
|
q3 = ptrBElement[-2 * (int)b_stride];
|
||
|
q4 = ptrBElement[-1 * (int)b_stride];
|
||
|
p1 = ptrLElement[-4];
|
||
|
p2 = ptrLElement[-3];
|
||
|
p3 = ptrLElement[-2];
|
||
|
p4 = ptrLElement[-1];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z1 += p1 * q1;
|
||
|
Z2 += p2 * q2;
|
||
|
Z3 += p3 * q3;
|
||
|
Z4 += p4 * q4;
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrLElement += block_step;
|
||
|
ptrBElement += block_step * b_stride;
|
||
|
|
||
|
if (--columnCounter == 0)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
/* end of inner loop */
|
||
|
}
|
||
|
|
||
|
tempZ[partialRow] += Z1 + Z2 + Z3 + Z4;
|
||
|
|
||
|
if (++partialRow == loopX1RowCount)
|
||
|
{
|
||
|
// Here switch is used to avoid accessing Z by parametrized index.
|
||
|
// So far all the accesses were performed by explicit constants
|
||
|
// what lets the compiler treat Z elements as individual variables
|
||
|
// rather than array elements.
|
||
|
Z[0] += tempZ[0];
|
||
|
|
||
|
if (loopX1RowCount >= 2)
|
||
|
{
|
||
|
Z[1] += tempZ[1];
|
||
|
|
||
|
if (loopX1RowCount > 2)
|
||
|
{
|
||
|
Z[2] += tempZ[2];
|
||
|
}
|
||
|
}
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
if (finalColumnBlock == currentBlock)
|
||
|
{
|
||
|
if (loopX1RowCount > 2)
|
||
|
{
|
||
|
// Correct the LElement so that it points to the second row
|
||
|
//
|
||
|
// Note, that ff there is just one partial row, it does not matter that
|
||
|
// the LElement will remain pointing at the first row,
|
||
|
// since the former is not going to be used in that case.
|
||
|
ptrLElement -= /*(sizeint)*/rowSkip/* * (loopX1RowCount - 2)*/; dIASSERT(loopX1RowCount == 3);
|
||
|
}
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
rowEndReached = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Take a look if anymore columns have been completed...
|
||
|
completedBlocks = refBlockCompletionProgress;
|
||
|
dIASSERT(completedBlocks >= finalColumnBlock);
|
||
|
|
||
|
if (completedBlocks == finalColumnBlock)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
std::fill(tempZ, tempZ + loopX1RowCount, REAL(0.0));
|
||
|
partialRow = 0;
|
||
|
|
||
|
// Correct the LElement pointer to continue at the first partial row
|
||
|
ptrLElement -= (sizeint)rowSkip * (loopX1RowCount - 1);
|
||
|
|
||
|
// ...continue if so.
|
||
|
columnCompletedSoFar = finalColumnBlock;
|
||
|
finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrLElement += rowSkip - (finalColumnBlock - columnCompletedSoFar) * block_step;
|
||
|
ptrBElement -= (sizeint)((finalColumnBlock - columnCompletedSoFar) * block_step) * b_stride;
|
||
|
}
|
||
|
/* end of loop by individual rows */
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrLElement = L + (sizeint)(1/* + currentBlock * block_step*/) * rowSkip/* + completedColumnBlock * block_step*/;
|
||
|
ptrBElement = B/* + (sizeint)(completedColumnBlock * block_step) * b_stride*/;
|
||
|
dIASSERT(completedColumnBlock == 0);
|
||
|
|
||
|
rowEndReached = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (rowEndReached)
|
||
|
{
|
||
|
// Check whether there is still a need to proceed or if the computation has been taken over by another thread
|
||
|
cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
|
||
|
|
||
|
if (blockProgressDescriptors[currentBlock] == oldDescriptor)
|
||
|
{
|
||
|
/* finish computing the X(i) block */
|
||
|
if (!partialBlock)
|
||
|
{
|
||
|
Y[0] = ptrBElement[0 * b_stride] - Z[0];
|
||
|
|
||
|
dReal p2 = ptrLElement[0];
|
||
|
Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];
|
||
|
|
||
|
ptrLElement += rowSkip;
|
||
|
|
||
|
dReal p3 = ptrLElement[0];
|
||
|
dReal p3_1 = ptrLElement[1];
|
||
|
Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
|
||
|
|
||
|
dReal p4 = ptrLElement[rowSkip];
|
||
|
dReal p4_1 = ptrLElement[1 + rowSkip];
|
||
|
dReal p4_2 = ptrLElement[2 + rowSkip];
|
||
|
Y[3] = ptrBElement[3 * b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
Y[0] = ptrBElement[0 * b_stride] - Z[0];
|
||
|
|
||
|
if (loopX1RowCount >= 2)
|
||
|
{
|
||
|
dReal p2 = ptrLElement[0];
|
||
|
Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];
|
||
|
|
||
|
if (loopX1RowCount > 2)
|
||
|
{
|
||
|
dReal p3 = ptrLElement[0 + rowSkip];
|
||
|
dReal p3_1 = ptrLElement[1 + rowSkip];
|
||
|
Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
|
||
|
}
|
||
|
}
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
|
||
|
// Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
|
||
|
CooperativeAtomics::AtomicReadReorderBarrier();
|
||
|
|
||
|
// The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
|
||
|
// and the computation result is valid.
|
||
|
if (blockProgressDescriptors[currentBlock] == oldDescriptor)
|
||
|
{
|
||
|
// Assign the results to the result context (possibly in parallel with other threads
|
||
|
// that could and ought to be assigning exactly the same values)
|
||
|
SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
|
||
|
resultContext.storePrecalculatedZs(Y);
|
||
|
|
||
|
// Assign the result assignment progress descriptor
|
||
|
cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
|
||
|
CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
|
||
|
|
||
|
// Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
|
||
|
goAssigningTheResult = true;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// Otherwise, go on competing for copying the results
|
||
|
handleComputationTakenOver = true;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
handleComputationTakenOver = true;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// If the final column has not been reached yet, store current values to the context.
|
||
|
// Select the other context instance as the previous one might be read by other threads.
|
||
|
CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
|
||
|
SolveL1StraightCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
|
||
|
destinationContext.storePrecalculatedZs(Z);
|
||
|
|
||
|
// Unlock the row until more columns can be used
|
||
|
cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
|
||
|
cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalColumnBlock, nextContextInstance, false);
|
||
|
// The descriptor might have been updated by a competing thread
|
||
|
if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
|
||
|
{
|
||
|
// Adjust the ptrBElement to point to the result area...
|
||
|
ptrBElement = B + (sizeint)(currentBlock * block_step) * b_stride;
|
||
|
// ...and go on handling the case
|
||
|
handleComputationTakenOver = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (handleComputationTakenOver)
|
||
|
{
|
||
|
cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
|
||
|
// This can only happen if the row was (has become) the uppermost not fully completed one
|
||
|
// and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
|
||
|
if (existingDescriptor != INVALID_CELLDESCRIPTOR)
|
||
|
{
|
||
|
// If not fully completed this must be the final stage of the result assignment into the matrix
|
||
|
dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
|
||
|
|
||
|
// Go on competing copying the result as anyway the block is the topmost not completed one
|
||
|
// and since there was competition for it, there is no other work that can be done right now.
|
||
|
const SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
|
||
|
resultContext.loadPrecalculatedZs(Y);
|
||
|
|
||
|
goAssigningTheResult = true;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// everything is over -- just go handling next blocks
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else if (goForLockedBlockDuplicateCalculation)
|
||
|
{
|
||
|
blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
|
||
|
|
||
|
bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
|
||
|
|
||
|
/* declare variables */
|
||
|
const dReal *ptrLElement;
|
||
|
|
||
|
if (completedColumnBlock < currentBlock)
|
||
|
{
|
||
|
/* check if this is not the partial block of fewer rows */
|
||
|
if (currentBlock != lastBlock || loopX1RowCount == 0)
|
||
|
{
|
||
|
partialBlock = false;
|
||
|
|
||
|
ptrLElement = L + (sizeint)(1 + currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
|
||
|
ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;
|
||
|
|
||
|
/* the inner loop that computes outer products and adds them to Z */
|
||
|
unsigned finalColumnBlock = currentBlock;
|
||
|
dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
|
||
|
|
||
|
unsigned lastCompletedColumn = completedColumnBlock;
|
||
|
unsigned columnCounter = finalColumnBlock - completedColumnBlock;
|
||
|
for (bool exitInnerLoop = false; !exitInnerLoop; exitInnerLoop = --columnCounter == 0)
|
||
|
{
|
||
|
dReal q1, p1, p2, p3, p4;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[0];
|
||
|
p2 = ptrLElement[0];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[0];
|
||
|
p4 = ptrLElement[0 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[1 * b_stride];
|
||
|
p3 = ptrLElement[1];
|
||
|
p4 = ptrLElement[1 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[1];
|
||
|
p2 = ptrLElement[1];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[2 * b_stride];
|
||
|
p1 = (ptrLElement - rowSkip)[2];
|
||
|
p2 = ptrLElement[2];
|
||
|
ptrLElement += rowSkip;
|
||
|
p3 = ptrLElement[2];
|
||
|
p4 = ptrLElement[2 + rowSkip];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
|
||
|
/* load p and q values */
|
||
|
q1 = ptrBElement[3 * b_stride];
|
||
|
p3 = ptrLElement[3];
|
||
|
p4 = ptrLElement[3 + rowSkip];
|
||
|
ptrLElement -= rowSkip;
|
||
|
p1 = (ptrLElement - rowSkip)[3];
|
||
|
p2 = ptrLElement[3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
Z[0] += p1 * q1;
|
||
|
Z[1] += p2 * q1;
|
||
|
Z[2] += p3 * q1;
|
||
|
Z[3] += p4 * q1;
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
// Check if the primary solver thread has not made any progress
|
||
|
cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
|
||
|
unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
|
||
|
|
||
|
if (newCompletedColumn != lastCompletedColumn)
|
||
|
{
|
||
|
// Check, this is the first change the current thread detects.
|
||
|
// There is absolutely no reason in code for the computation to stop/resume twice
|
||
|
// while the current thread is competing.
|
||
|
dIASSERT(lastCompletedColumn == completedColumnBlock);
|
||
|
|
||
|
if (descriptorVerification == INVALID_CELLDESCRIPTOR)
|
||
|
{
|
||
|
skipToHandlingSubsequentRows = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (newCompletedColumn == currentBlock + 1)
|
||
|
{
|
||
|
skiptoCopyingResult = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Check if the current thread is behind
|
||
|
if (newCompletedColumn > finalColumnBlock - columnCounter)
|
||
|
{
|
||
|
// If so, go starting over one more time
|
||
|
blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
|
||
|
stayWithinTheBlock = true;
|
||
|
skipToHandlingSubsequentRows = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// If current thread is ahead, just save new completed column for further comparisons and go on calculating
|
||
|
lastCompletedColumn = newCompletedColumn;
|
||
|
}
|
||
|
|
||
|
/* advance pointers */
|
||
|
ptrLElement += block_step;
|
||
|
ptrBElement += block_step * b_stride;
|
||
|
/* end of inner loop */
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
partialBlock = true;
|
||
|
|
||
|
dReal tempZ[dMACRO_MAX(block_step - 1U, 1U)] = { REAL(0.0), };
|
||
|
|
||
|
ptrLElement = L + (sizeint)(/*1 + */currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
|
||
|
ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;
|
||
|
|
||
|
/* the inner loop that computes outer products and adds them to Z */
|
||
|
unsigned finalColumnBlock = currentBlock;
|
||
|
dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
|
||
|
|
||
|
unsigned lastCompletedColumn = completedColumnBlock;
|
||
|
for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
|
||
|
{
|
||
|
dReal q1, q2, q3, q4;
|
||
|
|
||
|
/* load q values */
|
||
|
q1 = ptrBElement[0 * b_stride];
|
||
|
q2 = ptrBElement[1 * b_stride];
|
||
|
q3 = ptrBElement[2 * b_stride];
|
||
|
q4 = ptrBElement[3 * b_stride];
|
||
|
|
||
|
for (unsigned partialRow = 0; ; )
|
||
|
{
|
||
|
dReal p1, p2, p3, p4;
|
||
|
|
||
|
/* load p values */
|
||
|
p1 = ptrLElement[0];
|
||
|
p2 = ptrLElement[1];
|
||
|
p3 = ptrLElement[2];
|
||
|
p4 = ptrLElement[3];
|
||
|
|
||
|
/* compute outer product and add it to the Z matrix */
|
||
|
tempZ[partialRow] += p1 * q1 + p2 * q2 + p3 * q3 + p4 * q4;
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
if (++partialRow == loopX1RowCount)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
ptrLElement += rowSkip;
|
||
|
}
|
||
|
|
||
|
// Check if the primary solver thread has not made any progress
|
||
|
cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
|
||
|
unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
|
||
|
|
||
|
if (newCompletedColumn != lastCompletedColumn)
|
||
|
{
|
||
|
// Check, this is the first change the current thread detects.
|
||
|
// There is absolutely no reason in code for the computation to stop/resume twice
|
||
|
// while the current thread is competing.
|
||
|
dIASSERT(lastCompletedColumn == completedColumnBlock);
|
||
|
|
||
|
if (descriptorVerification == INVALID_CELLDESCRIPTOR)
|
||
|
{
|
||
|
skipToHandlingSubsequentRows = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (newCompletedColumn == currentBlock + 1)
|
||
|
{
|
||
|
skiptoCopyingResult = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Check if the current thread is behind
|
||
|
if (newCompletedColumn > finalColumnBlock - columnCounter)
|
||
|
{
|
||
|
// If so, go starting over one more time
|
||
|
blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
|
||
|
stayWithinTheBlock = true;
|
||
|
skipToHandlingSubsequentRows = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// If current thread is ahead, just save new completed column for further comparisons and go on calculating
|
||
|
lastCompletedColumn = newCompletedColumn;
|
||
|
}
|
||
|
|
||
|
ptrLElement += block_step;
|
||
|
ptrBElement += block_step * b_stride;
|
||
|
|
||
|
if (--columnCounter == 0)
|
||
|
{
|
||
|
// Here switch is used to avoid accessing Z by parametrized index.
|
||
|
// So far all the accesses were performed by explicit constants
|
||
|
// what lets the compiler treat Z elements as individual variables
|
||
|
// rather than array elements.
|
||
|
Z[0] += tempZ[0];
|
||
|
|
||
|
if (loopX1RowCount >= 2)
|
||
|
{
|
||
|
Z[1] += tempZ[1];
|
||
|
|
||
|
if (loopX1RowCount > 2)
|
||
|
{
|
||
|
Z[2] += tempZ[2];
|
||
|
|
||
|
// Correct the LElement so that it points to the second row
|
||
|
//
|
||
|
// Note, that if there is just one partial row, it does not matter that
|
||
|
// the LElement will remain pointing at the first row,
|
||
|
// since the former is not going to be used in that case.
|
||
|
ptrLElement -= /*(sizeint)*/rowSkip/* * (loopX1RowCount - 2)*/; dIASSERT(loopX1RowCount == 3);
|
||
|
}
|
||
|
}
|
||
|
dSASSERT(block_step == 4);
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/* advance pointers */
|
||
|
ptrLElement -= (sizeint)rowSkip * (loopX1RowCount - 1);
|
||
|
/* end of inner loop */
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else if (completedColumnBlock > currentBlock)
|
||
|
{
|
||
|
dIASSERT(completedColumnBlock == currentBlock + 1);
|
||
|
|
||
|
partialBlock = currentBlock == lastBlock && loopX1RowCount != 0;
|
||
|
|
||
|
skiptoCopyingResult = true;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
|
||
|
|
||
|
partialBlock = rowCount < block_step;
|
||
|
|
||
|
/* assign the pointers appropriately and go on computing the results */
|
||
|
ptrLElement = L + (sizeint)(1/* + currentBlock * block_step*/) * rowSkip/* + completedColumnBlock * block_step*/;
|
||
|
ptrBElement = B/* + (sizeint)(completedColumnBlock * block_step) * b_stride*/;
|
||
|
}
|
||
|
|
||
|
if (!skipToHandlingSubsequentRows)
|
||
|
{
|
||
|
if (!skiptoCopyingResult)
|
||
|
{
|
||
|
if (!partialBlock)
|
||
|
{
|
||
|
Y[0] = ptrBElement[0 * b_stride] - Z[0];
|
||
|
|
||
|
dReal p2 = ptrLElement[0];
|
||
|
Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];
|
||
|
|
||
|
ptrLElement += rowSkip;
|
||
|
|
||
|
dReal p3 = ptrLElement[0];
|
||
|
dReal p3_1 = ptrLElement[1];
|
||
|
Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
|
||
|
|
||
|
dReal p4 = ptrLElement[rowSkip];
|
||
|
dReal p4_1 = ptrLElement[1 + rowSkip];
|
||
|
dReal p4_2 = ptrLElement[2 + rowSkip];
|
||
|
Y[3] = ptrBElement[3 * b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
Y[0] = ptrBElement[0 * b_stride] - Z[0];
|
||
|
|
||
|
if (loopX1RowCount >= 2)
|
||
|
{
|
||
|
dReal p2 = ptrLElement[0];
|
||
|
Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];
|
||
|
|
||
|
if (loopX1RowCount > 2)
|
||
|
{
|
||
|
dReal p3 = ptrLElement[0 + rowSkip];
|
||
|
dReal p3_1 = ptrLElement[1 + rowSkip];
|
||
|
Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
|
||
|
}
|
||
|
}
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
|
||
|
CooperativeAtomics::AtomicReadReorderBarrier();
|
||
|
|
||
|
// Use atomic load to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
|
||
|
cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
|
||
|
|
||
|
if (existingDescriptor == INVALID_CELLDESCRIPTOR)
|
||
|
{
|
||
|
// Everything is over -- proceed to subsequent rows
|
||
|
skipToHandlingSubsequentRows = true;
|
||
|
}
|
||
|
else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
|
||
|
{
|
||
|
// The values computed above may not be valid. Copy the values already in the result context.
|
||
|
skiptoCopyingResult = true;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
|
||
|
// and the computation result is valid.
|
||
|
cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
|
||
|
|
||
|
// Assign the results to the result context (possibly in parallel with other threads
|
||
|
// that could and ought to be assigning exactly the same values)
|
||
|
SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
|
||
|
resultContext.storePrecalculatedZs(Y);
|
||
|
|
||
|
// Assign the result assignment progress descriptor
|
||
|
CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
|
||
|
|
||
|
// Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!skipToHandlingSubsequentRows)
|
||
|
{
|
||
|
if (skiptoCopyingResult)
|
||
|
{
|
||
|
// Extract the result values stored in the result context
|
||
|
const SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
|
||
|
resultContext.loadPrecalculatedZs(Y);
|
||
|
|
||
|
ptrBElement = B + (sizeint)(currentBlock * block_step) * b_stride;
|
||
|
}
|
||
|
|
||
|
goAssigningTheResult = true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (goAssigningTheResult)
|
||
|
{
|
||
|
cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
|
||
|
// Check if the assignment has not been completed yet
|
||
|
if (existingDescriptor != INVALID_CELLDESCRIPTOR)
|
||
|
{
|
||
|
// Assign the computation results to the B vector
|
||
|
if (!partialBlock)
|
||
|
{
|
||
|
ptrBElement[0 * b_stride] = Y[0];
|
||
|
ptrBElement[1 * b_stride] = Y[1];
|
||
|
ptrBElement[2 * b_stride] = Y[2];
|
||
|
ptrBElement[3 * b_stride] = Y[3];
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ptrBElement[0 * b_stride] = Y[0];
|
||
|
|
||
|
if (loopX1RowCount >= 2)
|
||
|
{
|
||
|
ptrBElement[1 * b_stride] = Y[1];
|
||
|
|
||
|
if (loopX1RowCount > 2)
|
||
|
{
|
||
|
ptrBElement[2 * b_stride] = Y[2];
|
||
|
}
|
||
|
}
|
||
|
dSASSERT(block_step == 4);
|
||
|
}
|
||
|
|
||
|
ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
|
||
|
dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
|
||
|
|
||
|
// And assign the completed status no matter what
|
||
|
CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// everything is over -- just go handling next blocks
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!stayWithinTheBlock)
|
||
|
{
|
||
|
completedBlocks = refBlockCompletionProgress;
|
||
|
|
||
|
if (completedBlocks == blockCount)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
currentBlock += 1;
|
||
|
|
||
|
bool lookaheadBoundaryReached = false;
|
||
|
|
||
|
if (currentBlock == blockCount || completedBlocks == 0)
|
||
|
{
|
||
|
lookaheadBoundaryReached = true;
|
||
|
}
|
||
|
else if (currentBlock >= completedBlocks + lookaheadRange)
|
||
|
{
|
||
|
lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
|
||
|
}
|
||
|
else if (currentBlock < completedBlocks)
|
||
|
{
|
||
|
// Treat detected row advancement as a row processed
|
||
|
// blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
|
||
|
|
||
|
currentBlock = completedBlocks;
|
||
|
}
|
||
|
|
||
|
if (lookaheadBoundaryReached)
|
||
|
{
|
||
|
dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did we not compete???
|
||
|
|
||
|
// If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
|
||
|
if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
|
||
|
{
|
||
|
// Abandon job if too few blocks remain
|
||
|
if (blockCount - completedBlocks <= ownThreadIndex)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// If there was some progress, just continue to the next pass
|
||
|
blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
|
||
|
}
|
||
|
|
||
|
currentBlock = completedBlocks;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
#endif
|