horror/thirdparty/ode-0.16.5/ode/src/fastlsolve_impl.h



/*************************************************************************
 *                                                                       *
 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
 * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
 *                                                                       *
 * This library is free software; you can redistribute it and/or         *
 * modify it under the terms of EITHER:                                  *
 *   (1) The GNU Lesser General Public License as published by the Free  *
 *       Software Foundation; either version 2.1 of the License, or (at  *
 *       your option) any later version. The text of the GNU Lesser      *
 *       General Public License is included with this library in the     *
 *       file LICENSE.TXT.                                               *
 *   (2) The BSD-style license that is included with this library in     *
 *       the file LICENSE-BSD.TXT.                                       *
 *                                                                       *
 * This library is distributed in the hope that it will be useful,       *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
 * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
 *                                                                       *
 *************************************************************************/

/*
 * Code style improvements and optimizations by Oleh Derevenko ????-2024
 * L1Straight cooperative solving code of ThreadedEquationSolverLDLT copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")  
 */

#ifndef _ODE_FASTLSOLVE_IMPL_H_
#define _ODE_FASTLSOLVE_IMPL_H_


/* solve L*X=B, with B containing 1 right hand sides.
 * L is an n*n lower triangular matrix with ones on the diagonal.
 * L is stored by rows and its leading dimension is lskip.
 * B is an n*1 matrix that contains the right hand sides.
 * B is stored by columns and its leading dimension is also lskip.
 * B is overwritten with X.
 * this processes blocks of 4*4.
 * if this is in the factorizer source file, n must be a multiple of 4.
 */

template<unsigned int b_stride>
void solveL1Straight (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
{
    dIASSERT(rowCount != 0);

    /* compute all 4 x 1 blocks of X */
    unsigned blockStartRow = 0;
    bool subsequentPass = false;
    bool goForLoopX4 = rowCount >= 4;
    const unsigned loopX4LastRow = goForLoopX4 ? rowCount - 4 : 0;
    for (; goForLoopX4; subsequentPass = true, goForLoopX4 = (blockStartRow += 4) <= loopX4LastRow) 
    {
        /* declare variables - Z matrix, p and q vectors, etc */
        const dReal *ptrLElement;
        dReal *ptrBElement;

        dReal Z11, Z21, Z31, Z41;

        /* compute all 4 x 1 block of X, from rows i..i+4-1 */
        if (subsequentPass)
        {
            ptrLElement = L + (1 + blockStartRow) * rowSkip;
            ptrBElement = B;
            /* set the Z matrix to 0 */
            Z11 = 0; Z21 = 0; Z31 = 0; Z41 = 0;

            /* the inner loop that computes outer products and adds them to Z */
            for (unsigned columnCounter = blockStartRow; ; )
            {
                dReal q1, p1, p2, p3, p4;

                /* load p and q values */
                q1 = ptrBElement[0 * b_stride];
                p1 = (ptrLElement - rowSkip)[0];
                p2 = ptrLElement[0];
                ptrLElement += rowSkip;
                p3 = ptrLElement[0];
                p4 = ptrLElement[0 + rowSkip];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z21 += p2 * q1;
                Z31 += p3 * q1;
                Z41 += p4 * q1;

                /* load p and q values */
                q1 = ptrBElement[1 * b_stride];
                p3 = ptrLElement[1];
                p4 = ptrLElement[1 + rowSkip];
                ptrLElement -= rowSkip;
                p1 = (ptrLElement - rowSkip)[1];
                p2 = ptrLElement[1];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z21 += p2 * q1;
                Z31 += p3 * q1;
                Z41 += p4 * q1;

                /* load p and q values */
                q1 = ptrBElement[2 * b_stride];
                p1 = (ptrLElement - rowSkip)[2];
                p2 = ptrLElement[2];
                ptrLElement += rowSkip;
                p3 = ptrLElement[2];
                p4 = ptrLElement[2 + rowSkip];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z21 += p2 * q1;
                Z31 += p3 * q1;
                Z41 += p4 * q1;

                /* load p and q values */
                q1 = ptrBElement[3 * b_stride];
                p3 = ptrLElement[3];
                p4 = ptrLElement[3 + rowSkip];
                ptrLElement -= rowSkip;
                p1 = (ptrLElement - rowSkip)[3];
                p2 = ptrLElement[3];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z21 += p2 * q1;
                Z31 += p3 * q1;
                Z41 += p4 * q1;

                if (columnCounter > 12)
                {
                    columnCounter -= 12;

                    /* advance pointers */
                    ptrLElement += 12;
                    ptrBElement += 12 * b_stride;

                    /* load p and q values */
                    q1 = ptrBElement[-8 * (int)b_stride];
                    p1 = (ptrLElement - rowSkip)[-8];
                    p2 = ptrLElement[-8];
                    ptrLElement += rowSkip;
                    p3 = ptrLElement[-8];
                    p4 = ptrLElement[-8 + rowSkip];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-7 * (int)b_stride];
                    p3 = ptrLElement[-7];
                    p4 = ptrLElement[-7 + rowSkip];
                    ptrLElement -= rowSkip;
                    p1 = (ptrLElement - rowSkip)[-7];
                    p2 = ptrLElement[-7];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-6 * (int)b_stride];
                    p1 = (ptrLElement - rowSkip)[-6];
                    p2 = ptrLElement[-6];
                    ptrLElement += rowSkip;
                    p3 = ptrLElement[-6];
                    p4 = ptrLElement[-6 + rowSkip];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-5 * (int)b_stride];
                    p3 = ptrLElement[-5];
                    p4 = ptrLElement[-5 + rowSkip];
                    ptrLElement -= rowSkip;
                    p1 = (ptrLElement - rowSkip)[-5];
                    p2 = ptrLElement[-5];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-4 * (int)b_stride];
                    p1 = (ptrLElement - rowSkip)[-4];
                    p2 = ptrLElement[-4];
                    ptrLElement += rowSkip;
                    p3 = ptrLElement[-4];
                    p4 = ptrLElement[-4 + rowSkip];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-3 * (int)b_stride];
                    p3 = ptrLElement[-3];
                    p4 = ptrLElement[-3 + rowSkip];
                    ptrLElement -= rowSkip;
                    p1 = (ptrLElement - rowSkip)[-3];
                    p2 = ptrLElement[-3];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-2 * (int)b_stride];
                    p1 = (ptrLElement - rowSkip)[-2];
                    p2 = ptrLElement[-2];
                    ptrLElement += rowSkip;
                    p3 = ptrLElement[-2];
                    p4 = ptrLElement[-2 + rowSkip];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;

                    /* load p and q values */
                    q1 = ptrBElement[-1 * (int)b_stride];
                    p3 = ptrLElement[-1];
                    p4 = ptrLElement[-1 + rowSkip];
                    ptrLElement -= rowSkip;
                    p1 = (ptrLElement - rowSkip)[-1];
                    p2 = ptrLElement[-1];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z21 += p2 * q1;
                    Z31 += p3 * q1;
                    Z41 += p4 * q1;
                }
                else
                {
                    /* advance pointers */
                    ptrLElement += 4;
                    ptrBElement += 4 * b_stride;

                    if ((columnCounter -= 4) == 0)
                    {
                        break;
                    }
                }
                /* end of inner loop */
            }
        }
        else
        {
            ptrLElement = L + rowSkip/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
            ptrBElement = B;
            /* set the Z matrix to 0 */
            Z11 = 0; Z21 = 0; Z31 = 0; Z41 = 0;
        }

        /* finish computing the X(i) block */
        dReal Y11, Y21, Y31, Y41;
        {
            Y11 = ptrBElement[0 * b_stride] - Z11;
            ptrBElement[0 * b_stride] = Y11;
        }
        {
            dReal p2 = ptrLElement[0];
            Y21 = ptrBElement[1 * b_stride] - Z21 - p2 * Y11;
            ptrBElement[1 * b_stride] = Y21;
        }
        ptrLElement += rowSkip;
        {
            dReal p3 = ptrLElement[0];
            dReal p3_1 = ptrLElement[1];
            Y31 = ptrBElement[2 * b_stride] - Z31 - p3 * Y11 - p3_1 * Y21;
            ptrBElement[2 * b_stride] = Y31;
        }
        {
            dReal p4 = ptrLElement[rowSkip];
            dReal p4_1 = ptrLElement[1 + rowSkip];
            dReal p4_2 = ptrLElement[2 + rowSkip];
            Y41 = ptrBElement[3 * b_stride] - Z41 - p4 * Y11 - p4_1 * Y21 - p4_2 * Y31;
            ptrBElement[3 * b_stride] = Y41;
        }
        /* end of outer loop */
    }

    /* compute rows at end that are not a multiple of block size */
    for (; !subsequentPass || blockStartRow < rowCount; subsequentPass = true, ++blockStartRow) 
    {
        /* compute all 1 x 1 block of X, from rows i..i+1-1 */
        dReal *ptrBElement;

        dReal Z11, Z12;

        if (subsequentPass)
        {
            ptrBElement = B;
            /* set the Z matrix to 0 */
            Z11 = 0; Z12 = 0;

            const dReal *ptrLElement = L + blockStartRow * rowSkip;

            /* the inner loop that computes outer products and adds them to Z */
            unsigned columnCounter = blockStartRow;
            for (bool exitLoop = columnCounter < 4; !exitLoop; exitLoop = false) 
            {
                dReal p1, p2, q1, q2;

                /* load p and q values */
                p1 = ptrLElement[0];
                p2 = ptrLElement[1];
                q1 = ptrBElement[0 * b_stride];
                q2 = ptrBElement[1 * b_stride];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z12 += p2 * q2;

                /* load p and q values */
                p1 = ptrLElement[2];
                p2 = ptrLElement[3];
                q1 = ptrBElement[2 * b_stride];
                q2 = ptrBElement[3 * b_stride];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z12 += p2 * q2;

                if (columnCounter >= (12 + 4))
                {
                    columnCounter -= 12;

                    /* advance pointers */
                    ptrLElement += 12;
                    ptrBElement += 12 * b_stride;

                    /* load p and q values */
                    p1 = ptrLElement[-8];
                    p2 = ptrLElement[-7];
                    q1 = ptrBElement[-8 * (int)b_stride];
                    q2 = ptrBElement[-7 * (int)b_stride];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z12 += p2 * q2;

                    /* load p and q values */
                    p1 = ptrLElement[-6];
                    p2 = ptrLElement[-5];
                    q1 = ptrBElement[-6 * (int)b_stride];
                    q2 = ptrBElement[-5 * (int)b_stride];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z12 += p2 * q2;

                    /* load p and q values */
                    p1 = ptrLElement[-4];
                    p2 = ptrLElement[-3];
                    q1 = ptrBElement[-4 * (int)b_stride];
                    q2 = ptrBElement[-3 * (int)b_stride];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z12 += p2 * q2;

                    /* load p and q values */
                    p1 = ptrLElement[-2];
                    p2 = ptrLElement[-1];
                    q1 = ptrBElement[-2 * (int)b_stride];
                    q2 = ptrBElement[-1 * (int)b_stride];

                    /* compute outer product and add it to the Z matrix */
                    Z11 += p1 * q1;
                    Z12 += p2 * q2;
                }
                else
                {
                    /* advance pointers */
                    ptrLElement += 4;
                    ptrBElement += 4 * b_stride;

                    if ((columnCounter -= 4) < 4)
                    {
                        break;
                    }
                }
                /* end of inner loop */
            }

            /* compute left-over iterations */
            if ((columnCounter & 2) != 0) 
            {
                dReal p1, p2, q1, q2;

                /* load p and q values */
                p1 = ptrLElement[0];
                p2 = ptrLElement[1];
                q1 = ptrBElement[0 * b_stride];
                q2 = ptrBElement[1 * b_stride];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;
                Z12 += p2 * q2;

                /* advance pointers */
                ptrLElement += 2;
                ptrBElement += 2 * b_stride;
            }

            if ((columnCounter & 1) != 0)
            {
                dReal p1, q1;

                /* load p and q values */
                p1 = ptrLElement[0];
                q1 = ptrBElement[0 * b_stride];

                /* compute outer product and add it to the Z matrix */
                Z11 += p1 * q1;

                /* advance pointers */
                // ptrLElement += 1; -- not needed anymore
                ptrBElement += 1 * b_stride;
            }

            /* finish computing the X(i) block */
            dReal Y11 = ptrBElement[0 * b_stride] - (Z11 + Z12);
            ptrBElement[0 * b_stride] = Y11;
        }
    }
}


template<unsigned int block_step>
/*static */
sizeint ThreadedEquationSolverLDLT::estimateCooperativelySolvingL1StraightMemoryRequirement(unsigned rowCount, SolvingL1StraightMemoryEstimates &ref_solvingMemoryEstimates)
{
    unsigned blockCount = deriveSolvingL1StraightBlockCount(rowCount, block_step);
    sizeint descriptorSizeRequired = dEFFICIENT_SIZE(sizeof(cellindexint) * blockCount);
    sizeint contextSizeRequired = dEFFICIENT_SIZE(sizeof(SolveL1StraightCellContext) * (CCI__MAX + 1) * blockCount);
    ref_solvingMemoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired);

    sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired;
    return totalSizeRequired;
}

template<unsigned int block_step>
/*static */
void ThreadedEquationSolverLDLT::initializeCooperativelySolveL1StraightMemoryStructures(unsigned rowCount, 
    atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *dUNUSED(cellContexts))
{
    unsigned blockCount = deriveSolvingL1StraightBlockCount(rowCount, block_step);

    out_blockCompletionProgress = 0;
    memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors));
}

template<unsigned int block_step, unsigned int b_stride>
void ThreadedEquationSolverLDLT::participateSolvingL1Straight(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip, 
    volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/, 
    SolveL1StraightCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
{
    const unsigned lookaheadRange = 32;
    const unsigned blockCount = deriveSolvingL1StraightBlockCount(rowCount, block_step), lastBlock = blockCount - 1;
    /* compute rows at end that are not a multiple of block size */
    const unsigned loopX1RowCount = rowCount % block_step;

    BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;

    unsigned completedBlocks = refBlockCompletionProgress;
    unsigned currentBlock = completedBlocks;
    dIASSERT(completedBlocks <= blockCount);

    for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
    {
        bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
        bool goAssigningTheResult = false, stayWithinTheBlock = false;

        dReal Z[block_step];
        dReal Y[block_step];

        dReal *ptrBElement;

        CellContextInstance previousContextInstance;
        unsigned completedColumnBlock;
        bool partialBlock;

        for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
        {
            if (testDescriptor == INVALID_CELLDESCRIPTOR)
            {
                // Invalid descriptor is the indication that the row has been fully calculated
                // Test if this was the last row and break out if so.
                if (currentBlock + 1 == blockCount)
                {
                    exitLoop = true;
                    break;
                }

                // Treat detected row advancement as a row processed
                // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
                break;
            }

            CooperativeAtomics::AtomicReadReorderBarrier();
            // It is necessary to read up to date completedBblocks value after the descriptor retrieval
            // as otherwise the logic below breaks
            completedBlocks = refBlockCompletionProgress;

            if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
            {
                completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
                dIASSERT(completedColumnBlock < currentBlock || (completedColumnBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
                dIASSERT(completedColumnBlock <= completedBlocks); // Since the descriptor is not locked

                if (completedColumnBlock == completedBlocks && currentBlock != completedBlocks)
                {
                    dIASSERT(completedBlocks < currentBlock);
                    break;
                }

                if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
                {
                    if (completedColumnBlock != 0)
                    {
                        CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
                        previousContextInstance = contextInstance;

                        const SolveL1StraightCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
                        sourceContext.loadPrecalculatedZs(Z);
                    }
                    else
                    {
                        previousContextInstance = CCI__MIN;
                        SolveL1StraightCellContext::initializePrecalculatedZs(Z);
                    }

                    goForLockedBlockPrimaryCalculation = true;
                    break;
                }

                if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
                {
                    break;
                }

                testDescriptor = blockProgressDescriptors[currentBlock];
            }
            else
            {
                if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
                {
                    break;
                }

                cellindexint verificativeDescriptor;
                bool verificationFailure = false;

                completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
                dIASSERT(completedColumnBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block

                if (completedColumnBlock != 0)
                {
                    CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
                    const SolveL1StraightCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
                    sourceContext.loadPrecalculatedZs(Z);
                }
                else
                {
                    SolveL1StraightCellContext::initializePrecalculatedZs(Z);
                }

                if (completedColumnBlock != 0 && completedColumnBlock <= currentBlock)
                {
                    // Make sure the descriptor is re-read after the precalculates
                    CooperativeAtomics::AtomicReadReorderBarrier();
                }

                if (completedColumnBlock <= currentBlock)
                {
                    verificativeDescriptor = blockProgressDescriptors[currentBlock];
                    verificationFailure = verificativeDescriptor != testDescriptor;
                }

                if (!verificationFailure)
                {
                    dIASSERT(completedColumnBlock <= currentBlock + 1);

                    goForLockedBlockDuplicateCalculation = true;
                    break;
                }

                testDescriptor = verificativeDescriptor;
            }
        }

        if (exitLoop)
        {
            break;
        }

        if (goForLockedBlockPrimaryCalculation)
        {
            blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;

            // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
            bool handleComputationTakenOver = false, rowEndReached = false;

            const dReal *ptrLElement;
            unsigned finalColumnBlock;

            /* check if this is not the partial block of fewer rows */
            if (currentBlock != lastBlock || loopX1RowCount == 0)
            {
                partialBlock = false;

                if (currentBlock != 0)
                {
                    ptrLElement = L + (sizeint)(1 + currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
                    ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;

                    /* the inner loop that computes outer products and adds them to Z */
                    finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
                    dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);

                    for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
                    {
                        dReal q1, p1, p2, p3, p4;

                        /* load p and q values */
                        q1 = ptrBElement[0 * b_stride];
                        p1 = (ptrLElement - rowSkip)[0];
                        p2 = ptrLElement[0];
                        ptrLElement += rowSkip;
                        p3 = ptrLElement[0];
                        p4 = ptrLElement[0 + rowSkip];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;

                        /* load p and q values */
                        q1 = ptrBElement[1 * b_stride];
                        p3 = ptrLElement[1];
                        p4 = ptrLElement[1 + rowSkip];
                        ptrLElement -= rowSkip;
                        p1 = (ptrLElement - rowSkip)[1];
                        p2 = ptrLElement[1];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;

                        /* load p and q values */
                        q1 = ptrBElement[2 * b_stride];
                        p1 = (ptrLElement - rowSkip)[2];
                        p2 = ptrLElement[2];
                        ptrLElement += rowSkip;
                        p3 = ptrLElement[2];
                        p4 = ptrLElement[2 + rowSkip];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;

                        /* load p and q values */
                        q1 = ptrBElement[3 * b_stride];
                        p3 = ptrLElement[3];
                        p4 = ptrLElement[3 + rowSkip];
                        ptrLElement -= rowSkip;
                        p1 = (ptrLElement - rowSkip)[3];
                        p2 = ptrLElement[3];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;
                        dSASSERT(block_step == 4);

                        if (columnCounter > 3)
                        {
                            columnCounter -= 3;

                            ptrLElement += 3 * block_step;
                            ptrBElement += 3 * block_step * b_stride;

                            /* load p and q values */
                            q1 = ptrBElement[-8 * (int)b_stride];
                            p1 = (ptrLElement - rowSkip)[-8];
                            p2 = ptrLElement[-8];
                            ptrLElement += rowSkip;
                            p3 = ptrLElement[-8];
                            p4 = ptrLElement[-8 + rowSkip];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-7 * (int)b_stride];
                            p3 = ptrLElement[-7];
                            p4 = ptrLElement[-7 + rowSkip];
                            ptrLElement -= rowSkip;
                            p1 = (ptrLElement - rowSkip)[-7];
                            p2 = ptrLElement[-7];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-6 * (int)b_stride];
                            p1 = (ptrLElement - rowSkip)[-6];
                            p2 = ptrLElement[-6];
                            ptrLElement += rowSkip;
                            p3 = ptrLElement[-6];
                            p4 = ptrLElement[-6 + rowSkip];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-5 * (int)b_stride];
                            p3 = ptrLElement[-5];
                            p4 = ptrLElement[-5 + rowSkip];
                            ptrLElement -= rowSkip;
                            p1 = (ptrLElement - rowSkip)[-5];
                            p2 = ptrLElement[-5];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-4 * (int)b_stride];
                            p1 = (ptrLElement - rowSkip)[-4];
                            p2 = ptrLElement[-4];
                            ptrLElement += rowSkip;
                            p3 = ptrLElement[-4];
                            p4 = ptrLElement[-4 + rowSkip];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-3 * (int)b_stride];
                            p3 = ptrLElement[-3];
                            p4 = ptrLElement[-3 + rowSkip];
                            ptrLElement -= rowSkip;
                            p1 = (ptrLElement - rowSkip)[-3];
                            p2 = ptrLElement[-3];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-2 * (int)b_stride];
                            p1 = (ptrLElement - rowSkip)[-2];
                            p2 = ptrLElement[-2];
                            ptrLElement += rowSkip;
                            p3 = ptrLElement[-2];
                            p4 = ptrLElement[-2 + rowSkip];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;

                            /* load p and q values */
                            q1 = ptrBElement[-1 * (int)b_stride];
                            p3 = ptrLElement[-1];
                            p4 = ptrLElement[-1 + rowSkip];
                            ptrLElement -= rowSkip;
                            p1 = (ptrLElement - rowSkip)[-1];
                            p2 = ptrLElement[-1];

                            /* compute outer product and add it to the Z matrix */
                            Z[0] += p1 * q1;
                            Z[1] += p2 * q1;
                            Z[2] += p3 * q1;
                            Z[3] += p4 * q1;
                            dSASSERT(block_step == 4);
                        }
                        else
                        {
                            ptrLElement += block_step;
                            ptrBElement += block_step * b_stride;

                            if (--columnCounter == 0)
                            {
                                if (finalColumnBlock == currentBlock)
                                {
                                    rowEndReached = true;
                                    break;
                                }

                                // Take a look if anymore columns have been completed...
                                completedBlocks = refBlockCompletionProgress;
                                dIASSERT(completedBlocks >= finalColumnBlock);

                                if (completedBlocks == finalColumnBlock)
                                {
                                    break;
                                }

                                // ...continue if so.
                                unsigned columnCompletedSoFar = finalColumnBlock;
                                finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
                                columnCounter = finalColumnBlock - columnCompletedSoFar;
                            }
                        }
                        /* end of inner loop */
                    }
                }
                else
                {
                    ptrLElement = L + (sizeint)(1/* + currentBlock * block_step*/) * rowSkip/* + completedColumnBlock * block_step*/;
                    ptrBElement = B/* + (sizeint)(completedColumnBlock * block_step) * b_stride*/;
                    dIASSERT(completedColumnBlock == 0);

                    rowEndReached = true;
                }
            }
            else
            {
                partialBlock = true;

                if (currentBlock != 0)
                {
                    dReal tempZ[dMACRO_MAX(block_step - 1U, 1U)] = { REAL(0.0), };

                    ptrLElement = L + (sizeint)(/*1 + */currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
                    ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;

                    /* the inner loop that computes outer products and adds them to Z */
                    finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
                    dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);

                    for (unsigned partialRow = 0, columnCompletedSoFar = completedColumnBlock; ; )
                    {
                        dReal Z1 = 0, Z2 = 0, Z3 = 0, Z4 = 0;

                        for (unsigned columnCounter = finalColumnBlock - columnCompletedSoFar; ; )
                        {
                            dReal q1, q2, q3, q4, p1, p2, p3, p4;

                            /* load p and q values */
                            q1 = ptrBElement[0 * b_stride];
                            q2 = ptrBElement[1 * b_stride];
                            q3 = ptrBElement[2 * b_stride];
                            q4 = ptrBElement[3 * b_stride];
                            p1 = ptrLElement[0];
                            p2 = ptrLElement[1];
                            p3 = ptrLElement[2];
                            p4 = ptrLElement[3];

                            /* compute outer product and add it to the Z matrix */
                            Z1 += p1 * q1;
                            Z2 += p2 * q2;
                            Z3 += p3 * q3;
                            Z4 += p4 * q4;
                            dSASSERT(block_step == 4);

                            if (columnCounter > 3)
                            {
                                columnCounter -= 3;

                                ptrLElement += 3 * block_step;
                                ptrBElement += 3 * block_step * b_stride;
                            
                                /* load p and q values */
                                q1 = ptrBElement[-8 * (int)b_stride];
                                q2 = ptrBElement[-7 * (int)b_stride];
                                q3 = ptrBElement[-6 * (int)b_stride];
                                q4 = ptrBElement[-5 * (int)b_stride];
                                p1 = ptrLElement[-8];
                                p2 = ptrLElement[-7];
                                p3 = ptrLElement[-6];
                                p4 = ptrLElement[-5];

                                /* compute outer product and add it to the Z matrix */
                                Z1 += p1 * q1;
                                Z2 += p2 * q2;
                                Z3 += p3 * q3;
                                Z4 += p4 * q4;

                                /* load p and q values */
                                q1 = ptrBElement[-4 * (int)b_stride];
                                q2 = ptrBElement[-3 * (int)b_stride];
                                q3 = ptrBElement[-2 * (int)b_stride];
                                q4 = ptrBElement[-1 * (int)b_stride];
                                p1 = ptrLElement[-4];
                                p2 = ptrLElement[-3];
                                p3 = ptrLElement[-2];
                                p4 = ptrLElement[-1];

                                /* compute outer product and add it to the Z matrix */
                                Z1 += p1 * q1;
                                Z2 += p2 * q2;
                                Z3 += p3 * q3;
                                Z4 += p4 * q4;
                                dSASSERT(block_step == 4);
                            }
                            else
                            {
                                ptrLElement += block_step;
                                ptrBElement += block_step * b_stride;

                                if (--columnCounter == 0)
                                {
                                    break;
                                }
                            }
                            /* end of inner loop */
                        }

                        tempZ[partialRow] += Z1 + Z2 + Z3 + Z4;

                        if (++partialRow == loopX1RowCount)
                        {
                            // Here switch is used to avoid accessing Z by parametrized index. 
                            // So far all the accesses were performed by explicit constants
                            // what lets the compiler treat Z elements as individual variables 
                            // rather than array elements.
                            Z[0] += tempZ[0];

                            if (loopX1RowCount >= 2)
                            {
                                Z[1] += tempZ[1];

                                if (loopX1RowCount > 2)
                                {
                                    Z[2] += tempZ[2];
                                }
                            }
                            dSASSERT(block_step == 4);

                            if (finalColumnBlock == currentBlock)
                            {
                                if (loopX1RowCount > 2)
                                {
                                    // Correct the LElement so that it points to the second row
                                    //
                                    // Note, that ff there is just one partial row, it does not matter that 
                                    // the LElement will remain pointing at the first row, 
                                    // since the former is not going to be used in that case.
                                    ptrLElement -= /*(sizeint)*/rowSkip/* * (loopX1RowCount - 2)*/; dIASSERT(loopX1RowCount == 3);
                                }
                                dSASSERT(block_step == 4);

                                rowEndReached = true;
                                break;
                            }

                            // Take a look if anymore columns have been completed...
                            completedBlocks = refBlockCompletionProgress;
                            dIASSERT(completedBlocks >= finalColumnBlock);

                            if (completedBlocks == finalColumnBlock)
                            {
                                break;
                            }

                            std::fill(tempZ, tempZ + loopX1RowCount, REAL(0.0));
                            partialRow = 0;

                            // Correct the LElement pointer to continue at the first partial row
                            ptrLElement -= (sizeint)rowSkip * (loopX1RowCount - 1);

                            // ...continue if so.
                            columnCompletedSoFar = finalColumnBlock;
                            finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
                        }
                        else
                        {
                            ptrLElement += rowSkip - (finalColumnBlock - columnCompletedSoFar) * block_step;
                            ptrBElement -= (sizeint)((finalColumnBlock - columnCompletedSoFar) * block_step) * b_stride;
                        }
                        /* end of loop by individual rows */
                    }
                }
                else
                {
                    ptrLElement = L + (sizeint)(1/* + currentBlock * block_step*/) * rowSkip/* + completedColumnBlock * block_step*/;
                    ptrBElement = B/* + (sizeint)(completedColumnBlock * block_step) * b_stride*/;
                    dIASSERT(completedColumnBlock == 0);

                    rowEndReached = true;
                }
            }

            if (rowEndReached)
            {
                // Check whether there is still a need to proceed or if the computation has been taken over by another thread
                cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);

                if (blockProgressDescriptors[currentBlock] == oldDescriptor)
                {
                    /* finish computing the X(i) block */
                    if (!partialBlock)
                    {
                        Y[0] = ptrBElement[0 * b_stride] - Z[0];

                        dReal p2 = ptrLElement[0];
                        Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];

                        ptrLElement += rowSkip;

                        dReal p3 = ptrLElement[0];
                        dReal p3_1 = ptrLElement[1];
                        Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];

                        dReal p4 = ptrLElement[rowSkip];
                        dReal p4_1 = ptrLElement[1 + rowSkip];
                        dReal p4_2 = ptrLElement[2 + rowSkip];
                        Y[3] = ptrBElement[3 * b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
                        dSASSERT(block_step == 4);
                    }
                    else
                    {
                        Y[0] = ptrBElement[0 * b_stride] - Z[0];

                        if (loopX1RowCount >= 2)
                        {
                            dReal p2 = ptrLElement[0];
                            Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];

                            if (loopX1RowCount > 2)
                            {
                                dReal p3 = ptrLElement[0 + rowSkip];
                                dReal p3_1 = ptrLElement[1 + rowSkip];
                                Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
                            }
                        }
                        dSASSERT(block_step == 4);
                    }

                    // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
                    CooperativeAtomics::AtomicReadReorderBarrier();

                    // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet 
                    // and the computation result is valid.
                    if (blockProgressDescriptors[currentBlock] == oldDescriptor)
                    {
                        // Assign the results to the result context (possibly in parallel with other threads 
                        // that could and ought to be assigning exactly the same values)
                        SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
                        resultContext.storePrecalculatedZs(Y);

                        // Assign the result assignment progress descriptor
                        cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
                        CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored

                        // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
                        goAssigningTheResult = true;
                    }
                    else
                    {
                        // Otherwise, go on competing for copying the results
                        handleComputationTakenOver = true;
                    }
                }
                else
                {
                    handleComputationTakenOver = true;
                }
            }
            else
            {
                // If the final column has not been reached yet, store current values to the context.
                // Select the other context instance as the previous one might be read by other threads.
                CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
                SolveL1StraightCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
                destinationContext.storePrecalculatedZs(Z);

                // Unlock the row until more columns can be used
                cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
                cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalColumnBlock, nextContextInstance, false);
                // The descriptor might have been updated by a competing thread
                if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
                {
                    // Adjust the ptrBElement to point to the result area...
                    ptrBElement = B + (sizeint)(currentBlock * block_step) * b_stride;
                    // ...and go on handling the case
                    handleComputationTakenOver = true;
                }
            }

            if (handleComputationTakenOver)
            {
                cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
                // This can only happen if the row was (has become) the uppermost not fully completed one
                // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
                if (existingDescriptor != INVALID_CELLDESCRIPTOR)
                {
                    // If not fully completed this must be the final stage of the result assignment into the matrix
                    dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));

                    // Go on competing copying the result as anyway the block is the topmost not completed one
                    // and since there was competition for it, there is no other work that can be done right now.
                    const SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
                    resultContext.loadPrecalculatedZs(Y);

                    goAssigningTheResult = true;
                }
                else 
                {
                    // everything is over -- just go handling next blocks
                }
            }
        }
        else if (goForLockedBlockDuplicateCalculation)
        {
            blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;

            bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;

            /* declare variables */
            const dReal *ptrLElement;

            if (completedColumnBlock < currentBlock)
            {
                /* check if this is not the partial block of fewer rows */
                if (currentBlock != lastBlock || loopX1RowCount == 0)
                {
                    partialBlock = false;

                    ptrLElement = L + (sizeint)(1 + currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
                    ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;

                    /* the inner loop that computes outer products and adds them to Z */
                    unsigned finalColumnBlock = currentBlock;
                    dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?

                    unsigned lastCompletedColumn = completedColumnBlock;
                    unsigned columnCounter = finalColumnBlock - completedColumnBlock;
                    for (bool exitInnerLoop = false; !exitInnerLoop; exitInnerLoop = --columnCounter == 0)
                    {
                        dReal q1, p1, p2, p3, p4;

                        /* load p and q values */
                        q1 = ptrBElement[0 * b_stride];
                        p1 = (ptrLElement - rowSkip)[0];
                        p2 = ptrLElement[0];
                        ptrLElement += rowSkip;
                        p3 = ptrLElement[0];
                        p4 = ptrLElement[0 + rowSkip];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;

                        /* load p and q values */
                        q1 = ptrBElement[1 * b_stride];
                        p3 = ptrLElement[1];
                        p4 = ptrLElement[1 + rowSkip];
                        ptrLElement -= rowSkip;
                        p1 = (ptrLElement - rowSkip)[1];
                        p2 = ptrLElement[1];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;

                        /* load p and q values */
                        q1 = ptrBElement[2 * b_stride];
                        p1 = (ptrLElement - rowSkip)[2];
                        p2 = ptrLElement[2];
                        ptrLElement += rowSkip;
                        p3 = ptrLElement[2];
                        p4 = ptrLElement[2 + rowSkip];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;

                        /* load p and q values */
                        q1 = ptrBElement[3 * b_stride];
                        p3 = ptrLElement[3];
                        p4 = ptrLElement[3 + rowSkip];
                        ptrLElement -= rowSkip;
                        p1 = (ptrLElement - rowSkip)[3];
                        p2 = ptrLElement[3];

                        /* compute outer product and add it to the Z matrix */
                        Z[0] += p1 * q1;
                        Z[1] += p2 * q1;
                        Z[2] += p3 * q1;
                        Z[3] += p4 * q1;
                        dSASSERT(block_step == 4);

                        // Check if the primary solver thread has not made any progress
                        cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
                        unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);

                        if (newCompletedColumn != lastCompletedColumn)
                        {
                            // Check, this is the first change the current thread detects.
                            // There is absolutely no reason in code for the computation to stop/resume twice 
                            // while the current thread is competing.
                            dIASSERT(lastCompletedColumn == completedColumnBlock);

                            if (descriptorVerification == INVALID_CELLDESCRIPTOR)
                            {
                                skipToHandlingSubsequentRows = true;
                                break;
                            }

                            if (newCompletedColumn == currentBlock + 1)
                            {
                                skiptoCopyingResult = true;
                                break;
                            }

                            // Check if the current thread is behind
                            if (newCompletedColumn > finalColumnBlock - columnCounter)
                            {
                                // If so, go starting over one more time
                                blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
                                stayWithinTheBlock = true;
                                skipToHandlingSubsequentRows = true;
                                break;
                            }

                            // If current thread is ahead, just save new completed column for further comparisons and go on calculating
                            lastCompletedColumn = newCompletedColumn;
                        }

                        /* advance pointers */
                        ptrLElement += block_step;
                        ptrBElement += block_step * b_stride;
                        /* end of inner loop */
                    }
                }
                else
                {
                    partialBlock = true;

                    dReal tempZ[dMACRO_MAX(block_step - 1U, 1U)] = { REAL(0.0), };

                    ptrLElement = L + (sizeint)(/*1 + */currentBlock * block_step) * rowSkip + completedColumnBlock * block_step;
                    ptrBElement = B + (sizeint)(completedColumnBlock * block_step) * b_stride;

                    /* the inner loop that computes outer products and adds them to Z */
                    unsigned finalColumnBlock = currentBlock;
                    dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?

                    unsigned lastCompletedColumn = completedColumnBlock;
                    for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
                    {
                        dReal q1, q2, q3, q4;

                        /* load q values */
                        q1 = ptrBElement[0 * b_stride];
                        q2 = ptrBElement[1 * b_stride];
                        q3 = ptrBElement[2 * b_stride];
                        q4 = ptrBElement[3 * b_stride];

                        for (unsigned partialRow = 0; ; )
                        {
                            dReal p1, p2, p3, p4;

                            /* load p values */
                            p1 = ptrLElement[0];
                            p2 = ptrLElement[1];
                            p3 = ptrLElement[2];
                            p4 = ptrLElement[3];

                            /* compute outer product and add it to the Z matrix */
                            tempZ[partialRow] += p1 * q1 + p2 * q2 + p3 * q3 + p4 * q4;
                            dSASSERT(block_step == 4);

                            if (++partialRow == loopX1RowCount)
                            {
                                break;
                            }

                            ptrLElement += rowSkip;
                        }

                        // Check if the primary solver thread has not made any progress
                        cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
                        unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);

                        if (newCompletedColumn != lastCompletedColumn)
                        {
                            // Check, this is the first change the current thread detects.
                            // There is absolutely no reason in code for the computation to stop/resume twice 
                            // while the current thread is competing.
                            dIASSERT(lastCompletedColumn == completedColumnBlock);

                            if (descriptorVerification == INVALID_CELLDESCRIPTOR)
                            {
                                skipToHandlingSubsequentRows = true;
                                break;
                            }

                            if (newCompletedColumn == currentBlock + 1)
                            {
                                skiptoCopyingResult = true;
                                break;
                            }

                            // Check if the current thread is behind
                            if (newCompletedColumn > finalColumnBlock - columnCounter)
                            {
                                // If so, go starting over one more time
                                blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
                                stayWithinTheBlock = true;
                                skipToHandlingSubsequentRows = true;
                                break;
                            }

                            // If current thread is ahead, just save new completed column for further comparisons and go on calculating
                            lastCompletedColumn = newCompletedColumn;
                        }

                        ptrLElement += block_step;
                        ptrBElement += block_step * b_stride;

                        if (--columnCounter == 0)
                        {
                            // Here switch is used to avoid accessing Z by parametrized index. 
                            // So far all the accesses were performed by explicit constants
                            // what lets the compiler treat Z elements as individual variables 
                            // rather than array elements.
                            Z[0] += tempZ[0];

                            if (loopX1RowCount >= 2)
                            {
                                Z[1] += tempZ[1];

                                if (loopX1RowCount > 2)
                                {
                                    Z[2] += tempZ[2];

                                    // Correct the LElement so that it points to the second row
                                    //
                                    // Note, that if there is just one partial row, it does not matter that 
                                    // the LElement will remain pointing at the first row, 
                                    // since the former is not going to be used in that case.
                                    ptrLElement -= /*(sizeint)*/rowSkip/* * (loopX1RowCount - 2)*/; dIASSERT(loopX1RowCount == 3);
                                }
                            }
                            dSASSERT(block_step == 4);

                            break;
                        }

                        /* advance pointers */
                        ptrLElement -= (sizeint)rowSkip * (loopX1RowCount - 1);
                        /* end of inner loop */
                    }
                }
            }
            else if (completedColumnBlock > currentBlock)
            {
                dIASSERT(completedColumnBlock == currentBlock + 1);

                partialBlock = currentBlock == lastBlock && loopX1RowCount != 0;

                skiptoCopyingResult = true;
            }
            else
            {
                dIASSERT(currentBlock == 0); // Execution can get here within the very first block only

                partialBlock = rowCount < block_step;

                /* assign the pointers appropriately and go on computing the results */
                ptrLElement = L + (sizeint)(1/* + currentBlock * block_step*/) * rowSkip/* + completedColumnBlock * block_step*/;
                ptrBElement = B/* + (sizeint)(completedColumnBlock * block_step) * b_stride*/;
            }

            if (!skipToHandlingSubsequentRows)
            {
                if (!skiptoCopyingResult)
                {
                    if (!partialBlock)
                    {
                        Y[0] = ptrBElement[0 * b_stride] - Z[0];

                        dReal p2 = ptrLElement[0];
                        Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];

                        ptrLElement += rowSkip;

                        dReal p3 = ptrLElement[0];
                        dReal p3_1 = ptrLElement[1];
                        Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];

                        dReal p4 = ptrLElement[rowSkip];
                        dReal p4_1 = ptrLElement[1 + rowSkip];
                        dReal p4_2 = ptrLElement[2 + rowSkip];
                        Y[3] = ptrBElement[3 * b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
                        dSASSERT(block_step == 4);
                    }
                    else
                    {
                        Y[0] = ptrBElement[0 * b_stride] - Z[0];

                        if (loopX1RowCount >= 2)
                        {
                            dReal p2 = ptrLElement[0];
                            Y[1] = ptrBElement[1 * b_stride] - Z[1] - p2 * Y[0];

                            if (loopX1RowCount > 2)
                            {
                                dReal p3 = ptrLElement[0 + rowSkip];
                                dReal p3_1 = ptrLElement[1 + rowSkip];
                                Y[2] = ptrBElement[2 * b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
                            }
                        }
                        dSASSERT(block_step == 4);
                    }

                    CooperativeAtomics::AtomicReadReorderBarrier();

                    // Use atomic load to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
                    cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];

                    if (existingDescriptor == INVALID_CELLDESCRIPTOR)
                    {
                        // Everything is over -- proceed to subsequent rows
                        skipToHandlingSubsequentRows = true;
                    }
                    else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
                    {
                        // The values computed above may not be valid. Copy the values already in the result context.
                        skiptoCopyingResult = true;
                    }
                    else
                    {
                        // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet 
                        // and the computation result is valid.
                        cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused

                        // Assign the results to the result context (possibly in parallel with other threads 
                        // that could and ought to be assigning exactly the same values)
                        SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
                        resultContext.storePrecalculatedZs(Y);

                        // Assign the result assignment progress descriptor
                        CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored

                        // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
                    }
                }

                if (!skipToHandlingSubsequentRows)
                {
                    if (skiptoCopyingResult)
                    {
                        // Extract the result values stored in the result context
                        const SolveL1StraightCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
                        resultContext.loadPrecalculatedZs(Y);

                        ptrBElement = B + (sizeint)(currentBlock * block_step) * b_stride;
                    }

                    goAssigningTheResult = true;
                }
            }
        }

        if (goAssigningTheResult)
        {
            cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
            // Check if the assignment has not been completed yet
            if (existingDescriptor != INVALID_CELLDESCRIPTOR)
            {
                // Assign the computation results to the B vector
                if (!partialBlock)
                {
                    ptrBElement[0 * b_stride] = Y[0];
                    ptrBElement[1 * b_stride] = Y[1];
                    ptrBElement[2 * b_stride] = Y[2];
                    ptrBElement[3 * b_stride] = Y[3];
                    dSASSERT(block_step == 4);
                }
                else
                {
                    ptrBElement[0 * b_stride] = Y[0];

                    if (loopX1RowCount >= 2)
                    {
                        ptrBElement[1 * b_stride] = Y[1];

                        if (loopX1RowCount > 2)
                        {
                            ptrBElement[2 * b_stride] = Y[2];
                        }
                    }
                    dSASSERT(block_step == 4);
                }

                ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
                dIASSERT(refBlockCompletionProgress >= currentBlock + 1);

                // And assign the completed status no matter what
                CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
            }
            else 
            {
                // everything is over -- just go handling next blocks
            }
        }

        if (!stayWithinTheBlock)
        {
            completedBlocks = refBlockCompletionProgress;

            if (completedBlocks == blockCount)
            {
                break;
            }

            currentBlock += 1;

            bool lookaheadBoundaryReached = false;

            if (currentBlock == blockCount || completedBlocks == 0)
            {
                lookaheadBoundaryReached = true;
            }
            else if (currentBlock >= completedBlocks + lookaheadRange)
            {
                lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
            }
            else if (currentBlock < completedBlocks)
            {
                // Treat detected row advancement as a row processed
                // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it

                currentBlock = completedBlocks;
            }

            if (lookaheadBoundaryReached)
            {
                dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did we not compete???

                // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
                if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
                {
                    // Abandon job if too few blocks remain
                    if (blockCount - completedBlocks <= ownThreadIndex)
                    {
                        break;
                    }

                    blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
                }
                else
                {
                    // If there was some progress, just continue to the next pass
                    blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
                }

                currentBlock = completedBlocks;
            }
        }
    }
}


#endif