//----------------------------------------------------------------------------
//
// setup.cpp
//
// PrimProcessor setup methods.
//
// Copyright (C) Microsoft Corporation, 1997.
//
//----------------------------------------------------------------------------

#include "rgb_pch.h"
#pragma hdrstop

#include "d3dutil.h"
#include "setup.hpp"
#include "attrs_mh.h"
#include "tstp_mh.h"
#include "walk_mh.h"
#include "rsdbg.hpp"

DBG_DECLARE_FILE();

//----------------------------------------------------------------------------
//
// MINMAX3
//
// Computes the min and max of three integer values.
//
//----------------------------------------------------------------------------

#define MINMAX3(iV0, iV1, iV2, iMin, iMax)                                    \
    if ((iV0) <= (iV1))                                                       \
    {                                                                         \
        if ((iV1) <= (iV2))                                                   \
        {                                                                     \
            (iMin) = (iV0);                                                   \
            (iMax) = (iV2);                                                   \
        }                                                                     \
        else if ((iV0) <= (iV2))                                              \
        {                                                                     \
            (iMin) = (iV0);                                                   \
            (iMax) = (iV1);                                                   \
        }                                                                     \
        else                                                                  \
        {                                                                     \
            (iMin) = (iV2);                                                   \
            (iMax) = (iV1);                                                   \
        }                                                                     \
    }                                                                         \
    else if ((iV1) <= (iV2))                                                  \
    {                                                                         \
        (iMin) = (iV1);                                                       \
        if ((iV0) <= (iV2))                                                   \
        {                                                                     \
            (iMax) = (iV2);                                                   \
        }                                                                     \
        else                                                                  \
        {                                                                     \
            (iMax) = (iV0);                                                   \
        }                                                                     \
    }                                                                         \
    else                                                                      \
    {                                                                         \
        (iMin) = (iV2);                                                       \
        (iMax) = (iV0);                                                       \
    }

// Determine whether any of the given values are less than zero or greater
// than one.  Negative zero counts as less than zero so this check will
// produce some false positives but that's OK.
//
// ATTENTION Just wipe this out for now.  Need a test for W too close to
// zero to avoid numerical problems.
//#define NEEDS_NORMALIZE3(fV0, fV1, fV2) \
//    ((ASUINT32(fV0) | ASUINT32(fV1) | ASUINT32(fV2)) > INT32_FLOAT_ONE)

#define NEEDS_NORMALIZE3(fV0, fV1, fV2) \
    (1)

//----------------------------------------------------------------------------
//
// PrimProcessor::NormalizeTriRHW
//
// D3DTLVERTEX.dvRHW can be anything, but our internal structures only
// allow for it being in the range [0, 1].  This function ensures that
// the RHWs are in the proper range by finding the largest one and
// scaling all of them down by it.
//
//----------------------------------------------------------------------------

void
PrimProcessor::NormalizeTriRHW(LPD3DTLVERTEX pV0, LPD3DTLVERTEX pV1,
                               LPD3DTLVERTEX pV2)
{
    // Save original values.
    m_dvV0RHW = pV0->dvRHW;
    m_dvV1RHW = pV1->dvRHW;
    m_dvV2RHW = pV2->dvRHW;

    // Produce a warning when a value is out of the desired range.
#if DBG
    if (FLOAT_LTZ(pV0->dvRHW) || 
        FLOAT_LTZ(pV1->dvRHW) || 
        FLOAT_LTZ(pV2->dvRHW))
    {
        RSDPF(("Triangle RHW out of range %f,%f,%f\n",
               pV0->dvRHW, pV1->dvRHW, pV2->dvRHW));
    }
#endif

    // Find bounds and compute scale.
    FLOAT fMax;

    if (pV0->dvRHW < pV1->dvRHW)
    {
        if (pV1->dvRHW < pV2->dvRHW)
        {
            fMax = pV2->dvRHW;
        }
        else if (pV0->dvRHW < pV2->dvRHW)
        {
            fMax = pV1->dvRHW;
        }
        else
        {
            fMax = pV1->dvRHW;
        }
    }
    else if (pV1->dvRHW < pV2->dvRHW)
    {
        if (pV0->dvRHW < pV2->dvRHW)
        {
            fMax = pV2->dvRHW;
        }
        else
        {
            fMax = pV0->dvRHW;
        }
    }
    else
    {
        fMax = pV0->dvRHW;
    }

    FLOAT fRHWScale;

    fRHWScale = NORMALIZED_RHW_MAX / fMax;

    // Scale all values by scaling factor.
    pV0->dvRHW = pV0->dvRHW * fRHWScale;
    pV1->dvRHW = pV1->dvRHW * fRHWScale;
    pV2->dvRHW = pV2->dvRHW * fRHWScale;

#ifdef DBG_RHW_NORM
    RSDPF(("%f,%f,%f - %f,%f,%f\n",
           m_dvV0RHW, m_dvV1RHW, m_dvV2RHW,
           pV0->dvRHW, pV1->dvRHW, pV2->dvRHW));
#endif
}

//----------------------------------------------------------------------------
//
// PrimProcessor::TriSetup
//
// Takes three vertices and does triangle setup, filling in both a
// primitive structure for the triangle and a span structure for the first
// span.  All internal intermediates and DY values are computed.
//
// Uses the current D3DI_RASTPRIM and D3DI_RASTSPAN so these pointers must
// be valid before calling this routine.
//
// Returns whether the triangle was kept or not.  Culled triangles return
// FALSE.
//
//----------------------------------------------------------------------------

BOOL
PrimProcessor::TriSetup(LPD3DTLVERTEX pV0,
                        LPD3DTLVERTEX pV1,
                        LPD3DTLVERTEX pV2)
{
    // Preserve original first vertex for flat shading reference.
    m_StpCtx.pFlatVtx = pV0;

    //
    // Sort vertices in Y.
    // This can cause ordering changes from the original vertex set
    // so track reversals.
    //
    // Determinant computation and culling could be done before this.
    // Doing so causes headaches with computing deltas up front, though,
    // because the edges may change during sorting.
    //

    LPD3DTLVERTEX pVTmp;
    UINT uReversed;

    uReversed = 0;
    if (pV0->dvSY <= pV1->dvSY)
    {
        if (pV1->dvSY <= pV2->dvSY)
        {
            // Sorted.
        }
        else if (pV0->dvSY <= pV2->dvSY)
        {
            // Sorted order is 0 2 1.
            pVTmp = pV1;
            pV1 = pV2;
            pV2 = pVTmp;
            uReversed = 1;
        }
        else
        {
            // Sorted order is 2 0 1.
            pVTmp = pV0;
            pV0 = pV2;
            pV2 = pV1;
            pV1 = pVTmp;
        }
    }
    else if (pV1->dvSY < pV2->dvSY)
    {
        if (pV0->dvSY <= pV2->dvSY)
        {
            // Sorted order is 1 0 2.
            pVTmp = pV0;
            pV0 = pV1;
            pV1 = pVTmp;
            uReversed = 1;
        }
        else
        {
            // Sorted order is 1 2 0.
            pVTmp = pV0;
            pV0 = pV1;
            pV1 = pV2;
            pV2 = pVTmp;
        }
    }
    else
    {
        // Sorted order is 2 1 0.
        pVTmp = pV0;
        pV0 = pV2;
        pV2 = pVTmp;
        uReversed = 1;
    }

    FLOAT fX0 = pV0->dvSX;
    FLOAT fX1 = pV1->dvSX;
    FLOAT fX2 = pV2->dvSX;
    FLOAT fY0 = pV0->dvSY;
    FLOAT fY1 = pV1->dvSY;
    FLOAT fY2 = pV2->dvSY;

    //
    // Compute x,y deltas.
    //
    m_StpCtx.fDX10 = fX1 - fX0;
    m_StpCtx.fDX20 = fX2 - fX0;
    m_StpCtx.fDY10 = fY1 - fY0;
    m_StpCtx.fDY20 = fY2 - fY0;

    //
    // Compute determinant and do culling.
    //
    FLOAT fDet;

    fDet = m_StpCtx.fDX20 * m_StpCtx.fDY10 - m_StpCtx.fDX10 * m_StpCtx.fDY20;
    if (FLOAT_EQZ(fDet))
    {
        // No area, so bail out
        return FALSE;
    }

    // Get sign of determinant.
    UINT uDetCcw = FLOAT_GTZ(fDet) ? 1 : 0;

    // If culling is off the cull sign to check against is set to a
    // value that can't be matched so this single check is sufficient
    // for all three culling cases.
    //
    // Fold in sign reversal here rather than in uDetCcw because
    // we need the true sign later to determine whether the long edge is
    // to the left or the right.
    if ((uDetCcw ^ uReversed) == m_StpCtx.pCtx->uCullFaceSign)
    {
        return FALSE;
    }

    // Snap bounding vertex Y's to pixel centers and check for trivial reject.

    m_StpCtx.iY = ICEILF(fY0);
    m_iY2 = ICEILF(fY2);

    if (m_StpCtx.iY >= m_StpCtx.pCtx->Clip.bottom ||
        m_iY2 <= m_StpCtx.pCtx->Clip.top)
    {
        return FALSE;
    }

    INT iX0 = ICEILF(fX0);
    INT iX1 = ICEILF(fX1);
    INT iX2 = ICEILF(fX2);

    // Start 2 - 0 edge DXDY divide so that it's overlapped with the
    // integer processing done during X clip checking.  The assumption
    // is that it's nearly zero cost when overlapped so it's worth
    // it to start it even when the clip check rejects the triangle.
    FLOAT fDX20, fDY20, fDXDY20;

    // Need to use stack variables so the assembly can understand the
    // address.
    fDX20 = m_StpCtx.fDX20;
    fDY20 = m_StpCtx.fDY20;
    FLD_BEGIN_DIVIDE(fDX20, fDY20, fDXDY20);

    // Computing the X triangle bounds involves quite a few operations,
    // but it allows for both trivial rejection and trivial acceptance.
    // Given that guard band clipping can lead to a lot of trivial rejections
    // and that there will usually be a lot of trivial acceptance cases,
    // the work is worth it.

    INT iMinX, iMaxX;
    BOOL bXAccept;

    MINMAX3(iX0, iX1, iX2, iMinX, iMaxX);

    m_iXWidth = iMaxX - iMinX;

    // Use X bounds for trivial reject and accept.
    if (iMinX >= m_StpCtx.pCtx->Clip.right ||
        iMaxX <= m_StpCtx.pCtx->Clip.left ||
        m_iXWidth <= 0)
    {
        bXAccept = FALSE;
    }
    else
    {
        if (iMinX >= m_StpCtx.pCtx->Clip.left &&
            iMaxX <= m_StpCtx.pCtx->Clip.right)
        {
            m_StpCtx.uFlags |= PRIMF_TRIVIAL_ACCEPT_X;
        }
        else
        {
            RSDPFM((RSM_XCLIP, "XClip bounds %5d - %5d, %5d\n",
                    iMinX, iMaxX, m_iXWidth));
        }

        bXAccept = TRUE;
    }

    // Complete divide.
    FSTP_END_DIVIDE(fDXDY20);

    if (!bXAccept)
    {
        return FALSE;
    }

    // Clamp triangle Y's to clip rect.

    m_iY1 = ICEILF(fY1);

    if (m_StpCtx.iY < m_StpCtx.pCtx->Clip.top)
    {
        RSDPFM((RSM_YCLIP, "YClip iY %d to %d\n",
                m_StpCtx.iY, m_StpCtx.pCtx->Clip.top));

        m_StpCtx.iY = m_StpCtx.pCtx->Clip.top;

        if (m_iY1 < m_StpCtx.pCtx->Clip.top)
        {
            RSDPFM((RSM_YCLIP, "YClip iY1 %d to %d\n",
                    m_iY1, m_StpCtx.pCtx->Clip.top));

            m_iY1 = m_StpCtx.pCtx->Clip.top;
        }
    }

    if (m_iY1 > m_StpCtx.pCtx->Clip.bottom)
    {
        RSDPFM((RSM_YCLIP, "YClip iY1 %d, iY2 %d to %d\n",
                m_iY1, m_iY2, m_StpCtx.pCtx->Clip.bottom));

        m_iY1 = m_StpCtx.pCtx->Clip.bottom;
        m_iY2 = m_StpCtx.pCtx->Clip.bottom;
    }
    else if (m_iY2 > m_StpCtx.pCtx->Clip.bottom)
    {
        RSDPFM((RSM_YCLIP, "YClip iY2 %d to %d\n",
                m_iY2, m_StpCtx.pCtx->Clip.bottom));

        m_iY2 = m_StpCtx.pCtx->Clip.bottom;
    }

    // Compute Y subpixel correction.  This will include any Y
    // offset due to clamping.
    m_StpCtx.fDY = m_StpCtx.iY - fY0;

    // Compute trapzeoid heights.  These will be restricted to
    // lie in the clip rect.

    RSASSERT(m_iY1 >= m_StpCtx.iY && m_iY2 >= m_iY1);

    m_uHeight10 = m_iY1 - m_StpCtx.iY;
    m_uHeight21 = m_iY2 - m_iY1;

    m_uHeight20 = m_uHeight10 + m_uHeight21;
    if (m_uHeight20 == 0)
    {
        // Triangle doesn't cover any pixels.
        return FALSE;
    }

    RSDPFM((RSM_TRIS, "Tstp (%.4f,%.4f) (%.4f,%.4f) (%.4f,%.4f)\n",
            fX0, fY0, fX1, fY1, fX2, fY2));
    RSDPFM((RSM_TRIS, "    (%.4f,%.4f : %.4f,%.4f) %d:%d det %.4f\n",
            m_StpCtx.fDX10, m_StpCtx.fDY10,
            m_StpCtx.fDX20, m_StpCtx.fDY20,
            m_uHeight10, m_uHeight21, fDet));
    RSDPFM((RSM_Z, "    Z (%f) (%f) (%f)\n",
        pV0->dvSZ, pV1->dvSZ, pV2->dvSZ));
    RSDPFM((RSM_DIFF, "    diff (0x%08X) (0x%08X) (0x%08X)\n",
            pV0->dcColor, pV1->dcColor, pV2->dcColor));
    RSDPFM((RSM_DIDX, "    didx (0x%08X) (0x%08X) (0x%08X)\n",
            pV0->dcColor, pV1->dcColor, pV2->dcColor));
    RSDPFM((RSM_SPEC, "    spec (0x%08X) (0x%08X) (0x%08X)\n",
            pV0->dcSpecular & 0xffffff, pV1->dcSpecular & 0xffffff,
            pV2->dcSpecular & 0xffffff));
    RSDPFM((RSM_OOW, "    OoW (%f) (%f) (%f)\n",
            pV0->dvRHW, pV1->dvRHW, pV2->dvRHW));
    RSDPFM((RSM_TEX1, "    Tex1 (%f,%f) (%f,%f) (%f,%f)\n",
            pV0->dvTU, pV0->dvTV, pV1->dvTU, pV1->dvTV,
            pV2->dvTU, pV2->dvTV));
    RSDPFM((RSM_FOG, "    Fog (0x%02X) (0x%02X) (0x%02X)\n",
            RGBA_GETALPHA(pV0->dcSpecular),
            RGBA_GETALPHA(pV1->dcSpecular),
            RGBA_GETALPHA(pV2->dcSpecular)));

    // Compute dx/dy for edges and initial X's.

    m_StpCtx.fDX = m_StpCtx.fDY * fDXDY20;
    FLOAT fX20 = fX0 + m_StpCtx.fDX;

    ComputeIntCarry(fX20, fDXDY20, &m_StpCtx.X20);
    m_StpCtx.fX20NC = (FLOAT)m_StpCtx.X20.iNC;
    m_StpCtx.fX20CY = (FLOAT)m_StpCtx.X20.iCY;

    RSDPFM((RSM_TRIS, "    edge20  %f dxdy %f\n", fX20, fDXDY20));
    RSDPFM((RSM_TRIS, "            (?.%d d %d nc %d cy %d)\n",
            m_StpCtx.X20.iFrac,
            m_StpCtx.X20.iDFrac, m_StpCtx.X20.iNC, m_StpCtx.X20.iCY));

    if (m_uHeight10 > 0)
    {
        FLOAT fDXDY10;
        FLOAT fX10;

#ifdef CHECK_VERTICAL
        // This case probably doesn't occur enough to justify the code.
        if (FLOAT_EQZ(m_StpCtx.fDX10))
        {
            fDXDY10 = g_fZero;
            fX10 = fX0;
        }
        else
#endif
        {
            fDXDY10 = m_StpCtx.fDX10 / m_StpCtx.fDY10;
            fX10 = fX0 + m_StpCtx.fDY * fDXDY10;
        }

        m_StpCtx.X10.iV = ICEILF(fX10);
        ComputeIntCarry(fX10, fDXDY10, &m_StpCtx.X10);

        RSDPFM((RSM_TRIS, "    edge10  %f dxdy %f\n", fX10, fDXDY10));
        RSDPFM((RSM_TRIS, "            (%d.%d d %d nc %d cy %d)\n",
                m_StpCtx.X10.iV, m_StpCtx.X10.iFrac,
                m_StpCtx.X10.iDFrac, m_StpCtx.X10.iNC, m_StpCtx.X10.iCY));
    }
#if DBG
    else
    {
        // Make it easier to detect when an invalid edge is used.
        memset(&m_StpCtx.X10, 0, sizeof(m_StpCtx.X10));
    }
#endif

    if (m_uHeight21 > 0)
    {
        FLOAT fDXDY21;
        FLOAT fX21;

#ifdef CHECK_VERTICAL
        // This case probably doesn't occur enough to justify the code.
        if (FLOAT_COMPARE(fX1, ==, fX2))
        {
            fDXDY21 = g_fZero;
            fX21 = fX1;
        }
        else
#endif
        {
            fDXDY21 = (fX2 - fX1) / (fY2 - fY1);
            fX21 = fX1 + (m_iY1 - fY1) * fDXDY21;
        }

        m_StpCtx.X21.iV = ICEILF(fX21);
        ComputeIntCarry(fX21, fDXDY21, &m_StpCtx.X21);

        RSDPFM((RSM_TRIS, "    edge21  %f dxdy %f\n", fX21, fDXDY21));
        RSDPFM((RSM_TRIS, "            (%d.%d d %d nc %d cy %d)\n",
                m_StpCtx.X21.iV, m_StpCtx.X21.iFrac,
                m_StpCtx.X21.iDFrac, m_StpCtx.X21.iNC, m_StpCtx.X21.iCY));
    }
#if DBG
    else
    {
        // Make it easier to detect when an invalid edge is used.
        memset(&m_StpCtx.X21, 0, sizeof(m_StpCtx.X21));
    }
#endif

    // The edge walker always walks the long edge so it may either
    // be a left or a right edge.  Determine what side the long edge
    // is and perform appropriate snapping and subpixel adjustment
    // computations.
    //
    // The clip-clamped initial X pixel position is also computed and
    // any necessary offset added into the subpixel correction delta.

    if (uDetCcw)
    {
        // Long edge (0-2) is to the right.

        m_StpCtx.uFlags |= TRIF_X_DEC;
        m_StpCtx.pPrim->uFlags = D3DI_RASTPRIM_X_DEC;

        m_StpCtx.X20.iV = ICEILF(fX20) - 1;

        // Other edges are left edges.  Bias them back by one
        // so that the span width computation can do R - L
        // rather than R - L + 1.
        m_StpCtx.X10.iV--;
        m_StpCtx.X21.iV--;

        // Clamp the initial X position.
        if (m_StpCtx.X20.iV >= m_StpCtx.pCtx->Clip.right)
        {
            m_StpCtx.iX = m_StpCtx.pCtx->Clip.right - 1;
        }
        else
        {
            m_StpCtx.iX = m_StpCtx.X20.iV;
        }
    }
    else
    {
        // Long edge (0-2) is to the left.

        m_StpCtx.pPrim->uFlags = 0;

        m_StpCtx.X20.iV = ICEILF(fX20);

        // Other edges are right edges.  The ICEILF snapping done
        // already leaves them off by one so that R - L works.

        // Clamp the initial X position.
        if (m_StpCtx.X20.iV < m_StpCtx.pCtx->Clip.left)
        {
            m_StpCtx.iX = m_StpCtx.pCtx->Clip.left;
        }
        else
        {
            m_StpCtx.iX = m_StpCtx.X20.iV;
        }
    }

    // Update X subpixel correction.  This delta includes any
    // offseting due to clamping of the initial pixel position.
    m_StpCtx.fDX += m_StpCtx.iX - fX20;

    RSDPFM((RSM_TRIS, "    subp    %f,%f\n", m_StpCtx.fDX, m_StpCtx.fDY));

    // Compute span-to-span steps for buffer pointers.
    m_StpCtx.DAttrNC.ipSurface = m_StpCtx.pCtx->iSurfaceStride +
        m_StpCtx.X20.iNC * m_StpCtx.pCtx->iSurfaceStep;
    m_StpCtx.DAttrNC.ipZ = m_StpCtx.pCtx->iZStride +
        m_StpCtx.X20.iNC * m_StpCtx.pCtx->iZStep;

    // Start one over determinant divide.  Done after the multiplies
    // since integer multiplies require some of the FP unit.

    FLOAT fOoDet;

    FLD_BEGIN_DIVIDE(g_fOne, fDet, fOoDet);

    if (m_StpCtx.X20.iCY > m_StpCtx.X20.iNC)
    {
        m_StpCtx.DAttrCY.ipSurface = m_StpCtx.DAttrNC.ipSurface +
            m_StpCtx.pCtx->iSurfaceStep;
        m_StpCtx.DAttrCY.ipZ = m_StpCtx.DAttrNC.ipZ + m_StpCtx.pCtx->iZStep;
    }
    else
    {
        m_StpCtx.DAttrCY.ipSurface = m_StpCtx.DAttrNC.ipSurface -
            m_StpCtx.pCtx->iSurfaceStep;
        m_StpCtx.DAttrCY.ipZ = m_StpCtx.DAttrNC.ipZ - m_StpCtx.pCtx->iZStep;
    }

    //
    // Compute attribute functions.
    //

    // Set pure X/Y step deltas for surface and Z so that DX, DY, CY and NC all
    // have complete information and can be used interchangeably.
    if (m_StpCtx.uFlags & TRIF_X_DEC)
    {
        m_StpCtx.DAttrDX.ipSurface = -m_StpCtx.pCtx->iSurfaceStep;
        m_StpCtx.DAttrDX.ipZ = -m_StpCtx.pCtx->iZStep;
    }
    else
    {
        m_StpCtx.DAttrDX.ipSurface = m_StpCtx.pCtx->iSurfaceStep;
        m_StpCtx.DAttrDX.ipZ = m_StpCtx.pCtx->iZStep;
    }
    m_StpCtx.DAttrDY.ipSurface = m_StpCtx.pCtx->iSurfaceStride;
    m_StpCtx.DAttrDY.ipZ = m_StpCtx.pCtx->iZStride;

    // Finish overlapped divide.
    FSTP_END_DIVIDE(fOoDet);

    m_StpCtx.fOoDet = fOoDet;

    // The PrimProcessor is created zeroed out so the initial
    // state is FP clean.  Later uses may put FP values in slots but
    // they should still be valid, so the optional computations here
    // should never result in FP garbage.  It should therefore be
    // OK to use any mixture of attribute handlers since there should
    // never be any case where FP garbage will creep in.

    BOOL bNorm;

    // USED checks cannot be combined since TEX_USED is a multibit check.
    if ((m_StpCtx.uFlags & PRIMSF_TEX_USED) &&
        (m_StpCtx.uFlags & PRIMSF_PERSP_USED) &&
        (m_uPpFlags & PPF_NORMALIZE_RHW) &&
        NEEDS_NORMALIZE3(pV0->dvRHW, pV1->dvRHW, pV2->dvRHW))
    {
        NormalizeTriRHW(pV0, pV1, pV2);
        bNorm = TRUE;
    }
    else
    {
        bNorm = FALSE;
    }

    TriSetup_Start(&m_StpCtx, pV0, pV1, pV2);

    if (bNorm)
    {
        pV0->dvRHW = m_dvV0RHW;
        pV1->dvRHW = m_dvV1RHW;
        pV2->dvRHW = m_dvV2RHW;
    }

    return TRUE;
}