/* Copyright (c) 2015  Gerald Knizia
 * 
 * This file is part of the IboView program (see: http://www.iboview.org)
 * 
 * IboView is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3.
 * 
 * IboView is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with bfint (LICENSE). If not, see http://www.gnu.org/licenses/
 * 
 * Please see IboView documentation in README.txt for:
 * -- A list of included external software and their licenses. The included
 *    external software's copyright is not touched by this agreement.
 * -- Notes on re-distribution and contributions to/further development of
 *    the IboView software
 */

#include "CxAlgebra.h"
#include "CtDfti.h"
#include "Ir.h"
#include "CtMatrix.h"
using namespace ct;

inline void MXMA(double const *pA, size_t iRowStA, size_t iColStA, double const *pB, size_t iRowStB, size_t iColStB, double *pOut, size_t iRowStOut, size_t iColStOut, size_t nRows, size_t nLink, size_t nCols) {
   FMatrixView const
      A(const_cast<double*>(pA), nRows, nLink, iRowStA, iColStA),
      B(const_cast<double*>(pB), nLink, nCols, iRowStB, iColStB);
   FMatrixView
      Out(pOut, nRows, nCols, iRowStOut, iColStOut);
   Mxm(Out, A, B);
}

inline void MXMB(double const *pA, size_t iRowStA, size_t iColStA, double const *pB, size_t iRowStB, size_t iColStB, double *pOut, size_t iRowStOut, size_t iColStOut, size_t nRows, size_t nLink, size_t nCols) {
   FMatrixView const
      A(const_cast<double*>(pA), nRows, nLink, iRowStA, iColStA),
      B(const_cast<double*>(pB), nLink, nCols, iRowStB, iColStB);
   FMatrixView
      Out(pOut, nRows, nCols, iRowStOut, iColStOut);
   Mxm(Out, A, B, 1.0, MXM_Add);
}


#ifdef _MSC_VER
   // OpenBLAS DGEMV crashes on me if executed in OpenMP-loop. No idea.
   #pragma message( "WARNING: REPLACING REAL DGEMV BY MxmLame AS OPENBLAS/OPENMP WORKAROUND!")
   #define Mxv MxvLame
#endif


namespace dfti {


enum {
   CLOCK_EvalBfn = 0,
   CLOCK_FormRho = 1,
   CLOCK_FormXcMat = 2,
   CLOCK_EvalDftfun = 3,
   CLOCK_GridWtDeriv = 4, // for gradient code.
   CLOCK_CoreGrad = 5,
   CLOCK_TimerCount = 6
};

enum {
   iNoCenter = 0xffffffff
};

struct FDftiResultSet;
struct FDensitySet;
struct FDensitySetInput;





// #define RESUME_CPU_CLOCK(iClockId)
// #define PAUSE_CPU_CLOCK(iClockId)

// note: comment with /* */ to avoid multi-line comment warnings.

#define RESUME_CPU_CLOCK(iClockId) \
   if ( Args.MeasureTime() ) \
      Timings[(iClockId)] -= Second()

#define PAUSE_CPU_CLOCK(iClockId) \
   if ( Args.MeasureTime() ) \
      Timings[(iClockId)] += Second()


struct FDensitySetInput
{
   double
      *pRdm, *pOccOrb;
   size_t
      nBf, nOcc;
   size_t
      *pMap, nMap;
   bool
      UseOrb,
      Gradient; // keep data for gradient evaluation
   uint
      nComp;
   FDensitySetInput() {};
   FDensitySetInput(double *pRdm_, double *pOccOrb_, size_t nBf_, size_t nOcc_,
         size_t *pMap_, size_t nMap_, bool UseOrb_, bool Gradient_, uint nComp_)
      : pRdm(pRdm_), pOccOrb(pOccOrb_), nBf(nBf_), nOcc(nOcc_), pMap(pMap_), nMap(nMap_),
        UseOrb(UseOrb_), Gradient(Gradient_), nComp(nComp_)
   {};
};

// densities for one spin component.
struct FDensitySet
{
   double
      *pRho,      // electron density: rho(rg) = \sum_i [phi_i(rg)]^2 (len: nPt)
      *pRhoGrd,   // gradient of electron density: d/dx_i rho(rg)  (len: 3*nPt)
      *pTau,      // kinetic term: \sum_i dot(grad phi_i(rg), grad phi_i(rg)) (len: nPt)
      *pUpsilon,  // laplacian of electron density: div grad rho(rg) (len: nPt)
      *pBxRho;    // may be 0. Otherwise pOrbVal contracted with density matrix: arrays nGridPt x nBx x nMap.
   size_t
      nGridPt;
   uint
      nDiff,
      nBx; // number of components of pOrbVal stored in pBxRho (1: only density; 4: density+gradient)
   bool
      MakeTau,
      AuxExpandXc;
   double
      fElec;

   void Init(size_t nGridPt_, uint nDiff_, bool MakeTau_, bool AuxExpandXc_, FMemoryStack &Mem);
   // evaluate densities (+gradients)
   void Eval(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, double const *pGridWt, FMemoryStack &Mem);
   void EvalAux(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, double const *pGridWt, FMemoryStack &Mem);


   // calculate and accumulate xc matrix contribution.
   void AccXcMatrix(double *pXcTriang, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem);
   void AccXcMatrixAux(double *pXcVec, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem);

   // accumulate symmetric contribution mu(r) vdrho(r) nu(r) to given xc matrix.
   // beware of mysterious flags.
   void AccXcSym1(double *pXc, double const *pVdRho, double const *pGridWt, double const *pOrbVal, size_t nMapSt, size_t nMap, uint Flags, FMemoryStack &Mem);

   double *MakePhi(double const *pIn, size_t nPts, size_t Stride, FDensitySetInput const &p, FMemoryStack &Mem) const;
   double *MakeBxRho(double const *pIn, size_t nPts, size_t Stride, FDensitySetInput const &p, FMemoryStack &Mem) const;

   double *CompressAuxVec(double const *pInput, FDensitySetInput const &p, FMemoryStack &Mem) const;
};

void FDensitySet::Init(size_t nGridPt_, uint nDiff_, bool MakeTau_, bool AuxExpandXc_, FMemoryStack &Mem)
{
   nGridPt = nGridPt_;
   nDiff = nDiff_;
   MakeTau = MakeTau_;
   AuxExpandXc = AuxExpandXc_;

   pBxRho = 0;
   nBx = 0;

   Mem.Alloc(pRho, nGridPt);
   if ( nDiff >= 1 )
      Mem.Alloc(pRhoGrd, 3 * nGridPt);
   if ( nDiff >= 2 || MakeTau )
      Mem.Alloc(pTau, nGridPt);
   if ( nDiff >= 2 )
      Mem.Alloc(pUpsilon, nGridPt);
}



enum FAlgOpFlags{
   ALGOP_Add = 0x01,  // add to output instead of replacing it.
   ALGOP_Symmetrize = 0x02 // copy lower triangle to upper triangle.
};


// r = dot(x,y)
template<class FScalar>
FScalar Dot2( FScalar const *IR_RP x, FScalar const *IR_RP y, size_t n )
{
   FScalar
      r = 0;
   size_t
      i = 0;
   for ( ; i < (n & (~3)); i += 4 ) {
      r += x[i]   * y[i];
      r += x[i+1] * y[i+1];
      r += x[i+2] * y[i+2];
      r += x[i+3] * y[i+3];
   }
   for ( ; i < n; ++ i ) {
      r += x[i] * y[i];
   }
   return r;
}


template
double Dot2<double>( double const *IR_RP x, double const *IR_RP y, size_t n );

// r[i] += f * x[i]
void Add2( double *IR_RP r, double const *IR_RP x, double f, std::size_t n )
{
   std::size_t
      i = 0;
   for ( ; i < (n & (~3)); i += 4 ) {
      r[i]   += f * x[i];
      r[i+1] += f * x[i+1];
      r[i+2] += f * x[i+2];
      r[i+3] += f * x[i+3];
   }
   for ( ; i < n; ++ i ) {
      r[i] += f * x[i];
   }
}


// r[i] += f * x[i] * y[i]
void Add2( double *IR_RP r, double const *IR_RP x, double const *IR_RP y, double f, std::size_t n )
{
   std::size_t
      i = 0;
   for ( ; i < (n & (~3)); i += 4 ) {
      r[i]   += f * x[i]   * y[i];
      r[i+1] += f * x[i+1] * y[i+1];
      r[i+2] += f * x[i+2] * y[i+2];
      r[i+3] += f * x[i+3] * y[i+3];
   }
   for ( ; i < n; ++ i ) {
      r[i] += f * x[i] * y[i];
   }
}



// ddots multiple strided sets of vectors.
void DotN(double *pOut, double Factor, double const *pA, size_t nStrideA, double const *pB, size_t nStrideB, size_t nPoints, size_t nSets, uint Flags = 0)
{
   for ( size_t iSet = 0; iSet < nSets; ++ iSet ){
      double
         f = Factor * Dot2(&pA[iSet * nStrideA], &pB[iSet * nStrideB], nPoints);

      if ( Flags == 0 )
         pOut[iSet] = f;
      else {
         assert(Flags == ALGOP_Add);
         pOut[iSet] += f;
      }
   }
}

// daxpys multiple strided sets of vectors.
void AccN(double *pOut, double Factor, double const *pA, size_t nStrideA, double const *pB, size_t nStrideB, size_t nPoints, size_t nSets, uint Flags = 0)
{
   if ( (Flags & ALGOP_Add) == 0 )
      // clear output.
      for ( size_t iPt = 0; iPt < nPoints; ++ iPt )
         pOut[iPt] = 0;

   for ( size_t iSet = 0; iSet < nSets; ++ iSet )
      Add2(pOut, &pA[iSet * nStrideA], &pB[iSet * nStrideB], Factor, nPoints);
}

void FDensitySet::Eval(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, double const *pGridWt, FMemoryStack &Mem)
{
   if (AuxExpandXc)
      return EvalAux(pOrbVal, p, ForceNonNegative, pGridWt, Mem);
}


double *FDensitySet::CompressAuxVec(double const *pInput, FDensitySetInput const &p, FMemoryStack &Mem) const
{
   double *r;
   Mem.Alloc(r, p.nMap);
   for (size_t iMap = 0; iMap < p.nMap; ++ iMap)
      r[iMap] = pInput[p.pMap[iMap]];
   return r;
}

// evaluate densities via auxiliary expansion of density.
void FDensitySet::EvalAux(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, double const *pGridWt, FMemoryStack &Mem)
{
   void
      *pBeginOfStorage = Mem.Alloc(0); // note: NOT freed in gradient case! (for keeping bxrho!)
   uint
      nComp = p.nComp;
   size_t
      nMap = (size_t)p.nMap,
      nMapSt = nGridPt * nComp; // stride between two basis function entries in OrbVal.

   double
      *pAuxDen = CompressAuxVec(p.pRdm, p, Mem); // compress input density to Map dimensions.
   if ( nDiff == 0 ) {
      // LDA.
      Mxv(pRho,1, pOrbVal,1,nMapSt, pAuxDen,1, nGridPt, nMap);
   } else if ( nDiff == 1 && !MakeTau ) {
      // GGA without tau. Most common case.
      Mxv(pRho,1, pOrbVal,1,nMapSt, pAuxDen,1, nGridPt, nMap);
      Mxv(&pRhoGrd[0*nGridPt],1, &pOrbVal[1*nGridPt],1,nMapSt, pAuxDen,1, nGridPt, nMap, false, 1.0);
      Mxv(&pRhoGrd[1*nGridPt],1, &pOrbVal[2*nGridPt],1,nMapSt, pAuxDen,1, nGridPt, nMap, false, 1.0);
      Mxv(&pRhoGrd[2*nGridPt],1, &pOrbVal[3*nGridPt],1,nMapSt, pAuxDen,1, nGridPt, nMap, false, 1.0);
   } else if ( nDiff == 1 && MakeTau ) {
      assert(0);
   } else if ( nDiff == 2 ) {
      assert(0);
   } else {
      assert(0);
   }

   // make sure that density has not gone negative due to some
   // unfortunate numerical cancellation. Count electrons (for grid accuracy) BEFORE doing that.
   fElec = Dot(pRho, pGridWt, nGridPt);
   if ( ForceNonNegative )
      for ( size_t iPt = 0; iPt < nGridPt; ++ iPt )
         if ( pRho[iPt] < 0 )
            pRho[iPt] = 0;
   // ^- UKS spin densities can be negative

   if ( !p.Gradient )
      Mem.Free(pBeginOfStorage);
}


// void TransformAbToCo(double *pAC, double *pBO, size_t n)
// {
//    assert( pAC != 0 && pBO != 0 );
//    for ( size_t i = 0; i < n; ++ i ) {
//       double c = pAC[i] + pBO[i];
//       double o = pAC[i] - pBO[i];
//       pAC[i] = c;
//       pBO[i] = o;
//    };
// }
//
// // transform a pair of density sets for alpha/beta densities to closed/open densities
// void TransformAbToCo(FDensitySet &AC, FDensitySet &BO, size_t nGridPt)
// {
//    if ( AC.pRho )
//       TransformAbToCo(AC.pRho, BO.pRho, nGridPt);
//    if ( AC.pRhoGrd )
//       TransformAbToCo(AC.pRhoGrd, BO.pRhoGrd, 3*nGridPt);
//    if ( AC.pTau )
//       TransformAbToCo(AC.pTau, BO.pTau, nGridPt);
//    if ( AC.pUpsilon )
//       TransformAbToCo(AC.pUpsilon, BO.pUpsilon, nGridPt);
// }

struct FDftiResultSet
{
   double
      *pZk,        // dft integrand Exc
      *pVdRho,      // [d/d rho] Exc
      *pVdSigma,    // [d/d sigmaxx] Exc, where xx = cc for closed case and oo for open case
      *pVdSigmaXY,  // [d/d sigmaxy] Exc, where xy = co (for both cases)
      *pVdTau,      // [d/d tau] Exc
      *pVdUpsilon;  // [d/d upsilon] Exc

   void Init(size_t nGridPt, uint nDiff, bool MakeTau, double *pVdSigmaXY_, FMemoryStack &Mem);
};

void FDftiResultSet::Init(size_t nGridPt, uint nDiff, bool MakeTau, double *pVdSigmaXY_, FMemoryStack &Mem)
{
   Mem.ClearAlloc(pZk, nGridPt);
   Mem.ClearAlloc(pVdRho, nGridPt);
   if ( nDiff >= 1 ) {
      Mem.ClearAlloc(pVdSigma, nGridPt);
      if ( pVdSigmaXY_ )
         pVdSigmaXY = pVdSigmaXY_; // store link if provided by other case.
      else
         Mem.ClearAlloc(pVdSigmaXY, nGridPt);
   }
   if ( nDiff >= 2 || MakeTau )
      Mem.ClearAlloc(pVdTau, nGridPt);
   if ( nDiff >= 2 )
      Mem.ClearAlloc(pVdUpsilon, nGridPt);
}

// void FDensitySet::MakeZm(double *&pZm, double *pOrbVal, double *pRhoGrd, size_t nGridPt, size_t nMap, size_t nMapSt, FMemoryStack &Mem)
// {
//    // form Zm(r) := (grad mu(r))*(grad rho(r))
//    Mem.Alloc(pZm, nGridPt * nMap);
//    for ( size_t iMap = 0; iMap < nMap; ++ iMap )
//    {
//       double *pVal = &pOrbVal[nMapSt * iMap] + nGridPt; // start of gradient.
//       double *pOut = &pZm[nMap * iMap];
//       for ( size_t iPt = 0; iPt < nGridPt; ++ iPt )
//       {
//          pOut[iPt] =
//             pRhoGrd[iPt]             * pVal[iPt +   nGridPt] +
//             pRhoGrd[iPt +   nGridPt] * pVal[iPt +   nGridPt] +
//             pRhoGrd[iPt + 2*nGridPt] * pVal[iPt + 2*nGridPt];
//       };
// };




// accumulate xc matrix contribution to triangular full-dimension xc matrix at pXcTriang.
// X denotes the open/closed case at hand (e.g., 'c' when making closed-shell exchange), Y the other case.
// For example, when making the open-shell exchange matrix, then Sigma is SigmaOO and SigmaXY is SigmaCO.
void FDensitySet::AccXcMatrix(double *pXcTriang, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem)
{
   if (AuxExpandXc)
      return AccXcMatrixAux(pXcTriang, pOrbVal, r, pRhoGrdY, p, pGridWt, Mem);
}


// accumulate xc matrix contribution to triangular full-dimension xc matrix at pXcTriang.
// X denotes the open/closed case at hand (e.g., 'c' when making closed-shell exchange), Y the other case.
// For example, when making the open-shell exchange matrix, then Sigma is SigmaOO and SigmaXY is SigmaCO.
void FDensitySet::AccXcMatrixAux(double *pXcVec, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem)
{
   void
      *pBeginOfStorage = Mem.Alloc(0);
   uint
      nComp = p.nComp;
   size_t
      nMap = (size_t)p.nMap,
      nMapSt = nGridPt * nComp; // stride between two basis function entries in OrbVal.
   double
      *pXc;
   Mem.Alloc(pXc, nMap);

   if ( nDiff == 0 ) {
      // LDA:
      //    k_{\mu\nu}(r) = \mu(r) vrho(r) \nu(r)
      // ... that's it already.
      double *pVdRhoWt;
      Mem.Alloc(pVdRhoWt, nGridPt);
      for (size_t iPt = 0; iPt < nGridPt; ++ iPt)
         pVdRhoWt[iPt] = r.pVdRho[iPt] * pGridWt[iPt];
      Mxv(pXc,1, pOrbVal,nMapSt,1, pVdRhoWt,1, nMap, nGridPt);
//       Mxv(pXc,1, pOrbVal,nMapSt,1, r.VdRho,1, nMap, nGridPt);
   } else if ( nDiff == 1 ) {
      assert(!MakeTau);
//       if ( MakeTau ) {
//          // GGA with tau: same as GGA without tau, but with additional term
//          //    k_{\mu\nu}(r) += vtau(r) [\grad \mu(r)] [\grad \nu(r)]
//          // Do that first (because of mystery symmetrization in AccXcSym1).
//          AccXcSym1(pXc, r.pVdTau, pGridWt, &pOrbVal[1*nGridPt], nMapSt, nMap, 0, Mem);
//          AccXcSym1(pXc, r.pVdTau, pGridWt, &pOrbVal[2*nGridPt], nMapSt, nMap, ALGOP_Add, Mem);
//          AccXcSym1(pXc, r.pVdTau, pGridWt, &pOrbVal[3*nGridPt], nMapSt, nMap, ALGOP_Add | ALGOP_Symmetrize, Mem);
//       }

      // auxiliary version:
      //
      //    k[A] += \sum_g  w(g) vrho(g) A(g) + \sum_g w(g) D[Ecx(r)]/D[grad rho(r)] * grad A(r)
      //    k[A] += \sum_g  w(g) vrho(g) A(g) + \sum_g w(g) vsigma(g) 2 (\grad rho(r))*(\grad A(r))
      //
      // second term comes from derivative D[sigma]/D[grad rho(r)] = 2 grad rho(r)
      // since sigma = [grad rho(r)] x [grad rho(r)]
      double *pVdRhoWt, *pVdGrdWt;
      Mem.Alloc(pVdRhoWt, nGridPt);
      for (size_t iPt = 0; iPt < nGridPt; ++ iPt)
         pVdRhoWt[iPt] = r.pVdRho[iPt] * pGridWt[iPt];
      Mem.Alloc(pVdGrdWt, 3*nGridPt);
      for (size_t iPt = 0; iPt < nGridPt; ++ iPt) {
         pVdGrdWt[iPt + 0*nGridPt] = 2. * pGridWt[iPt] * r.pVdSigma[iPt] * pRhoGrd[iPt + 0*nGridPt];
         pVdGrdWt[iPt + 1*nGridPt] = 2. * pGridWt[iPt] * r.pVdSigma[iPt] * pRhoGrd[iPt + 1*nGridPt];
         pVdGrdWt[iPt + 2*nGridPt] = 2. * pGridWt[iPt] * r.pVdSigma[iPt] * pRhoGrd[iPt + 2*nGridPt];
      }
      Mxv(pXc,1, pOrbVal,nMapSt,1, pVdRhoWt,1, nMap, nGridPt);
      Mxv(pXc,1, &pOrbVal[1*nGridPt],nMapSt,1, &pVdGrdWt[0*nGridPt],1, nMap, nGridPt, true, 1.0);
      Mxv(pXc,1, &pOrbVal[2*nGridPt],nMapSt,1, &pVdGrdWt[1*nGridPt],1, nMap, nGridPt, true, 1.0);
      Mxv(pXc,1, &pOrbVal[3*nGridPt],nMapSt,1, &pVdGrdWt[2*nGridPt],1, nMap, nGridPt, true, 1.0);
   } else {
      assert_rt(0);
      // fixme: add code here once other stuff works (MGGA).
   }

   // expand Map dimension & add to previous vxc vector.
   for ( size_t iMapCol = 0; iMapCol < nMap; ++ iMapCol ) {
      pXcVec[p.pMap[iMapCol]] += pXc[iMapCol];
   }

   Mem.Free(pBeginOfStorage);
}


struct FDftiJob {
   FDftiArgs const
      &Args;
   FDftGrid::FGridBlock const
      *pGridBlock;
   // outputs.
   double
      *pFockC, // note: these are assumed to be thread-local (i.e., lock-free)
      *pFockO,
      *pGradient, // geometric gradient (in case iMakeGrad=1)
      *pDfuEnergies; // accumulated dft energies, one for each functional.
   size_t
      nGridPt,
      nDiff,    // derivative order of functional
      nDiffBf,  // derivative order of basis functions we need to evaluate (>= nDiff; required for Gradients)
      nComp,    // number of orbital components at pOrbVal (value; value,dx,dy,dz; value,dx,dy,dz,dxx,dxy,...)
      iGridBlockCenter;
      // ^- atom/center index on which the current block of grid points in locked.
      //    only used in gradient calculations, otherwise set to iNoCenter
   FMemoryStack
      &Mem; // note: this one is assumed to be thread-local

   double
      *pGridWt;
   double const
      *pGridPt, *pGridWt_In;

   double
      Timings[CLOCK_TimerCount],
      fElec; // integrated density on the block.

   FDftiJob(double *pDfuEnergies, double *pFockC, double *pFockO, double *pGrad, FDftGrid::FGridBlock const *pGridBlock, FDftiArgs &Args, FMemoryStack &Mem);
   void Run();


   void EvalBfn(double *&pOrbVal, size_t *&pMap, size_t &nMap, ptrdiff_t *&pCenterIndices);
   void FormSigma(double *&pSigma, double const *pRhoGrdA, double const *pRhoGrdB);

   void EvalGridWtDeriv(double *&pWtGrd);

   // calculate and accumulate DFTI analytic gradient (core part)
   void AccCoreGradient(double *pGrad, double const *pOrbVal, ptrdiff_t const *pCenterIndices, FDftiResultSet const &r,
     FDensitySet &d, FDensitySetInput const &DenInp, double const *pRhoGrdY, double const *pSigmaXY, bool bGridGrad);
   void AccCoreGradientAux(double *pGrad, double const *pOrbVal, ptrdiff_t const *pCenterIndices, FDftiResultSet const &r,
     FDensitySet &d, FDensitySetInput const &DenInp, double const *pRhoGrdY, double const *pSigmaXY, bool bGridGrad);

   // calculate and accumulate DFTI analytic gradient (grid weight derivative part)
   void AccGridWtGradient(double *pGrad, double const *pZk);
private:
   FDftiJob(FDftiJob const&); // not implemented
   void operator = (FDftiJob const&); // not implemented.
};


FDftiJob::FDftiJob(double *pDfuEnergies_, double *pFockC_, double *pFockO_, double *pGrad_, FDftGrid::FGridBlock const *pGridBlock_, FDftiArgs &Args_, FMemoryStack &Mem_)
   : Args(Args_), pGridBlock(pGridBlock_), pFockC(pFockC_), pFockO(pFockO_), pGradient(pGrad_), pDfuEnergies(pDfuEnergies_), iGridBlockCenter(pGridBlock_->iAtomicCenter), Mem(Mem_)
{
   nGridPt = pGridBlock->nPt();
   pGridPt = &Args.pDftGrid->Positions[pGridBlock->iFirst][0];
   // make a copy of the grid weights---we might modify them under some conditions.
   Mem.Alloc(pGridWt, nGridPt);
   pGridWt_In = &Args.pDftGrid->Weights[pGridBlock->iFirst];
   assert(sizeof(pGridWt[0]) == sizeof(Args.pDftGrid->Weights[0]));
   memcpy(&pGridWt[0], &pGridWt_In[0], sizeof(pGridWt[0])*nGridPt);

   nDiff = Args.pXcFn->NeedSigma() ? 1 : 0;
   nDiffBf = nDiff;
   if (Args.Flags & DFTI_MakeGradient)
      nDiffBf += 1;

   // number of derivative components.
   nComp = ((nDiffBf+1)*(nDiffBf+2)*(nDiffBf+3))/6;

   for ( uint i = 0; i < CLOCK_TimerCount; ++ i )
      Timings[i] = 0;
   fElec = 0;
}




// note: allocates pSigma on this->Mem.
void FDftiJob::FormSigma(double *&pSigma, double const *pRhoGrdA, double const *pRhoGrdB)
{
   // sigmaAB = [grad rhoA]*[grad rhoB]
   Mem.Alloc(pSigma, nGridPt);
   for ( size_t iPt = 0; iPt < nGridPt; ++ iPt ){
      pSigma[iPt] = pRhoGrdA[iPt            ] * pRhoGrdB[iPt            ] +
                    pRhoGrdA[iPt +   nGridPt] * pRhoGrdB[iPt +   nGridPt] +
                    pRhoGrdA[iPt + 2*nGridPt] * pRhoGrdB[iPt + 2*nGridPt];
   }
}




void FDftiJob::Run()
{
   TMemoryLock<char>
      pBaseOfStorage(0, &Mem);

   // evaluate the basis functions (+derivatives) on the grid.
   RESUME_CPU_CLOCK(CLOCK_EvalBfn);
   double
      *pOrbVal;
   size_t
      nMap, *pMap;
   ptrdiff_t
      *pCenterIndices;
   EvalBfn(pOrbVal, pMap, nMap, pCenterIndices);
   PAUSE_CPU_CLOCK(CLOCK_EvalBfn);
   if ( nMap == 0 )
      return; // no functions left after screening.

   bool
      UseOrbitals = false;

   // make electron densities (+derivatives).
   RESUME_CPU_CLOCK(CLOCK_FormRho);
   FDensitySet
      DenC = FDensitySet(), DenO = FDensitySet();
   FDensitySetInput
      DenInpC(Args.pDenC, Args.pOccOrbC, Args.nBf(), Args.nOccC,
         pMap, nMap, UseOrbitals, Args.MakeGradient(), nComp),
      DenInpO;

   bool
      ForceNonNegative = true;
//    if ( Args.GridDensityFlags == 2 )
//       // just evaluate densities -- for some applications these might not be
//       // entirely positive.
//       ForceNonNegative = false;
   if (Args.UseAuxiliaryExpansion())
      ForceNonNegative = false; // see below.

   DenC.Init(nGridPt, nDiff, Args.NeedTau(), Args.UseAuxiliaryExpansion(), Mem);
   DenC.Eval(pOrbVal, DenInpC, ForceNonNegative, pGridWt, Mem);

   if ( Args.OpenShell() ) {
      DenInpO = DenInpC;
      DenInpO.pRdm = Args.pDenO;
      DenInpO.pOccOrb = Args.pOccOrbO;
      DenInpO.nOcc = Args.nOccO;

      DenO.Init(nGridPt, nDiff, Args.NeedTau(), Args.UseAuxiliaryExpansion(), Mem);
      DenO.Eval(pOrbVal, DenInpO, false, pGridWt, Mem);

//       bool
//          bHaveAB = (bool)Args.iUseAB && UseOrbitals;
//       if ( bHaveAB )
//          // input was alpha/beta orbitals, thus we have alpha/beta densities now.
//          // Transform this into closed/open densities, which the rest of the
//          // code expects.
//          TransformAbToCo(DenC, DenO, nGridPt);
   }
   if (Args.UseAuxiliaryExpansion()) {
      // flip weights and densities of points with negative density. reduced gradients (sigma) and taus
      // should be unaffected by this, since they are bilinear in the density. Note that we made a copy
      // of the grid weights for this purpose.
      // WARNING: assumes closed/open input, not A/B input!
      for (size_t iGridPt = 0; iGridPt < nGridPt; ++ iGridPt)
         if (DenC.pRho[iGridPt] < 0) {
            DenC.pRho[iGridPt] *= -1.;
            pGridWt[iGridPt] *= -1.;
            if (DenO.pRho)
               DenO.pRho[iGridPt] *= -1.;
         }
   }
   PAUSE_CPU_CLOCK(CLOCK_FormRho);

//    // if requested, store densities to external array or add densities from
//    // external array.
//    if ( Args.GridDensityFlags != 0 ) {
//       uint
//          nExtSt = Args.nGridDensitySt;
//       double
//          *pExtDenC = &Args.pGridDensitiesC[nExtSt * iFirstGridPt],
//          *pExtDenO = &Args.pGridDensitiesO[nExtSt * iFirstGridPt];
//       if ( Args.pGridDensitiesC != 0 )
//          DenC.StoreOrLoad(pExtDenC, nExtSt, Args.GridDensityFlags);
//       if ( Args.pGridDensitiesO != 0 && Args.OpenShell() )
//          DenO.StoreOrLoad(pExtDenO, nExtSt, Args.GridDensityFlags);
//    };
//
//    if ( Args.GridDensityFlags == 2 )
//       return; // done here.

   // form intermediates sigma_cc/sigma_co/sigma_oo.
   double
      *pSigmaCC = 0, *pSigmaOO = 0, *pSigmaCO = 0;
   if ( nDiff >= 1 ) {
      FormSigma(pSigmaCC, DenC.pRhoGrd, DenC.pRhoGrd);
      if ( Args.OpenShell() ){
         FormSigma(pSigmaCO, DenC.pRhoGrd, DenO.pRhoGrd);
         FormSigma(pSigmaOO, DenO.pRhoGrd, DenO.pRhoGrd);
      }
   }

   // allocate output data: functional kernels and derivatives
   // of kernel with respect to input data.
   // zk,vrhoc,vrhoo,vsigmacc,vsigmaco,vsigmaoo,vtauc,vtauo,vupsilonc,vupsilono
//    FDftiResultSet
//       ResC = {0}, ResO = {0};
// ^- why does g++ warn about this? This is perfectly well defined in both C and C++! Annoying.
   FDftiResultSet
      ResC = FDftiResultSet(), ResO = FDftiResultSet();
   ResC.Init(nGridPt, nDiff, Args.NeedTau(), 0, Mem);
   ResO.Init(nGridPt, nDiff, Args.NeedTau(), ResC.pVdSigmaXY, Mem);

   // branch out to fortran in order to evaluate the functional
   // and the split energy contributions (=sum zk(g) wt(g), but split according to functional)
   RESUME_CPU_CLOCK(CLOCK_EvalDftfun);
   double
      *pTmp;
   Mem.Alloc(pTmp, 20*nGridPt);
   DFTFUN_CXX(1, Args.OpenShell(), nDiff, nGridPt, pGridWt,
              DenC.pRho, DenO.pRho, pSigmaCC, pSigmaCO, pSigmaOO, DenC.pTau, DenO.pTau, DenC.pUpsilon, DenO.pUpsilon,
              // density functional outputs (summed over functionals)
              ResC.pZk, ResC.pVdRho, ResO.pVdRho, ResC.pVdSigma, ResC.pVdSigmaXY, ResO.pVdSigma, ResC.pVdTau, ResO.pVdTau, ResC.pVdUpsilon, ResO.pVdUpsilon,
              // energy outputs (split according to functions; one output per ndftu; accumulated)
              pDfuEnergies,
              // buffer (for accumulating zk)
              pTmp,
              // flags specifying input data
              (FORTINT)(DenC.pTau != 0), (FORTINT)(DenC.pUpsilon != 0));
   Mem.Free(pTmp);
   PAUSE_CPU_CLOCK(CLOCK_EvalDftfun);

   if ( Args.pfElecTotal )
      fElec = DenC.fElec;
//       fElec = Dot(DenC.pRho, pGridWt, nGridPt);

   if ( Args.MakeXc() ) {
      // assemble & accumulate xc matrices.
      RESUME_CPU_CLOCK(CLOCK_FormXcMat);
      DenC.AccXcMatrix(pFockC, pOrbVal, ResC, DenO.pRhoGrd, DenInpC, pGridWt, Mem);
      if ( Args.OpenShell() )
         DenO.AccXcMatrix(pFockO, pOrbVal, ResO, DenC.pRhoGrd, DenInpO, pGridWt, Mem);
      PAUSE_CPU_CLOCK(CLOCK_FormXcMat);
   }

}


// #ifdef HAVE_GRID_KERNEL
bool Vec3Eq(double const *pA, double const *pB) {
   return pA[0] == pB[0] && pA[1] == pB[1] && pA[2] == pB[2];
}

// pOrbValAo: format nGridPt * nComp * nAo  (after: nGridPt * nComp * nMap)
void IrEvalBfn(double *pOrbValAo, size_t *pMap, size_t &nMap, double const *pGridPt, size_t iStGridPt, size_t nGridPt, uint nDiff, FRawBasis const *pBasis, double ThrOrb, double LogThrOrb, FMemoryStack &Mem)
{
   TMemoryLock<char>
      pBaseOfMemory(0, &Mem);
   FMatrixView const
      Grid(const_cast<double*>(pGridPt), iStGridPt, nGridPt);
   size_t
//       nGridPt = Grid.nCols,
      nComp = 1,
      nAo = pBasis->nFn();
   IR_SUPPRESS_UNUSED_WARNING(nAo);
   if ( nDiff == 1 ) nComp = 4;
   if ( nDiff == 2 ) nComp = 10;

   // convert basis format for low level driver input...
   FRawBasis::FShellArray
      &ShellsA = const_cast<FRawBasis*>(pBasis)->Shells;

   nMap = 0;
//    assert(Grid.nRowSt == 1 && Grid.nRows == 3);
   assert(Grid.nRowSt == 1);

   size_t
      iFnBase = 0;
   ir::FRawShell
      *pLastBf = &ShellsA[0];
   for ( ir::FRawShell *pFirstBf = &ShellsA[0]; pFirstBf != &ShellsA[0] + ShellsA.size(); pFirstBf = pLastBf ) {
      pLastBf = pFirstBf;
      size_t nFn = 0;
      while ( pLastBf != &ShellsA[0] + ShellsA.size() && Vec3Eq(pLastBf->vCen, pFirstBf->vCen) ) {
         nFn += pLastBf->nFn();
         ++pLastBf;
      }
      size_t
         nBfStride = nGridPt*nComp;
      EvalShellGroupOnGrid(&pOrbValAo[nBfStride*nMap], nGridPt, nBfStride, // comp stride, bf stride
         pMap, nMap, iFnBase, &Grid(0,0), Grid.nColSt, nGridPt,
         pFirstBf, pLastBf, &pFirstBf->vCen[0],
         nDiff, ThrOrb, LogThrOrb, Mem);
      iFnBase += nFn;
   }
   assert(iFnBase == nAo && nMap <= nAo);
   assert(pLastBf == &ShellsA[0] + ShellsA.size());
}
// #endif




void FDftiJob::EvalBfn(double *&pOrbVal, size_t *&pMap, size_t &nMap, ptrdiff_t *&pCenterIndices)
{
   size_t
      nBf = Args.nBf();
   Mem.Alloc(pCenterIndices, nBf); // used in gradient evaluation
   Mem.Alloc(pMap, nBf);
   Mem.Alloc(pOrbVal, nGridPt * nComp * nBf);
//    Mem.Alloc(1000000);
//    xout << boost::format("      nGridPt: %i  nComp: %i   nBf: %i   nDiffBf: %i") % nGridPt % nComp % nBf % nDiffBf << std::endl;

//    // make temporary space for orbs-on-grid-routine (this way it doesn't
//    // have to allocate memory itself)
//    double
//       *pBuf;
//    uint
//       nBuf = 250000;
//    Mem.Alloc(pBuf, nBuf);
//
//    AIC_EVAL_BFN_ON_GRID( pOrbVal, nComp, pCenterIndices, pMap, nMap,
//       Args.infg, Args.expt, Args.cc, Args.sph, Args.nGrp, nBf, Args.pCenters,
//       pGridPt, nGridPt, nDiffBf, Args.ThrOrb, Args.LogThrOrb, pBuf, nBuf );
//    // ^- note that this is actually a C++ function (in AicFB.cpp). We just call
//    //    its exported Fortran interface.
//    FMatrixView
// //       Grid(&Args.pDftGrid->Positions[pGridBlock->iFirst][0], 3, nGridPt);
//       Grid(const_cast<double*>(pGridPt), 3, nGridPt);
   IrEvalBfn(pOrbVal, pMap, nMap, &pGridPt[0], 3, nGridPt, nDiffBf, Args.pBasis, Args.ThrOrb, Args.LogThrOrb, Mem);

   // assemble indices of basis function centers.
   {
      FRawBasis
         *pBasis = Args.pBasis;
      TMemoryLock<size_t>
         AllCenters(nBf, &Mem);
      size_t iFnTotal = 0;
      for (size_t iSh = 0; iSh < pBasis->Shells.size(); ++ iSh) {
         size_t iCen = pBasis->ShellCenters[iSh];
//          for (size_t iFn = 0; iFn < pBasis->Shells[iSh].nFn(); ++ iFn) {
// //             xout << " fn(" << iFnTotal << ") icen = " << iCen << std::endl;
//             AllCenters[iFnTotal] = iCen;
//             ++ iFnTotal;
//          }
// ^- FIXME: this crashes in mpp mode. WHY?
         size_t nFn = pBasis->nFn(iSh);
         for (size_t iFn = 0; iFn < nFn; ++ iFn) {
//             xout << " fn(" << iFnTotal << ") icen = " << iCen << std::endl;
//             AllCenters[iFnTotal + iFn] = iCen;
            AllCenters[pBasis->iFn(iSh) + iFn] = iCen;
         }
         assert_rt(iFnTotal == pBasis->iFn(iSh));
         iFnTotal += nFn;
      }
      assert_rt(iFnTotal == nBf);
      for ( size_t iMap = 0; iMap < (size_t)nMap; ++ iMap ) {
         assert_rt(pMap[iMap] < nBf);
         pCenterIndices[iMap] = AllCenters[pMap[iMap]];
      }
   }


   // fix up pMap and pCenterIndices from 1-based addressing into 0-based addressing.
   // (FIXME: IR version does 0-based maps automatically?)
//    for ( size_t iMap = 0; iMap < (size_t)nMap; ++ iMap ) {
//       pMap[iMap] -= 1;
// //       pCenterIndices[iMap] -= 1;
//    }
#if 0
   xout << "CenIdx = [";
   for ( uint iMap = 0; iMap < nMap; ++ iMap ) {
      if ( iMap != 0 ) xout << " ";
      xout << pCenterIndices[iMap];
   }
   xout << "]" << std::endl;
#endif
}





void AccXc(FDftiArgs &Args, FLog &Log, FMemoryStack &Mem_)
{

   size_t
      nDftFunc = 1,
      nFockSize = Args.nFockSize();

   // clear output & timings
   memset(Args.pDfuEnergies, 0, sizeof(double) * nDftFunc);
   if (Args.pfElecTotal)
      *Args.pfElecTotal = 0;

   size_t nJobs = Args.pDftGrid->GridBlocks.size();
   double Timings[CLOCK_TimerCount];
   if (1) {
//       Args.Flags |= DFTI_MeasureTime;
      Args.pTimings = &Timings[0];
      memset(Args.pTimings, 0, sizeof(double) * CLOCK_TimerCount);
   }

   FMemoryStackArray MemStacks(Mem_);

   bool Aborted = false;
   #pragma omp parallel for schedule(dynamic)
   //for (size_t iJob = 0; iJob < nJobs; ++ iJob ) {
   for (int iJob__ = 0; iJob__ < int(nJobs); ++ iJob__ ) {
      size_t iJob = size_t(iJob__); // for OpenMP.
      FMemoryStack &Mem = MemStacks.GetStackOfThread();

      if (!Log.StatusOkay())
         Aborted = true;
      #pragma omp flush (Aborted)
      if (!Aborted) {
         // ^- both "break" and "continue" seem to be problemaic..
         TMemoryLock<char>
            pBaseOfMemory(0, &Mem);
         FDftGrid::FGridBlock const
            *pGridBlock = &Args.pDftGrid->GridBlocks[iJob];

//          FMemoryStack2
//             // FIXME: this is a hack around the workspace memory size computation...
//             // the *much* better alternative would be to do this op with less space.
//             Mem1(2*nFockSize * sizeof(double) + 100000);
//          TMemoryLock<char> pHackyFreeMe1(0, &Mem1);
         FMemoryStack &Mem1 = Mem;

         double
            *pDfuEnergies = 0,
            *pFockC = 0,
            *pFockO = 0,
            *pGrad = 0;
         Mem.ClearAlloc(pDfuEnergies, nDftFunc);
         if (Args.MakeXc()) {
            Mem1.ClearAlloc(pFockC, nFockSize);
            Mem1.ClearAlloc(pFockO, nFockSize);
         }
         if (Args.MakeGradient()) {
            Mem.ClearAlloc(pGrad, 3*Args.nCenters());
         }

         FDftiJob
            Job(pDfuEnergies, pFockC, pFockO, pGrad, pGridBlock, Args, Mem);
         Job.Run();

         // hmhm... my Molpro version does this in a much more clever fashion.
         // (only joins threads once). Can this be done with openmp?
         // FIXME: this should really be fixed. Current version needs a ClearAlloc on the output quantities,
         // for every single grid block! Lame.
         #pragma omp critical
         {
            Add2(Args.pDfuEnergies, pDfuEnergies, 1.0, nDftFunc);
            if (Args.MakeXc()) {
               Add2(Args.pFockC, pFockC, 1.0, nFockSize);
               if (Args.OpenShell())
                  Add2(Args.pFockO, pFockO, 1.0, nFockSize);
            }
            if (Args.MakeGradient()) {
               Add2(Args.pGradient, pGrad, 1.0, 3*Args.nCenters());
            }
            if (Args.MeasureTime())
               Add2(Args.pTimings, Job.Timings, 1.0, CLOCK_TimerCount);
            if (Args.pfElecTotal)
               *Args.pfElecTotal += Job.fElec;
         }
      }
   }

   Log.CheckStatus();
   assert_rt(!Aborted); // you should not be here if this was set.

   if (Args.MeasureTime()) {
      Log.WriteTiming("DFT (basis fn.)", Args.pTimings[CLOCK_EvalBfn]);
      Log.WriteTiming("DFT (densities)", Args.pTimings[CLOCK_FormRho]);
      if (Args.MakeXc())
         Log.WriteTiming("DFT (xc matrix)", Args.pTimings[CLOCK_FormXcMat]);
      Log.WriteTiming("DFT (functional)", Args.pTimings[CLOCK_EvalDftfun]);
      if (Args.MakeGradient())
         Log.WriteTiming("DFT (core grad.)", Args.pTimings[CLOCK_CoreGrad]);
      if (Args.MakeGradient() && Args.MakeGridGradient())
         Log.WriteTiming("DFT (grid grad.)", Args.pTimings[CLOCK_GridWtDeriv]);
   }
}

} // namespace dfti
