/* Copyright (c) 2015  Gerald Knizia
 * 
 * This file is part of the IboView program (see: http://www.iboview.org)
 * 
 * IboView is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3.
 * 
 * IboView is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with bfint (LICENSE). If not, see http://www.gnu.org/licenses/
 * 
 * Please see IboView documentation in README.txt for:
 * -- A list of included external software and their licenses. The included
 *    external software's copyright is not touched by this agreement.
 * -- Notes on re-distribution and contributions to/further development of
 *    the IboView software
 */

// #include <iostream>
#include <cmath>
#include <boost/format.hpp>
using boost::format;
#include "Ir.h"
#include "CxDiis.h"
#include "CtRhf.h"
#include "CtIo.h"
#include "CtTiming.h"
#include "CxPodArray.h"
#if 0
   #include "RDTSC.h"
#else
   #define RESET_CLOCKS
   #define RESUME_CLOCK(x)
   #define PAUSE_CLOCK(x)
#endif

#include "CtConstants.h"
#include "CtDft.h"

#ifdef _MSC_VER
   // OpenBLAS DGEMV crashes on me if executed in OpenMP-loop. No idea.
   #pragma message( "WARNING: REPLACING REAL DGEMV BY MxmLame AS OPENBLAS/OPENMP WORKAROUND!")
   #define Mxv MxvLame
#endif


namespace ct {


FFockComponentBuilder::~FFockComponentBuilder()
{}

void FFockComponentBuilder::AccFock(FMatrixView &FockC, FMatrixView &FockO, FBasisSet *pBasis, FMatrixView const &COccC, FMatrixView const &COccO, uint Flags, FMemoryStack &Mem)
{
   size_t
      nBf = pBasis->nFn();
   if (!((FockC.nRows == nBf && FockC.nCols == nBf) &&
         (FockO.pData == 0 || (FockO.nRows == nBf && FockO.nCols == nBf)) &&
         (COccC.nRows == nBf) &&
         (COccO.pData == 0 || COccO.nRows == nBf)))
      throw std::runtime_error("FFockComponentBuilder: Input orbitals not consistent with orbital basis sets.");
   IR_SUPPRESS_UNUSED_WARNING(Flags);
   IR_SUPPRESS_UNUSED_WARNING(Mem);
}

void FFockComponentBuilder::PrintEnergyContribs()
{
   m_Log.Write("FFockComponentBuilder::PrintEnergyContribs: not implemented for current Fock builder. Fix this.");
}




// forms the integral matrix set (\mu\nu|F) for a single set of F shells.
// \mu comes from OrbBasis1, \nu comes from OrbBasis2. If both pointers are identical,
// symmetric integrals will be assumed [(\mu\nu|F) = (\nu\mu|F)]  and this symmetry will be used.
double *FormIntMNF(ir::FRawShell const &ShF,
   ir::FIntegralKernel *pIntKernel, FRawBasis const *pOrbBasisA, FRawBasis const *pOrbBasisB, FRawBasis const *pFitBasis, FMemoryStack &Mem, FMatrixView ScrDen, double fThr)
{
   size_t
      nAoA = pOrbBasisA->nFn(),
      nAoB = pOrbBasisB->nFn(),
      nFnF = ShF.nFn();
   double
      *pMNF;
   bool
      Symmetric = (pOrbBasisA == pOrbBasisB);
   Mem.ClearAlloc(pMNF, nAoA * nAoB * nFnF);

   std::size_t nIntTotal = 0, nIntRetained = 0;

   for ( size_t iShB = 0; iShB < pOrbBasisB->Shells.size(); ++ iShB ){
      ir::FRawShell const &ShB = pOrbBasisB->Shells[iShB];
      size_t nFnB = ShB.nFn();
      size_t iShA_First = Symmetric ? iShB : 0;
      for ( size_t iShA = iShA_First; iShA < pOrbBasisA->Shells.size(); ++ iShA ) {
         ir::FRawShell const &ShA = pOrbBasisA->Shells[iShA];
         size_t nFnA = ShA.nFn();
         nIntTotal += nFnA * nFnB;

//          if (ScrDen(iShA, iShB) < fThr)
//             continue;
         double
            fDistSqAB = DistSq(FVector3(ShA.vCen), FVector3(ShB.vCen));
         if (ShA.pRange && ShB.pRange && sqr(ShA.MaxCoRange() + ShB.MaxCoRange()) < fDistSqAB)
            continue;
         size_t
            Strides[3] = {1, nFnA, nFnA * nFnB};

//          double fElecInPair = 0.;
//          for (size_t iA = 0; iA < nFnA; ++ iA)
//             for (size_t iB = 0; iB < nFnB; ++ iB)
// //                fElecInPair += ScrDen((pOrbBasisA->iFn(iShA) + iA), (pOrbBasisB->iFn(iShB) + iB));
//                fElecInPair += sqr(ScrDen((pOrbBasisA->iFn(iShA) + iA), (pOrbBasisB->iFn(iShB) + iB)));
//          if (std::abs(fElecInPair) < sqr(fThr))
//             continue;
         nIntRetained += nFnA * nFnB;

         double
            *pIntData;
         Mem.Alloc(pIntData, nFnA * nFnB * nFnF );

         RESUME_CLOCK(100)
         ir::EvalInt2e3c(pIntData, Strides, &ShA, &ShB, &ShF,1, 1.0, pIntKernel, Mem);
         PAUSE_CLOCK(100)

         if (Symmetric) {
            assert(pOrbBasisA == pOrbBasisB && nAoA == nAoB);
            for ( size_t iF = 0; iF < nFnF; ++ iF )
               for ( size_t iB = 0; iB < nFnB; ++ iB )
                  for ( size_t iA = 0; iA < nFnA; ++ iA ) {
                     double
                        f = pIntData[iA + nFnA * (iB + nFnB * iF)];
                     // assign to (\mu\nu| and (\nu\mu|. (int has perm symmetry).
                     pMNF[(pOrbBasisA->iFn(iShA) + iA) + nAoA*(pOrbBasisA->iFn(iShB) + iB) + nAoA*nAoA*iF] = f;
                     pMNF[(pOrbBasisA->iFn(iShB) + iB) + nAoA*(pOrbBasisA->iFn(iShA) + iA) + nAoA*nAoA*iF] = f;
                  }
         } else {
            for ( size_t iF = 0; iF < nFnF; ++ iF )
               for ( size_t iB = 0; iB < nFnB; ++ iB )
                  for ( size_t iA = 0; iA < nFnA; ++ iA ) {
                     double
                        f = pIntData[iA + nFnA * (iB + nFnB * iF)];
                     pMNF[(pOrbBasisA->iFn(iShA) + iA) + nAoA*(pOrbBasisB->iFn(iShB) + iB) + nAoA*nAoB*iF] = f;
                  }
         }

         Mem.Free(pIntData);
      }
   }

//    if (&ShF == &pFitBasis->Shells[0] && fThr != 0.) {
//    if (&ShF == &pFitBasis->Shells[0]) {
//       double f = 0, g = 0;
//       for (std::size_t i = 0; i < ScrDen.GetStridedSize(); ++ i)
//          f += ScrDen.pData[i];
//       for (std::size_t i = 0; i < nAo; ++ i)
//          g += ScrDen.pData[i*(nAo+1)];
//       xout << format("ScrAB: %i of %i (\\mu\\nu| integrals evaluated (%.2f%% screened, %.6f elec total / %.6f diag).\n")
//          % nIntRetained % nIntTotal % (100.*(1. - nIntRetained/static_cast<double>(nIntTotal))) % f % g;
//    }
//    if (&ShF == &pFitBasis->Shells[0]) {
//       xout << format("ScrAB: %i of %i (\\mu\\nu| integrals evaluated (%.2f%% screened).\n")
//          % nIntRetained % nIntTotal % (100.*(1. - nIntRetained/static_cast<double>(nIntTotal)));
//    }

   return pMNF;
   IR_SUPPRESS_UNUSED_WARNING(pFitBasis);
   IR_SUPPRESS_UNUSED_WARNING(ScrDen);
   IR_SUPPRESS_UNUSED_WARNING(fThr);
}


double *FormIntMNF(ir::FRawShell const &ShF,
   ir::FIntegralKernel *pIntKernel, FRawBasis const *pOrbBasis, FRawBasis const *pFitBasis, FMemoryStack &Mem, FMatrixView ScrDen, double fThr)
{
   return FormIntMNF(ShF, pIntKernel, pOrbBasis, pOrbBasis, pFitBasis, Mem, ScrDen, fThr);
}







FFockComponentBuilderDfCoulXcCached::FFockComponentBuilderDfCoulXcCached(FDfJkOptions const &JkOptions_, FDftGridParams const &GridParams_, std::string const &XcFunctionalName_, FLog &Log_, FTimerSet *pTimers_)
   : FFockComponentBuilder(Log_, pTimers_), XcFunctionalName(XcFunctionalName_), JkOptions(JkOptions_), GridParams(GridParams_)
{
   if ( !XcFunctionalName.empty() ) {
      g_pXcFunctional = new FXcFunctional(XcFunctionalName);
      pXcFn = g_pXcFunctional;
      m_Log.Write("\n"+pXcFn->Desc());
   }
}

FFockComponentBuilderDfCoulXcCached::~FFockComponentBuilderDfCoulXcCached()
{
}

void FFockComponentBuilderDfCoulXcCached::Init(FWfDecl const &WfDecl_, FBasisSet *pOrbBasis_, FAtomSet const &Atoms_, FHfOptions const &Options_, FMemoryStack &Mem)
{
   if (pXcFn) {
      FTimer
         tDftGrid;
      pDftGrid = new FDftGrid(Atoms_, GridParams);
      m_Log.Write(" Generated DFT grid with {} points for {} atoms in {:.2} sec.\n", pDftGrid->Points.size(), Atoms_.size(), (double)tDftGrid);
//       xout << "" << format(pTimingFmt) % "DFT integration grid" % (double)tDftGrid; xout.flush();
   }

   pAtoms = &Atoms_;
   pOrbBasis = pOrbBasis_;
   nAo = pOrbBasis->nFn();

   // Make fitting coefficients J^{-1/2}.
   FTimer TimerJcd;
   if (1) {
      pFitBasis = new FBasisSet(*pAtoms, BASIS_JFit);
   } else {
      pFitBasis = new FBasisSet(*pAtoms, BASIS_JkFit);
      m_Log.Write(" NOTE: Using JkFit basis as JFit. Fix this!");
   }
//    xout << *pFitBasis;
   pFitBasisRaw = pFitBasis->pRawBasis.get();
   pOrbBasisRaw = pOrbBasis->pRawBasis.get();

   nFit = pFitBasis->nFn();
   Jcd = MakeStackMatrix(nFit, nFit, Mem);
   MakeIntMatrix(Jcd, *pFitBasis, *pFitBasis, FKrn2i_Direct(&ir::g_IrCoulombKernel), Mem);
   CalcCholeskyFactors(Jcd);
   m_Log.WriteTiming("fitting metric (coul)", (double)TimerJcd);

   // make the cached integrals themselves.
   size_t
      nAoTr = nAo * (nAo+1)/2;
   Int3ixStorage.resize(nAoTr * nFit);
   Int3ix = FMatrixView(&Int3ixStorage[0], nAoTr, nFit);

   FTimer Timer3ix;
   FMatrixView ScrDen;
   {
      FMemoryStackArray MemStacks(Mem);
      #pragma omp parallel for schedule(dynamic)
      for ( int iShF__ = 0; iShF__ < int(pFitBasisRaw->Shells.size()); ++ iShF__ ) {
         size_t iShF = size_t(iShF__); // that one's for OpenMP.
      //for ( size_t iShF = 0; iShF < pFitBasisRaw->Shells.size(); ++ iShF ) {
         if (m_Log.StatusOkay()) {
            FMemoryStack &Mem1 = MemStacks.GetStackOfThread();
            ir::FRawShell const &ShF = pFitBasisRaw->Shells[iShF];
            size_t nFnF = ShF.nFn();
//             FMemoryStack2
//                // FIXME: this is a hack around the workspace memory size computation...
//                // the *much* better alternative would be to do this op with less space.
//                Mem1(nAo * nAo * nFnF * sizeof(double) + 100000);

            double
               // (\mu\nu|F): nAo x nAo x nFnF
               *pMNF = FormIntMNF(ShF, &ir::g_IrCoulombKernel, pOrbBasisRaw, pFitBasisRaw, Mem1, ScrDen, 0.);

            for (size_t iF = 0; iF < nFnF; ++ iF) {
               FMatrixView
                  MN(pMNF + nAo*nAo*iF, nAo, nAo);
               MN.TriangularReduce(1, &Int3ix(0, iF + pFitBasisRaw->iFn(iShF)));

               assert(MN.TriangularStorageSize1() == nAoTr);
            }

            Mem1.Free(pMNF);
         }
      }
   }
   m_Log.CheckStatus(); // may raise exception.

   double
      fIntSizeMb = double(Int3ix.GetStridedSize()) * double(sizeof(Int3ix[0])) / double(1<<20);
//    xout << format(pTimingFmt) % str(format("3-index integrals (%.2f MB)") % fIntSizeMb) % (double)Timer3ix; xout.flush();
//    xout << format(pTimingFmt) % str(format("3-index integrals (%i MB)") % (size_t)fIntSizeMb) % (double)Timer3ix; xout.flush();
   m_Log.WriteTiming(fmt::format("3-index integrals ({} MB)", (size_t)fIntSizeMb), (double)Timer3ix);
   IR_SUPPRESS_UNUSED_WARNING(Options_);
   IR_SUPPRESS_UNUSED_WARNING(WfDecl_);
}


namespace dfti {

enum FDftiFlags {
   DFTI_AuxiliaryExpand = 0x01,
   DFTI_MakeXc = 0x02,
   DFTI_MakeGradient = 0x04,
   DFTI_MakeGridGradient = 0x08, // in the case of a gradient computation, include the gradient of the integration grid.
   DFTI_MeasureTime = 0x10
};

enum {
   CLOCK_EvalBfn = 0,
   CLOCK_FormRho = 1,
   CLOCK_FormXcMat = 2,
   CLOCK_EvalDftfun = 3,
   CLOCK_GridWtDeriv = 4, // for gradient code.
   CLOCK_CoreGrad = 5,
   CLOCK_TimerCount = 6
};

enum {
   iNoCenter = 0xffffffff
};

struct FDftiResultSet;
struct FDensitySet;
struct FDensitySetInput;

inline void MXMA(double const *pA, size_t iRowStA, size_t iColStA, double const *pB, size_t iRowStB, size_t iColStB, double *pOut, size_t iRowStOut, size_t iColStOut, size_t nRows, size_t nLink, size_t nCols) {
   FMatrixView const
      A(const_cast<double*>(pA), nRows, nLink, iRowStA, iColStA),
      B(const_cast<double*>(pB), nLink, nCols, iRowStB, iColStB);
   FMatrixView
      Out(pOut, nRows, nCols, iRowStOut, iColStOut);
   Mxm(Out, A, B);
}

inline void MXMB(double const *pA, size_t iRowStA, size_t iColStA, double const *pB, size_t iRowStB, size_t iColStB, double *pOut, size_t iRowStOut, size_t iColStOut, size_t nRows, size_t nLink, size_t nCols) {
   FMatrixView const
      A(const_cast<double*>(pA), nRows, nLink, iRowStA, iColStA),
      B(const_cast<double*>(pB), nLink, nCols, iRowStB, iColStB);
   FMatrixView
      Out(pOut, nRows, nCols, iRowStOut, iColStOut);
   Mxm(Out, A, B, 1.0, MXM_Add);
}


// #define RESUME_CPU_CLOCK(iClockId)
// #define PAUSE_CPU_CLOCK(iClockId)

// note: comment with /* */ to avoid multi-line comment warnings.

#define RESUME_CPU_CLOCK(iClockId) \
   if ( Args.MeasureTime() ) \
      Timings[(iClockId)] -= Second()

#define PAUSE_CPU_CLOCK(iClockId) \
   if ( Args.MeasureTime() ) \
      Timings[(iClockId)] += Second()



struct FDftiArgs
{
   uint32_t
      Flags; // combination of DFTI_*.

   // energies, energy gradients, and triangular fock matrices (output) or Fock potential expansion coefficients.
   double *pDfuEnergies; double *pGradient; double *pFockC; double *pFockO;
   // triangular density matrices (input) in case of regular XC, expansion coefficients
   // in terms of auxiliary basis in other case.
   // if pDenO == 0, open-shell contributions are not evaluated.
   double *pDenC; double *pDenO;
   double *pOccOrbC; double *pOccOrbO;
   size_t nOccC; size_t nOccO;

   // basis over which the orbitals are expanded (regular) or auxiliary basis.
   FRawBasis *pBasis;

   // the functional.
   FXcFunctional *pXcFn;

   // integration grid.
   FDftGrid const *pDftGrid;

   double ThrOrb; double LogThrOrb; // threshold for orbital-on-grid screening

   double *pTimings;
public:
   bool OpenShell() const { return pDenO != 0; };
   bool MakeGradient() const { return Flags & DFTI_MakeGradient; };
   bool MakeXc() const { return Flags & DFTI_MakeXc; };
   bool MakeGridGradient() const { return Flags & DFTI_MakeGridGradient; }
   bool UseAuxiliaryExpansion() const { return Flags & DFTI_AuxiliaryExpand; };
   bool MeasureTime() const { return Flags & DFTI_MeasureTime; }
   bool NeedTau() const { return pXcFn->NeedTau(); }
   bool NeedSigma() const { return pXcFn->NeedSigma(); }
   size_t nBf() const { return pBasis->nFn(); }; // number of basis functions in pBasis
   size_t nCenters() const { return pBasis->nCen(); }; // number of centers (required to determine gradient dimension)
   size_t nFockSize() const {
      size_t nBf_ = nBf();
      if (UseAuxiliaryExpansion())
         return nBf_;
      else
         return (nBf_*(nBf_+1))/2;
   }
};

struct FDensitySetInput
{
   double
      *pRdm, *pOccOrb;
   size_t
      nBf, nOcc;
   size_t
      *pMap, nMap;
   bool
      UseOrb,
      Gradient; // keep data for gradient evaluation
   uint
      nComp;
   FDensitySetInput() {};
   FDensitySetInput(double *pRdm_, double *pOccOrb_, size_t nBf_, size_t nOcc_,
         size_t *pMap_, size_t nMap_, bool UseOrb_, bool Gradient_, uint nComp_)
      : pRdm(pRdm_), pOccOrb(pOccOrb_), nBf(nBf_), nOcc(nOcc_), pMap(pMap_), nMap(nMap_),
        UseOrb(UseOrb_), Gradient(Gradient_), nComp(nComp_)
   {};
};

// densities for one spin component.
struct FDensitySet
{
   double
      *pRho,      // electron density: rho(rg) = \sum_i [phi_i(rg)]^2 (len: nPt)
      *pRhoGrd,   // gradient of electron density: d/dx_i rho(rg)  (len: 3*nPt)
      *pTau,      // kinetic term: \sum_i dot(grad phi_i(rg), grad phi_i(rg)) (len: nPt)
      *pUpsilon,  // laplacian of electron density: div grad rho(rg) (len: nPt)
      *pBxRho;    // may be 0. Otherwise pOrbVal contracted with density matrix: arrays nGridPt x nBx x nMap.
   size_t
      nGridPt;
   uint
      nDiff,
      nBx; // number of components of pOrbVal stored in pBxRho (1: only density; 4: density+gradient)
   bool
      MakeTau,
      AuxExpandXc;

   void Init(size_t nGridPt_, uint nDiff_, bool MakeTau_, bool AuxExpandXc_, FMemoryStack &Mem);
   // evaluate densities (+gradients)
   void Eval(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, FMemoryStack &Mem);
   void EvalAux(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, FMemoryStack &Mem);


   // calculate and accumulate xc matrix contribution.
   void AccXcMatrix(double *pXcTriang, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem);
   void AccXcMatrixAux(double *pXcVec, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem);

   // accumulate symmetric contribution mu(r) vdrho(r) nu(r) to given xc matrix.
   // beware of mysterious flags.
   void AccXcSym1(double *pXc, double const *pVdRho, double const *pGridWt, double const *pOrbVal, size_t nMapSt, size_t nMap, uint Flags, FMemoryStack &Mem);

   double *MakePhi(double const *pIn, size_t nPts, size_t Stride, FDensitySetInput const &p, FMemoryStack &Mem) const;
   double *MakeBxRho(double const *pIn, size_t nPts, size_t Stride, FDensitySetInput const &p, FMemoryStack &Mem) const;

   double *CompressAuxVec(double const *pInput, FDensitySetInput const &p, FMemoryStack &Mem) const;
};

void FDensitySet::Init(size_t nGridPt_, uint nDiff_, bool MakeTau_, bool AuxExpandXc_, FMemoryStack &Mem)
{
   nGridPt = nGridPt_;
   nDiff = nDiff_;
   MakeTau = MakeTau_;
   AuxExpandXc = AuxExpandXc_;

   pBxRho = 0;
   nBx = 0;

   Mem.Alloc(pRho, nGridPt);
   if ( nDiff >= 1 )
      Mem.Alloc(pRhoGrd, 3 * nGridPt);
   if ( nDiff >= 2 || MakeTau )
      Mem.Alloc(pTau, nGridPt);
   if ( nDiff >= 2 )
      Mem.Alloc(pUpsilon, nGridPt);
}



enum FAlgOpFlags{
   ALGOP_Add = 0x01,  // add to output instead of replacing it.
   ALGOP_Symmetrize = 0x02 // copy lower triangle to upper triangle.
};


// r = dot(x,y)
template<class FScalar>
FScalar Dot2( FScalar const *IR_RP x, FScalar const *IR_RP y, size_t n )
{
   FScalar
      r = 0;
   size_t
      i = 0;
   for ( ; i < (n & (~3)); i += 4 ) {
      r += x[i]   * y[i];
      r += x[i+1] * y[i+1];
      r += x[i+2] * y[i+2];
      r += x[i+3] * y[i+3];
   }
   for ( ; i < n; ++ i ) {
      r += x[i] * y[i];
   }
   return r;
}


template
double Dot2<double>( double const *IR_RP x, double const *IR_RP y, size_t n );

// r[i] += f * x[i]
void Add2( double *IR_RP r, double const *IR_RP x, double f, std::size_t n )
{
   std::size_t
      i = 0;
   for ( ; i < (n & (~3)); i += 4 ) {
      r[i]   += f * x[i];
      r[i+1] += f * x[i+1];
      r[i+2] += f * x[i+2];
      r[i+3] += f * x[i+3];
   }
   for ( ; i < n; ++ i ) {
      r[i] += f * x[i];
   }
}


// r[i] += f * x[i] * y[i]
void Add2( double *IR_RP r, double const *IR_RP x, double const *IR_RP y, double f, std::size_t n )
{
   std::size_t
      i = 0;
   for ( ; i < (n & (~3)); i += 4 ) {
      r[i]   += f * x[i]   * y[i];
      r[i+1] += f * x[i+1] * y[i+1];
      r[i+2] += f * x[i+2] * y[i+2];
      r[i+3] += f * x[i+3] * y[i+3];
   }
   for ( ; i < n; ++ i ) {
      r[i] += f * x[i] * y[i];
   }
}



// ddots multiple strided sets of vectors.
void DotN(double *pOut, double Factor, double const *pA, size_t nStrideA, double const *pB, size_t nStrideB, size_t nPoints, size_t nSets, uint Flags = 0)
{
   for ( size_t iSet = 0; iSet < nSets; ++ iSet ){
      double
         f = Factor * Dot2(&pA[iSet * nStrideA], &pB[iSet * nStrideB], nPoints);

      if ( Flags == 0 )
         pOut[iSet] = f;
      else {
         assert(Flags == ALGOP_Add);
         pOut[iSet] += f;
      }
   }
}

// daxpys multiple strided sets of vectors.
void AccN(double *pOut, double Factor, double const *pA, size_t nStrideA, double const *pB, size_t nStrideB, size_t nPoints, size_t nSets, uint Flags = 0)
{
   if ( (Flags & ALGOP_Add) == 0 )
      // clear output.
      for ( size_t iPt = 0; iPt < nPoints; ++ iPt )
         pOut[iPt] = 0;

   for ( size_t iSet = 0; iSet < nSets; ++ iSet )
      Add2(pOut, &pA[iSet * nStrideA], &pB[iSet * nStrideB], Factor, nPoints);
}

void FDensitySet::Eval(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, FMemoryStack &Mem)
{
   if (AuxExpandXc)
      return EvalAux(pOrbVal, p, ForceNonNegative, Mem);
}


double *FDensitySet::CompressAuxVec(double const *pInput, FDensitySetInput const &p, FMemoryStack &Mem) const
{
   double *r;
   Mem.Alloc(r, p.nMap);
   for (size_t iMap = 0; iMap < p.nMap; ++ iMap)
      r[iMap] = pInput[p.pMap[iMap]];
   return r;
}

// evaluate densities via auxiliary expansion of density.
void FDensitySet::EvalAux(double const *pOrbVal, FDensitySetInput const &p, bool ForceNonNegative, FMemoryStack &Mem)
{
   void
      *pBeginOfStorage = Mem.Alloc(0); // note: NOT freed in gradient case! (for keeping bxrho!)
   uint
      nComp = p.nComp;
   size_t
      nMap = (size_t)p.nMap,
      nMapSt = nGridPt * nComp; // stride between two basis function entries in OrbVal.

   double
      *pAuxDen = CompressAuxVec(p.pRdm, p, Mem); // compress input density to Map dimensions.
   if ( nDiff == 0 ) {
      // LDA.
      Mxv(pRho,1, pOrbVal,1,nMapSt, pAuxDen,1, nGridPt, nMap);
   } else if ( nDiff == 1 && !MakeTau ) {
      // GGA without tau. Most common case.
      Mxv(pRho,1, pOrbVal,1,nMapSt, pAuxDen,1, nGridPt, nMap);
      Mxv(&pRhoGrd[0*nGridPt],1, &pOrbVal[1*nGridPt],1,nMapSt, pAuxDen,1, nGridPt, nMap, false, 1.0);
      Mxv(&pRhoGrd[1*nGridPt],1, &pOrbVal[2*nGridPt],1,nMapSt, pAuxDen,1, nGridPt, nMap, false, 1.0);
      Mxv(&pRhoGrd[2*nGridPt],1, &pOrbVal[3*nGridPt],1,nMapSt, pAuxDen,1, nGridPt, nMap, false, 1.0);
   } else if ( nDiff == 1 && MakeTau ) {
      assert(0);
   } else if ( nDiff == 2 ) {
      assert(0);
   } else {
      assert(0);
   }

   // make sure that density has not gone negative due to some
   // unfortunate numerical cancellation
   if ( ForceNonNegative )
      for ( size_t iPt = 0; iPt < nGridPt; ++ iPt )
         if ( pRho[iPt] < 0 )
            pRho[iPt] = 0;
   // ^- UKS spin densities can be negative

   if ( !p.Gradient )
      Mem.Free(pBeginOfStorage);
}


// void TransformAbToCo(double *pAC, double *pBO, size_t n)
// {
//    assert( pAC != 0 && pBO != 0 );
//    for ( size_t i = 0; i < n; ++ i ) {
//       double c = pAC[i] + pBO[i];
//       double o = pAC[i] - pBO[i];
//       pAC[i] = c;
//       pBO[i] = o;
//    };
// }
//
// // transform a pair of density sets for alpha/beta densities to closed/open densities
// void TransformAbToCo(FDensitySet &AC, FDensitySet &BO, size_t nGridPt)
// {
//    if ( AC.pRho )
//       TransformAbToCo(AC.pRho, BO.pRho, nGridPt);
//    if ( AC.pRhoGrd )
//       TransformAbToCo(AC.pRhoGrd, BO.pRhoGrd, 3*nGridPt);
//    if ( AC.pTau )
//       TransformAbToCo(AC.pTau, BO.pTau, nGridPt);
//    if ( AC.pUpsilon )
//       TransformAbToCo(AC.pUpsilon, BO.pUpsilon, nGridPt);
// }

struct FDftiResultSet
{
   double
      *pZk,        // dft integrand Exc
      *pVdRho,      // [d/d rho] Exc
      *pVdSigma,    // [d/d sigmaxx] Exc, where xx = cc for closed case and oo for open case
      *pVdSigmaXY,  // [d/d sigmaxy] Exc, where xy = co (for both cases)
      *pVdTau,      // [d/d tau] Exc
      *pVdUpsilon;  // [d/d upsilon] Exc

   void Init(size_t nGridPt, uint nDiff, bool MakeTau, double *pVdSigmaXY_, FMemoryStack &Mem);
};

void FDftiResultSet::Init(size_t nGridPt, uint nDiff, bool MakeTau, double *pVdSigmaXY_, FMemoryStack &Mem)
{
   Mem.ClearAlloc(pZk, nGridPt);
   Mem.ClearAlloc(pVdRho, nGridPt);
   if ( nDiff >= 1 ) {
      Mem.ClearAlloc(pVdSigma, nGridPt);
      if ( pVdSigmaXY_ )
         pVdSigmaXY = pVdSigmaXY_; // store link if provided by other case.
      else
         Mem.ClearAlloc(pVdSigmaXY, nGridPt);
   }
   if ( nDiff >= 2 || MakeTau )
      Mem.ClearAlloc(pVdTau, nGridPt);
   if ( nDiff >= 2 )
      Mem.ClearAlloc(pVdUpsilon, nGridPt);
}

// void FDensitySet::MakeZm(double *&pZm, double *pOrbVal, double *pRhoGrd, size_t nGridPt, size_t nMap, size_t nMapSt, FMemoryStack &Mem)
// {
//    // form Zm(r) := (grad mu(r))*(grad rho(r))
//    Mem.Alloc(pZm, nGridPt * nMap);
//    for ( size_t iMap = 0; iMap < nMap; ++ iMap )
//    {
//       double *pVal = &pOrbVal[nMapSt * iMap] + nGridPt; // start of gradient.
//       double *pOut = &pZm[nMap * iMap];
//       for ( size_t iPt = 0; iPt < nGridPt; ++ iPt )
//       {
//          pOut[iPt] =
//             pRhoGrd[iPt]             * pVal[iPt +   nGridPt] +
//             pRhoGrd[iPt +   nGridPt] * pVal[iPt +   nGridPt] +
//             pRhoGrd[iPt + 2*nGridPt] * pVal[iPt + 2*nGridPt];
//       };
// };




// accumulate xc matrix contribution to triangular full-dimension xc matrix at pXcTriang.
// X denotes the open/closed case at hand (e.g., 'c' when making closed-shell exchange), Y the other case.
// For example, when making the open-shell exchange matrix, then Sigma is SigmaOO and SigmaXY is SigmaCO.
void FDensitySet::AccXcMatrix(double *pXcTriang, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem)
{
   if (AuxExpandXc)
      return AccXcMatrixAux(pXcTriang, pOrbVal, r, pRhoGrdY, p, pGridWt, Mem);
}


// accumulate xc matrix contribution to triangular full-dimension xc matrix at pXcTriang.
// X denotes the open/closed case at hand (e.g., 'c' when making closed-shell exchange), Y the other case.
// For example, when making the open-shell exchange matrix, then Sigma is SigmaOO and SigmaXY is SigmaCO.
void FDensitySet::AccXcMatrixAux(double *pXcVec, double *pOrbVal, FDftiResultSet const &r,
      double *pRhoGrdY, FDensitySetInput const &p, double const *pGridWt, FMemoryStack &Mem)
{
   void
      *pBeginOfStorage = Mem.Alloc(0);
   uint
      nComp = p.nComp;
   size_t
      nMap = (size_t)p.nMap,
      nMapSt = nGridPt * nComp; // stride between two basis function entries in OrbVal.
   double
      *pXc;
   Mem.Alloc(pXc, nMap);

   if ( nDiff == 0 ) {
      // LDA:
      //    k_{\mu\nu}(r) = \mu(r) vrho(r) \nu(r)
      // ... that's it already.
      double *pVdRhoWt;
      Mem.Alloc(pVdRhoWt, nGridPt);
      for (size_t iPt = 0; iPt < nGridPt; ++ iPt)
         pVdRhoWt[iPt] = r.pVdRho[iPt] * pGridWt[iPt];
      Mxv(pXc,1, pOrbVal,nMapSt,1, pVdRhoWt,1, nMap, nGridPt);
//       Mxv(pXc,1, pOrbVal,nMapSt,1, r.VdRho,1, nMap, nGridPt);
   } else if ( nDiff == 1 ) {
      assert(!MakeTau);
//       if ( MakeTau ) {
//          // GGA with tau: same as GGA without tau, but with additional term
//          //    k_{\mu\nu}(r) += vtau(r) [\grad \mu(r)] [\grad \nu(r)]
//          // Do that first (because of mystery symmetrization in AccXcSym1).
//          AccXcSym1(pXc, r.pVdTau, pGridWt, &pOrbVal[1*nGridPt], nMapSt, nMap, 0, Mem);
//          AccXcSym1(pXc, r.pVdTau, pGridWt, &pOrbVal[2*nGridPt], nMapSt, nMap, ALGOP_Add, Mem);
//          AccXcSym1(pXc, r.pVdTau, pGridWt, &pOrbVal[3*nGridPt], nMapSt, nMap, ALGOP_Add | ALGOP_Symmetrize, Mem);
//       }

      // auxiliary version:
      //
      //    k[A] += \sum_g  w(g) vrho(g) A(g) + \sum_g w(g) D[Ecx(r)]/D[grad rho(r)] * grad A(r)
      //    k[A] += \sum_g  w(g) vrho(g) A(g) + \sum_g w(g) vsigma(g) 2 (\grad rho(r))*(\grad A(r))
      //
      // second term comes from derivative D[sigma]/D[grad rho(r)] = 2 grad rho(r)
      // since sigma = [grad rho(r)] x [grad rho(r)]
      double *pVdRhoWt, *pVdGrdWt;
      Mem.Alloc(pVdRhoWt, nGridPt);
      for (size_t iPt = 0; iPt < nGridPt; ++ iPt)
         pVdRhoWt[iPt] = r.pVdRho[iPt] * pGridWt[iPt];
      Mem.Alloc(pVdGrdWt, 3*nGridPt);
      for (size_t iPt = 0; iPt < nGridPt; ++ iPt) {
         pVdGrdWt[iPt + 0*nGridPt] = 2. * pGridWt[iPt] * r.pVdSigma[iPt] * pRhoGrd[iPt + 0*nGridPt];
         pVdGrdWt[iPt + 1*nGridPt] = 2. * pGridWt[iPt] * r.pVdSigma[iPt] * pRhoGrd[iPt + 1*nGridPt];
         pVdGrdWt[iPt + 2*nGridPt] = 2. * pGridWt[iPt] * r.pVdSigma[iPt] * pRhoGrd[iPt + 2*nGridPt];
      }
      Mxv(pXc,1, pOrbVal,nMapSt,1, pVdRhoWt,1, nMap, nGridPt);
      Mxv(pXc,1, &pOrbVal[1*nGridPt],nMapSt,1, &pVdGrdWt[0*nGridPt],1, nMap, nGridPt, true, 1.0);
      Mxv(pXc,1, &pOrbVal[2*nGridPt],nMapSt,1, &pVdGrdWt[1*nGridPt],1, nMap, nGridPt, true, 1.0);
      Mxv(pXc,1, &pOrbVal[3*nGridPt],nMapSt,1, &pVdGrdWt[2*nGridPt],1, nMap, nGridPt, true, 1.0);
   } else {
      assert_rt(0);
      // fixme: add code here once other stuff works (MGGA).
   }

   // expand Map dimension & add to previous vxc vector.
   for ( size_t iMapCol = 0; iMapCol < nMap; ++ iMapCol ) {
      pXcVec[p.pMap[iMapCol]] += pXc[iMapCol];
   }

   Mem.Free(pBeginOfStorage);
}



// struct FDftiJobAux {
//    double
//       *pVxcAuxC, // coefficients of auxiliary expansion of xc functional
//       *pRhoAuxC; // coefficients of auxiliary expansion of electron density
//    // shells covered by this job (typically: all shells on a given atom).
//    ir::FRawShell
//       *pShFs;
//    size_t
//       nShF;
//    // grid block covered by this job
//    FDftGrid::FGridBlock
//       *pGridBlock;
// }

struct FDftiJob {
   FDftiArgs const
      &Args;
   FDftGrid::FGridBlock const
      *pGridBlock;
   // outputs.
   double
      *pFockC, // note: these are assumed to be thread-local (i.e., lock-free)
      *pFockO,
      *pGradient, // geometric gradient (in case iMakeGrad=1)
      *pDfuEnergies; // accumulated dft energies, one for each functional.
   size_t
      nGridPt,
      nDiff,    // derivative order of functional
      nDiffBf,  // derivative order of basis functions we need to evaluate (>= nDiff; required for Gradients)
      nComp,    // number of orbital components at pOrbVal (value; value,dx,dy,dz; value,dx,dy,dz,dxx,dxy,...)
      iGridBlockCenter;
      // ^- atom/center index on which the current block of grid points in locked.
      //    only used in gradient calculations, otherwise set to iNoCenter
   FMemoryStack
      &Mem; // note: this one is assumed to be thread-local

   double const
      *pGridPt, *pGridWt;

   double
      Timings[CLOCK_TimerCount];

   FDftiJob(double *pDfuEnergies, double *pFockC, double *pFockO, double *pGrad, FDftGrid::FGridBlock const *pGridBlock, FDftiArgs &Args, FMemoryStack &Mem);
   void Run();


   void EvalBfn(double *&pOrbVal, size_t *&pMap, size_t &nMap, ptrdiff_t *&pCenterIndices);
   void FormSigma(double *&pSigma, double const *pRhoGrdA, double const *pRhoGrdB);

   void EvalGridWtDeriv(double *&pWtGrd);

   // calculate and accumulate DFTI analytic gradient (core part)
   void AccCoreGradient(double *pGrad, double const *pOrbVal, ptrdiff_t const *pCenterIndices, FDftiResultSet const &r,
     FDensitySet &d, FDensitySetInput const &DenInp, double const *pRhoGrdY, double const *pSigmaXY, bool bGridGrad);

   // calculate and accumulate DFTI analytic gradient (grid weight derivative part)
   void AccGridWtGradient(double *pGrad, double const *pZk);
private:
   FDftiJob(FDftiJob const&); // not implemented
   void operator = (FDftiJob const&); // not implemented.
};


FDftiJob::FDftiJob(double *pDfuEnergies_, double *pFockC_, double *pFockO_, double *pGrad_, FDftGrid::FGridBlock const *pGridBlock_, FDftiArgs &Args_, FMemoryStack &Mem_)
   : Args(Args_), pGridBlock(pGridBlock_), pFockC(pFockC_), pFockO(pFockO_), pGradient(pGrad_), pDfuEnergies(pDfuEnergies_), Mem(Mem_)
{
   nGridPt = pGridBlock->nPt();
   pGridPt = &Args.pDftGrid->Positions[pGridBlock->iFirst][0];
   pGridWt = &Args.pDftGrid->Weights[pGridBlock->iFirst];

   nDiff = Args.pXcFn->NeedSigma() ? 1 : 0;
   nDiffBf = nDiff;
   if (Args.Flags & DFTI_MakeGradient)
      nDiffBf += 1;

   // number of derivative components.
   nComp = ((nDiffBf+1)*(nDiffBf+2)*(nDiffBf+3))/6;

   for ( uint i = 0; i < CLOCK_TimerCount; ++ i )
      Timings[i] = 0;
}




// note: allocates pSigma on this->Mem.
void FDftiJob::FormSigma(double *&pSigma, double const *pRhoGrdA, double const *pRhoGrdB)
{
   // sigmaAB = [grad rhoA]*[grad rhoB]
   Mem.Alloc(pSigma, nGridPt);
   for ( size_t iPt = 0; iPt < nGridPt; ++ iPt ){
      pSigma[iPt] = pRhoGrdA[iPt            ] * pRhoGrdB[iPt            ] +
                    pRhoGrdA[iPt +   nGridPt] * pRhoGrdB[iPt +   nGridPt] +
                    pRhoGrdA[iPt + 2*nGridPt] * pRhoGrdB[iPt + 2*nGridPt];
   }
}




void FDftiJob::Run()
{
   TMemoryLock<char>
      pBaseOfStorage(0, &Mem);

   // evaluate the basis functions (+derivatives) on the grid.
   RESUME_CPU_CLOCK(CLOCK_EvalBfn);
   double
      *pOrbVal;
   size_t
      nMap, *pMap;
   ptrdiff_t
      *pCenterIndices;
   EvalBfn(pOrbVal, pMap, nMap, pCenterIndices);
   PAUSE_CPU_CLOCK(CLOCK_EvalBfn);
   if ( nMap == 0 )
      return; // no functions left after screening.

   bool
      UseOrbitals = false;

   // make electron densities (+derivatives).
   RESUME_CPU_CLOCK(CLOCK_FormRho);
   FDensitySet
      DenC = FDensitySet(), DenO = FDensitySet();
   FDensitySetInput
      DenInpC(Args.pDenC, Args.pOccOrbC, Args.nBf(), Args.nOccC,
         pMap, nMap, UseOrbitals, Args.MakeGradient(), nComp),
      DenInpO;

   bool
      ForceNonNegative = true;
//    if ( Args.GridDensityFlags == 2 )
//       // just evaluate densities -- for some applications these might not be
//       // entirely positive.
//       ForceNonNegative = false;

   DenC.Init(nGridPt, nDiff, Args.NeedTau(), Args.UseAuxiliaryExpansion(), Mem);
   DenC.Eval(pOrbVal, DenInpC, ForceNonNegative, Mem);
   if ( Args.OpenShell() ) {
      DenInpO = DenInpC;
      DenInpO.pRdm = Args.pDenO;
      DenInpO.pOccOrb = Args.pOccOrbO;
      DenInpO.nOcc = Args.nOccO;

      DenO.Init(nGridPt, nDiff, Args.NeedTau(), Args.UseAuxiliaryExpansion(), Mem);
      DenO.Eval(pOrbVal, DenInpO, false, Mem);

//       bool
//          bHaveAB = (bool)Args.iUseAB && UseOrbitals;
//       if ( bHaveAB )
//          // input was alpha/beta orbitals, thus we have alpha/beta densities now.
//          // Transform this into closed/open densities, which the rest of the
//          // code expects.
//          TransformAbToCo(DenC, DenO, nGridPt);
   }
   PAUSE_CPU_CLOCK(CLOCK_FormRho);

//    // if requested, store densities to external array or add densities from
//    // external array.
//    if ( Args.GridDensityFlags != 0 ) {
//       uint
//          nExtSt = Args.nGridDensitySt;
//       double
//          *pExtDenC = &Args.pGridDensitiesC[nExtSt * iFirstGridPt],
//          *pExtDenO = &Args.pGridDensitiesO[nExtSt * iFirstGridPt];
//       if ( Args.pGridDensitiesC != 0 )
//          DenC.StoreOrLoad(pExtDenC, nExtSt, Args.GridDensityFlags);
//       if ( Args.pGridDensitiesO != 0 && Args.OpenShell() )
//          DenO.StoreOrLoad(pExtDenO, nExtSt, Args.GridDensityFlags);
//    };
//
//    if ( Args.GridDensityFlags == 2 )
//       return; // done here.

   // form intermediates sigma_cc/sigma_co/sigma_oo.
   double
      *pSigmaCC = 0, *pSigmaOO = 0, *pSigmaCO = 0;
   if ( nDiff >= 1 ) {
      FormSigma(pSigmaCC, DenC.pRhoGrd, DenC.pRhoGrd);
      if ( Args.OpenShell() ){
         FormSigma(pSigmaCO, DenC.pRhoGrd, DenO.pRhoGrd);
         FormSigma(pSigmaOO, DenO.pRhoGrd, DenO.pRhoGrd);
      }
   }

   // allocate output data: functional kernels and derivatives
   // of kernel with respect to input data.
   // zk,vrhoc,vrhoo,vsigmacc,vsigmaco,vsigmaoo,vtauc,vtauo,vupsilonc,vupsilono
//    FDftiResultSet
//       ResC = {0}, ResO = {0};
// ^- why does g++ warn about this? This is perfectly well defined in both C and C++! Annoying.
   FDftiResultSet
      ResC = FDftiResultSet(), ResO = FDftiResultSet();
   ResC.Init(nGridPt, nDiff, Args.NeedTau(), 0, Mem);
   ResO.Init(nGridPt, nDiff, Args.NeedTau(), ResC.pVdSigmaXY, Mem);

   // branch out to fortran in order to evaluate the functional
   // and the split energy contributions (=sum zk(g) wt(g), but split according to functional)
   RESUME_CPU_CLOCK(CLOCK_EvalDftfun);
   double
      *pTmp;
   Mem.Alloc(pTmp, 20*nGridPt);
   DFTFUN_CXX(1, Args.OpenShell(), nDiff, nGridPt, pGridWt,
              DenC.pRho, DenO.pRho, pSigmaCC, pSigmaCO, pSigmaOO, DenC.pTau, DenO.pTau, DenC.pUpsilon, DenO.pUpsilon,
              // density functional outputs (summed over functionals)
              ResC.pZk, ResC.pVdRho, ResO.pVdRho, ResC.pVdSigma, ResC.pVdSigmaXY, ResO.pVdSigma, ResC.pVdTau, ResO.pVdTau, ResC.pVdUpsilon, ResO.pVdUpsilon,
              // energy outputs (split according to functions; one output per ndftu; accumulated)
              pDfuEnergies,
              // buffer (for accumulating zk)
              pTmp,
              // flags specifying input data
              (FORTINT)(DenC.pTau != 0), (FORTINT)(DenC.pUpsilon != 0));
   Mem.Free(pTmp);
   PAUSE_CPU_CLOCK(CLOCK_EvalDftfun);

   if ( Args.MakeXc() ) {
      // assemble & accumulate xc matrices.
      RESUME_CPU_CLOCK(CLOCK_FormXcMat);
      DenC.AccXcMatrix(pFockC, pOrbVal, ResC, DenO.pRhoGrd, DenInpC, pGridWt, Mem);
      if ( Args.OpenShell() )
         DenO.AccXcMatrix(pFockO, pOrbVal, ResO, DenC.pRhoGrd, DenInpO, pGridWt, Mem);
      PAUSE_CPU_CLOCK(CLOCK_FormXcMat);
   }

}


// #ifdef HAVE_GRID_KERNEL
bool Vec3Eq(double const *pA, double const *pB) {
   return pA[0] == pB[0] && pA[1] == pB[1] && pA[2] == pB[2];
}

// pOrbValAo: format nGridPt * nComp * nAo  (after: nGridPt * nComp * nMap)
void IrEvalBfn(double *pOrbValAo, size_t *pMap, size_t &nMap, FMatrixView Grid, uint nDiff, FRawBasis const *pBasis, double ThrOrb, double LogThrOrb, FMemoryStack &Mem)
{
   TMemoryLock<char>
      pBaseOfMemory(0, &Mem);
   size_t
      nGridPt = Grid.nCols,
      nComp = 1,
      nAo = pBasis->nFn();
   IR_SUPPRESS_UNUSED_WARNING(nAo);
   if ( nDiff == 1 ) nComp = 4;
   if ( nDiff == 2 ) nComp = 10;

   // convert basis format for low level driver input...
   FRawBasis::FShellArray
      &ShellsA = const_cast<FRawBasis*>(pBasis)->Shells;

   nMap = 0;
//    assert(Grid.nRowSt == 1 && Grid.nRows == 3);
   assert(Grid.nRowSt == 1);

   size_t
      iFnBase = 0;
   ir::FRawShell
      *pLastBf = &ShellsA[0];
   for ( ir::FRawShell *pFirstBf = &ShellsA[0]; pFirstBf != &ShellsA[0] + ShellsA.size(); pFirstBf = pLastBf ) {
      pLastBf = pFirstBf;
      size_t nFn = 0;
      while ( pLastBf != &ShellsA[0] + ShellsA.size() && Vec3Eq(pLastBf->vCen, pFirstBf->vCen) ) {
         nFn += pLastBf->nFn();
         ++pLastBf;
      }
      size_t
         nBfStride = nGridPt*nComp;
      EvalShellGroupOnGrid(&pOrbValAo[nBfStride*nMap], nGridPt, nBfStride, // comp stride, bf stride
         pMap, nMap, iFnBase, &Grid(0,0), Grid.nColSt, nGridPt,
         pFirstBf, pLastBf, &pFirstBf->vCen[0],
         nDiff, ThrOrb, LogThrOrb, Mem);
      iFnBase += nFn;
   }
   assert(iFnBase == nAo && nMap <= nAo);
   assert(pLastBf == &ShellsA[0] + ShellsA.size());
}
// #endif




void FDftiJob::EvalBfn(double *&pOrbVal, size_t *&pMap, size_t &nMap, ptrdiff_t *&pCenterIndices)
{
   size_t
      nBf = Args.nBf();
   Mem.Alloc(pCenterIndices, nBf); // used in gradient evaluation
   Mem.Alloc(pMap, nBf);
   Mem.Alloc(pOrbVal, nGridPt * nComp * nBf);

//    // make temporary space for orbs-on-grid-routine (this way it doesn't
//    // have to allocate memory itself)
//    double
//       *pBuf;
//    uint
//       nBuf = 250000;
//    Mem.Alloc(pBuf, nBuf);
//
//    AIC_EVAL_BFN_ON_GRID( pOrbVal, nComp, pCenterIndices, pMap, nMap,
//       Args.infg, Args.expt, Args.cc, Args.sph, Args.nGrp, nBf, Args.pCenters,
//       pGridPt, nGridPt, nDiffBf, Args.ThrOrb, Args.LogThrOrb, pBuf, nBuf );
//    // ^- note that this is actually a C++ function (in AicFB.cpp). We just call
//    //    its exported Fortran interface.
   FMatrixView
//       Grid(&Args.pDftGrid->Positions[pGridBlock->iFirst][0], 3, nGridPt);
      Grid(const_cast<double*>(pGridPt), 3, nGridPt);
   IrEvalBfn(pOrbVal, pMap, nMap, Grid, nDiff, Args.pBasis, Args.ThrOrb, Args.LogThrOrb, Mem);

   // fix up pMap and pCenterIndices from 1-based addressing into 0-based addressing.
   // (FIXME: IR version does 0-based maps automatically?)
//    for ( size_t iMap = 0; iMap < (size_t)nMap; ++ iMap ) {
//       pMap[iMap] -= 1;
// //       pCenterIndices[iMap] -= 1;
//    }
#if 0
   xout << "CenIdx = [";
   for ( uint iMap = 0; iMap < nMap; ++ iMap ) {
      if ( iMap != 0 ) xout << " ";
      xout << pCenterIndices[iMap];
   }
   xout << "]" << std::endl;
#endif
}





void AccXc(FDftiArgs &Args, FLog &Log, FMemoryStack &Mem_)
{

   size_t
      nDftFunc = 1,
      nFockSize = Args.nFockSize();

   // clear output & timings
   memset(Args.pDfuEnergies, 0, sizeof(double) * nDftFunc);

   size_t nJobs = Args.pDftGrid->GridBlocks.size();
   double Timings[CLOCK_TimerCount];
   if (1) {
//       Args.Flags |= DFTI_MeasureTime;
      Args.pTimings = &Timings[0];
      memset(Args.pTimings, 0, sizeof(double) * CLOCK_TimerCount);
   }

   FMemoryStackArray MemStacks(Mem_);

   bool Aborted = false;
   #pragma omp parallel for schedule(dynamic)
   //for (size_t iJob = 0; iJob < nJobs; ++ iJob ) {
   for (int iJob__ = 0; iJob__ < int(nJobs); ++ iJob__ ) {
      size_t iJob = size_t(iJob__); // for OpenMP.
      FMemoryStack &Mem = MemStacks.GetStackOfThread();

      if (!Log.StatusOkay())
         Aborted = true;
      #pragma omp flush (Aborted)
      if (!Aborted) {
         // ^- both "break" and "continue" seem to be problemaic..
         TMemoryLock<char>
            pBaseOfMemory(0, &Mem);
         FDftGrid::FGridBlock const
            *pGridBlock = &Args.pDftGrid->GridBlocks[iJob];

//          FMemoryStack2
//             // FIXME: this is a hack around the workspace memory size computation...
//             // the *much* better alternative would be to do this op with less space.
//             Mem1(2*nFockSize * sizeof(double) + 100000);
//          TMemoryLock<char> pHackyFreeMe1(0, &Mem1);
         FMemoryStack &Mem1 = Mem;

         double
            *pDfuEnergies = 0,
            *pFockC = 0,
            *pFockO = 0,
            *pGrad = 0;
         Mem.ClearAlloc(pDfuEnergies, nDftFunc);
         if (Args.MakeXc()) {
            Mem1.ClearAlloc(pFockC, nFockSize);
            Mem1.ClearAlloc(pFockO, nFockSize);
         }
         if (Args.MakeGradient()) {
            Mem.ClearAlloc(pGrad, 3*Args.nCenters());
         }

         FDftiJob
            Job(pDfuEnergies, pFockC, pFockO, pGrad, pGridBlock, Args, Mem);
         Job.Run();

         // hmhm... my Molpro version does this in a much more clever fashion.
         // (only joins threads once). Can this be done with openmp?
         // FIXME: this should really be fixed. Current version needs a ClearAlloc on the output quantities,
         // for every single grid block! Lame.
         #pragma omp critical
         {
            Add2(Args.pDfuEnergies, pDfuEnergies, 1.0, nDftFunc);
            if (Args.MakeXc()) {
               Add2(Args.pFockC, pFockC, 1.0, nFockSize);
               if (Args.OpenShell())
                  Add2(Args.pFockO, pFockO, 1.0, nFockSize);
            }
            if (Args.MakeGradient()) {
               Add2(Args.pGradient, pGrad, 1.0, 3*Args.nCenters());
            }
            if (Args.MeasureTime())
               Add2(Args.pTimings, Job.Timings, 1.0, CLOCK_TimerCount);
         }
      }
   }

   Log.CheckStatus();
   assert_rt(!Aborted); // you should not be here if this was set.

   if (Args.MeasureTime()) {
      xout << format(pTimingFmt) % "DFT (basis fn.)" % Args.pTimings[CLOCK_EvalBfn];
      xout << format(pTimingFmt) % "DFT (densities)" % Args.pTimings[CLOCK_FormRho];
      if (Args.MakeXc())
         xout << format(pTimingFmt) % "DFT (xc matrix)" % Args.pTimings[CLOCK_FormXcMat];
      xout << format(pTimingFmt) % "DFT (functional)" % Args.pTimings[CLOCK_EvalDftfun];
      if (Args.MakeGradient())
         xout << format(pTimingFmt) % "DFT (core grad.)" % Args.pTimings[CLOCK_CoreGrad];
      if (Args.MakeGradient() && Args.MakeGridGradient())
      xout << format(pTimingFmt) % "DFT (grid grad.)" % Args.pTimings[CLOCK_GridWtDeriv];
   }
}





} // namespace dfti


void FFockComponentBuilderDfCoulXcCached::AccFock(FMatrixView &FockC, FMatrixView &FockO, FBasisSet *pBasis, FMatrixView const &OrbC, FMatrixView const &OrbO, uint Flags, FMemoryStack &Mem)
{
   FFockComponentBuilder::AccFock(FockC, FockO, pBasis, OrbC, OrbO, Flags, Mem); // consistency checks.

   bool AuxExpandXc = true;
//    AuxExpandXc = false;

   if (pBasis != pOrbBasis)
      throw std::runtime_error("FFockComponentBuilderDfCoulXcCached must now be used with orb-projected initial guess (not Fock-projected guess!)");
//    if (pBasis != pOrbBasis) {
//       m_pTimers->Enter(0x201, "DF-J/XC (Guess)");
//       // hm hm. This might be a problem since we project Fock matrices...
//       // (note: not sure if this actually *accumulates* matrices, instead of overwriting them)
//       FockC.Clear();
//       MakeCoul(FockC, &ir::g_IrCoulombKernel, &*pBasis->pRawBasis, &*pFitBasis->pRawBasis, OrbC, OrbO, Jcd, Mem);
//
//       if (pXcFn) {
//          using namespace dfti;
//
//          size_t nGu = pBasis->nFn();
//
//          FStackMatrix
//             DenC(nGu, nGu, &Mem);
//          Mxm(DenC, OrbC, Transpose(OrbC));
//          DenC.TriangularReduce(1, 0, 1.);
//
//          FStackMatrix
//             FockTr(nGu, nGu, &Mem);
//          FockTr.Clear();
//
//          EnergyXc = 0.;
//          FDftiArgs DftiArgs = {
//             DFTI_MakeXc,
//             &EnergyXc, 0, FockTr.pData, 0,
//             DenC.pData, 0, 0, 0, 0, 0, // no orbitals and no open-shell density provided.
//             pBasis->pRawBasis.get(),
//             pXcFn.get(),
//             pDftGrid.get(),
//             1e-10, 40 // 1e-1 * ThrDen?
//          };
//          AccXc(DftiArgs, m_Log, Mem);
//          FockTr.TriangularExpand();
//          Add(FockC, FockTr, 1.0);
//          xout << format(pResultFmt) % "Density functional" % EnergyXc;
//       }
//
//       m_pTimers->Leave(0x201, "DF-J/XC (Guess)");
//       return;
//    }

   if ( nAo != OrbC.nRows || nAo != OrbO.nRows || nFit != Jcd.nRows )
      throw std::runtime_error("AccFock: Input orbitals not consistent with orbital basis set.");

   m_pTimers->Enter(0x200, "DF-J/XC");
   FStackMatrix
      Density(nAo, nAo, &Mem), // OrbC x OrbC.T + OrbO x OrbO.T
      jgamma(nFit, 1, &Mem),   // (A|rs) gamma[r,s].
      AuxDen(nFit, 1, &Mem);   // (A|B)^{-1} jgamma[B]
   m_pTimers->Resume(0x201, "DF-J/XC (RDM)");
   Mxm(Density, OrbC, Transpose(OrbC));
   Mxm(Density, OrbO, Transpose(OrbO), MXM_Add);

   Density.TriangularReduce(1, 0, 2.); // reduce to triangular form; multiply off diagonal elements by 2.0
   m_pTimers->Pause(0x201, "DF-J/XC (RDM)");

   m_pTimers->Resume(0x202, "DF-J/XC (1x density fit)");
   Mxva(jgamma.pData, Transpose(Int3ix), Density.pData);

   Move(AuxDen, jgamma);

   //   Solve[Jcd[AB] D[\nu B i]] -> D[\nu A I]  to get density coefficients
   CholeskySolve(AuxDen, Jcd);
   m_pTimers->Pause(0x202, "DF-J/XC (Density fit)");

   FStackMatrix
      AuxDen1(nFit, 1, &Mem);
   Move(AuxDen1, AuxDen);

   if (1) {
      m_pTimers->Resume(0x203, "DF-J/XC (coul. energy)");
      // make coulomb energy from 2ix integrals:
      // Ecoulomb = .5 j[A] (A|B) j[B]
      // (since we use a robust fit this is equal to the density matrix contraction.
      // the advantage here is that we can do it also in the xc auxiliary expansion case,
      // where we will not get a pure j matrix).
//       FStackMatrix
//          cgamma(nFit, 1, &Mem);
//       Move(cgamma, jgamma);
//       TriangularMxm(Transpose(cgamma), Jcd, 'R');
//       EnergyCoulomb = .5 * dfti::Dot2(cgamma.pData, cgamma.pData, nFit);
      EnergyCoulomb = .5 * dfti::Dot2(AuxDen.pData, jgamma.pData, nFit);
      m_pTimers->Pause(0x203, "DF-J/XC (coul. energy)");
   }

   if (pXcFn && AuxExpandXc) {
      m_pTimers->Resume(0x204, "DF-J/XC (xc contrib.)");
      using namespace dfti;
      FStackMatrix
         vxc(nFit, 1, &Mem);

      vxc.Clear();
      FDftiArgs DftiArgs = {
         DFTI_MakeXc | DFTI_AuxiliaryExpand,
         &EnergyXc, 0, vxc.pData, 0,
         AuxDen.pData, 0, 0, 0, 0, 0, // no orbitals and no open-shell density provided.
         pFitBasisRaw,
         pXcFn.get(),
         pDftGrid.get(),
         1e-10, 40, 0 // 1e-1 * ThrDen?
      };
      AccXc(DftiArgs, m_Log, Mem);
//       vxc.Print(xout, "AUXILIARY XC POTENTIAL (pure, pre-J^1).");
      if (0) {
         FStackMatrix
            vxcd(nFit, 1, &Mem),
            Scd(nFit, nFit, &Mem);
         MakeIntMatrix(Scd, *pFitBasis, *pFitBasis, FKrn2i_Direct(&ir::g_IrOverlapKernel), Mem);
//          CalcCholeskyFactors(Scd);
         Move(vxcd, vxc);
//          CholeskySolve(vxcd, Scd);
//          Mxm(vxcd, Scd, vxc);
         vxcd.Print(xout, "AUXILIARY XC POTENTIAL (S^-1).");
         xout << format(" <vxc,auxden> = %18.12f\n") % Dot2(vxcd.pData, AuxDen1.pData, nFit);
      }
      CholeskySolve(vxc, Jcd);
      Add(AuxDen, vxc);
//       AuxDen1.Print(xout, "AUXILIARY DENSITY.");
//       vxc.Print(xout, "DENSITY-LIKE XC POTENTIAL.");
//       xout << format(pResultFmt) % "Density functional" % DfuEnergy;
//       xout << format(pResultFmt) % "Functional energy" % DfuEnergy;
//       EnergyXc
      m_pTimers->Pause(0x204, "DF-J/XC (xc contrib.)");
   }



   // recalculate integrals to form the coulomb matrix.
//    FormIntMNF_ContractF(Coul, pIntKernel, pOrbBasis, pFitBasis, AuxDen.pData, Mem);
   FStackMatrix
      FockTr(nAo, nAo, &Mem);

   m_pTimers->Resume(0x205, "DF-J/XC (j/vxc matrix)");
   Mxva(FockTr.pData, Int3ix, AuxDen.pData);
   m_pTimers->Pause(0x205, "DF-J/XC (j/vxc matrix)");

   // coulomb energy.
//    EnergyCoulomb = .5 * dfti::Dot2(Density.pData, FockTr.pData, FockTr.TriangularStorageSize1());


//    FockTr.Clear(); // FIXME: REMOVE THIS.
   if (pXcFn && !AuxExpandXc) {
      m_pTimers->Resume(0x204, "DF-J/XC (xc contrib.)");
      using namespace dfti;
      FStackMatrix
         DenC(nAo, nAo, &Mem);
      Mxm(DenC, OrbC, Transpose(OrbC));
      DenC.TriangularReduce(1, 0, 1.);

      FDftiArgs DftiArgs = {
         DFTI_MakeXc,
         &EnergyXc, 0, FockTr.pData, 0,
         DenC.pData, 0, 0, 0, 0, 0, // no orbitals and no open-shell density provided.
         pOrbBasisRaw,
         pXcFn.get(),
         pDftGrid.get(),
         1e-10, 40, 0 // 1e-1 * ThrDen?
      };
      AccXc(DftiArgs, m_Log, Mem);
//       xout << format(pResultFmt) % "Density functional" % DfuEnergy;
//       xout << format(pResultFmt) % "Functional energy" % DfuEnergy;
      m_pTimers->Pause(0x204, "DF-J/XC (xc contrib.)");
   }

   Energy = EnergyCoulomb + EnergyXc;

   FockTr.TriangularExpand();
//    FockTr.Print(xout, "FOCK + XC");
//    throw std::runtime_error("absicht.");
   Add(FockC, FockTr, 1.0);

   m_pTimers->Leave(0x200, "DF-J/XC");
}

void FFockComponentBuilderDfCoulXcCached::PrintEnergyContribs()
{
   m_Log.WriteResult("Coulomb energy", EnergyCoulomb);
   m_Log.WriteResult("Density functional energy", EnergyXc);
}

















} // namespace ct
