/* Copyright (c) 2015  Gerald Knizia
 * 
 * This file is part of the IboView program (see: http://www.iboview.org)
 * 
 * IboView is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3.
 * 
 * IboView is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with bfint (LICENSE). If not, see http://www.gnu.org/licenses/
 * 
 * Please see IboView documentation in README.txt for:
 * -- A list of included external software and their licenses. The included
 *    external software's copyright is not touched by this agreement.
 * -- Notes on re-distribution and contributions to/further development of
 *    the IboView software
 */

#include <algorithm> // for std::swap
#include <cmath>
#include "CxVec3.h"
#include "IrAmrr.h"
#include "Ir.h"
// #include <immintrin.h>
#if 0
   #include "RDTSC.h"
#else
   #define RESUME_CLOCK(x)
   #define PAUSE_CLOCK(x)
#endif
// using ct::FMemoryStack;
using namespace ct;
using std::size_t;
typedef ct::TVector3<double>
   FVec3;

// #define CONTRACT_WITH_BLAS
#ifdef CONTRACT_WITH_BLAS
   #include "CxAlgebra.h" // for DGER (vxv)
#endif

// #include <stdio.h>    // FIXME: remove this
// #include <iostream> // FIXME: remove this

// references:
//   [1]: PCCP 8 3072 (2006), doi: 10.1039/b605188j
//   [2]: PCCP 6 5119 (2004), doi: 10.1039/b413539c

namespace ir {

template<class T>
inline T sqr(T x) {
   return x*x;
}

// return x^(3/2).
inline double pow15(double x) {
   return x * std::sqrt(x);
}

inline double DistSq3(double const *pA, double const *pB) {
   return sqr(pA[0] - pB[0]) + sqr(pA[1] - pB[1]) + sqr(pA[2] - pB[2]);
}

inline void SubVec3(double *pOut, double const *pA, double const *pB) {
   pOut[0] = pA[0] - pB[0];
   pOut[1] = pA[1] - pB[1];
   pOut[2] = pA[2] - pB[2];
}

// // pOut += f * pIn
// static void Add2(double *IR_RP pOut, double const *IR_RP pIn, double f, size_t n)
// {
//    size_t i = 0;
//    for ( ; i < (n & ~3); i += 4 ) {
//       pOut[i]   += f * pIn[i];
//       pOut[i+1] += f * pIn[i+1];
//       pOut[i+2] += f * pIn[i+2];
//       pOut[i+3] += f * pIn[i+3];
//    }
//    for ( ; i != n; ++ i ) {
//       pOut[i] += f * pIn[i];
//    }
// }


#define ALIGN_CO_MEM(Mem)
#define ALIGN_CO_SIZE_DBL(n) n
#define ALIGN_CO_SIZE_DBLx(n) n

#ifdef __AVX__
   // pOut += f * pIn
   // ...this version works okay if compiled with -march=native...
   // in fact, in practice it seems to work similarly well as other versions
   // in which stuff is actually aligned.
   static void Add2(double *IR_RP pOut, double const *IR_RP pIn, double f, size_t n)
   {
      for (size_t i = 0 ; i < n; ++i)
         pOut[i] += f * pIn[i];
   }
#else
   // pOut += f * pIn
   static void Add2(double *IR_RP pOut, double const *IR_RP pIn, double f, size_t n)
   {
      size_t i = 0;
      for ( ; i < (n & ~3); i += 4 ) {
         pOut[i]   += f * pIn[i];
         pOut[i+1] += f * pIn[i+1];
         pOut[i+2] += f * pIn[i+2];
         pOut[i+3] += f * pIn[i+3];
      }
      pOut += i;
      pIn += i;
      switch(n - i) {
         case 3: pOut[2] += f*pIn[2];
         case 2: pOut[1] += f*pIn[1];
         case 1: pOut[0] += f*pIn[0];
         default: break;
      }
   }
#endif




static bool IsWithinRange(FRawShell const *pA, FRawShell const *pB) {
   if (!pA->pRange || !pB->pRange)
      return true; // shells have no screening data.
   return ir::sqr(pA->MaxCoRange() + pB->MaxCoRange()) >= DistSq3(pA->vCen, pB->vCen);
}

static bool IsPrimitiveWithinRange(FRawShell const *pA, uint iExpA, FRawShell const *pB, uint iExpB, double fDistSqAB)
{
   if (!pA->pRange || !pB->pRange)
      return true; // shells have no screening data.
   return ir::sqr(pA->ExpRange(iExpA) + pB->ExpRange(iExpB)) >= fDistSqAB;
}

static bool IsContractionWithinRange(FRawShell const *pA, uint iCoA, FRawShell const *pB, uint iCoB, double fDistSqAB)
{
   if (!pA->pRange || !pB->pRange)
      return true; // shells have no screening data.
   return ir::sqr(pA->CoRange(iCoA) + pB->CoRange(iCoB)) >= fDistSqAB;
}

static bool IsPrimitiveWithinRange(FRawShell const *pA, uint iExpA, FRawShell const *pB, uint iExpB)
{
   return IsPrimitiveWithinRange(pA, iExpA, pB, iExpB, DistSq3(pA->vCen, pB->vCen));
}


struct FGaussProduct
{
   double
      ExpA, ExpB, // exponents of the primitives
      Eta, // ExpA + ExpB
      InvEta, // 1/(ExpA+ExpB)
      Exp, // exponent of the primitive product.
      DistSq; // squared distance between A and B
   double
      vCen[3],
      vAmB[3];
   FGaussProduct(double const *vA, double ExpA_, double const *vB, double ExpB_) {
      ExpA = ExpA_; ExpB = ExpB_;
      Eta = ExpA + ExpB;
      InvEta = 1./Eta;
      Exp = ExpA * ExpB * InvEta;
      DistSq = 0;
      for ( uint i = 0; i < 3; ++ i ) {
         vCen[i] = InvEta * (ExpA * vA[i] + ExpB * vB[i]);
         vAmB[i] = vA[i] - vB[i];
         DistSq += sqr(vAmB[i]);
      }
   }

   double Sab() {
      return std::exp(-Exp * DistSq);
   }
};


static size_t *MakeFnOffsets(FRawShell const *pCs, size_t nC, FMemoryStack &Mem)
{
   size_t *piFnC;
   Mem.Alloc(piFnC, nC+1);
   Mem.Align(16);
   piFnC[0] = 0;
   for (size_t iC = 0; iC < nC; ++ iC)
      piFnC[iC+1] = piFnC[iC] + pCs[iC].nFn();
   return piFnC;
}

static double
   dbl = 0.; // dummy.

#ifdef CONTRACT_WITH_BLAS

   // accumulate matrix nSize  at pIn, corresponding to primitive iExpC,
   // to nSize x nCo at pOut.
   static void Contract1(double *pOut, double *pIn, size_t nSize, FRawShell const *pC, uint iExpC)
   {
      DGER(nSize, pC->nCo, 1.0, pIn, 1, pC->pCo + iExpC, pC->nExp, pOut, nSize);
   }

   #ifdef INCLUDE_OPTIONALS
   // same as Contract1, but additionally multiply with the supplied float.
   static void Contract1f(double *pOut, double *pIn, double fScale, size_t nSize, FRawShell const *pC, uint iExpC)
   {
      DGER(nSize, pC->nCo, fScale, pIn, 1, pC->pCo + iExpC, pC->nExp, pOut, nSize);
   }

   // same as Contract1f, but process multiple sets with strides and holes.
   static void Contract1fh(double *pOut, size_t nSize, double *pIn, size_t StrideIn,
      double fScale, size_t nSets, FRawShell const *pC, uint iExpC)
   {
      assert(nSize <= StrideIn);
      if (StrideIn == nSize)
         DGER(nSize*nSets, pC->nCo, fScale, pIn, 1, pC->pCo + iExpC, pC->nExp, pOut, nSize*nSets);
      else {
         for (size_t iSet = 0; iSet < nSets; ++ iSet)
            DGER(nSize, pC->nCo, fScale, &pIn[StrideIn*iSet], 1, pC->pCo + iExpC, pC->nExp, &pOut[nSize*iSet], nSize*nSets);
      }
   }
   #endif // INCLUDE_OPTIONALS

#else
   // accumulate matrix nSize  at pIn, corresponding to primitive iExpC,
   // to nSize x nCo at pOut.
   static void Contract1(double *pOut, double *pIn, size_t nSize, FRawShell const *pC, uint iExpC)
   {
      for (uint iCoC = 0; iCoC < pC->nCo; ++ iCoC) {
         double fCoC = pC->fCo(iExpC,iCoC);
         if (fCoC != 0)
            Add2(&pOut[nSize*iCoC], pIn, fCoC, nSize);
      }
   }

   #ifdef INCLUDE_OPTIONALS
   // same as Contract1, but additionally multiply with the supplied float.
   static void Contract1f(double *pOut, double *pIn, double fScale, size_t nSize, FRawShell const *pC, uint iExpC)
   {
      for (uint iCoC = 0; iCoC < pC->nCo; ++ iCoC) {
         double fCo = fScale * pC->fCo(iExpC,iCoC);
         if (fCo != 0)
            Add2(&pOut[nSize*iCoC], pIn, fCo, nSize);
      }
   }

   // same as Contract1f, but process multiple sets with strides and holes.
   static void Contract1fh(double *pOut, size_t nSize, double *pIn, size_t StrideIn,
      double fScale, size_t nSets, FRawShell const *pC, uint iExpC)
   {
      assert(nSize <= StrideIn);
      if (StrideIn == nSize)
         return Contract1f(pOut, pIn, fScale, nSize*nSets, pC, iExpC);
      for (uint iCoC = 0; iCoC < pC->nCo; ++ iCoC) {
         double fCo = fScale * pC->fCo(iExpC,iCoC);
         if (fCo != 0) {
            for (size_t iSet = 0; iSet < nSets; ++ iSet) {
               Add2(&pOut[nSize*(iSet + nSets*iCoC)], &pIn[StrideIn*iSet], fCo, nSize);
            }
         }
      }
   }
   #endif // INCLUDE_OPTIONALS
#endif // CONTRACT_WITH_BLAS

#if 0
// this is the base version of the 2e3c integral routine. It is currently
// replaced by the generalzied version below, which separates the contraction
// work from the primitive work to support derivative integrals and inline-
// contracting drivers. It is retained here in order to illustrate what the
// generalized version does.

void EvalInt2e3c_(double *pOut, size_t *Strides,
    FRawShell const *pA, FRawShell const *pB, FRawShell const *pCs, size_t nC,
    double Prefactor, FIntegralKernel const *pKernel, FMemoryStack &Mem)
{
   void
      *pBaseOfMemory = Mem.Alloc(0);
   size_t
      StrideA = Strides[0], StrideB = Strides[1], StrideC = Strides[2];
   if (pA->l < pB->l) { // <- OsrrC only implemented for la >= lb.
      std::swap(pA, pB);
      std::swap(StrideA, StrideB);
   }

   // count number of C functions and find largest lc for memory purposes.
   size_t
      *piFnC = MakeFnOffsets(pCs, nC, Mem),
      nFnC_Total = piFnC[nC];
   uint
      lc_Max = 0;
   for (size_t iC = 0; iC < nC; ++ iC)
      lc_Max = std::max(lc_Max, (uint)pCs[iC].l);
   // allocate intermediates
   size_t
      nCartX_AB = nCartX(pA->l + pB->l),
      nCartX_Am1 = nCartX(pA->l-1),
      nCartX_B = nCartX(pB->l),
      nCartX_ABmA = nCartX_AB - nCartX_Am1,
      nShA_CartXB = pA->nSh() * nCartX_B;
   // intermediates for primitive integrals
   double
      // Kernel derivatives (-d/dT)^m of (00|00) integral
      *pGm = Mem.AllocN(pA->l + pB->l + lc_Max + 1, dbl),
      // (a0|0) intermediate
      *p_A00 = Mem.AllocN(nCartX_AB, dbl),
      *p_A0C_sh_mem = Mem.AllocN(nCartX_ABmA * (2*lc_Max+1), dbl),
      *pMemOsrrB = Mem.AllocN(nCartX_AB * nCartX(lc_Max), dbl);
   // intermediates for contractions
   double
      // intermediates (a0|c) with AB primitives and C contracted, a = la..lab
      *p_A0C_ppc = Mem.AllocN(nCartX_ABmA * nFnC_Total, dbl),
      // intermediates (a0|c) with A,C contracted, a = la..lab.
      *p_A0C_cpc = Mem.AllocN(nCartX_ABmA * nFnC_Total * pA->nCo, dbl),
      // intermediates (a0|c) with A,B,C all contracted, a = la..lab.
      *p_A0C_ccc = Mem.ClearAllocN(nCartX_ABmA * nFnC_Total * pA->nCo * pB->nCo, dbl),
      // intermediates (xa|c) with A,B,C contracted and (xa| = nCartX(lb) x (2*la+1)
      *p_xAC_ccc = Mem.AllocN(nShA_CartXB * nFnC_Total * pA->nCo * pB->nCo, dbl);

//    printf("size on p_A0C_ccc = %i  expected: %i\n", p_xAC_ccc-p_A0C_ccc, nCartX_ABmA * nFnC_Total * pA->nCo * pB->nCo);
   FVec3
      vAmB = FVec3(pA->vCen) - FVec3(pB->vCen);
   double
//       fRangeKernel = sqr(pKernel->MaxRange()),
      fDistSqAB = LengthSq(vAmB);

   for (uint iExpB = 0; iExpB < pB->nExp; ++ iExpB)
   {
      memset(p_A0C_cpc, 0, nCartX_ABmA * nFnC_Total * pA->nCo * sizeof(*p_A0C_cpc));
      for (uint iExpA = 0; iExpA < pA->nExp; ++ iExpA)
      {
         // skip if Dist(A,B) < Range(A) + Range(B)
         if (!IsPrimitiveWithinRange(pA, iExpA, pB, iExpB, fDistSqAB))
            continue;

         FGaussProduct
            OvAB(pA->vCen, pA->pExp[iExpA], pB->vCen, pB->pExp[iExpB]);
            // ^- P == OvAB.vCen
         double
            Sab = std::exp(-OvAB.Exp * fDistSqAB), // [1] (6)
            PmA[3];
         SubVec3(PmA, OvAB.vCen, pA->vCen);

         memset(p_A0C_ppc, 0, nCartX_ABmA * nFnC_Total * sizeof(*p_A0C_ppc));
         for (size_t iC = 0; iC < nC; ++ iC) {
            FRawShell const
               *pC = &pCs[iC];
            uint
               TotalL = pA->l + pB->l + pC->l;
            for (uint iExpC = 0; iExpC < pC->nExp; ++ iExpC)
            {
               FGaussProduct
                  OvPC(OvAB.vCen, OvAB.Eta, pC->vCen, pC->pExp[iExpC]);
               double
                  *PmC = OvPC.vAmB,
                  rho = OvPC.Exp, // [1] (3)
                  T = rho * OvPC.DistSq,
                  Factor = pow15(M_PI * OvPC.InvEta) * Sab * Prefactor; // [1] (7)

               // make I[m] = (00|0)^m, m = 0..TotalL (inclusive)
               pKernel->EvalGm(pGm, rho, T, TotalL, Factor);

               // make (a0|0)^m for a = 0..lab with lab + la+lb.
               OsrrA(p_A00, pGm + pC->l, (pA->l + pB->l), PmA[0], PmA[1], PmA[2],
                  PmC[0], PmC[1], PmC[2], rho, OvAB.InvEta);

               // make (a0|c) for a = la..lab, c = 0..lc.
               double
                  *p_A0C_sh;
               if (pC->l == 0) {
                  p_A0C_sh = p_A00 + nCartX_Am1;
               } else {
                  p_A0C_sh = p_A0C_sh_mem;
                  OsrrB_3c_shc(p_A0C_sh, p_A00, pMemOsrrB, pA->l, (pA->l + pB->l), pC->l,
                     PmC[0], PmC[1], PmC[2], OvPC.InvEta, rho/pC->pExp[iExpC]);
               }

               // (a0|c) with solid harmonic c is ready now. Just need to add it to
               // its contractions.
               Contract1(&p_A0C_ppc[nCartX_ABmA * piFnC[iC]], p_A0C_sh,
                  nCartX_ABmA*pC->nSh(), pC, iExpC);
            } // c exponents
         } // c shells
         // p_A0C_ppc should be done now. Contract A and B.
         Contract1(p_A0C_cpc, p_A0C_ppc, nCartX_ABmA * nFnC_Total, pA, iExpA);
      } // a exponents
      Contract1(p_A0C_ccc, p_A0C_cpc, (nCartX_ABmA * nFnC_Total * pA->nCo), pB, iExpB);
   } // b exponents

   // transform A to solid harmonics by factoring nCartX(lab) into nCartX(lb) x Slm(A).
   ShTrA_XY(p_xAC_ccc, p_A0C_ccc, pA->l, (pA->l + pB->l), nFnC_Total * pA->nCo * pB->nCo);

   // we now have nCartX(lb) x nShA x nFnC_Total x nCoA x nCoB at p_xAC_ccc.
   // we still need to move the angular momentum from a to b and to write the
   // output integrals to their final destination.
   for (uint iCoB = 0; iCoB < pB->nCo; ++ iCoB)
      for (uint iCoA = 0; iCoA < pA->nCo; ++ iCoA) {
         for (uint iFnC = 0; iFnC < nFnC_Total; ++ iFnC) {
            uint
               iFnA = iCoA * pA->nSh(), iFnB = iCoB * pB->nSh();
            OsrrC(
               &pOut[iFnA*StrideA + iFnB*StrideB + iFnC*StrideC], StrideA, StrideB,
               &p_xAC_ccc[nShA_CartXB * (iFnC + (nFnC_Total * (iCoA + pA->nCo * iCoB)))],
               vAmB[0], vAmB[1], vAmB[2], pB->l, pA->nSh() );
         };
      }

   Mem.Free(pBaseOfMemory);
}

//          if ( Sab < 1e-10 )
//             continue;
//          if (!IsPrimitiveWithinRange(pA, iExpA, pB, iExpB, fDistSqAB) && Sab > 1e-10) {
//             printf("Problem: A[l=%i exp=%f rg=%f]  B[l=%i  exp=%f  rg=%f]  fDistAB = %10.3f  Sab = %.2e\n",
//                   pA->l, pA->pExp[iExpA], pA->ExpRange(iExpA),
//                   pB->l, pB->pExp[iExpB], pB->ExpRange(iExpB),  std::sqrt(fDistSqAB), Sab);
//          }

//          if (!IsContractionWithinRange(pA, iCoA, pB, iCoB, fDistSqAB)) {
//             uint
//                iFnA = iCoA * pA->nSh(), iFnB = iCoB * pB->nSh();
//             for (uint iFnC = 0; iFnC < nFnC_Total; ++ iFnC)
//                for (uint iB = iFnB; iB != iFnB + pB->nSh(); ++ iB)
//                   for (uint iA = iFnA; iA != iFnA + pA->nSh(); ++ iA)
//                      pOut[iA*StrideA + iB*StrideB + iFnC*StrideC] = 0.;
//             continue;
//          }
/*
// evaluate (ab|c) for given range of shells [pA[0]..pA[nA]) x [pB[0]..pB[nB]) x [pC[0]..pC[nC])
// Output integral [ia,ib,ic] is stored at pOut[Strides[0]*ia + Strides[1]*ib + Strides[2]*ic],
// and integrals are multiplied by Prefactor.
void EvalInt2e3c(double *pOut, size_t *Strides,
    FRawShell const *pAs, size_t nA, FRawShell const *pBs, size_t nB, FRawShell const *pCs, size_t nC,
    double Prefactor, FIntegralKernel const *pKernel, FMemoryStack &Mem)
{
   for ( size_t iA = 0; iA < nA; ++ iA ) {
      for ( size_t iB = 0; iB < nB; ++ iB ) {
         // ...
      }
   }
};






void EvalInt2e3c(double *pOut, size_t *Strides,
    FRawShell const &A, FRawShell const &B, FRawShell const &C,
    double Prefactor, FIntegralKernel const *pKernel, FMemoryStack &Mem)
{
   return EvalInt2e3c(pOut, Strides, &A,1, &B,1, &C,1, pKernel, Mem);
};*/
#endif // 0



// without inlining the 3c main driver function (a prerequisite for inlining the
// virtual functions) there can be substantial call overhead. Not disastrous
// yet, but quite noticeable.
// As the driver function is rather large, compilers might be inclined to ignore
// our ``inline'' directive. Try to tell them that we really mean it.
#ifdef __GNUC__ // g++
   #define IR_FORCE_INLINE inline __attribute__((always_inline))
#elif defined(_MSC_VER)
   #define IR_FORCE_INLINE __forceinline
#else
   #define IR_FORCE_INLINE inline
#endif


// this auxiliary structure covers common functionality of 3-center shell
// drivers for direct 2e3c integrals, derivative integrals, and inline-
// contracting variants of the same.
struct FInt3cShellFactory
{
   FInt3cShellFactory(FRawShell const *pA_, FRawShell const *pB_, FRawShell const *pCs_, size_t nC_,
                      double Prefactor_, FIntegralKernel const *pKernel_, FMemoryStack &Mem_)
      : pA(pA_), pB(pB_), pCs(pCs_), nC(nC_), Prefactor(Prefactor_), pKernel(pKernel_), Mem(Mem_)
   {
      SwapAB = pA->l < pB->l;
      if (SwapAB) // <- OsrrC only implemented for la >= lb.
         std::swap(pA, pB);
   }
protected:
   IR_FORCE_INLINE void EvalDrv(uint lab_Min, uint lab_Max, bool ShTrC);

   FRawShell const
      *pA, *pB, *pCs;
   size_t
      nC;
   double
      Prefactor;
   FIntegralKernel const
      *pKernel;
   FMemoryStack
      &Mem;

   virtual void BeginContractions() = 0;
   virtual void BeginContraction1(uint iExpA) = 0;
   virtual void BeginContraction2(uint iExpA, uint iExpB) = 0;
   virtual void ContractPrimitiveA0C(double *p_A0C, uint iExpA, uint iExpB, uint iExpC, size_t iC, size_t iFnC, FRawShell const *pC) = 0;
   virtual void EndContraction2(uint iExpA, uint iExpB) = 0;
   virtual void EndContraction1(uint iExpA) = 0;
   virtual void EndContractions() = 0;
private:
   void
      *pBaseOfMemory;
   double
      *pGm, *p_A00, *p_A0C_mem, *pMemOsrrB;
   inline void InitDrv(uint lab_Min, uint lab_Max, bool ShTrC);
protected:
   bool
      SwapAB; // set if A and B were swapped due to la < lb.
   FVec3
      vAmB; // pA->vCen - pB->vCen
   size_t
      *piFnC,
      nFnC_Total;
   size_t
      nCartX_ABmx, nCartX_ABmn, nCartX_ABac;
};

void FInt3cShellFactory::InitDrv(uint lab_Min, uint lab_Max, bool ShTrC)
{
   pBaseOfMemory = Mem.Alloc(0);
   Mem.Align(16);

   vAmB = FVec3(pA->vCen) - FVec3(pB->vCen);

   // count number of C functions and find largest lc for memory purposes.
   piFnC = MakeFnOffsets(pCs, nC, Mem);
   nFnC_Total = piFnC[nC];
   uint
      lc_Max = 0;
   for (size_t iC = 0; iC < nC; ++ iC)
      lc_Max = std::max(lc_Max, (uint)pCs[iC].l);
   size_t
      nCompC_Max = ShTrC? (2*lc_Max+1) : nCartY(lc_Max);

   // allocate intermediates for primitive integrals
   nCartX_ABmx = nCartX(lab_Max); // number of (a0| components for a in 0..lab_Max
   nCartX_ABmn = nCartX(lab_Min-1); // number of (a0| components for a in 0..lab_Min
//    nCartX_ABac = nCartX_ABmx - nCartX_ABmn; // actual number of components: difference
   nCartX_ABac = ALIGN_CO_SIZE_DBL(nCartX_ABmx - nCartX_ABmn); // actual number of components: difference

   // Kernel derivatives (-d/dT)^m of (00|00) integral
   pGm = Mem.AllocN(lab_Max + lc_Max + 1, dbl);

   // (a0|0) intermediate
   p_A00 = Mem.AllocN(nCartX_ABmx, dbl);
   ALIGN_CO_MEM(Mem)
   p_A0C_mem = Mem.AllocN(nCartX_ABac * nCompC_Max, dbl);
   pMemOsrrB = Mem.AllocN(nCartX_ABmx * nCartX(lc_Max), dbl);

   IR_SUPPRESS_UNUSED_WARNING(ir::IsWithinRange);
   IR_SUPPRESS_UNUSED_WARNING(ir::IsContractionWithinRange);
   IR_SUPPRESS_UNUSED_WARNING(static_cast<bool (*)(FRawShell const *, uint, FRawShell const *, uint, double)>(ir::IsPrimitiveWithinRange));
   IR_SUPPRESS_UNUSED_WARNING(static_cast<bool (*)(FRawShell const *, uint, FRawShell const *, uint)>(ir::IsPrimitiveWithinRange));
}

void FInt3cShellFactory::EvalDrv(uint lab_Min, uint lab_Max, bool ShTrC)
{
   InitDrv(lab_Min, lab_Max, ShTrC);

   double
//       fRangeKernelSq = sqr(pKernel->MaxRange()),
      fDistSqAB = LengthSq(vAmB);

   // choose the function to use for the OsrrB recurrence relation. We have
   // a normal one (producing (a0|c) with nCartY(lc) cartesian cs) and an inline-
   // ShTr'ing one (producing (a0|c) with (2lc+1) solid harmonic cs)
   typedef
      void (*FOsrrBFn)(double *IR_RP pOut, double const *IR_RP pIn, double *IR_RP pMem, int la, unsigned lab, unsigned lc, double fPmQx, double fPmQy, double fPmQz, double InvEtaABC, double riz);
   FOsrrBFn
      OsrrB_3c = ShTrC? OsrrB_3c_shc : OsrrB_3c_cac;

   BeginContractions();
   for (uint iExpA = 0; iExpA < pA->nExp; ++ iExpA)
   {
      BeginContraction1(iExpA);
      for (uint iExpB = 0; iExpB < pB->nExp; ++ iExpB)
      {
         // skip if Dist(A,B) < Range(A) + Range(B)
         if (!IsPrimitiveWithinRange(pA, iExpA, pB, iExpB, fDistSqAB))
            continue;

         FGaussProduct
            OvAB(pA->vCen, pA->pExp[iExpA], pB->vCen, pB->pExp[iExpB]);
            // ^- P == OvAB.vCen
         double
            Sab = std::exp(-OvAB.Exp * fDistSqAB), // [1] (6)
            PmA[3];
         SubVec3(PmA, OvAB.vCen, pA->vCen);

         BeginContraction2(iExpA, iExpB);
         for (size_t iC = 0; iC < nC; ++ iC) {
            FRawShell const
               *pC = &pCs[iC];
//             double
//                fDistSqACK = DistSq3(pA->vCen, pC->vCen) - fRangeKernelSq,
//                fDistSqBCK = DistSq3(pB->vCen, pC->vCen) - fRangeKernelSq;
            uint
               TotalL = lab_Max + pC->l;
            for (uint iExpC = 0; iExpC < pC->nExp; ++ iExpC)
            {
//                if (!IsPrimitiveWithinRange(pA, iExpA, pC, iExpC, fDistSqACK) ||
//                    !IsPrimitiveWithinRange(pB, iExpB, pC, iExpC, fDistSqBCK))
//                   continue;

               FGaussProduct
                  OvPC(OvAB.vCen, OvAB.Eta, pC->vCen, pC->pExp[iExpC]);
               double
                  *PmC = OvPC.vAmB,
                  rho = OvPC.Exp, // [1] (3)
                  T = rho * OvPC.DistSq,
                  Factor = pow15(M_PI * OvPC.InvEta) * Sab * Prefactor; // [1] (7)

               // make I[m] = (00|0)^m, m = 0..TotalL (inclusive)
               pKernel->EvalGm(pGm, rho, T, TotalL, Factor);

               // make (a0|0)^m for a = 0..lab with lab + la+lb.
               OsrrA(p_A00, pGm + pC->l, lab_Max, PmA[0], PmA[1], PmA[2],
                  PmC[0], PmC[1], PmC[2], rho, OvAB.InvEta);

               // make (a0|c) for a = la..lab, c = 0..lc.
               double
                  *p_A0C;
// #ifndef ALIGN_CO
               if (pC->l == 0) {
                  p_A0C = p_A00 + nCartX_ABmn;
               } else {
// #endif
                  p_A0C = p_A0C_mem;
                  OsrrB_3c(p_A0C, p_A00, pMemOsrrB, lab_Min, lab_Max, pC->l,
                     PmC[0], PmC[1], PmC[2], OvPC.InvEta, rho/pC->pExp[iExpC]);
               }

               // (a0|c) is ready now. Just need use it in its contractions.
               ContractPrimitiveA0C(p_A0C, iExpA, iExpB, iExpC, iC, piFnC[iC], pC);
            } // c exponents
         } // c shells
         EndContraction2(iExpA, iExpB);
      } // b exponents
      EndContraction1(iExpA);
   } // a exponents
   EndContractions();

   Mem.Free(pBaseOfMemory);
}

// calculates normal integrals (ab|c) and stores them at
//    pOut[ia * StrideA + ib * StrideB + ic * StrideC].
struct FInt2e3cShellFactory : public FInt3cShellFactory
{
   FInt2e3cShellFactory(
      double *pOut_, size_t StrideA_, size_t StrideB_, size_t StrideC_,
      FRawShell const *pA_, FRawShell const *pB_, FRawShell const *pCs_, size_t nC_, double Prefactor_, FIntegralKernel const *pKernel_, FMemoryStack &Mem_)
      : FInt3cShellFactory(pA_, pB_, pCs_, nC_, Prefactor_, pKernel_, Mem_),
        pOut(pOut_), StrideA(StrideA_), StrideB(StrideB_), StrideC(StrideC_)
   {}

   void Eval() {
      EvalDrv(pA->l, pA->l + pB->l, true);
   }
protected:
   double
      *pOut;
   size_t
      StrideA, StrideB, StrideC, nFnC_Out;

   // intermediates for contractions
   double
      // intermediates (a0|c) with AB primitives and C contracted, a = la..lab
      *p_A0C_ppc,
      // intermediates (a0|c) with B,C contracted, a = la..lab.
      *p_A0C_cpc,
      // intermediates (a0|c) with A,B,C all contracted, a = la..lab.
      *p_A0C_ccc;

   IR_FORCE_INLINE void SetupCoBuffers(size_t nFnC_) {
      assert(nCartX_ABac == (nCartX(pA->l + pB->l) - nCartX((signed)pA->l-1)));
      nFnC_Out = nFnC_;

      if (SwapAB)
         std::swap(StrideA, StrideB);

      // allocate intermediates for partially contracted A0C integrals.
      p_A0C_ppc = Mem.AllocN(nCartX_ABac * nFnC_Out, dbl);
      p_A0C_cpc = Mem.AllocN(nCartX_ABac * nFnC_Out * pB->nCo, dbl);
      p_A0C_ccc = Mem.ClearAllocN(nCartX_ABac * nFnC_Out * pB->nCo * pA->nCo, dbl);
   }

   IR_FORCE_INLINE void BeginContractions() { // override
      SetupCoBuffers(nFnC_Total);
   }

   IR_FORCE_INLINE void BeginContraction1(uint /*iExpB*/) { // override
      memset(p_A0C_cpc, 0, nCartX_ABac * nFnC_Out * pB->nCo * sizeof(*p_A0C_cpc));
   }

   IR_FORCE_INLINE void BeginContraction2(uint /*iExpA*/, uint /*iExpB*/){ // override
      memset(p_A0C_ppc, 0, nCartX_ABac * nFnC_Out * sizeof(*p_A0C_cpc));
   }

   IR_FORCE_INLINE void ContractPrimitiveA0C(double *p_A0C, uint /*iExpA*/, uint /*iExpB*/, uint iExpC, size_t /*iC*/, size_t iFnC, FRawShell const *pC) { // override
      Contract1(&p_A0C_ppc[nCartX_ABac * iFnC], p_A0C,
         nCartX_ABac*pC->nSh(), pC, iExpC);
   }
   IR_FORCE_INLINE void EndContraction2(uint /*iExpA*/, uint iExpB) { // override
      Contract1(p_A0C_cpc, p_A0C_ppc, nCartX_ABac * nFnC_Out, pB, iExpB);
   }

   IR_FORCE_INLINE void EndContraction1(uint iExpA) { // override
      Contract1(p_A0C_ccc, p_A0C_cpc, (nCartX_ABac * nFnC_Out * pB->nCo), pA, iExpA);
   }

   IR_FORCE_INLINE void EndContractions() { // override
      // transform A to solid harmonics by factoring nCartX(lab) into nCartX(lb) x Slm(A).
      size_t
         nShA_CartXB = (2*pA->l+1) * nCartX(pB->l);
      double
         // intermediates (xa|c) with A,B,C contracted and (xa| = nCartX(lb) x (2*la+1)
         *p_xAC_ccc = Mem.AllocN(nShA_CartXB * nFnC_Out * pB->nCo * pA->nCo, dbl);
      ShTrA_XY(p_xAC_ccc, p_A0C_ccc, pA->l, (pA->l + pB->l), nFnC_Out * pB->nCo * pA->nCo);

      // we now have nCartX(lb) x nShA x nFnC_Out x nCoA x nCoB at p_xAC_ccc.
      // we still need to move the angular momentum from a to b and to write the
      // output integrals to their final destination.
      for (size_t iCoB = 0; iCoB < pB->nCo; ++ iCoB)
         for (size_t iCoA = 0; iCoA < pA->nCo; ++ iCoA) {
            for (size_t iFnC = 0; iFnC < nFnC_Out; ++ iFnC) {
               size_t
                  iFnA = iCoA * pA->nSh(), iFnB = iCoB * pB->nSh();
               OsrrC(
                  &pOut[iFnA*StrideA + iFnB*StrideB + iFnC*StrideC], StrideA, StrideB,
                  &p_xAC_ccc[nShA_CartXB * (iFnC + (nFnC_Out * (iCoB + pB->nCo * iCoA)))],
                  vAmB[0], vAmB[1], vAmB[2], pB->l, pA->nSh() );
            };
         }
   }
};


void EvalInt2e3c(double *pOut, size_t *Strides,
    FRawShell const *pA, FRawShell const *pB, FRawShell const *pCs, size_t nC,
    double Prefactor, FIntegralKernel const *pKernel, FMemoryStack &Mem)
{
//    EvalInt2e3c_(pOut, Strides, pA, pB, pCs, nC, Prefactor, pKernel, Mem);
   FInt2e3cShellFactory(pOut, Strides[0], Strides[1], Strides[2],
      pA, pB, pCs, nC, Prefactor, pKernel, Mem).Eval();
}




// output: contracted kernels Fm(rho,T), format: (TotalL+1) x nCoA x nCoC
void Int2e2c_EvalCoKernels(double *pCoFmT, uint TotalL,
    FRawShell const *pA, FRawShell const *pC,
    double PrefactorExt, FIntegralKernel const *pKernel, FMemoryStack &Mem)
{
   double
      t = DistSq3(pA->vCen, pC->vCen),
      *pFmT;
   Mem.Alloc(pFmT, TotalL + 1); // FmT for current primitive.

   // loop over primitives (that's all the per primitive stuff there is)
   for (uint iExpC = 0; iExpC < pC->nExp; ++ iExpC)
   for (uint iExpA = 0; iExpA < pA->nExp; ++ iExpA)
   {
      double
         Alpha = pA->pExp[iExpA],
         Gamma = pC->pExp[iExpC],
         InvEta = 1./(Alpha + Gamma),
         Rho = (Alpha * Gamma)*InvEta, // = (Alpha * Gamma)*/(Alpha + Gamma)
         Prefactor = (M_PI*InvEta)*std::sqrt(M_PI*InvEta); // = (M_PI/(Alpha+Gamma))^{3/2}

      Prefactor *= PrefactorExt;
      if(pC->l) Prefactor *= std::pow( 1.0/(2*Gamma), (int)pC->l); // <- use Hermites with D Ax := [1/(2 alpha)] \partial/\partial A_i.
      if(pA->l) Prefactor *= std::pow(-1.0/(2*Alpha), (int)pA->l); // <- -1 because \partial_A R \propto -\partial_B R!

      // calculate derivatives (D/Dt)^m exp(-rho t) with t = (A-C)^2.
      pKernel->EvalGm(pFmT, Rho, Rho*t, TotalL, Prefactor);

      // convert from Gm(rho,T) to Fm(rho,T) by absorbing powers of rho
      // (those would normally be present in the R of the MDRR)
      double
         RhoPow = 1.;
      for ( uint i = 0; i < TotalL + 1; ++ i ){
         pFmT[i] *= RhoPow;
         RhoPow *= 2*Rho;
      }

      // contract (lamely). However, normally either nCo
      // or nExp, or TotalL (or even all of them at the same time)
      // will be small, so I guess it's okay.
      for (uint iCoC = 0; iCoC < pC->nCo; ++ iCoC)
      for (uint iCoA = 0; iCoA < pA->nCo; ++ iCoA) {
         double CoAC = pC->pCo[iExpC + pC->nExp*iCoC] *
                       pA->pCo[iExpA + pA->nExp*iCoA];
         Add2(&pCoFmT[(TotalL+1)*(iCoA + pA->nCo*iCoC)],
               pFmT, CoAC, (TotalL+1));
      }
   }

   Mem.Free(pFmT);
}

// write (2*la+1) x (2*lc+1) x nCoA x nCoC matrix to final destination.
static void Scatter2e2c(double *IR_RP pOut, size_t StrideA, size_t StrideC,
   double const *IR_RP pIn, size_t la, size_t lc, size_t nComp, size_t nCoA, size_t nCoC, bool Add)
{
   size_t nShA = 2*la+1, nShC = 2*lc+1;
   if ( Add ) {
      for (size_t iCoC = 0; iCoC < nCoC; ++ iCoC)
         for (size_t iCoA = 0; iCoA < nCoA; ++ iCoA)
            for (size_t iShC = 0; iShC < nShC; ++ iShC)
               for (size_t iShA = 0; iShA < nShA; ++ iShA)
                  pOut[(iShA + nShA*iCoA)*StrideA + (iShC + nShC*iCoC)*StrideC]
                     += pIn[iShA + nShA * (iShC + nShC * nComp * (iCoA + nCoA * iCoC))];
   } else {
      for (size_t iCoC = 0; iCoC < nCoC; ++ iCoC)
         for (size_t iCoA = 0; iCoA < nCoA; ++ iCoA)
            for (size_t iShC = 0; iShC < nShC; ++ iShC)
               for (size_t iShA = 0; iShA < nShA; ++ iShA)
                  pOut[(iShA + nShA*iCoA)*StrideA + (iShC + nShC*iCoC)*StrideC]
                      = pIn[iShA + nShA * (iShC + nShC * nComp *(iCoA + nCoA * iCoC))];
   }
}

// Forms [SlmY(TotalLab)] x nCoA x nCoC.
// Allocates memory. Free pOutR.
static void Int2e2c_EvalCoShY(double *&pOutR, unsigned &TotalCo, FRawShell const *pA, FRawShell const *pC, double Prefactor,
   unsigned TotalLab, unsigned LaplaceOrder, FIntegralKernel const *pKernel, FMemoryStack &Mem)
{
   FVec3
      R;
   SubVec3(R, pA->vCen, pC->vCen);
   TotalCo = pA->nCo * pC->nCo;

   double
      *pCoFmT, *pDataR_LapC;
   unsigned
      TotalL = TotalLab + 2 * LaplaceOrder;
   Mem.Alloc(pOutR, nCartY(TotalLab) * TotalCo);
   Mem.Alloc(pDataR_LapC, nCartY(TotalL));

   Mem.ClearAlloc(pCoFmT, (TotalL+1) * TotalCo);
   Int2e2c_EvalCoKernels(pCoFmT, TotalL, pA, pC, Prefactor, pKernel, Mem);


   for (uint iCoC = 0; iCoC < pC->nCo; ++ iCoC)
   for (uint iCoA = 0; iCoA < pA->nCo; ++ iCoA) {
      // note: if skipping stuff here due to screening, the output must
      // be wiped unless Add == true!
      double
         *pFmT = &pCoFmT[(TotalL+1)*(iCoA + pA->nCo*iCoC)],
         *pDataR_ = &pOutR[nCartY(TotalLab) * (iCoA + pA->nCo*iCoC)];
      if (LaplaceOrder == 0)
         ShellMdrr(pDataR_, pFmT, R[0], R[1], R[2], TotalL);
      else {
         assert(LaplaceOrder == 1);
         ShellMdrr(pDataR_LapC, pFmT, R[0], R[1], R[2], TotalL);
         // note: a simple way of getting higher laplace derivatives is to
         // just apply ShellLaplace function multiple times.
         // At this moment this is not implemented since it is not required.
         ShellLaplace(pDataR_, pDataR_LapC, LaplaceOrder, TotalL - 2);
      }
   }
}


// evaluate 2-electron 2-center integrals <a|krn * laplace^LaplaceOrder|c>
// note: to obtain the kinetic energy operator, pass an overlap kernel
//       and supply -.5 as Prefactor (ekin = -.5 laplace).
// if add is given: increment the output instead of overwriting it.
void EvalInt2e2c_LaplaceC( double *pOut, size_t StrideA, size_t StrideC,
    FRawShell const *pA, FRawShell const *pC, double Prefactor, bool Add,
    unsigned LaplaceOrder, FIntegralKernel const *pKernel, FMemoryStack &Mem )
{
//    if (pA->l > pC->l) {
//       // ^- FIXME: isn't this the wrong way around? Note also that the function works fine
//       //    with either order... this is just a question of efficiency in ShTrA_YY. Need to look
//       //    up which version is better (factor large la first or factor small la first).
//       std::swap(pA, pC);
//       std::swap(StrideA, StrideC);
//    }
   uint
      lc = pC->l, la = pA->l,
      TotalCo;
   double
      *pDataR, *pR1, *pFinal;
   Int2e2c_EvalCoShY(pDataR, TotalCo, pA, pC, Prefactor, la + lc, LaplaceOrder, pKernel, Mem);

   Mem.Alloc(pR1, nCartY(la)*(2*lc+1) * TotalCo);
   Mem.Alloc(pFinal, (2*la+1)*(2*lc+1) * TotalCo);

   ShTrA_YY(pR1, pDataR, lc, (la + lc), TotalCo);
   ShTrA_YY(pFinal, pR1, la, la, (2*lc + 1)*TotalCo);
   // now: (2*la+1) x (2*lc+1) x nCoA x nCoC
   Scatter2e2c(pOut, StrideA, StrideC, pFinal, la, lc, 1, pA->nCo, pC->nCo, Add);

   Mem.Free(pDataR);

   // ^- note: this is a very unconventional way of evaluating integrals
   // (precontracted kernels, *no* OsrrC/HRR type recursion, etc). A few years
   // ago I tried hard to make this work for 3/4-c integrals too (would be the
   // coolest integral core ever), but I do not think it can be done for more
   // than two-center integrals. It *can*, however, be done for three- and four
   // /index/ integrals, as long as in (ab|cd) both a/b and c/d, respectively,
   // sit on the same /center/. Such integrals are required in semi-empirical
   // methods like AM1/PM3/DFTB/etc, and I think this is a much better way of
   // doing them than the Slater-Koster tables used in these fields.
}

// void EvalInt2e2c_LaplaceC( double *pOut, size_t StrideA, size_t StrideC,
//     FRawShell const *pA, FRawShell const *pC, double Prefactor, bool Add,
//     unsigned LaplaceOrder, FIntegralKernel const *pKernel, FMemoryStack &Mem )
// {
//    if (pA->l > pC->l) { // <- isn't this the wrong way around?
//       std::swap(pA, pC);
//       std::swap(StrideA, StrideC);
//    }
//    FVec3
//       R;
//    SubVec3(R, pA->vCen, pC->vCen);
//    uint
//       lc = pC->l, la = pA->l,
//       TotalL = la + lc + 2*LaplaceOrder,
//       TotalCo = pA->nCo * pC->nCo;
//    double
//       *pCoFmT, *pDataR_LapC, *pDataR, *pR1, *pFinal;
//    Mem.ClearAlloc(pCoFmT, (TotalL+1) * TotalCo);
//    Int2e2c_EvalCoKernels(pCoFmT, TotalL, pA, pC, Prefactor, pKernel, Mem);
//
//    Mem.Alloc(pDataR_LapC, nCartY(TotalL));
//    Mem.Alloc(pDataR, nCartY(la+lc) * TotalCo);
//    Mem.Alloc(pR1, nCartY(la)*(2*lc+1) * TotalCo);
//    Mem.Alloc(pFinal, (2*la+1)*(2*lc+1) * TotalCo);
//
//    for (uint iCoC = 0; iCoC < pC->nCo; ++ iCoC)
//    for (uint iCoA = 0; iCoA < pA->nCo; ++ iCoA) {
//       // note: if skipping stuff here due to screening, the output must
//       // be wiped unless Add == true!
//       double
//          *pFmT = &pCoFmT[(TotalL+1)*(iCoA + pA->nCo*iCoC)],
//          *pDataR_ = &pDataR[nCartY(la+lc) * (iCoA + pA->nCo*iCoC)];
//       if (LaplaceOrder == 0)
//          ShellMdrr(pDataR_, pFmT, R[0], R[1], R[2], TotalL);
//       else {
//          ShellMdrr(pDataR_LapC, pFmT, R[0], R[1], R[2], TotalL);
//          // note: a simple way of getting higher derivatives is to
//          // just apply this function multiple times.
//          ShellLaplace(pDataR_, pDataR_LapC, LaplaceOrder, la+lc);
//       }
//    }
//    ShTrA_YY(pR1, pDataR, lc, (la + lc), TotalCo);
//    ShTrA_YY(pFinal, pR1, la, la, (2*lc + 1)*TotalCo);
//    // now: (2*la+1) x (2*lc+1) x nCoA x nCoC
//    Scatter2e2c(pOut, StrideA, StrideC, pFinal, la, lc, 1, pA->nCo, pC->nCo, Add);
//
//    Mem.Free(pCoFmT);
//
//    // ^- note: this is a very unconventional way of evaluating integrals
//    // (precontracted kernels, *no* OsrrC/HRR type recursion, etc). A few years
//    // ago I tried hard to make this work for 3/4-c integrals too (would be the
//    // coolest integral core ever), but I do not think it can be done for more
//    // than two-center integrals. It *can*, however, be done for three- and four
//    // /index/ integrals, as long as in (ab|cd) both a/b and c/d, respectively,
//    // sit on the same /center/. Such integrals are required in semi-empirical
//    // methods like AM1/PM3/DFTB/etc, and I think this is a much better way of
//    // doing them than the Slater-Koster tables used in these fields.
// }



// evaluate 1st derivative of 2-electron 2-center integrals <a|krn * laplace|c>
// note: to obtain the kinetic energy operator, pass an overlap kernel
//       and supply -.5 as Prefactor (ekin = -.5 laplace).
// if add is given: increment the output instead of overwriting it.
void EvalInt2e2c1d_LaplaceC( double *pOutAxyz, double *pOutCxyz, size_t StrideA, size_t StrideC, size_t StrideDeriv,
    FRawShell const *pA, FRawShell const *pC, double Prefactor, bool Add,
    unsigned LaplaceOrder, FIntegralKernel const *pKernel, FMemoryStack &Mem )
{
//    if (pA->l > pC->l) {
//       // ^- FIXME: isn't this the wrong way around? Note also that the function works fine
//       //    with either order... this is just a question of efficiency in ShTrA_YY. Need to look
//       //    up which version is better (factor large la first or factor small la first).
//       std::swap(pA, pC);
//       std::swap(StrideA, StrideC);
//       std::swap(pOutAxyz, pOutCxyz);
//    }

   uint
      lc = pC->l, la = pA->l,
      TotalCo;
   double
      *pDataR, *pDataR_xyz, *pR1, *pFinal;
   Int2e2c_EvalCoShY(pDataR, TotalCo, pA, pC, -1.0 * Prefactor, la + lc + 1, LaplaceOrder, pKernel, Mem);

   // factor out derivative components.
   Mem.Alloc(pDataR_xyz, nCartY(la+lc) * TotalCo * 3);
   // and now SlmY(la) and SlmY(lb).
   Mem.Alloc(pR1, nCartY(la)*(2*lc+1) * TotalCo * 3);
   Mem.Alloc(pFinal, (2*la+1)*(2*lc+1) * TotalCo * 3);

   // now: nCartY(la+lc+1) x nCoA x nCoC
   ShTrA_YY(pDataR_xyz, pDataR, 1, (la + lc + 1), TotalCo);

   // now: nCartY(la+lc) x 3 x nCoA x nCoC
   ShTrA_YY(pR1, pDataR_xyz, lc, (la + lc), TotalCo*3);
   ShTrA_YY(pFinal, pR1, la, la, (2*lc + 1)*TotalCo*3);
   // now: (2*la+1) x (2*lc+1) x 3 x nCoA x nCoC

//    IrPrintMatrixGen(std::cout, pDataR_xyz, nCartY(la+lc+1), 1, TotalCo, nCartY(la+lc+1), "2e2c1d nCartY(la+lc+1) x TotalCo");
//    IrPrintMatrixGen(std::cout, pFinal, pA->nCo, 1, 3*pC->nCo, pA->nCo, "2e2c1d output");
   size_t
      iXyzOff = (2*la+1)*(2*lc+1);
   if (pOutAxyz) {
      for (uint ixyz = 0; ixyz < 3; ++ ixyz)
         // note: this stores only one of the components (deriv wrt. A. Other would be -1 * the input derivative).
         // could this this my simply multiplying *pFinal with -1. and then going through Scatter2e2c again.
         Scatter2e2c(&pOutAxyz[StrideDeriv * ixyz], StrideA, StrideC, &pFinal[ixyz * iXyzOff], la, lc, 3, pA->nCo, pC->nCo, Add);
   }
   if (pOutCxyz) {
      for (uint i = 0; i < 3*(2*la+1)*(2*lc+1)*TotalCo; ++ i)
         pFinal[i] *= -1.;
      for (uint ixyz = 0; ixyz < 3; ++ ixyz)
         // note: this stores only one of the components (deriv wrt. A. Other would be -1 * the input derivative).
         // could this this my simply multiplying *pFinal with -1. and then going through Scatter2e2c again.
         Scatter2e2c(&pOutCxyz[StrideDeriv * ixyz], StrideA, StrideC, &pFinal[ixyz * iXyzOff], la, lc, 3, pA->nCo, pC->nCo, Add);
   }

   Mem.Free(pDataR);
}




void EvalInt2e2c( double *pOut, size_t StrideA, size_t StrideC,
    FRawShell const *pA, FRawShell const *pC, double Prefactor, bool Add,
    FIntegralKernel const *pKernel, FMemoryStack &Mem )
{
   if (1) {
      return EvalInt2e2c_LaplaceC(pOut, StrideA, StrideC, pA, pC,
         Prefactor, Add, 0, pKernel, Mem);
   } else {
      assert(!Add);
      double ExpB = 0., CoB = 1., CenB[3] = {0., 0., 0.};
      FRawShell
         ShB(0, &ExpB, 1, &CoB, 1, &CenB[0], 0);
      size_t Strides3[3] = {StrideA, 1, StrideC};
      return EvalInt2e3c(pOut, &Strides3[0], pA, &ShB, pC, 1, Prefactor, pKernel, Mem);
   }
}




} // namespace ir
