diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index dbedfa7c49..575071dee7 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -99,25 +99,25 @@ const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
 
 // s,sp,c,spc,lc
 
-template<typename vtype> using iSinglet                   = iScalar<iScalar<iScalar<vtype> > >;
-template<typename vtype> using iSpinMatrix                = iScalar<iMatrix<iScalar<vtype>, Ns> >;
-template<typename vtype> using iColourMatrix              = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
-template<typename vtype> using iSpinColourMatrix          = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
-template<typename vtype> using iLorentzColourMatrix       = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
-template<typename vtype> using iLorentzComplex            = iVector<iScalar<iScalar<vtype> >, Nd > ;
-template<typename vtype> using iDoubleStoredColourMatrix  = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
-template<typename vtype> using iSpinVector                = iScalar<iVector<iScalar<vtype>, Ns> >;
-template<typename vtype> using iColourVector              = iScalar<iScalar<iVector<vtype, Nc> > >;
-template<typename vtype> using iSpinColourVector          = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
-template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
-template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
-    template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
-
-
-template<typename vtype> using iGparityFlavourVector                = iVector<iScalar<iScalar<vtype> >, Ngp>;
-template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
-template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
-template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
+template<typename vtype> using iSinglet                     = iScalar<iScalar<iScalar<vtype> > >;
+template<typename vtype> using iSpinMatrix                  = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+template<typename vtype> using iColourMatrix                = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+template<typename vtype> using iSpinColourMatrix            = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+template<typename vtype> using iLorentzColourMatrix         = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+template<typename vtype> using iLorentzComplex              = iVector<iScalar<iScalar<vtype> >, Nd > ;
+template<typename vtype> using iDoubleStoredColourMatrix    = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
+template<typename vtype> using iSpinVector                  = iScalar<iVector<iScalar<vtype>, Ns> >;
+template<typename vtype> using iColourVector                = iScalar<iScalar<iVector<vtype, Nc> > >;
+template<typename vtype> using iSpinColourVector            = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+template<typename vtype> using iHalfSpinVector              = iScalar<iVector<iScalar<vtype>, Nhs> >;
+template<typename vtype> using iHalfSpinColourVector        = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
+template<typename vtype> using iSpinColourSpinColourMatrix  = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
+
+
+template<typename vtype> using iGparityFlavourVector        = iVector<iScalar<iScalar<vtype> >, Ngp>;
+template<typename vtype> using iGparitySpinColourVector     = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
+template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+template<typename vtype> using iGparityFlavourMatrix        = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
 
 // Spin matrix
 typedef iSpinMatrix<Complex  >          SpinMatrix;
diff --git a/Grid/qcd/action/fermion/StaggeredImpl.h b/Grid/qcd/action/fermion/StaggeredImpl.h
index f44d12f4f8..01c516b7b3 100644
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -163,7 +163,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
 
   inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
     GaugeLinkField link(mat.Grid());
-    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    link = outerProduct(Btilde,A); 
     PokeIndex<LorentzIndex>(mat,link,mu);
   }   
       
diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h
index e98e9b879d..a55f8d4a71 100644
--- a/Grid/qcd/smearing/HISQSmearing.h
+++ b/Grid/qcd/smearing/HISQSmearing.h
@@ -27,7 +27,7 @@ directory
 *************************************************************************************/
 /*
     @file HISQSmearing.h
-    @brief Declares classes related to HISQ smearing 
+    @brief Declares classes related to the HISQ action 
 */
 
 
@@ -35,42 +35,19 @@ directory
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
-
-
 NAMESPACE_BEGIN(Grid);
 
 
-// TODO: find a way to fold this into the stencil header. need to access grid to get
-// Nd, since you don't want to inherit from QCD.h
-/*!  @brief append arbitrary shift path to shifts */
-template<typename... Args>
-void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) {
-    Coordinate shift(Nd,0);
-    generalShift(shift, dir, args...); 
-    // push_back creates an element at the end of shifts and
-    // assigns the data in the argument to it.
-    shifts.push_back(shift);
-}
-
-
-/*!  @brief figure out the stencil index from mu and nu */
-accelerator_inline int stencilIndex(int mu, int nu) {
-    // Nshifts depends on how you built the stencil
-    int Nshifts = 6;
-    return Nshifts*nu + Nd*Nshifts*mu;
-}
-
-
-/*!  @brief structure holding the link treatment */
-struct HISQSmearingParameters{
-    HISQSmearingParameters(){}
-    Real c_1;               // 1 link
-    Real c_naik;            // Naik term
-    Real c_3;               // 3 link
-    Real c_5;               // 5 link
-    Real c_7;               // 7 link
-    Real c_lp;              // 5 link Lepage
-    HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
+/*!  @brief structure holding the link treatment for a given smear */
+template<class floatT>
+struct SmearingParameters{
+    floatT c_1;               // 1 link
+    floatT c_naik;            // Naik term
+    floatT c_3;               // 3 link
+    floatT c_5;               // 5 link
+    floatT c_7;               // 7 link
+    floatT c_lp;              // Lepage
+    SmearingParameters(floatT c1, floatT cnaik, floatT c3, floatT c5, floatT c7, floatT clp) 
         : c_1(c1),
           c_naik(cnaik),
           c_3(c3),
@@ -80,252 +57,572 @@ struct HISQSmearingParameters{
 };
 
 
-/*!  @brief create fat links from link variables */
+// There are 6 quarks in nature, and 3 never need a Naik epsilon
+int const GRID_MAX_NAIK = 3;
+
+
+/*!  @brief structure holding all input parameters related to the HISQ action */
+template<class floatT>
+struct HISQParameters{
+    // Structure from QOP/QDP 
+    int n_naiks;
+    std::array<floatT,GRID_MAX_NAIK> eps_naiks;
+    floatT fat7_c1  ; floatT fat7_c3  ; floatT fat7_c5  ; floatT fat7_c7  ; floatT fat7_clp;
+    floatT asqtad_c1; floatT asqtad_c3; floatT asqtad_c5; floatT asqtad_c7; floatT asqtad_clp; floatT asqtad_cnaik;
+    floatT diff_c1  ; floatT diff_cnaik;
+    HISQParameters(int n_naiks_in, std::array<floatT,GRID_MAX_NAIK> eps_naiks_in, 
+                   floatT fat7_one_link  , floatT fat7_three_staple  , floatT fat7_five_staple  , floatT fat7_seven_staple  , floatT fat7_lepage, 
+                   floatT asqtad_one_link, floatT asqtad_three_staple, floatT asqtad_five_staple, floatT asqtad_seven_staple, floatT asqtad_lepage,
+                   floatT asqtad_naik    , floatT difference_one_link, floatT difference_naik) 
+        : n_naiks(n_naiks_in),
+          eps_naiks(eps_naiks_in), 
+          fat7_c1(fat7_one_link),
+          fat7_c3(fat7_three_staple),
+          fat7_c5(fat7_five_staple),
+          fat7_c7(fat7_seven_staple),
+          fat7_clp(fat7_lepage),
+          asqtad_c1(asqtad_one_link),
+          asqtad_c3(asqtad_three_staple),
+          asqtad_c5(asqtad_five_staple),
+          asqtad_c7(asqtad_seven_staple),
+          asqtad_clp(asqtad_lepage),
+          asqtad_cnaik(asqtad_naik),
+          diff_c1(difference_one_link),
+          diff_cnaik(difference_naik) {}
+};
+
+
+/*!  @brief Sometimes in the U(3) projection we use SVD cuts; here we collect related parameters */
+template<class floatT>
+struct HISQReunitSVDParameters{
+    // Structure from QOP/QDP 
+    bool   allow_svd;
+    bool   svd_only;
+    floatT svd_rel_error;
+    floatT svd_abs_error;
+    floatT force_filter;
+    HISQReunitSVDParameters(bool allow, bool only, floatT rel, floatT abs, floatT filter)
+        : allow_svd(allow),
+          svd_only(only),
+          svd_rel_error(rel),
+          svd_abs_error(abs),
+          force_filter(filter) {}
+};
+
+
+/*!  @brief Get the link U_mu(x).  */
+template<class link> accelerator_inline
+auto getLink(const link& __restrict__ U, GeneralStencilEntry* x, int mu) {
+    return coalescedReadGeneralPermute(U[x->_offset](mu), x->_permute, Nd); 
+}
+#define setLink coalescedWrite 
+
+
+/*! @brief Figure out the stencil index from mu and nu. */
+accelerator_inline 
+int HISQStencilIndex(int mu, int nu, int rho=0, int sig=0, std::string kind="3STAPLE") {
+    int res;
+    if (kind=="3STAPLE") 
+        res = 5*(nu + Nd*mu);
+    else if (kind=="5STAPLE")
+        res = 17*(rho + Nd*nu + Nd*Nd*mu);
+    else if (kind=="7STAPLE")
+        // seems correct
+        res = 46*(sig + Nd*rho + Nd*Nd*nu + Nd*Nd*Nd*mu);
+    else 
+        Grid_error("Unknown staple kind",kind);
+    return res;
+}
+
+
+/*! @brief Create various stencils needed for HISQ calculations */ 
+inline
+std::vector<Coordinate> createHISQStencil(std::string kind="3STAPLE") {
+    std::vector<Coordinate> shifts;
+    // We allow nu=mu and rho=nu, rho=mu to make indexing easier, but these
+    // entries will not be used.
+    if (kind=="3STAPLE") {
+        for(int mu=0;mu<Nd;mu++)
+        for(int nu=0;nu<Nd;nu++) {
+            appendShift<Nd>(shifts,mu);
+            appendShift<Nd>(shifts,nu);
+            appendShift<Nd>(shifts,shiftSignal::NO_SHIFT);
+            appendShift<Nd>(shifts,mu,Back(nu));
+            appendShift<Nd>(shifts,Back(nu));
+        }
+    } else if (kind=="5STAPLE") {
+        for(int mu =0;mu <Nd;mu++)
+        for(int nu =0;nu <Nd;nu++) 
+        for(int rho=0;rho<Nd;rho++) {
+            appendShift<Nd>(shifts,nu,Back(rho));
+            appendShift<Nd>(shifts,nu);
+            appendShift<Nd>(shifts,Back(rho));
+            appendShift<Nd>(shifts,shiftSignal::NO_SHIFT);
+            appendShift<Nd>(shifts,rho);
+            appendShift<Nd>(shifts,Back(nu),Back(rho));
+            appendShift<Nd>(shifts,Back(nu));
+            appendShift<Nd>(shifts,Back(nu),rho);
+            appendShift<Nd>(shifts,mu,nu,Back(rho));
+            appendShift<Nd>(shifts,mu,nu);
+            appendShift<Nd>(shifts,mu,Back(rho));
+            appendShift<Nd>(shifts,mu);
+            appendShift<Nd>(shifts,mu,rho);
+            appendShift<Nd>(shifts,mu,Back(nu),Back(rho));
+            appendShift<Nd>(shifts,mu,Back(nu));
+            appendShift<Nd>(shifts,mu,Back(nu),rho);
+            appendShift<Nd>(shifts,nu,rho);
+        }
+    } else if (kind=="7STAPLE") {
+        for(int mu =0;mu <Nd;mu++)
+        for(int nu =0;nu <Nd;nu++) 
+        for(int rho=0;rho<Nd;rho++)
+        for(int sig=0;sig<Nd;sig++) {
+            appendShift<Nd>(shifts,shiftSignal::NO_SHIFT);
+            appendShift<Nd>(shifts,mu);
+            appendShift<Nd>(shifts,mu,nu);
+            appendShift<Nd>(shifts,mu,nu,rho);
+            appendShift<Nd>(shifts,mu,nu,rho,Back(sig));
+            appendShift<Nd>(shifts,mu,nu,Back(rho));
+            appendShift<Nd>(shifts,mu,nu,Back(rho),Back(sig));
+            appendShift<Nd>(shifts,mu,nu,Back(sig));
+            appendShift<Nd>(shifts,mu,Back(nu));
+            appendShift<Nd>(shifts,mu,Back(nu),rho);
+            appendShift<Nd>(shifts,mu,Back(nu),rho,sig);
+            appendShift<Nd>(shifts,mu,Back(nu),rho,Back(sig));
+            appendShift<Nd>(shifts,mu,Back(nu),Back(rho));
+            appendShift<Nd>(shifts,mu,Back(nu),Back(rho),sig);
+            appendShift<Nd>(shifts,mu,Back(nu),Back(rho),Back(sig));
+            appendShift<Nd>(shifts,mu,Back(nu),sig);
+            appendShift<Nd>(shifts,mu,Back(nu),Back(sig));
+            appendShift<Nd>(shifts,mu,rho);
+            appendShift<Nd>(shifts,mu,rho,sig);
+            appendShift<Nd>(shifts,mu,rho,Back(sig));
+            appendShift<Nd>(shifts,mu,Back(rho));
+            appendShift<Nd>(shifts,mu,Back(rho),sig);
+            appendShift<Nd>(shifts,mu,Back(rho),Back(sig));
+            appendShift<Nd>(shifts,mu,sig);
+            appendShift<Nd>(shifts,mu,Back(sig));
+            appendShift<Nd>(shifts,nu);
+            appendShift<Nd>(shifts,nu,rho);
+            appendShift<Nd>(shifts,nu,rho,sig);
+            appendShift<Nd>(shifts,nu,rho,Back(sig));
+            appendShift<Nd>(shifts,nu,Back(rho));
+            appendShift<Nd>(shifts,nu,Back(rho),sig);
+            appendShift<Nd>(shifts,nu,Back(rho),Back(sig));
+            appendShift<Nd>(shifts,rho);
+            appendShift<Nd>(shifts,rho,Back(nu));
+            appendShift<Nd>(shifts,rho,sig);
+            appendShift<Nd>(shifts,rho,Back(sig));
+            appendShift<Nd>(shifts,Back(nu));
+            appendShift<Nd>(shifts,Back(nu),rho);
+            appendShift<Nd>(shifts,Back(nu),rho,sig);
+            appendShift<Nd>(shifts,Back(nu),rho,Back(sig));
+            appendShift<Nd>(shifts,Back(nu),Back(rho));
+            appendShift<Nd>(shifts,Back(nu),Back(rho),sig);
+            appendShift<Nd>(shifts,Back(nu),Back(rho),Back(sig));
+            appendShift<Nd>(shifts,Back(rho));
+            appendShift<Nd>(shifts,Back(rho),sig);
+            appendShift<Nd>(shifts,Back(rho),Back(sig));
+        }
+    } else { 
+        Grid_error("Unknown staple kind",kind);
+    }
+    return shifts;
+}
+
+
+/*! @brief Retrieve 3-link stencil entries. */
+template<class acc> accelerator_inline
+std::tuple<GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*> 
+get3StaplePoints(acc sView, int sIndex, int site) {
+    GeneralStencilEntry* x_p_mu      = sView.GetEntry(sIndex+0,site);
+    GeneralStencilEntry* x_p_nu      = sView.GetEntry(sIndex+1,site);
+    GeneralStencilEntry* x           = sView.GetEntry(sIndex+2,site);
+    GeneralStencilEntry* x_p_mu_m_nu = sView.GetEntry(sIndex+3,site);
+    GeneralStencilEntry* x_m_nu      = sView.GetEntry(sIndex+4,site);
+    return { x_p_mu, x_p_nu, x, x_p_mu_m_nu, x_m_nu };
+}
+
+
+/*! @brief Retrieve 5-link stencil entries. */
+template<class acc> accelerator_inline
+std::tuple<GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*>
+get5StaplePoints(acc sView, int sIndex, int site) {
+    GeneralStencilEntry* x_p_nu_m_rho      = sView.GetEntry(sIndex+0 ,site);
+    GeneralStencilEntry* x_p_nu            = sView.GetEntry(sIndex+1 ,site);
+    GeneralStencilEntry* x_m_rho           = sView.GetEntry(sIndex+2 ,site);
+    GeneralStencilEntry* x                 = sView.GetEntry(sIndex+3 ,site);
+    GeneralStencilEntry* x_p_rho           = sView.GetEntry(sIndex+4 ,site);
+    GeneralStencilEntry* x_m_nu_m_rho      = sView.GetEntry(sIndex+5 ,site);
+    GeneralStencilEntry* x_m_nu            = sView.GetEntry(sIndex+6 ,site);
+    GeneralStencilEntry* x_m_nu_p_rho      = sView.GetEntry(sIndex+7 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu_m_rho = sView.GetEntry(sIndex+8 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu       = sView.GetEntry(sIndex+9 ,site);
+    GeneralStencilEntry* x_p_mu_m_rho      = sView.GetEntry(sIndex+10,site);
+    GeneralStencilEntry* x_p_mu            = sView.GetEntry(sIndex+11,site);
+    GeneralStencilEntry* x_p_mu_p_rho      = sView.GetEntry(sIndex+12,site);
+    GeneralStencilEntry* x_p_mu_m_nu_m_rho = sView.GetEntry(sIndex+13,site);
+    GeneralStencilEntry* x_p_mu_m_nu       = sView.GetEntry(sIndex+14,site);
+    GeneralStencilEntry* x_p_mu_m_nu_p_rho = sView.GetEntry(sIndex+15,site);
+    GeneralStencilEntry* x_p_nu_p_rho      = sView.GetEntry(sIndex+16,site);
+    return {x_p_nu_m_rho     , x_p_nu           , x_m_rho          , 
+            x                , x_p_rho          , x_m_nu_m_rho     , 
+            x_m_nu           , x_m_nu_p_rho     , x_p_mu_p_nu_m_rho, 
+            x_p_mu_p_nu      , x_p_mu_m_rho     , x_p_mu           ,
+            x_p_mu_p_rho     , x_p_mu_m_nu_m_rho, x_p_mu_m_nu      , 
+            x_p_mu_m_nu_p_rho, x_p_nu_p_rho}; 
+}
+
+
+/*! @brief Retrieve 7-link stencil entries. */
+template<class acc> accelerator_inline
+std::tuple<GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,GeneralStencilEntry*,
+           GeneralStencilEntry*,GeneralStencilEntry*>
+get7StaplePoints(acc sView, int sIndex, int site) {
+    GeneralStencilEntry* x                       = sView.GetEntry(sIndex+0 ,site); 
+    GeneralStencilEntry* x_p_mu                  = sView.GetEntry(sIndex+1 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu             = sView.GetEntry(sIndex+2 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu_p_rho       = sView.GetEntry(sIndex+3 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu_p_rho_m_sig = sView.GetEntry(sIndex+4 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu_m_rho       = sView.GetEntry(sIndex+5 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu_m_rho_m_sig = sView.GetEntry(sIndex+6 ,site);
+    GeneralStencilEntry* x_p_mu_p_nu_m_sig       = sView.GetEntry(sIndex+7 ,site);
+    GeneralStencilEntry* x_p_mu_m_nu             = sView.GetEntry(sIndex+8 ,site);
+    GeneralStencilEntry* x_p_mu_m_nu_p_rho       = sView.GetEntry(sIndex+9 ,site);
+    GeneralStencilEntry* x_p_mu_m_nu_p_rho_p_sig = sView.GetEntry(sIndex+10,site);
+    GeneralStencilEntry* x_p_mu_m_nu_p_rho_m_sig = sView.GetEntry(sIndex+11,site);
+    GeneralStencilEntry* x_p_mu_m_nu_m_rho       = sView.GetEntry(sIndex+12,site);
+    GeneralStencilEntry* x_p_mu_m_nu_m_rho_p_sig = sView.GetEntry(sIndex+13,site);
+    GeneralStencilEntry* x_p_mu_m_nu_m_rho_m_sig = sView.GetEntry(sIndex+14,site);
+    GeneralStencilEntry* x_p_mu_m_nu_p_sig       = sView.GetEntry(sIndex+15,site);
+    GeneralStencilEntry* x_p_mu_m_nu_m_sig       = sView.GetEntry(sIndex+16,site);
+    GeneralStencilEntry* x_p_mu_p_rho            = sView.GetEntry(sIndex+17,site);
+    GeneralStencilEntry* x_p_mu_p_rho_p_sig      = sView.GetEntry(sIndex+18,site);
+    GeneralStencilEntry* x_p_mu_p_rho_m_sig      = sView.GetEntry(sIndex+19,site);
+    GeneralStencilEntry* x_p_mu_m_rho            = sView.GetEntry(sIndex+20,site);
+    GeneralStencilEntry* x_p_mu_m_rho_p_sig      = sView.GetEntry(sIndex+21,site);
+    GeneralStencilEntry* x_p_mu_m_rho_m_sig      = sView.GetEntry(sIndex+22,site);
+    GeneralStencilEntry* x_p_mu_p_sig            = sView.GetEntry(sIndex+23,site); 
+    GeneralStencilEntry* x_p_mu_m_sig            = sView.GetEntry(sIndex+24,site);
+    GeneralStencilEntry* x_p_nu                  = sView.GetEntry(sIndex+25,site);
+    GeneralStencilEntry* x_p_nu_p_rho            = sView.GetEntry(sIndex+26,site);
+    GeneralStencilEntry* x_p_nu_p_rho_p_sig      = sView.GetEntry(sIndex+27,site);
+    GeneralStencilEntry* x_p_nu_p_rho_m_sig      = sView.GetEntry(sIndex+28,site);
+    GeneralStencilEntry* x_p_nu_m_rho            = sView.GetEntry(sIndex+29,site);
+    GeneralStencilEntry* x_p_nu_m_rho_p_sig      = sView.GetEntry(sIndex+30,site);
+    GeneralStencilEntry* x_p_nu_m_rho_m_sig      = sView.GetEntry(sIndex+31,site);
+    GeneralStencilEntry* x_p_rho                 = sView.GetEntry(sIndex+32,site);
+    GeneralStencilEntry* x_p_rho_m_nu            = sView.GetEntry(sIndex+33,site);
+    GeneralStencilEntry* x_p_rho_p_sig           = sView.GetEntry(sIndex+34,site);
+    GeneralStencilEntry* x_p_rho_m_sig           = sView.GetEntry(sIndex+35,site);
+    GeneralStencilEntry* x_m_nu                  = sView.GetEntry(sIndex+36,site);
+    GeneralStencilEntry* x_m_nu_p_rho            = sView.GetEntry(sIndex+37,site);
+    GeneralStencilEntry* x_m_nu_p_rho_p_sig      = sView.GetEntry(sIndex+38,site);
+    GeneralStencilEntry* x_m_nu_p_rho_m_sig      = sView.GetEntry(sIndex+39,site);
+    GeneralStencilEntry* x_m_nu_m_rho            = sView.GetEntry(sIndex+40,site);
+    GeneralStencilEntry* x_m_nu_m_rho_p_sig      = sView.GetEntry(sIndex+41,site);
+    GeneralStencilEntry* x_m_nu_m_rho_m_sig      = sView.GetEntry(sIndex+42,site);
+    GeneralStencilEntry* x_m_rho                 = sView.GetEntry(sIndex+43,site);
+    GeneralStencilEntry* x_m_rho_p_sig           = sView.GetEntry(sIndex+44,site);
+    GeneralStencilEntry* x_m_rho_m_sig           = sView.GetEntry(sIndex+45,site);
+    return {x                      , x_p_mu                 , x_p_mu_p_nu            , x_p_mu_p_nu_p_rho      ,
+            x_p_mu_p_nu_p_rho_m_sig, x_p_mu_p_nu_m_rho      , x_p_mu_p_nu_m_rho_m_sig, x_p_mu_p_nu_m_sig      ,
+            x_p_mu_m_nu            , x_p_mu_m_nu_p_rho      , x_p_mu_m_nu_p_rho_p_sig, x_p_mu_m_nu_p_rho_m_sig,
+            x_p_mu_m_nu_m_rho      , x_p_mu_m_nu_m_rho_p_sig, x_p_mu_m_nu_m_rho_m_sig, x_p_mu_m_nu_p_sig      ,
+            x_p_mu_m_nu_m_sig      , x_p_mu_p_rho           , x_p_mu_p_rho_p_sig     , x_p_mu_p_rho_m_sig     , 
+            x_p_mu_m_rho           , x_p_mu_m_rho_p_sig     , x_p_mu_m_rho_m_sig     , x_p_mu_p_sig           , 
+            x_p_mu_m_sig           , x_p_nu                 , x_p_nu_p_rho           , x_p_nu_p_rho_p_sig     , 
+            x_p_nu_p_rho_m_sig     , x_p_nu_m_rho           , x_p_nu_m_rho_p_sig     , x_p_nu_m_rho_m_sig     , 
+            x_p_rho                , x_p_rho_m_nu           , x_p_rho_p_sig          , x_p_rho_m_sig          , 
+            x_m_nu                 , x_m_nu_p_rho           , x_m_nu_p_rho_p_sig     , x_m_nu_p_rho_m_sig     , 
+            x_m_nu_m_rho           , x_m_nu_m_rho_p_sig     , x_m_nu_m_rho_m_sig     , x_m_rho                , 
+            x_m_rho_p_sig          , x_m_rho_m_sig};
+}
+
+
+/*!  @brief Allows for ASQTAD-like smearings. */
 template<class Gimpl> 
 class Smear_HISQ : public Gimpl {
+public:
 
-private:
     GridCartesian* const _grid;
-    HISQSmearingParameters _linkTreatment;
-
-public:
 
+    // Sort out the Gimpl. This handles BCs and part of the precision. 
     INHERIT_GIMPL_TYPES(Gimpl);
-    typedef typename Gimpl::GaugeField     GF;
-    typedef typename Gimpl::GaugeLinkField LF;
-    typedef typename Gimpl::ComplexField   CF;
+    typedef typename Gimpl::GaugeField      GF;
+    typedef typename Gimpl::GaugeLinkField  LF;
+    typedef typename Gimpl::ComplexField    CF;
+    typedef typename Gimpl::Scalar          ComplexScalar;
+    typedef decltype(real(ComplexScalar())) RealScalar;
+    typedef iColourMatrix<ComplexScalar>    ComplexColourMatrix;
+
+    RealScalar _Scut; // Cutoff for U(3) projection eigenvalues, set at initialization
+    int _HaloDepth=1; 
+
+    SmearingParameters<RealScalar> _linkTreatment;
+
+    void initialize() {
+        if (sizeof(RealScalar)==4) {
+            _Scut=1e-5; // Maybe should be higher? e.g. 1e-4
+        } else if (sizeof(RealScalar)==8) { 
+            _Scut=1e-8;
+        } else {
+            Grid_error("HISQ smearing only implemented for single and double");
+        }
+        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
+        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
+    }
 
-    // Don't allow default values here.
-    Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) 
+    Smear_HISQ(GridCartesian* grid, RealScalar c1, RealScalar cnaik, RealScalar c3, RealScalar c5, RealScalar c7, RealScalar clp) 
         : _grid(grid), 
           _linkTreatment(c1,cnaik,c3,c5,c7,clp) {
-        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
-        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
+        initialize();
     }
 
-    // Allow to pass a pointer to a C-style, double array for MILC convenience
+    // Allow to pass a pointer to a C-style array for MILC convenience
     Smear_HISQ(GridCartesian* grid, double* coeff) 
         : _grid(grid), 
           _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
-        assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
-        assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
+        initialize();
+    }
+    Smear_HISQ(GridCartesian* grid, float* coeff) 
+        : _grid(grid), 
+          _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
+        initialize();
     }
 
     ~Smear_HISQ() {}
 
-    // Intent: OUT--u_smr, u_naik
-    //          IN--u_thin
+
+    // Intent: OUT--U_3link (sum of left and right 3-staples attached to U)
+    //              U_fat (accmulates the fat smearing)
+    //          IN--U (thin links)
+    //              gStencil (3-link stencil)
+    //              mu
+    void threeLinkStaple(GF& U_fat, GF& U_3link, GF& U, GeneralLocalStencil gStencil, int mu) const {
+
+        SmearingParameters<RealScalar> lt = this->_linkTreatment;
+        autoView(U_v       , U       , AcceleratorRead);
+        autoView(U_fat_v   , U_fat   , AcceleratorWrite);
+        autoView(U_3link_v , U_3link , AcceleratorWrite);
+        auto gStencil_v = gStencil.View(AcceleratorRead); 
+        typedef decltype(getLink(U_v,gStencil_v.GetEntry(0,0),0)) U3matrix;
+        int Nsites = U_v.size();
+
+        accelerator_for(site,Nsites,Simd::Nsimd(),{
+            U3matrix res;
+            for(int nu=0;nu<Nd;nu++) {
+                if(nu==mu) continue;
+                int s = HISQStencilIndex(mu,nu);
+
+                auto [x_p_mu, x_p_nu, x, x_p_mu_m_nu, x_m_nu] = get3StaplePoints(gStencil_v,s,site);      
+
+                // When you're deciding whether to take an adjoint, the question is: how is the
+                // stored link oriented compared to the one you want? If I imagine myself traveling
+                // with the to-be-updated link, I have two possible, alternative 3-link paths I can
+                // take, one starting by going to the left, the other starting by going to the right.
+
+                //  "left" + "right"
+                res =       getLink(U_v,x,nu)       * getLink(U_v,x_p_nu,mu) * adj(getLink(U_v,x_p_mu,nu))
+                      + adj(getLink(U_v,x_m_nu,nu)) * getLink(U_v,x_m_nu,mu) *     getLink(U_v,x_p_mu_m_nu,nu);
+
+                // Save 3-link construct for later 
+                setLink(U_3link_v[x->_offset](nu), res);
+
+                // The index operator (x) returns the coalesced read on GPU. The view [] index returns 
+                // a reference to the vector object. The [x](mu) returns a reference to the densely 
+                // packed (contiguous in memory) mu-th element of the vector object. 
+                setLink(U_fat_v[x->_offset](mu), U_fat_v(x->_offset)(mu) + lt.c_3*res);
+            }
+        })
+        return;
+    }
+
+
+    // Intent: OUT--U_5link (sum of 5-staples attached to U) 
+    //              U_fat (accmulates the fat smearing)
+    //          IN--U (thin links)
+    //              U_3link (sum of left and right 3-staples attached to U)
+    //              gStencil (3-link stencil)
+    //              mu
+    //              updateFatLinks (in the force, you only want U_5link_v)
+    void fiveLinkStaple(GF& U_fat, GF& U_5linkA, GF& U_5linkB, GF& U_3link, 
+                        GF& U, GeneralLocalStencil gStencil, int mu) const {
+
+        SmearingParameters<RealScalar> lt = this->_linkTreatment;
+        autoView(U_v       , U       , AcceleratorRead);
+        autoView(U_fat_v   , U_fat   , AcceleratorWrite);
+        autoView(U_3link_v , U_3link , AcceleratorWrite);
+        autoView(U_5linkA_v, U_5linkA, AcceleratorWrite);
+        autoView(U_5linkB_v, U_5linkB, AcceleratorWrite);
+        auto gStencil_v = gStencil.View(AcceleratorRead); 
+        typedef decltype(getLink(U_v,gStencil_v.GetEntry(0,0),0)) U3matrix;
+        int Nsites = U_v.size();
+
+        accelerator_for(site,Nsites,Simd::Nsimd(),{
+            U3matrix res;
+            int sigmaIndex = 0;
+            for(int nu=0;nu<Nd;nu++) {
+                if(nu==mu) continue;
+                int s = HISQStencilIndex(mu,nu);
+                for(int rho=0;rho<Nd;rho++) {
+                    if (rho == mu || rho == nu) continue;
+
+                    auto [x_p_mu, x_p_nu, x, x_p_mu_m_nu, x_m_nu] = get3StaplePoints(gStencil_v,s,site);      
+
+                    res =       getLink(U_v,x,nu )      * getLink(U_3link_v,x_p_nu,rho) * adj(getLink(U_v,x_p_mu,nu))
+                          + adj(getLink(U_v,x_m_nu,nu)) * getLink(U_3link_v,x_m_nu,rho) *     getLink(U_v,x_p_mu_m_nu,nu);
+
+                    // Counting 3-link staples: there are three planes attached to the to-be-updated link,
+                    // which corresponds to three (forward+backward) staples. For the 5-link staples, for
+                    // each plane, there are two remaining directions, so that there are six 5-link staples
+                    // altogether. That will not fit in a single GaugeField object, so we use two. You can
+                    // think of sigmaIndex and rho together as being the labels that pick out a particular
+                    // 5-link staple. They therefore should not be interpreted as directions.
+                    if(sigmaIndex<3)
+                        setLink(U_5linkA_v[x->_offset](rho), res);
+                    else
+                        setLink(U_5linkB_v[x->_offset](rho), res);
+
+                    setLink(U_fat_v[x->_offset](mu), U_fat_v(x->_offset)(mu) + lt.c_5*res);
+                    sigmaIndex++;
+                }
+            }
+        })
+        return;
+    }
+
+
+    // Intent: OUT--U_fat (accmulates the fat smearing)
+    //          IN--U (thin links)
+    //              U_5link (sum of 5-staples attached to U)
+    //              gStencil (3-link stencil)
+    //              mu
+    void sevenLinkStaple(GF& U_fat, GF& U_5linkA, GF& U_5linkB, GF& U, GeneralLocalStencil gStencil, int mu) const {
+
+        SmearingParameters<RealScalar> lt = this->_linkTreatment;
+        autoView(U_v       , U       , AcceleratorRead);
+        autoView(U_fat_v   , U_fat   , AcceleratorWrite);
+        autoView(U_5linkA_v, U_5linkA, AcceleratorWrite);
+        autoView(U_5linkB_v, U_5linkB, AcceleratorWrite);
+        auto gStencil_v = gStencil.View(AcceleratorRead); 
+        typedef decltype(getLink(U_v,gStencil_v.GetEntry(0,0),0)) U3matrix;
+        int Nsites = U_v.size();
+
+        accelerator_for(site,Nsites,Simd::Nsimd(),{ 
+            U3matrix res;
+            int sigmaIndex = 0;
+            for(int nu=0;nu<Nd;nu++) {
+                if(nu==mu) continue;
+                int s = HISQStencilIndex(mu,nu);
+                for(int rho=0;rho<Nd;rho++) {
+                    if (rho == mu || rho == nu) continue;
+    
+                    auto [x_p_mu, x_p_nu, x, x_p_mu_m_nu, x_m_nu] = get3StaplePoints(gStencil_v,s,site);      
+    
+                    if(sigmaIndex<3)
+                        res =       getLink(U_v,x,nu)       * getLink(U_5linkB_v,x_p_nu,rho) * adj(getLink(U_v,x_p_mu,nu))
+                              + adj(getLink(U_v,x_m_nu,nu)) * getLink(U_5linkB_v,x_m_nu,rho) *     getLink(U_v,x_p_mu_m_nu,nu);
+                    else
+                        res =       getLink(U_v,x,nu)       * getLink(U_5linkA_v,x_p_nu,rho) * adj(getLink(U_v,x_p_mu,nu))
+                              + adj(getLink(U_v,x_m_nu,nu)) * getLink(U_5linkA_v,x_m_nu,rho) *     getLink(U_v,x_p_mu_m_nu,nu);
+                    
+                    setLink(U_fat_v[x->_offset](mu), U_fat_v(x->_offset)(mu) + lt.c_7*res);
+                    sigmaIndex++;
+                }
+            }
+        })
+        return;
+    }
+
+
+    // Intent: OUT--u_smr (smeared links) 
+    //              u_naik (Naik links)
+    //          IN--u_thin (thin links)
     void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
 
-        HISQSmearingParameters lt = this->_linkTreatment;
+        SmearingParameters<RealScalar> lt = this->_linkTreatment;
         auto grid = this->_grid;
 
         // Create a padded cell of extra padding depth=1 and fill the padding.
-        int depth = 1;
-        PaddedCell Ghost(depth,grid);
+        PaddedCell Ghost(_HaloDepth,grid);
         GF Ughost = Ghost.Exchange(u_thin);
 
-        // This is where auxiliary N-link fields and the final smear will be stored. 
-        GF Ughost_fat(Ughost.Grid());
-        GF Ughost_3link(Ughost.Grid());
-        GF Ughost_5linkA(Ughost.Grid());
-        GF Ughost_5linkB(Ughost.Grid());
+        // This is where auxiliary N-link fields and the final smear will be stored. As
+        // implemented, this uses about 25% more memory than necessary. 
+        GF U_fat(Ughost.Grid());
+        GF U_3link(Ughost.Grid());
+        GF U_5linkA(Ughost.Grid());
+        GF U_5linkB(Ughost.Grid());
 
-        // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier,
-        // but these entries will not be used. 
-        std::vector<Coordinate> shifts;
-        for(int mu=0;mu<Nd;mu++)
-        for(int nu=0;nu<Nd;nu++) {
-            appendShift(shifts,mu);
-            appendShift(shifts,nu);
-            appendShift(shifts,shiftSignal::NO_SHIFT);
-            appendShift(shifts,mu,Back(nu));
-            appendShift(shifts,Back(nu));
-            appendShift(shifts,Back(mu));
-        }
+        // mu-nu plane stencil.
+        std::vector<Coordinate> shifts = createHISQStencil("3STAPLE");
 
         // A GeneralLocalStencil has two indices: a site and stencil index 
         GeneralLocalStencil gStencil(Ughost.Grid(),shifts);
 
-        // This is where contributions from the smearing get added together
-        Ughost_fat=Zero();
-
-        // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik.
+        // Store sum of 3-, 5-, 7-link contributions in U_fat 
+        U_fat=Zero();
         for(int mu=0;mu<Nd;mu++) {
+            U_3link=Zero(); U_5linkA=Zero(); U_5linkB=Zero();
+            if((lt.c_7!=0) || (lt.c_5!=0) || (lt.c_3!=0))
+                threeLinkStaple(U_fat,                     U_3link, Ughost, gStencil, mu);
+            if((lt.c_7!=0) || (lt.c_5!=0)) 
+                fiveLinkStaple( U_fat, U_5linkA, U_5linkB, U_3link, Ughost, gStencil, mu);
+            if( lt.c_7!=0 ) 
+                sevenLinkStaple(U_fat, U_5linkA, U_5linkB,          Ughost, gStencil, mu);
+        }
 
-            // TODO: This approach is slightly memory inefficient. It uses 25% extra memory 
-            Ughost_3link =Zero();
-            Ughost_5linkA=Zero();
-            Ughost_5linkB=Zero();
-
-            // Create the accessors
-            autoView(U_v       , Ughost       , AcceleratorRead);
-            autoView(U_fat_v   , Ughost_fat   , AcceleratorWrite);
-            autoView(U_3link_v , Ughost_3link , AcceleratorWrite);
-            autoView(U_5linkA_v, Ughost_5linkA, AcceleratorWrite);
-            autoView(U_5linkB_v, Ughost_5linkB, AcceleratorWrite);
-
-            // We infer some types that will be needed in the calculation.
-            typedef decltype(gStencil.GetEntry(0,0)) stencilElement;
-            typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix;
-
-            int Nsites = U_v.size();
-            auto gStencil_v = gStencil.View(AcceleratorRead); 
-
-            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs
-                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
-                U3matrix U0, U1, U2, U3, U4, U5, W;
-                for(int nu=0;nu<Nd;nu++) {
-                    if(nu==mu) continue;
-                    int s = stencilIndex(mu,nu);
-
-                    // The stencil gives us support points in the mu-nu plane that we will use to
-                    // grab the links we need.
-                    SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
-                    SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
-                    SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
-                    SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
-                    SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
-                    SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu      = SE5->_offset;
-
-                    // When you're deciding whether to take an adjoint, the question is: how is the
-                    // stored link oriented compared to the one you want? If I imagine myself travelling
-                    // with the to-be-updated link, I have two possible, alternative 3-link paths I can
-                    // take, one starting by going to the left, the other starting by going to the right.
-                    U0 = coalescedReadGeneralPermute(U_v[x_p_mu     ](nu),SE0->_permute,Nd);
-                    U1 = coalescedReadGeneralPermute(U_v[x_p_nu     ](mu),SE1->_permute,Nd);
-                    U2 = coalescedReadGeneralPermute(U_v[x          ](nu),SE2->_permute,Nd);
-                    U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
-                    U4 = coalescedReadGeneralPermute(U_v[x_m_nu     ](mu),SE4->_permute,Nd);
-                    U5 = coalescedReadGeneralPermute(U_v[x_m_nu     ](nu),SE4->_permute,Nd);
-
-                    //  "left"          "right"
-                    W = U2*U1*adj(U0) + adj(U5)*U4*U3;
-
-                    // Save 3-link construct for later and add to smeared field.
-                    coalescedWrite(U_3link_v[x](nu), W);
-
-                    // The index operator (x) returns the coalesced read on GPU. The view [] index returns 
-                    // a reference to the vector object. The [x](mu) returns a reference to the densely 
-                    // packed (contiguous in memory) mu-th element of the vector object. On CPU, 
-                    // coalescedRead/Write is the identity mapping assigning vector object to vector object.
-                    // But on GPU it's non-trivial and maps scalar object to vector object and vice versa.
-                    coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W);
-                }
-            })
-
-            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link 
-                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
-                U3matrix U0, U1, U2, U3, U4, U5, W;
-                int sigmaIndex = 0;
-                for(int nu=0;nu<Nd;nu++) {
-                    if(nu==mu) continue;
-                    int s = stencilIndex(mu,nu);
-                    for(int rho=0;rho<Nd;rho++) {
-                        if (rho == mu || rho == nu) continue;
-
-                        SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
-                        SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
-                        SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
-                        SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
-                        SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
-
-                        U0 = coalescedReadGeneralPermute(      U_v[x_p_mu     ](nu ),SE0->_permute,Nd);
-                        U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu     ](rho),SE1->_permute,Nd);
-                        U2 = coalescedReadGeneralPermute(      U_v[x          ](nu ),SE2->_permute,Nd);
-                        U3 = coalescedReadGeneralPermute(      U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd);
-                        U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu     ](rho),SE4->_permute,Nd);
-                        U5 = coalescedReadGeneralPermute(      U_v[x_m_nu     ](nu ),SE4->_permute,Nd);
-
-                        W  = U2*U1*adj(U0) + adj(U5)*U4*U3;
-
-                        if(sigmaIndex<3) {
-                            coalescedWrite(U_5linkA_v[x](rho), W);
-                        } else {
-                            coalescedWrite(U_5linkB_v[x](rho), W);
-                        }    
-
-                        coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W);
-                        sigmaIndex++;
-                    }
-                }
-            })
-
-            accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link
-                stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
-                U3matrix U0, U1, U2, U3, U4, U5, W;
-                int sigmaIndex = 0;
-                for(int nu=0;nu<Nd;nu++) {
-                    if(nu==mu) continue;
-                    int s = stencilIndex(mu,nu);
-                    for(int rho=0;rho<Nd;rho++) {
-                        if (rho == mu || rho == nu) continue;
-
-                        SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset;
-                        SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset;
-                        SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset;
-                        SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
-                        SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset;
-
-                        U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd);
-                        if(sigmaIndex<3) {
-                            U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd);
-                        } else {
-                            U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd);
-                        }  
-                        U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd);
-                        U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
-                        if(sigmaIndex<3) {
-                            U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd);
-                        } else {
-                            U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd);
-                        }  
-                        U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd);
-
-                        W  = U2*U1*adj(U0) + adj(U5)*U4*U3;
-
-                        coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W);
-                        sigmaIndex++;
-                    }
-                }
-            })
-
-        } // end mu loop
-
-        // c1, c3, c5, c7 construct contributions
-        u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin;
+        // Add 1-link contribution 
+        u_smr = Ghost.Extract(U_fat) + lt.c_1*u_thin;
 
         // Load up U and V std::vectors to access thin and smeared links.
         std::vector<LF> U(Nd, grid);
         std::vector<LF> V(Nd, grid);
         std::vector<LF> Vnaik(Nd, grid);
         for (int mu = 0; mu < Nd; mu++) {
-            U[mu] = PeekIndex<LorentzIndex>(u_thin, mu);
-            V[mu] = PeekIndex<LorentzIndex>(u_smr, mu);
+            U[mu]     = PeekIndex<LorentzIndex>(u_thin, mu);
+            V[mu]     = PeekIndex<LorentzIndex>(u_smr, mu);
+            Vnaik[mu] = Zero();
         }
 
         for(int mu=0;mu<Nd;mu++) {
 
-            // Naik
-            Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu,
-                                    Gimpl::CovShiftForward(U[mu],mu,
-                                      Gimpl::CovShiftIdentityForward(U[mu],mu)));
-
-            // LePage
-            for (int nu_h=1;nu_h<Nd;nu_h++) {
-                int nu=(mu+nu_h)%Nd;
-                                // nu, nu, mu, Back(nu), Back(nu)
-                V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu,
-                                          Gimpl::CovShiftForward(U[nu],nu,
-                                            Gimpl::CovShiftForward(U[mu],mu,
-                                              Gimpl::CovShiftBackward(U[nu],nu,
-                                                Gimpl::CovShiftIdentityBackward(U[nu],nu)))))
-                                // Back(nu), Back(nu), mu, nu, nu
-                              + lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu,
-                                          Gimpl::CovShiftBackward(U[nu],nu,
-                                            Gimpl::CovShiftForward(U[mu],mu,
+            if(lt.c_naik!=0) { // Naik
+                Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu,
+                                        Gimpl::CovShiftForward(U[mu],mu,
+                                          Gimpl::CovShiftIdentityForward(U[mu],mu)));
+            }
+
+            if(lt.c_lp!=0) { // LePage
+                for (int nu=0;nu<Nd;nu++) {
+                    if(mu==nu) continue; 
+                    V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu,
                                               Gimpl::CovShiftForward(U[nu],nu,
-                                                Gimpl::CovShiftIdentityForward(U[nu],nu)))));
+                                                Gimpl::CovShiftForward(U[mu],mu,
+                                                  Gimpl::CovShiftBackward(U[nu],nu,
+                                                    Gimpl::CovShiftIdentityBackward(U[nu],nu)))))
+                                  + lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu,
+                                              Gimpl::CovShiftBackward(U[nu],nu,
+                                                Gimpl::CovShiftForward(U[mu],mu,
+                                                  Gimpl::CovShiftForward(U[nu],nu,
+                                                    Gimpl::CovShiftIdentityForward(U[nu],nu)))));
+                }
             }
         }
 
@@ -337,52 +634,1022 @@ class Smear_HISQ : public Gimpl {
     };
 
 
-    // Intent: OUT--u_proj
-    //          IN--u_mu
+    // Intent: OUT--u_proj (U3-projected links)
+    //          IN--u_mu (to-be-projected links)
     void projectU3(GF& u_proj, GF& u_mu) const {
 
-        auto grid = this->_grid;
+        // Open up the views
+        autoView(uproj_v, u_proj, AcceleratorWrite);
+        autoView(umu_v  , u_mu  , AcceleratorRead);
 
-        LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid);
-        CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid), 
-           u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid);
+        // Make sure everyone is using the same Grid
+        conformable(u_proj,u_mu);
 
         // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8)
+        accelerator_for(ss,umu_v.size(),Simd::Nsimd(),{
+#ifdef GRID_SIMT
+            { int blane=acceleratorSIMTlane(Simd::Nsimd());//
+#else
+            for(int blane=0;blane<Simd::Nsimd();blane++) {
+#endif
+                RealScalar g1, g2, g0;
+                ComplexColourMatrix V;
+                auto Vmu = extractLane(blane,umu_v[ss]);
+                for (int mu = 0; mu < Nd; mu++) {
+                    V()     = Vmu(mu);
+                    auto Q  = adj(V)*V;
+                    RealScalar c0 =        real(trace(Q))()()();
+                    RealScalar c1 = (1/2.)*real(trace(Q*Q))()()();
+                    RealScalar c2 = (1/3.)*real(trace(Q*Q*Q))()()();
+                    RealScalar S  = (1/3.)*c1-(1/18.)*c0*c0;
+                    if (abs(S)<_Scut) {
+                        g0 = (1/3.)*c0; 
+                        g1 = g0; 
+                        g2 = g1;
+                    } else {
+                        RealScalar R     = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
+                        RealScalar theta = acos(R*pow(S,-1.5));
+                        g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
+                        g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta          );
+                        g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
+                    }
+                    RealScalar u   = sqrt(g0) + sqrt(g1) + sqrt(g2);
+                    RealScalar v   = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
+                    RealScalar w   = sqrt(g0*g1*g2);
+                    RealScalar den = w*(u*v-w);
+                    RealScalar f0  = (-w*(u*u+v)+u*v*v)/den;
+                    RealScalar f1  = (-w-u*u*u+2.*u*v)/den;
+                    RealScalar f2  = u/den;
+
+                    auto res = V*(f0 + f1*Q + f2*Q*Q);
+
+                    insertLane(blane,uproj_v[ss](mu),res());
+                }
+            } 
+        });
+    };
+};
+
+
+
+/*!  @brief compute force from link variables */
+template<class Gimpl> 
+class Force_HISQ : public Gimpl {
+public:
+
+    GridCartesian* const   _grid;
+    GridRedBlackCartesian* _gridRB;
+
+    // Sort out the Gimpl. This handles BCs and part of the precision. 
+    INHERIT_GIMPL_TYPES(Gimpl);
+    typedef typename Gimpl::FermionField   FF;
+    typedef typename Gimpl::GaugeField     GF;
+    typedef typename Gimpl::GaugeLinkField LF;
+    typedef typename Gimpl::ComplexField   CF;
+    typedef typename Gimpl::Scalar ComplexScalar;
+    typedef decltype(real(ComplexScalar())) RealScalar;
+    typedef iColourMatrix<ComplexScalar> ComplexColourMatrix;
+
+    RealScalar _Scut=-1; // Cutoff for U(3) projection eigenvalues, set at initialization
+    int _HaloDepth=1;    // Depth of padded cell 
+
+    HISQParameters<Real> _linkParams;
+    HISQReunitSVDParameters<Real> _reunitParams;
+    GF _Umu, _Vmu, _Wmu;
+
+    void initialize() {
+        if (sizeof(RealScalar)==4) {
+            _Scut=1e-5; // Maybe should be higher? e.g. 1e-4
+        } else if (sizeof(RealScalar)==8) { 
+            _Scut=1e-8;
+        } else {
+            Grid_error("HISQ force only implemented for single and double");
+        }
+        _gridRB = SpaceTimeGrid::makeFourDimRedBlackGrid(_grid);
+        assert(Nc == 3 && "HISQ force currently implemented only for Nc==3");
+        assert(Nd == 4 && "HISQ force only defined for Nd==4");
+    }
+
+    Force_HISQ(GridCartesian* grid, HISQParameters<Real> linkParams, GF Wmu, GF Vmu, GF Umu, 
+               HISQReunitSVDParameters<Real> reunitParams) 
+        : _grid(grid), 
+          _linkParams(linkParams),
+          _Wmu(Wmu),
+          _Vmu(Vmu),
+          _Umu(Umu),
+          _reunitParams(reunitParams) {
+        initialize();
+    }
+
+    ~Force_HISQ() {}
+
+
+    // Intent: OUT--u_deriv (dW/dV slotted into force)
+    //          IN--u_mu (fat links), 
+    //              u_force (slot derivative into this force), 
+    //              delta (force cutoff)
+    // Follow MILC 10.1103/PhysRevD.82.074501
+    void projU3Deriv(GF& u_deriv, GF& u_mu, GF& u_force, RealScalar const delta=5e-5) {
+
+        conformable(u_force,u_mu);
+        conformable(u_deriv,u_mu);
+
+        autoView(uderiv_v, u_deriv, AcceleratorWrite);
+        autoView(umu_v   , u_mu   , AcceleratorRead);
+        autoView(uforce_v, u_force, AcceleratorRead);
+
+        // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8)
+        accelerator_for(ss,umu_v.size(),Simd::Nsimd(),{
+#ifdef GRID_SIMT
+            { int blane=acceleratorSIMTlane(Simd::Nsimd());//
+#else
+            for(int blane=0;blane<Simd::Nsimd();blane++) {
+#endif
+                RealScalar g1, g2, g0;
+                ComplexColourMatrix V, force;
+                auto Vmu     = extractLane(blane,umu_v[ss]);
+                auto forcemu = extractLane(blane,uforce_v[ss]);
+                for (int mu = 0; mu < Nd; mu++) {
+                    V()     = Vmu(mu);
+                    auto Q  = adj(V)*V;
+                    RealScalar c0 =        real(trace(Q))()()();
+                    RealScalar c1 = (1/2.)*real(trace(Q*Q))()()();
+                    RealScalar c2 = (1/3.)*real(trace(Q*Q*Q))()()();
+                    RealScalar S  = (1/3.)*c1-(1/18.)*c0*c0;
+                    if (abs(S)<_Scut) {
+                        g0 = (1/3.)*c0; 
+                        g1 = g0; 
+                        g2 = g1;
+                    } else {
+                        RealScalar R     = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
+                        RealScalar theta = acos(R*pow(S,-1.5));
+                        g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
+                        g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta          );
+                        g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
+                    }
+
+                    if (g0 < delta || g1 < delta || g2 < delta) {
+                        // force filter eq (C23)
+                        g0 += delta;
+                        g1 += delta;
+                        g2 += delta;
+                        Q   = Q + delta;
+                    }
+//                if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD }
+
+                    RealScalar u   = sqrt(g0) + sqrt(g1) + sqrt(g2);
+                    RealScalar v   = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
+                    RealScalar w   = sqrt(g0*g1*g2);
+                    RealScalar den = w*(u*v-w);
+                    RealScalar f0  = (-w*(u*u+v)+u*v*v)/den;
+                    RealScalar f1  = (-w-u*u*u+2.*u*v)/den;
+                    RealScalar f2  = u/den;
+
+                    auto Qinvsq = f0 + f1*Q + f2*Q*Q;
+
+                    force() = forcemu(mu);
+                    auto forcedag = adj(force);
+
+                    RealScalar u2, u3, u4, u5, u6, u7, u8, v2 ,v3, v4, v5, v6, w2, w3, w4, w5;
+
+                    u2 = u*u; u3 = u2*u; u4 = u3*u; u5 = u4*u; u6 = u5*u; u7 = u6*u; u8 = u7*u;
+                    v2 = v*v; v3 = v2*v; v4 = v3*v; v5 = v4*v; v6 = v5*v;
+                    w2 = w*w; w3 = w2*w; w4 = w3*w; w5 = w4*w;
+        
+                    // eq (C10)
+                    auto d = 2*w3*(u*v-w)*(u*v-w)*(u*v-w);
+        
+                    // eq (C11)
+                    auto C00 = ( -w3*u6 + 3*v*w3*u4 + 3*v4*w*u4 - v6*u3 - 4*w4*u3 - 12*v3*w2*u3 + 16*v2*w3*u2 
+                                 + 3*v5*w*u2 - 8*v*w4*u - 3*v4*w2*u + w5 + v3*w3 )/d;
+                    auto C01 = ( -w2*u7 - v2*w*u6 + v4*u5 + 6*v*w2*u5 - 5*w3*u4 - v3*w*u4 - 2*v5*u3 - 6*v2*w2*u3 
+                                 + 10*v*w3*u2 + 6*v4*w*u2 - 3*w4*u - 6*v3*w2*u + 2*v2*w3 )/d;
+                    auto C02 = ( w2*u5 + v2*w*u4 - v4*u3 - 4*v*w2*u3 + 4*w3*u2 +3*v3*w*u2 - 3*v2*w2*u + v*w3 )/d;
+                    auto C11 = ( -w*u8 - v2*u7 + 7*v*w*u6 + 4*v3*u5 - 5*w2*u5 - 16*v2*w*u4 - 4*v4*u3 + 16*v*w2*u3 
+                                 - 3*w3*u2 + 12*v3*w*u2 - 12*v2*w2*u + 3*v*w3 )/d;
+                    auto C12 = ( w*u6 + v2*u5 - 5*v*w*u4 - 2*v3*u3 + 4*w2*u3 + 6*v2*w*u2 - 6*v*w2*u + w3 )/d;
+                    auto C22 = ( -w*u4 - v2*u3 + 3*v*w*u2 - 3*w2*u )/d;
+        
+                    // These are all used in the loop over color entries, and we want to avoid recomputing
+                    // these products, which should be broadcast to all sites, 3*3*3*3=81 times. 
+                    auto Vdag   = adj(V);
+                    auto VVdag  = V*Vdag;
+                    auto VQ     = V*Q;   
+                    auto VQ2    = VQ*Q;
+                    auto VQVdag = VQ*Vdag;
+                    auto QVdag  = Q*Vdag;
+                    auto Q2Vdag = Q*QVdag;
+        
+                    // eqs (C17-C19)
+                    auto PVdag  = ( C00 + C01*Q + C02*Q*Q )*Vdag;
+                    auto RVdag  = ( C01 + C11*Q + C12*Q*Q )*Vdag;
+                    auto SVdag  = ( C02 + C12*Q + C22*Q*Q )*Vdag;
+        
+                    // eqs (C20) and (C21)
+                    ComplexColourMatrix res = Zero();
+                    for (int k = 0; k < 3; k++)
+                    for (int l = 0; l < 3; l++)
+                    for (int i = 0; i < 3; i++)
+                    for (int j = 0; j < 3; j++) {
+        
+                        ComplexScalar deriv = 0.; // dWij/dVkl
+                
+                        if (k == i) deriv += Qinvsq()()(l,j);
+                        if (l == j) deriv += f1*VVdag()()(i,k)+f2*VQVdag()()(i,k);
+
+                        deriv += f2*VVdag()()(i,k)*Q()()(l,j) + V()()(i,j)*PVdag()()(l,k) 
+                                 + VQ()()(i,j)*RVdag()()(l,k) + VQ2()()(i,j)*SVdag()()(l,k);
+        
+                        res()()(l,k) = res()()(l,k) + deriv*force()()(j,i);
+                
+                        // dWij^+/dVkl
+                        deriv = (f1*Vdag()()(i,k)+f2*QVdag()()(i,k))*Vdag()()(l,j) 
+                                + f2*Vdag()()(i,k)*QVdag()()(l,j) + Vdag()()(i,j)*PVdag()()(l,k) 
+                                + QVdag()()(i,j)*RVdag()()(l,k)+Q2Vdag()()(i,j)*SVdag()()(l,k);
+                
+                        res()()(l,k) = res()()(l,k) + deriv*forcedag()()(j,i);
+                  	}
+        
+                    insertLane(blane,uderiv_v[ss](mu),res());
+                }
+            } 
+        });
+    };
+
+
+    // vecx (contains |X> and |Y>)
+    // l (rat approx and Naik index)
+    // sep (separation between |X> and |Y>)
+    GF outerProductHISQ(std::vector<FF>& vecx, std::vector<Real> vecdt, std::vector<int> n_orders_naik, int n_naiks, int sep) {
+        
+        auto grid   = this->_grid;
+        auto gridRB = this->_gridRB;
+
+        GF XY(grid), XY_l(grid);
+        FF X(grid), Y(grid), RB(gridRB);
+        LF XYnu(grid), YXnu(grid);
+
+        XY = Zero();
+
+        // These four lines control the loop over rational approximation contributions. As explained in force(), 
+        // l indexes over both Naik epsilon and rational approximation order.
+        int l = 0;
+        for (int inaik = 0; inaik < n_naiks; inaik++) {
+            int rat_order = n_orders_naik[inaik];
+            for (int i=0; i<rat_order; i++) {
+
+                X = Zero(); Y = Zero(); 
+                
+                RB = Zero(); 
+                pickCheckerboard(Even,RB,vecx[l]);
+                setCheckerboard(X,RB);
+                RB = Zero();
+                pickCheckerboard(Odd ,RB,vecx[l]);
+                setCheckerboard(Y,RB);
+        
+                XY_l = Zero(); XYnu = Zero(); YXnu = Zero(); 
+                for (int nu = 0; nu < Nd; nu++) {
+                    YXnu = outerProduct( Cshift(Y,nu,sep) ,X);
+                    XYnu = outerProduct( Cshift(X,nu,sep) ,Y);
+                    PokeIndex<LorentzIndex>(XY_l,(YXnu-XYnu),nu);
+                }
+                XY += vecdt[l]*XY_l; 
+                l++;
+            }
+        }   
+        return XY;
+    }
+
+
+    // Intent: OUT--Fghost (accumulates 3-link derivative contribution)
+    //          IN--Ughost (thin links)
+    //              XYCghost (outer product)
+    //              gStencil3 (3-link stencil)
+    //              c3
+    //              mu
+    void threeLinkDeriv(GF& Fghost, GF& Ughost, GF& XYCghost, GeneralLocalStencil gStencil3, Real c3, int mu) const {
+        
+        autoView(U_v , Ughost , AcceleratorRead);
+        autoView(XY_v, XYCghost, AcceleratorRead);
+        autoView(F_v , Fghost , AcceleratorWrite);
+        int Nsites = U_v.size();
+        auto gStencil3_v = gStencil3.View(AcceleratorRead);
+        typedef decltype(getLink(U_v,gStencil3.GetEntry(0,0),0)) U3matrix;
+
+        accelerator_for(site,Nsites,Simd::Nsimd(),{
+            U3matrix res;
+            for(int nu=0;nu<Nd;nu++) {
+                if(nu==mu) continue;
+                int s = HISQStencilIndex(mu,nu);
+
+                auto [x_p_mu, x_p_nu, x, x_p_mu_m_nu, x_m_nu] = get3StaplePoints(gStencil3_v,s,site);      
+
+                res =   adj(getLink(XY_v,x,nu))*    getLink(U_v ,x_p_nu,mu) *adj(getLink(U_v ,x_p_mu,nu))
+                      +     getLink(U_v ,x,nu) *adj(getLink(XY_v,x_p_nu,mu))*adj(getLink(U_v ,x_p_mu,nu))
+                      +     getLink(U_v ,x,nu) *    getLink(U_v ,x_p_nu,mu) *    getLink(XY_v,x_p_mu,nu)
+
+                      +     getLink(XY_v,x_m_nu,nu) *    getLink(U_v ,x_m_nu,mu) *    getLink(U_v ,x_p_mu_m_nu,nu)
+                      + adj(getLink(U_v ,x_m_nu,nu))*adj(getLink(XY_v,x_m_nu,mu))*    getLink(U_v ,x_p_mu_m_nu,nu)
+                      + adj(getLink(U_v ,x_m_nu,nu))*    getLink(U_v ,x_m_nu,mu) *adj(getLink(XY_v,x_p_mu_m_nu,nu));
+
+                setLink(F_v[x->_offset](mu), F_v(x->_offset)(mu) + c3*adj(res));
+            }              
+        })
+    }
+
+
+    // Intent: OUT--Fghost (accumulates 5-link derivative contribution)
+    //          IN--Ughost (thin links)
+    //              XYCghost (outer product)
+    //              gStencil5 (5-link stencil)
+    //              c5
+    //              mu
+    template<int term>
+    void fiveLinkDeriv(GF& Fghost, GF& Ughost, GF& XYCghost, GeneralLocalStencil gStencil5, Real c5, int mu) const {
+        
+        autoView(U_v , Ughost , AcceleratorRead);
+        autoView(XY_v, XYCghost, AcceleratorRead);
+        autoView(F_v , Fghost , AcceleratorWrite);
+        int Nsites = U_v.size();
+        auto gStencil5_v = gStencil5.View(AcceleratorRead);
+        typedef decltype(getLink(U_v,gStencil5.GetEntry(0,0),0)) U3matrix;
+
+        accelerator_for(site,Nsites,Simd::Nsimd(),{ 
+            U3matrix res; 
+            for(int nu=0;nu<Nd;nu++) {
+                if(nu==mu) continue;
+                for(int rho=0;rho<Nd;rho++) {
+                    if (rho == mu || rho == nu) continue;
+                    int s = HISQStencilIndex(mu,nu,rho,0,"5STAPLE");
+
+                    auto [x_p_nu_m_rho     , x_p_nu           , x_m_rho          , 
+                          x                , x_p_rho          , x_m_nu_m_rho     , 
+                          x_m_nu           , x_m_nu_p_rho     , x_p_mu_p_nu_m_rho, 
+                          x_p_mu_p_nu      , x_p_mu_m_rho     , x_p_mu           ,
+                          x_p_mu_p_rho     , x_p_mu_m_nu_m_rho, x_p_mu_m_nu      , 
+                          x_p_mu_m_nu_p_rho, x_p_nu_p_rho     ] = get5StaplePoints(gStencil5_v,s,site);
+
+                    res = Zero();
+
+                    // The idea behind the constexpr syntax is to reduce compile times. These seem to grow
+                    // with increasing kernel size. The template parameter term lets the user choose which
+                    // part of the kernel to compile, and hence constexpr is evaluated at compile time.
+if constexpr(term==0) {
+                    res += (      getLink(U_v ,x_p_mu           ,rho)
+                            *     getLink(U_v ,x_p_mu_p_rho     ,nu ) 
+                            * adj(getLink(U_v ,x_p_mu_p_nu      ,rho))
+
+                            + adj(getLink(U_v ,x_p_mu_m_rho     ,rho))
+                            *     getLink(U_v ,x_p_mu_m_rho     ,nu )
+                            *     getLink(U_v ,x_p_mu_p_nu_m_rho,rho)
+                    )*adj(getLink(U_v,x_p_nu,mu))*getLink(XY_v,x,nu);
+
+                    res += (      getLink(U_v ,x_p_mu           ,rho) 
+                            * adj(getLink(U_v ,x_p_rho          ,mu )) 
+                            * adj(getLink(U_v ,x_m_nu_p_rho     ,nu ))
+                            *     getLink(XY_v,x_m_nu           ,rho)
+
+                            + adj(getLink(U_v ,x_p_mu_m_rho     ,rho))
+                            * adj(getLink(U_v ,x_m_rho          ,mu ))
+                            * adj(getLink(U_v ,x_m_nu_m_rho     ,nu ))
+                            * adj(getLink(XY_v,x_m_nu_m_rho     ,rho))
+                    )*getLink(U_v,x_m_nu,nu);
+
+                    res += (      getLink(U_v ,x_p_mu           ,rho) 
+                            * adj(getLink(U_v ,x_p_mu_m_nu_p_rho,nu ))
+                            * adj(getLink(U_v ,x_p_mu_m_nu      ,rho))
+                            + adj(getLink(U_v ,x_p_mu_m_rho     ,rho))
+                            * adj(getLink(U_v ,x_p_mu_m_nu_m_rho,nu ))
+                            *     getLink(U_v ,x_p_mu_m_nu_m_rho,rho)
+                    )*adj(getLink(U_v,x_m_nu,mu))*adj(getLink(XY_v,x_m_nu,nu));
+
+                    res += (      getLink(U_v ,x_p_mu           ,rho) 
+                            * adj(getLink(U_v ,x_p_rho          ,mu ))
+                            *     getLink(U_v ,x_p_rho          ,nu )
+                            *     getLink(XY_v,x_p_nu           ,rho) 
+                            + adj(getLink(U_v ,x_p_mu_m_rho     ,rho))
+                            * adj(getLink(U_v ,x_m_rho          ,mu ))
+                            *     getLink(U_v ,x_m_rho          ,nu )
+                            * adj(getLink(XY_v,x_p_nu_m_rho     ,rho))
+                    )*adj(getLink(U_v,x,nu));
+
+                    res += (      getLink(XY_v,x_p_mu_m_nu      ,nu ) 
+                            * adj(getLink(U_v ,x_m_nu           ,mu ))
+                            *     getLink(U_v ,x_m_nu           ,rho)
+                            +     getLink(U_v ,x_p_mu           ,rho)
+                            * adj(getLink(U_v ,x_p_mu_m_nu_p_rho,nu ))
+                            *     getLink(XY_v,x_m_nu_p_rho     ,mu )
+                    )*getLink(U_v,x_m_nu_p_rho,nu)*adj(getLink(U_v,x,rho));
+}
+if constexpr(term==1) {
+                    res += (      getLink(U_v ,x_p_mu           ,nu ) 
+                            * adj(getLink(XY_v,x_p_mu_p_nu      ,rho))
+                            * adj(getLink(U_v ,x_p_mu_p_rho     ,nu ))
+                            + adj(getLink(U_v ,x_p_mu_m_nu      ,nu ))
+                            * adj(getLink(XY_v,x_p_mu_m_nu      ,rho))
+                            *     getLink(U_v ,x_p_mu_m_nu_p_rho,nu )
+                    )*adj(getLink(U_v,x_p_rho,mu))*adj(getLink(U_v,x,rho));
+
+                    res += (      getLink(U_v ,x_p_mu           ,rho) 
+                            *     getLink(U_v ,x_p_mu_p_rho     ,nu )
+                            *     getLink(XY_v,x_p_nu_p_rho     ,mu )
+                            + adj(getLink(XY_v,x_p_mu           ,nu ))
+                            * adj(getLink(U_v ,x_p_nu           ,mu ))
+                            *     getLink(U_v ,x_p_nu           ,rho)
+                    )*adj(getLink(U_v,x_p_rho,nu))*adj(getLink(U_v,x,rho));
+
+                    res += (  adj(getLink(XY_v,x_p_mu           ,nu ))
+                            * adj(getLink(U_v ,x_p_nu           ,mu ))
+                            * adj(getLink(U_v ,x_p_nu_m_rho     ,rho))
+                            + adj(getLink(U_v ,x_p_mu_m_rho     ,rho))
+                            *     getLink(U_v ,x_p_mu_m_rho     ,nu )
+                            *     getLink(XY_v,x_p_nu_m_rho     ,mu )
+                    )*adj(getLink(U_v,x_m_rho,nu))*getLink(U_v,x_m_rho,rho);
+
+                    res += (      getLink(U_v ,x_p_mu           ,nu )
+                            *     getLink(XY_v,x_p_mu_p_nu_m_rho,rho) 
+                            * adj(getLink(U_v ,x_p_mu_m_rho     ,nu ))
+                            + adj(getLink(U_v ,x_p_mu_m_nu      ,nu ))
+                            *     getLink(XY_v,x_p_mu_m_nu_m_rho,rho)
+                            *     getLink(U_v ,x_p_mu_m_nu_m_rho,nu )
+                    )*adj(getLink(U_v,x_m_rho,mu))*getLink(U_v,x_m_rho,rho);
+
+                    res += (      getLink(XY_v,x_p_mu_m_nu      ,nu )
+                            * adj(getLink(U_v ,x_m_nu           ,mu )) 
+                            * adj(getLink(U_v ,x_m_nu_m_rho     ,rho))
+                            + adj(getLink(U_v ,x_p_mu_m_rho     ,rho))
+                            * adj(getLink(U_v ,x_p_mu_m_nu_m_rho,nu ))
+                            *     getLink(XY_v,x_m_nu_m_rho     ,mu )
+                    )*getLink(U_v,x_m_nu_m_rho,nu)*getLink(U_v,x_m_rho,rho);
+}
+                    setLink(F_v[x->_offset](mu), F_v(x->_offset)(mu) + c5*res);
+                }
+            }
+        })
+    }
+
+
+    template<int term>
+    void sevenLinkDeriv(GF& Fghost, GF& Ughost, GF& XYCghost, GeneralLocalStencil gStencil7, Real c7, int mu) const {
+        
+        autoView(U_v , Ughost , AcceleratorRead);
+        autoView(XY_v, XYCghost, AcceleratorRead);
+        autoView(F_v , Fghost , AcceleratorWrite);
+        int Nsites = U_v.size();
+        auto gStencil7_v = gStencil7.View(AcceleratorRead);
+        typedef decltype(getLink(U_v,gStencil7.GetEntry(0,0),0)) U3matrix;
+
+        accelerator_for(site,Nsites,Simd::Nsimd(),{
+            U3matrix res, U1; 
+            for(int nu=0;nu<Nd;nu++) {
+                if(nu==mu) continue;
+                for(int rho=0;rho<Nd;rho++) {
+                    if (rho == mu || rho == nu) continue;
+                    for(int sig=0;sig<Nd;sig++) {
+                        if (sig == mu || sig == nu || sig == rho) continue;
+                        int s = HISQStencilIndex(mu,nu,rho,sig,"7STAPLE");
+
+                        auto [x                      , x_p_mu                 , x_p_mu_p_nu            , x_p_mu_p_nu_p_rho      ,
+                              x_p_mu_p_nu_p_rho_m_sig, x_p_mu_p_nu_m_rho      , x_p_mu_p_nu_m_rho_m_sig, x_p_mu_p_nu_m_sig      ,
+                              x_p_mu_m_nu            , x_p_mu_m_nu_p_rho      , x_p_mu_m_nu_p_rho_p_sig, x_p_mu_m_nu_p_rho_m_sig,
+                              x_p_mu_m_nu_m_rho      , x_p_mu_m_nu_m_rho_p_sig, x_p_mu_m_nu_m_rho_m_sig, x_p_mu_m_nu_p_sig      ,
+                              x_p_mu_m_nu_m_sig      , x_p_mu_p_rho           , x_p_mu_p_rho_p_sig     , x_p_mu_p_rho_m_sig     , 
+                              x_p_mu_m_rho           , x_p_mu_m_rho_p_sig     , x_p_mu_m_rho_m_sig     , x_p_mu_p_sig           , 
+                              x_p_mu_m_sig           , x_p_nu                 , x_p_nu_p_rho           , x_p_nu_p_rho_p_sig     , 
+                              x_p_nu_p_rho_m_sig     , x_p_nu_m_rho           , x_p_nu_m_rho_p_sig     , x_p_nu_m_rho_m_sig     , 
+                              x_p_rho                , x_p_rho_m_nu           , x_p_rho_p_sig          , x_p_rho_m_sig          , 
+                              x_m_nu                 , x_m_nu_p_rho           , x_m_nu_p_rho_p_sig     , x_m_nu_p_rho_m_sig     , 
+                              x_m_nu_m_rho           , x_m_nu_m_rho_p_sig     , x_m_nu_m_rho_m_sig     , x_m_rho                , 
+                              x_m_rho_p_sig          , x_m_rho_m_sig
+                        ] = get7StaplePoints(gStencil7_v,s,site);
+
+if constexpr(term==0) {
+                        res = adj(getLink(XY_v,x_p_mu,nu))*adj(getLink(U_v,x_p_nu,mu))
+                              *(getLink(U_v,x_p_nu,rho)
+                                *(   getLink(U_v,x_p_nu_p_rho      ,sig)
+                                *adj(getLink(U_v,x_p_rho_p_sig     ,nu ))
+                                *adj(getLink(U_v,x_p_rho           ,sig))
+                                +adj(getLink(U_v,x_p_nu_p_rho_m_sig,sig))
+                                *adj(getLink(U_v,x_p_rho_m_sig     ,nu ))
+                                *    getLink(U_v,x_p_rho_m_sig     ,sig)
+                                 )*adj(getLink(U_v,x,rho))
+
+                                +adj(getLink(U_v,x_p_nu_m_rho,rho))
+                                *(   getLink(U_v,x_p_nu_m_rho      ,sig)
+                                *adj(getLink(U_v,x_m_rho_p_sig     ,nu ))
+                                *adj(getLink(U_v,x_m_rho           ,sig))
+                                +adj(getLink(U_v,x_p_nu_m_rho_m_sig,sig))
+                                *adj(getLink(U_v,x_m_rho_m_sig     ,nu ))
+                                *    getLink(U_v,x_m_rho_m_sig     ,sig)
+                                 )*getLink(U_v,x_m_rho,rho)
+                               ); 
+} 
+if constexpr(term==1) {
+                        res = getLink(XY_v,x_p_mu_m_nu,nu)*adj(getLink(U_v,x_m_nu,mu))
+                              *(getLink(U_v,x_m_nu,rho)
+                                *(   getLink(U_v,x_m_nu_p_rho      ,sig)
+                                *    getLink(U_v,x_m_nu_p_rho_p_sig,nu )
+                                *adj(getLink(U_v,x_p_rho           ,sig))
+                                +adj(getLink(U_v,x_m_nu_p_rho_m_sig,sig))
+                                *    getLink(U_v,x_m_nu_p_rho_m_sig,nu ) 
+                                *    getLink(U_v,x_p_rho_m_sig     ,sig)
+                                 )*adj(getLink(U_v,x,rho))
+
+                                +adj(getLink(U_v,x_m_nu_m_rho,rho))
+                                *(   getLink(U_v,x_m_nu_m_rho      ,sig)
+                                *    getLink(U_v,x_m_nu_m_rho_p_sig,nu ) 
+                                *adj(getLink(U_v,x_m_rho           ,sig))
+                                +adj(getLink(U_v,x_m_nu_m_rho_m_sig,sig))
+                                *    getLink(U_v,x_m_nu_m_rho_m_sig,nu ) 
+                                *    getLink(U_v,x_m_rho_m_sig     ,sig)
+                                 )*getLink(U_v,x_m_rho,rho)
+                               );
+}
+if constexpr(term==2) {
+                        U1 = adj(getLink(U_v,x_p_nu,mu));
+
+                        res = (     getLink(U_v ,x_p_mu           ,sig)
+                               *adj(getLink(XY_v,x_p_mu_p_sig     ,nu ))
+                               *adj(getLink(U_v ,x_p_mu_p_nu      ,sig))
+                               +adj(getLink(U_v ,x_p_mu_m_sig     ,sig))
+                               *adj(getLink(XY_v,x_p_mu_m_sig     ,nu ))
+                               *    getLink(U_v ,x_p_mu_p_nu_m_sig,sig)
+                              )*U1*getLink(U_v,x_p_nu,rho)*adj(getLink(U_v,x_p_rho,nu))*adj(getLink(U_v,x,rho))
+
+                            + (     getLink(U_v ,x_p_mu           ,sig) 
+                               *adj(getLink(XY_v,x_p_mu_p_sig     ,nu ))
+                               *adj(getLink(U_v ,x_p_mu_p_nu      ,sig))
+                               +adj(getLink(U_v ,x_p_mu_m_sig     ,sig))
+                               *adj(getLink(XY_v,x_p_mu_m_sig     ,nu ))
+                               *    getLink(U_v ,x_p_mu_p_nu_m_sig,sig)
+                              )*U1*adj(getLink(U_v,x_p_nu_m_rho,rho))*adj(getLink(U_v,x_m_rho,nu))*getLink(U_v,x_m_rho,rho); 
+}
+if constexpr(term==3) {
+                        U1 = adj(getLink(U_v,x_m_nu,mu));
+                        
+                        res = (     getLink(U_v ,x_p_mu            ,sig) 
+                               *    getLink(XY_v,x_p_mu_m_nu_p_sig ,nu )
+                               *adj(getLink(U_v ,x_p_mu_m_nu       ,sig))
+                               +adj(getLink(U_v ,x_p_mu_m_sig      ,sig))
+                               *    getLink(XY_v,x_p_mu_m_nu_m_sig ,nu )
+                               *    getLink(U_v ,x_p_mu_m_nu_m_sig ,sig)
+                              )*U1*getLink(U_v,x_m_nu,rho)*getLink(U_v,x_m_nu_p_rho,nu)*adj(getLink(U_v,x,rho))
+
+                            + (     getLink(U_v ,x_p_mu            ,sig) 
+                               *    getLink(XY_v,x_p_mu_m_nu_p_sig ,nu )
+                               *adj(getLink(U_v ,x_p_mu_m_nu       ,sig))
+                               +adj(getLink(U_v ,x_p_mu_m_sig      ,sig))
+                               *    getLink(XY_v,x_p_mu_m_nu_m_sig ,nu )
+                               *    getLink(U_v ,x_p_mu_m_nu_m_sig ,sig)
+                              )*U1*adj(getLink(U_v,x_m_nu_m_rho,rho))*getLink(U_v,x_m_nu_m_rho,nu)*getLink(U_v,x_m_rho,rho);
+}
+if constexpr(term==4) {
+                        res = (getLink(U_v,x_p_mu,rho)
+                                *(     getLink(U_v ,x_p_mu_p_rho           ,sig)
+                                  *adj(getLink(XY_v,x_p_mu_p_rho_p_sig     ,nu ))
+                                  *adj(getLink(U_v ,x_p_mu_p_nu_p_rho      ,sig))
+                                  +adj(getLink(U_v ,x_p_mu_p_rho_m_sig     ,sig))
+                                  *adj(getLink(XY_v,x_p_mu_p_rho_m_sig     ,nu ))
+                                  *    getLink(U_v ,x_p_mu_p_nu_p_rho_m_sig,sig) 
+                                 )*adj(getLink(U_v,x_p_mu_p_nu,rho))
+
+                                +adj(getLink(U_v,x_p_mu_m_rho,rho))
+                                *(     getLink(U_v ,x_p_mu_m_rho           ,sig)
+                                  *adj(getLink(XY_v,x_p_mu_m_rho_p_sig     ,nu ))
+                                  *adj(getLink(U_v ,x_p_mu_p_nu_m_rho      ,sig))
+                                  +adj(getLink(U_v ,x_p_mu_m_rho_m_sig     ,sig))
+                                  *adj(getLink(XY_v,x_p_mu_m_rho_m_sig     ,nu ))
+                                  *    getLink(U_v ,x_p_mu_p_nu_m_rho_m_sig,sig)
+                                 )*getLink(U_v,x_p_mu_p_nu_m_rho,rho)
+                              )*adj(getLink(U_v,x_p_nu,mu))*adj(getLink(U_v,x,nu));
+}
+if constexpr(term==5) {
+                        res = (getLink(U_v,x_p_mu,rho)
+                               *(     getLink(U_v ,x_p_mu_p_rho           ,sig)
+                                 *    getLink(XY_v,x_p_mu_m_nu_p_rho_p_sig,nu )
+                                 *adj(getLink(U_v ,x_p_mu_m_nu_p_rho      ,sig))
+                                 +adj(getLink(U_v ,x_p_mu_p_rho_m_sig     ,sig))
+                                 *    getLink(XY_v,x_p_mu_m_nu_p_rho_m_sig,nu )
+                                 *    getLink(U_v ,x_p_mu_m_nu_p_rho_m_sig,sig) 
+                                )*adj(getLink(U_v,x_p_mu_m_nu,rho))
+
+                              + adj(getLink(U_v,x_p_mu_m_rho,rho))
+                               *(     getLink(U_v ,x_p_mu_m_rho           ,sig)
+                                 *    getLink(XY_v,x_p_mu_m_nu_m_rho_p_sig,nu )
+                                 *adj(getLink(U_v ,x_p_mu_m_nu_m_rho      ,sig))
+                                 +adj(getLink(U_v ,x_p_mu_m_rho_m_sig     ,sig))
+                                 *    getLink(XY_v,x_p_mu_m_nu_m_rho_m_sig,nu )
+                                 *    getLink(U_v ,x_p_mu_m_nu_m_rho_m_sig,sig)
+                                )* getLink(U_v,x_p_mu_m_nu_m_rho,rho)
+                              )*adj(getLink(U_v,x_m_nu,mu))*getLink(U_v,x_m_nu,nu);
+}
+if constexpr(term==6) {
+                        res = getLink(U_v,x_p_mu,nu)
+                              *(getLink(U_v,x_p_mu_p_nu,rho)
+                                *(    getLink(U_v ,x_p_mu_p_nu_p_rho      ,sig)
+                                 *    getLink(XY_v,x_p_nu_p_rho_p_sig     ,mu )
+                                 *adj(getLink(U_v ,x_p_nu_p_rho           ,sig))
+                                 +adj(getLink(U_v ,x_p_mu_p_nu_p_rho_m_sig,sig))
+                                 *    getLink(XY_v,x_p_nu_p_rho_m_sig     ,mu )
+                                 *    getLink(U_v ,x_p_nu_p_rho_m_sig     ,sig)
+                                )*adj(getLink(U_v ,x_p_nu                 ,rho))
+
+                                +adj(getLink(U_v,x_p_mu_p_nu_m_rho,rho))
+                                *(adj(getLink(U_v ,x_p_mu_p_nu_m_rho_m_sig,sig))
+                                 *    getLink(XY_v,x_p_nu_m_rho_m_sig     ,mu )
+                                 *    getLink(U_v ,x_p_nu_m_rho_m_sig     ,sig)
+                                 +    getLink(U_v ,x_p_mu_p_nu_m_rho      ,sig)
+                                 *    getLink(XY_v,x_p_nu_m_rho_p_sig     ,mu )
+                                 *adj(getLink(U_v ,x_p_nu_m_rho           ,sig))
+                                )*getLink(U_v,x_p_nu_m_rho,rho)
+                              )*adj(getLink(U_v,x,nu));
+}
+if constexpr(term==7) {
+                        res = adj(getLink(U_v,x_p_mu_m_nu,nu))
+                              *(getLink(U_v,x_p_mu_m_nu,rho)
+                                *(    getLink(U_v ,x_p_mu_m_nu_p_rho      ,sig)
+                                 *    getLink(XY_v,x_m_nu_p_rho_p_sig     ,mu )
+                                 *adj(getLink(U_v ,x_m_nu_p_rho           ,sig))
+                                 +adj(getLink(U_v ,x_p_mu_m_nu_p_rho_m_sig,sig))
+                                 *    getLink(XY_v,x_m_nu_p_rho_m_sig     ,mu )
+                                 *    getLink(U_v ,x_m_nu_p_rho_m_sig     ,sig)
+                                )*adj(getLink(U_v,x_m_nu,rho))
+
+                                +adj(getLink(U_v,x_p_mu_m_nu_m_rho,rho))
+                                *(    getLink(U_v ,x_p_mu_m_nu_m_rho      ,sig)
+                                 *    getLink(XY_v,x_m_nu_m_rho_p_sig     ,mu )
+                                 *adj(getLink(U_v ,x_m_nu_m_rho           ,sig))
+                                 +adj(getLink(U_v ,x_p_mu_m_nu_m_rho_m_sig,sig))
+                                 *    getLink(XY_v,x_m_nu_m_rho_m_sig     ,mu )
+                                 *    getLink(U_v ,x_m_nu_m_rho_m_sig     ,sig)
+                                )*getLink(U_v,x_m_nu_m_rho,rho)
+                              )*getLink(U_v,x_m_nu,nu);
+}
+if constexpr(term==8) {
+                        res = getLink(U_v,x_p_mu,nu)*adj(getLink(U_v,x_p_nu,mu))
+                              *(getLink(U_v,x_p_nu,rho)
+                                *(    getLink(U_v ,x_p_nu_p_rho           ,sig)
+                                 *    getLink(XY_v,x_p_rho_p_sig          ,nu )
+                                 *adj(getLink(U_v ,x_p_rho                ,sig))
+                                 +adj(getLink(U_v ,x_p_nu_p_rho_m_sig     ,sig))
+                                 *    getLink(XY_v,x_p_rho_m_sig          ,nu )
+                                 *    getLink(U_v ,x_p_rho_m_sig          ,sig)
+                                )*adj(getLink(U_v,x,rho))
+
+                                +adj(getLink(U_v,x_p_nu_m_rho,rho))
+                                *(    getLink(U_v ,x_p_nu_m_rho           ,sig)
+                                 *    getLink(XY_v,x_m_rho_p_sig          ,nu )
+                                 *adj(getLink(U_v ,x_m_rho                ,sig))
+                                 +adj(getLink(U_v ,x_p_nu_m_rho_m_sig     ,sig))
+                                 *    getLink(XY_v,x_m_rho_m_sig          ,nu )
+                                 *    getLink(U_v ,x_m_rho_m_sig          ,sig)
+                                )*getLink(U_v,x_m_rho,rho)
+                              );
+}
+if constexpr(term==9) {
+                        res = adj(getLink(U_v,x_p_mu_m_nu,nu))*adj(getLink(U_v,x_m_nu,mu))
+                              *(getLink(U_v,x_m_nu,rho)
+                                *(    getLink(U_v ,x_m_nu_p_rho           ,sig)
+                                 *adj(getLink(XY_v,x_m_nu_p_rho_p_sig     ,nu ))
+                                 *adj(getLink(U_v ,x_p_rho                ,sig))
+                                 +adj(getLink(U_v ,x_m_nu_p_rho_m_sig     ,sig))
+                                 *adj(getLink(XY_v,x_m_nu_p_rho_m_sig     ,nu ))
+                                 *    getLink(U_v ,x_p_rho_m_sig          ,sig)
+                                )*adj(getLink(U_v,x,rho))
+
+                                +adj(getLink(U_v,x_m_nu_m_rho,rho))
+                                *(    getLink(U_v ,x_m_nu_m_rho           ,sig)
+                                 *adj(getLink(XY_v,x_m_nu_m_rho_p_sig     ,nu ))
+                                 *adj(getLink(U_v ,x_m_rho                ,sig))
+                                 +adj(getLink(U_v ,x_m_nu_m_rho_m_sig     ,sig))
+                                 *adj(getLink(XY_v,x_m_nu_m_rho_m_sig     ,nu ))
+                                 *    getLink(U_v ,x_m_rho_m_sig          ,sig)
+                                )*getLink(U_v,x_m_rho,rho)
+                              );
+}
+if constexpr(term==10) {
+                        U1 = adj(getLink(U_v,x_p_nu,mu));
+                        
+                        res = (     getLink(U_v,x_p_mu            ,sig) 
+                               *    getLink(U_v,x_p_mu_p_sig      ,nu )
+                               *adj(getLink(U_v,x_p_mu_p_nu       ,sig))
+                               +adj(getLink(U_v,x_p_mu_m_sig      ,sig)) 
+                               *    getLink(U_v,x_p_mu_m_sig      ,nu )
+                               *    getLink(U_v,x_p_mu_p_nu_m_sig ,sig)
+                              )*U1*getLink(U_v,x_p_nu,rho)*getLink(XY_v,x_p_rho,nu)*adj(getLink(U_v,x,rho))
+
+                              +(adj(getLink(U_v,x_p_mu_m_sig      ,sig)) 
+                               *    getLink(U_v,x_p_mu_m_sig      ,nu )
+                               *    getLink(U_v,x_p_mu_p_nu_m_sig ,sig)
+                               +    getLink(U_v,x_p_mu            ,sig)
+                               *    getLink(U_v,x_p_mu_p_sig      ,nu )
+                               *adj(getLink(U_v,x_p_mu_p_nu       ,sig))
+                              )*U1*adj(getLink(U_v,x_p_nu_m_rho,rho))*getLink(XY_v,x_m_rho,nu)*getLink(U_v,x_m_rho,rho);
+}
+if constexpr(term==11) {
+                        U1 = adj(getLink(U_v,x_m_nu,mu));
+                        
+                        res = (     getLink(U_v,x_p_mu            ,sig) 
+                               *adj(getLink(U_v,x_p_mu_m_nu_p_sig ,nu ))
+                               *adj(getLink(U_v,x_p_mu_m_nu       ,sig))
+                               +adj(getLink(U_v,x_p_mu_m_sig      ,sig))
+                               *adj(getLink(U_v,x_p_mu_m_nu_m_sig ,nu ))
+                               *    getLink(U_v,x_p_mu_m_nu_m_sig ,sig)
+                              )*U1*getLink(U_v,x_m_nu,rho)*adj(getLink(XY_v,x_m_nu_p_rho,nu))*adj(getLink(U_v,x,rho))
+
+                             +(     getLink(U_v,x_p_mu            ,sig)
+                               *adj(getLink(U_v,x_p_mu_m_nu_p_sig ,nu ))
+                               *adj(getLink(U_v,x_p_mu_m_nu       ,sig))
+                               +adj(getLink(U_v,x_p_mu_m_sig      ,sig))
+                               *adj(getLink(U_v,x_p_mu_m_nu_m_sig ,nu ))
+                               *    getLink(U_v,x_p_mu_m_nu_m_sig ,sig)
+                              )*U1*adj(getLink(U_v,x_m_nu_m_rho,rho))*adj(getLink(XY_v,x_m_nu_m_rho,nu))*getLink(U_v,x_m_rho,rho);
+}
+if constexpr(term==12) {
+                        res = (getLink(U_v,x_p_mu,rho)
+                               *(    getLink(U_v,x_p_mu_p_rho           ,sig)
+                                *    getLink(U_v,x_p_mu_p_rho_p_sig     ,nu ) 
+                                *adj(getLink(U_v,x_p_mu_p_nu_p_rho      ,sig))
+                                +adj(getLink(U_v,x_p_mu_p_rho_m_sig     ,sig))
+                                *    getLink(U_v,x_p_mu_p_rho_m_sig     ,nu )
+                                *    getLink(U_v,x_p_mu_p_nu_p_rho_m_sig,sig) 
+                               )*adj(getLink(U_v,x_p_mu_p_nu,rho))
+
+                               +adj(getLink(U_v,x_p_mu_m_rho,rho))
+                               *(    getLink(U_v,x_p_mu_m_rho           ,sig)
+                                *    getLink(U_v,x_p_mu_m_rho_p_sig     ,nu )
+                                *adj(getLink(U_v,x_p_mu_p_nu_m_rho      ,sig))
+                                +adj(getLink(U_v,x_p_mu_m_rho_m_sig     ,sig))
+                                *    getLink(U_v,x_p_mu_m_rho_m_sig     ,nu )
+                                *    getLink(U_v,x_p_mu_p_nu_m_rho_m_sig,sig)
+                               )*getLink(U_v,x_p_mu_p_nu_m_rho,rho)
+                             )*adj(getLink(U_v,x_p_nu,mu))*getLink(XY_v,x,nu);
+}
+if constexpr(term==13) {
+                        res = (adj(getLink(U_v,x_p_mu_m_rho,rho))
+                               *(    getLink(U_v,x_p_mu_m_rho           ,sig)
+                                *adj(getLink(U_v,x_p_mu_m_nu_m_rho_p_sig,nu ))
+                                *adj(getLink(U_v,x_p_mu_m_nu_m_rho      ,sig))
+                                +adj(getLink(U_v,x_p_mu_m_rho_m_sig     ,sig))
+                                *adj(getLink(U_v,x_p_mu_m_nu_m_rho_m_sig,nu ))
+                                *    getLink(U_v,x_p_mu_m_nu_m_rho_m_sig,sig) 
+                               )*getLink(U_v,x_p_mu_m_nu_m_rho,rho)
+
+                               +getLink(U_v,x_p_mu,rho)
+                               *(adj(getLink(U_v,x_p_mu_p_rho_m_sig     ,sig))
+                                *adj(getLink(U_v,x_p_mu_m_nu_p_rho_m_sig,nu ))
+                                *    getLink(U_v,x_p_mu_m_nu_p_rho_m_sig,sig)
+                                +    getLink(U_v,x_p_mu_p_rho           ,sig)
+                                *adj(getLink(U_v,x_p_mu_m_nu_p_rho_p_sig,nu ))
+                                *adj(getLink(U_v,x_p_mu_m_nu_p_rho      ,sig))
+                               )*adj(getLink(U_v,x_p_mu_m_nu,rho))
+                             )*adj(getLink(U_v,x_m_nu,mu))*adj(getLink(XY_v,x_m_nu,nu));
+} 
+                        setLink(F_v[x->_offset](mu), F_v(x->_offset)(mu) + c7*res);
+                    }
+                }
+            }
+        })
+    
+    }
+
+
+    GF naikLinkDeriv(std::vector<Real> vecdt, std::vector<FF>& vecx, std::vector<int> n_orders_naik, int n_naiks, Real cnaik) {
+        auto grid = this->_grid;
+        GF temp(grid);
+        std::vector<LF> Wv(Nd, grid);
+        std::vector<LF> XYdag(Nd, grid);
+        std::vector<LF> ddW(Nd, grid);
+        temp = outerProductHISQ(vecx, vecdt, n_orders_naik, n_naiks, 3);
+        for (int mu = 0; mu < Nd; mu++) {
+            Wv[mu]    = PeekIndex<LorentzIndex>(_Wmu, mu);
+            XYdag[mu] = PeekIndex<LorentzIndex>(temp, mu);
+            ddW[mu]   = Zero();
+        }
+        for (int mu = 0; mu < Nd; mu++) {
+            ddW[mu] =    Cshift(   Wv[mu],mu, 1)*Cshift(   Wv[mu],mu, 2)*       XYdag[mu]
+                       + Cshift(   Wv[mu],mu, 1)*Cshift(XYdag[mu],mu,-1)*Cshift(   Wv[mu],mu,-1)
+                       + Cshift(XYdag[mu],mu,-2)*Cshift(   Wv[mu],mu,-2)*Cshift(   Wv[mu],mu,-1); 
+        }
+        for (int mu = 0; mu < Nd; mu++) {
+            PokeIndex<LorentzIndex>(temp, ddW[mu], mu);
+        }
+        return cnaik*temp;
+    } 
+
+    
+    GF lepageLinkDeriv(GF& XY, Real clp) {
+
+        auto grid = this->_grid;
+        GF temp(grid);
+        std::vector<LF> Wv(Nd, grid);
+        std::vector<LF> XYdag(Nd, grid);
+        std::vector<LF> ddW(Nd, grid);
+
+        for (int mu = 0; mu < Nd; mu++) {
+            Wv[mu]    = PeekIndex<LorentzIndex>(_Wmu, mu);
+            ddW[mu]   = Zero();
+            XYdag[mu] = adj(PeekIndex<LorentzIndex>(XY, mu));
+        }
+
+        for (int mu = 0; mu < Nd; mu++) 
+        for (int nu = 0; nu < Nd; nu++) {
+            if(mu==nu) continue;
+
+            // (forward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(Wv[mu],mu,
+                                            Gimpl::CovShiftForward(Wv[nu],nu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftForward(Wv[mu],mu,
+                                                  Gimpl::CovShiftIdentityBackward(XYdag[nu],nu))))));
+            // (backward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(Wv[mu],mu,
+                                            Gimpl::CovShiftBackward(Wv[nu],nu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftForward(Wv[mu],mu,
+                                                  Gimpl::CovShiftIdentityForward(XYdag[nu],nu))))));
+            // (forward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftForward(Wv[nu],nu,
+                                            Gimpl::CovShiftForward(Wv[mu],mu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftBackward(XYdag[nu],nu,
+                                                  Gimpl::CovShiftIdentityBackward(Wv[mu],mu))))));
+            // (backward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(Wv[nu],nu,
+                                            Gimpl::CovShiftForward(Wv[mu],mu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftForward(XYdag[nu],nu,
+                                                  Gimpl::CovShiftIdentityBackward(Wv[mu],mu))))));
+            // (forward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftForward(Wv[nu],nu,
+                                            Gimpl::CovShiftForward(Wv[nu],nu,
+                                              Gimpl::CovShiftForward(XYdag[mu],mu,
+                                                Gimpl::CovShiftBackward(Wv[nu],nu,
+                                                  Gimpl::CovShiftIdentityBackward(Wv[nu],nu))))));
+            // (backward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(Wv[nu],nu,
+                                            Gimpl::CovShiftBackward(Wv[nu],nu,
+                                              Gimpl::CovShiftForward(XYdag[mu],mu,
+                                                Gimpl::CovShiftForward(Wv[nu],nu,
+                                                  Gimpl::CovShiftIdentityForward(Wv[nu],nu))))));
+            // (forward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(Wv[mu],mu,
+                                            Gimpl::CovShiftForward(XYdag[nu],nu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftForward(Wv[mu],mu,
+                                                  Gimpl::CovShiftIdentityBackward(Wv[nu],nu))))));
+            // (backward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(Wv[mu],mu,
+                                            Gimpl::CovShiftBackward(XYdag[nu],nu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftForward(Wv[mu],mu,
+                                                  Gimpl::CovShiftIdentityForward(Wv[nu],nu))))));
+            // (forward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftForward(XYdag[nu],nu,
+                                            Gimpl::CovShiftForward(Wv[mu],mu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftBackward(Wv[nu],nu,
+                                                  Gimpl::CovShiftIdentityBackward(Wv[mu],mu))))));
+            // (backward)
+            ddW[mu] = ddW[mu] + adj(Gimpl::CovShiftBackward(XYdag[nu],nu,
+                                            Gimpl::CovShiftForward(Wv[mu],mu,
+                                              Gimpl::CovShiftForward(Wv[mu],mu,
+                                                Gimpl::CovShiftForward(Wv[nu],nu,
+                                                  Gimpl::CovShiftIdentityBackward(Wv[mu],mu))))));
+        }
+
         for (int mu = 0; mu < Nd; mu++) {
-            V  = PeekIndex<LorentzIndex>(u_mu, mu);
-            Q  = adj(V)*V;
-            c0 =        real(trace(Q));
-            c1 = (1/2.)*real(trace(Q*Q));
-            c2 = (1/3.)*real(trace(Q*Q*Q));
-            S  = (1/3.)*c1-(1/18.)*c0*c0;
-            if (norm2(S)<1e-28) {
-                g0 = (1/3.)*c0; g1 = g0; g2 = g1;
-            } else {
-                R     = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
-                theta = acos(R*pow(S,-1.5));
-                g0    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
-                g1    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta          );
-                g2    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
+            PokeIndex<LorentzIndex>(temp, ddW[mu], mu);
+        }
+
+        return clp*temp;
+    }
+
+
+    // We are calculating the force using the rational approximation. The goal is that we can approximate
+    // (Mdag M)^(-nf/4) = alpha_0 + sum_l alpha_l/(M^dag M + beta_l). Hence the index l runs over the
+    // introduce a different "Naik epsilon" for each M. Hence, we can think of the total application
+    // of this operator as having an index inaik, running over the different Naik epsilons; for each inaik
+    // there is a possibly different order_inaik, then the operator has an index l running up to order_inaik.
+    // All terms with inaik=0 correspond to epsilon_Naik = 0.
+    //
+    // Intent: OUT--u_force
+    //          IN--vecdt: Monte Carlo separation vector times alpha_{inaik,0}. 
+    //              vecx: A vector of fermion fields coming from the MILC code. It is organized so that 
+    //                    |X_l> = (Mdag M + beta_l)^-1 |Phi> is on even sites, |Y_l>=D|X_l> is on odd sites.
+    //                    All the |X_l> for i=0 come first in memory, followed by all the |X_l> with
+    //                    i=1 in memory, and so on.
+    //              n_orders_naik: Indexed by unique naik epsilon.
+    void force(GF& u_force, std::vector<Real> vecdt, std::vector<FF>& vecx, std::vector<int> n_orders_naik) {
+
+        HISQParameters<Real> hp = this->_linkParams;
+        auto grid = this->_grid;
+
+        GF XY(grid);    // outer product field
+        GF temp(grid);  // used to accumulate N-link force contributions and projU3Deriv 
+
+        u_force = Zero();
+
+        if(hp.asqtad_cnaik!=0) u_force += naikLinkDeriv(vecdt, vecx, n_orders_naik, hp.n_naiks, hp.asqtad_cnaik); 
+
+        XY = outerProductHISQ(vecx, vecdt, n_orders_naik, hp.n_naiks, 1);   
+        u_force += hp.asqtad_c1*XY;
+
+        if(hp.asqtad_clp!=0) u_force += lepageLinkDeriv(XY, hp.asqtad_clp); 
+
+
+        // ---------------------------------- N-LINK DERIVATIVES (ASQTAD)
+
+        PaddedCell Ghost(_HaloDepth,grid);
+
+        temp = Zero();
+
+        GF UWghost  = Ghost.Exchange(_Wmu);    // Plays role of U or W 
+        GF XYCghost = Ghost.Exchange(XY);      // Plays role of XY or chain rule = dW/dV*dX/dW
+        GF Fghost   = Ghost.Exchange(temp);
+
+        std::vector<Coordinate> shifts3 = createHISQStencil("3STAPLE");
+        std::vector<Coordinate> shifts5 = createHISQStencil("5STAPLE");
+        std::vector<Coordinate> shifts7 = createHISQStencil("7STAPLE");
+        GeneralLocalStencil gStencil3(UWghost.Grid(),shifts3);
+        GeneralLocalStencil gStencil5(UWghost.Grid(),shifts5);
+        GeneralLocalStencil gStencil7(UWghost.Grid(),shifts7);
+
+        for(int mu=0;mu<Nd;mu++) {
+            if(hp.asqtad_c3!=0) { 
+                threeLinkDeriv(    Fghost, UWghost, XYCghost, gStencil3, hp.asqtad_c3, mu);
+            }
+            if(hp.asqtad_c5!=0) {
+                fiveLinkDeriv<0>(  Fghost, UWghost, XYCghost, gStencil5, hp.asqtad_c5, mu); 
+                fiveLinkDeriv<1>(  Fghost, UWghost, XYCghost, gStencil5, hp.asqtad_c5, mu);
             }
-//            if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD }
-            u     = sqrt(g0) + sqrt(g1) + sqrt(g2);
-            v     = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
-            w     = sqrt(g0*g1*g2);
-            den   = w*(u*v-w);
-            f0    = (-w*(u*u+v)+u*v*v)/den;
-            f1    = (-w-u*u*u+2.*u*v)/den;
-            f2    = u/den;
-            id_3  = 1.;
-
-            sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q;
-
-            PokeIndex<LorentzIndex>(u_proj, V*sqrtQinv, mu);
+            if(hp.asqtad_c7!=0) {
+                sevenLinkDeriv<0>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<1>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<2>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<3>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<4>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<5>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<6>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<7>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<8>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<9>( Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<10>(Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<11>(Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<12>(Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu); 
+                sevenLinkDeriv<13>(Fghost, UWghost, XYCghost, gStencil7, hp.asqtad_c7, mu);
+            } 
         }
-    };
 
+        u_force += Ghost.Extract(Fghost);
+
+        // ------------------------------------------- U3 PROJ DERIVATIVE 
+
+        projU3Deriv(temp, _Vmu, u_force, 5e-5);
+        u_force = hp.fat7_c1*temp;
+
+        // ------------------------------------ N-LINK DERIVATIVES (FAT7) 
+
+        UWghost  = Ghost.Exchange(_Umu);
+        XYCghost = Ghost.Exchange(temp); 
+        Fghost   = Zero(); 
+
+        for(int mu=0;mu<Nd;mu++) {
+            if(hp.fat7_c3!=0) { 
+                threeLinkDeriv(    Fghost, UWghost, XYCghost, gStencil3, hp.fat7_c3, mu);
+            }
+            if(hp.fat7_c5!=0) {
+                fiveLinkDeriv<0>(  Fghost, UWghost, XYCghost, gStencil5, hp.fat7_c5, mu); 
+                fiveLinkDeriv<1>(  Fghost, UWghost, XYCghost, gStencil5, hp.fat7_c5, mu);
+            }
+            if(hp.fat7_c7!=0) {
+                sevenLinkDeriv<0>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<1>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<2>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<3>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<4>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<5>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<6>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<7>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<8>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<9>( Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<10>(Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<11>(Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<12>(Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu); 
+                sevenLinkDeriv<13>(Fghost, UWghost, XYCghost, gStencil7, hp.fat7_c7, mu);
+            } 
+        } // end mu loop
+
+        u_force += Ghost.Extract(Fghost);
+
+        // Close the loop: Multiply on the left by U_mu(x)
+        LF force_mu(grid);
+        for (int mu = 0; mu < Nd; mu++) {
+            force_mu = PeekIndex<LorentzIndex>(_Umu, mu) * PeekIndex<LorentzIndex>(u_force, mu);
+            PokeIndex<LorentzIndex>(u_force, force_mu, mu);
+        }
+    }
 
-//    void derivative(const GaugeField& Gauge) const {
-//    };
 };
 
 
diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h
index 66d25bc4b5..c2bc9111f4 100644
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -156,12 +156,11 @@ class GeneralLocalStencil : public GeneralLocalStencilView {
 class shiftSignal {
 public:
     enum {
-        BACKWARD_CONST = 16,
+        BACKWARD_CONST = GRID_MAX_LATTICE_DIMENSION + 2,
         NO_SHIFT       = -1
     };
 };
 
-// TODO: put a check somewhere that BACKWARD_CONST > Nd!
 
 /*!  @brief signals that you want to go backwards in direction dir */
 inline int Back(const int dir) {
@@ -170,6 +169,7 @@ inline int Back(const int dir) {
     return dir + shiftSignal::BACKWARD_CONST;
 }
 
+
 /*!  @brief shift one unit in direction dir */
 template<typename... Args>
 void generalShift(Coordinate& shift, int dir) {
@@ -183,6 +183,7 @@ void generalShift(Coordinate& shift, int dir) {
     }
 }
 
+
 /*!  @brief follow a path of directions, shifting one unit in each direction */
 template<typename... Args>
 void generalShift(Coordinate& shift, int dir, Args... args) {
@@ -198,5 +199,16 @@ void generalShift(Coordinate& shift, int dir, Args... args) {
 }
 
 
+/*!  @brief append arbitrary shift path to shifts of dimension d */
+template<int d, typename... Args>
+void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) {
+    Coordinate shift(d,0);
+    generalShift(shift, dir, args...); 
+    // push_back creates an element at the end of shift and
+    // assigns the data in the argument to it.
+    shifts.push_back(shift);
+}
+
+
 NAMESPACE_END(Grid);
 
diff --git a/Makefile.am b/Makefile.am
index 9addcbf5bb..656ce475ac 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,3 +1,5 @@
+# part of the GNU Automake system, which simplifies the generation of `Makefile.in` files.
+
 # additional include paths necessary to compile the C++ library
 SUBDIRS = Grid  benchmarks tests examples HMC
 
diff --git a/tests/forces/Makefile.am b/tests/forces/Makefile.am
index 481efd3f55..f46e5005f7 100644
--- a/tests/forces/Makefile.am
+++ b/tests/forces/Makefile.am
@@ -12,3 +12,4 @@ check: tests
 	./Test_dwf_gpforce
 	./Test_mobius_force
 	./Test_zmobius_force
+	./Test_HISQ_force
diff --git a/tests/forces/Test_HISQ_force.cc b/tests/forces/Test_HISQ_force.cc
new file mode 100644
index 0000000000..3d06c509b5
--- /dev/null
+++ b/tests/forces/Test_HISQ_force.cc
@@ -0,0 +1,278 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/smearing/Test_HISQ_force.cc
+
+Copyright (C) 2024
+
+Author: D. A. Clarke <clarke.davida@gmail.com> 
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*
+    @file Test_HISQ_force.cc
+    @brief test of the HISQ fermion force 
+*/
+
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+#include <Grid/qcd/smearing/HISQSmearing.h>
+using namespace Grid;
+
+
+//#define USE_DOUBLE true 
+#define USE_DOUBLE true 
+
+#if USE_DOUBLE 
+    #define PREC double
+    typedef LatticeGaugeFieldD LGF;
+    typedef StaggeredImplD GIMPL;
+    typedef vComplexD COMP;
+#else
+    #define PREC float
+    typedef LatticeGaugeFieldF LGF;
+    typedef StaggeredImplF GIMPL;
+    typedef vComplexF COMP;
+#endif 
+
+typedef typename GIMPL::FermionField FF;
+
+
+
+
+// This is a sort of contrived test situation. The goal is to make sure the fermion force
+// code is stable against future changes and get an idea how the HISQ force interface works.
+bool testForce(GridCartesian& GRID, LGF Umu, LGF Ucontrol, std::string testKind,
+               Real fat7_c1  , Real fat7_c3  , Real fat7_c5  , Real fat7_c7  , Real cnaik,
+               Real asqtad_c1, Real asqtad_c3, Real asqtad_c5, Real asqtad_c7, Real asqtad_clp) {
+
+    LGF Vmu(&GRID), Wmu(&GRID), Nmu(&GRID), Umom(&GRID);
+
+    // The n_orders_naik array is indexed according to the unique Naik epsilon values. By convention, 
+    // index 0 always corresponds to zero Naik epsilon. n_orders_naik[0] is the sum of the RHMC 
+    // molecular dynamics orders of all the pseudofermion fermions with zero Naik epsilon at the 
+    // head of the rational function parameter file. n_orders_naik[1] is the sum of orders for the 
+    // first nonzero Naik epsilon. The sum of all the n_orders_naik elements equals the size of the 
+    // vecx and vecdt arrays. Elements of those arrays are grouped according to n_orders_naik, so 
+    // the first n_orders_naik[0] elements of vecx and epsv correspond to the pseudofermions with 
+    // nonzero Naik epsilon. That group lumps together terms for each of the zero-epsilon 
+    // pseudofermions.
+    int n_naiks = 1; // Just a charm 
+    std::array<Real,GRID_MAX_NAIK> eps_naik = {0,0,0}; 
+    std::vector<int> n_orders_naik = {1,1};  
+    std::vector<Real> vecdt = {0.1,0.1};  
+
+    HISQParameters<Real> hisq_param(n_naiks  , eps_naik ,
+                                    fat7_c1  , fat7_c3  , fat7_c5  , fat7_c7  , 0.,
+                                    asqtad_c1, asqtad_c3, asqtad_c5, asqtad_c7, asqtad_clp,
+                                    cnaik    , 0.       , 0.);
+    HISQReunitSVDParameters<Real> hisq_reunit_svd(false, false, 1, 1, 1);
+
+    Smear_HISQ<GIMPL> fat7(&GRID,fat7_c1,0.,fat7_c3,fat7_c5,fat7_c7,0.);
+    fat7.smear(Vmu,Nmu,Umu); // Populate fat7 and Naik links Vmu and Nmu
+    fat7.projectU3(Wmu,Vmu); // Populate U(3) projection Wmu
+    Force_HISQ<GIMPL> hisq_force(&GRID, hisq_param, Wmu, Vmu, Umu, hisq_reunit_svd);
+
+    std::vector<int> seeds({1,2,3,4});
+    GridParallelRNG pRNG(&GRID);  pRNG.SeedFixedIntegers(seeds);
+
+    // Construct a vecx. Eventually it would be nice if this generated the correct distribution,
+    // but for now use Gaussian random variables as a placeholder.
+    std::vector<FF> vecx;
+    int l = 0;
+    for (int inaik = 0; inaik < hisq_param.n_naiks; inaik++) {
+        int rat_order = n_orders_naik[inaik];
+        FF PHI(&GRID);
+        PHI = Zero();
+        for (int i=0; i<rat_order; i++) {
+            gaussian(pRNG,PHI);
+            vecx.push_back(PHI); 
+        }
+    }
+
+    Grid_log("");
+    Grid_log("    TEST "+testKind);
+    Grid_log("");
+
+    hisq_force.force(Umom,vecdt,vecx,n_orders_naik);
+    LGF diff(&GRID);
+    diff = Ucontrol-Umom;
+    auto absDiff = norm2(diff)/norm2(Ucontrol);
+    if (absDiff < 1e-30) {
+        Grid_pass(" |Umu-Umom|/|Umu| = ",absDiff);
+        return true;
+    } else {
+        Grid_error(" |Umu-Umom|/|Umu| = ",absDiff);
+        return false;
+    }
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.XY.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.3link.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.lp.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.naik.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.5link.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.7link.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.level2.control");
+//    NerscIO::writeConfiguration(Umom,"nersc.l8t4b3360.Umom.level12.control");
+//    return true;
+}
+
+
+// Intent: IN--Umu: thin links
+bool testddUProj(GridCartesian& GRID, LGF Umu, LGF Ucontrol) {
+
+    LGF Uforce(&GRID);
+
+    int n_naiks = 1;
+    std::array<Real,GRID_MAX_NAIK> eps_naik = {0,0,0}; 
+
+    Real fat7_c1  = 1/8.;                  Real asqtad_c1  = 1.;
+    Real fat7_c3  = 1/16.;                 Real asqtad_c3  = 1/16.;
+    Real fat7_c5  = 1/64.;                 Real asqtad_c5  = 1/64.;
+    Real fat7_c7  = 1/384.;                Real asqtad_c7  = 1/384.;
+    Real cnaik    = -1/24.+eps_naik[0]/8;  Real asqtad_clp = -1/8.;
+
+
+    LGF Vmu(&GRID), Wmu(&GRID), Nmu(&GRID);
+    Smear_HISQ<GIMPL> fat7(&GRID,fat7_c1,0.,fat7_c3,fat7_c5,fat7_c7,0.);
+
+    fat7.smear(Vmu,Nmu,Umu); // Populate fat7 and Naik links Vmu and Nmu
+    fat7.projectU3(Wmu,Vmu); // Populate U(3) projection Wmu
+
+    HISQParameters<Real> hisq_param(n_naiks  , eps_naik ,
+                                    fat7_c1  , fat7_c3  , fat7_c5  , fat7_c7  , 0.,
+                                    asqtad_c1, asqtad_c3, asqtad_c5, asqtad_c7, asqtad_clp,
+                                    cnaik    , 0.       , 0.);
+
+    HISQReunitSVDParameters<Real> hisq_reunit_svd(false, false, 1, 1, 1);
+
+    Force_HISQ<GIMPL> hisq_force(&GRID, hisq_param, Wmu, Vmu, Umu, hisq_reunit_svd);
+
+    Grid_log("");
+    Grid_log("    TEST DERIVATIVE U3 PROJECTION");
+    Grid_log("");
+
+    LGF diff(&GRID);
+    hisq_force.projU3Deriv(Uforce, Umu, Umu, 5e-5);
+    diff = Ucontrol-Uforce;
+    auto absDiff = norm2(diff)/norm2(Ucontrol);
+    if (absDiff < 1e-30) {
+        Grid_pass(" |Umu-Usmr|/|Umu| = ",absDiff);
+        return true;
+    } else {
+        Grid_error(" |Umu-Usmr|/|Umu| = ",absDiff);
+        return false;
+    }
+//    NerscIO::writeConfiguration(Uforce,"nersc.l8t4b3360.ddVU3");
+}
+
+
+int main (int argc, char** argv) {
+
+    // Params for the test.
+    int Ns = 8;
+    int Nt = 4;
+    Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt;
+    std::string conf_in  = "nersc.l8t4b3360";
+    int threads          = GridThread::GetThreads();
+
+    // Initialize the Grid
+    Grid_init(&argc,&argv);
+    Coordinate simd_layout = GridDefaultSimd(Nd,COMP::Nsimd());
+    Coordinate mpi_layout  = GridDefaultMpi();
+    Grid_log("mpi     = ",mpi_layout);
+    Grid_log("simd    = ",simd_layout);
+    Grid_log("latt    = ",latt_size);
+    Grid_log("threads = ",threads);
+    GridCartesian GRID(latt_size,simd_layout,mpi_layout);
+
+    LGF Umu(&GRID), Ucontrol(&GRID);
+
+#if USE_DOUBLE // NerscIO is hard-coded to double.
+
+    // Read the configuration into Umu
+    FieldMetaData header;
+    NerscIO::readConfiguration(Umu, header, conf_in);
+
+    bool pass=true;
+
+    // Check derivative of projection
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.ddVU3.control");
+    pass *= testddUProj(GRID,Umu,Ucontrol);
+
+    // Check the 1-link (outer product) 
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.XY.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "1-link",
+                      1, 0, 0, 0, 0,
+                      1, 0, 0, 0, 0 );
+
+    // Check the 3-link
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.3link.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "3-link",
+                      1, 0, 0, 0, 0,
+                      1, 1, 0, 0, 0 );
+
+    // Check the LePage-link 
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.lp.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "LePage",
+                      1, 0, 0, 0, 0,
+                      1, 0, 0, 0, 1 );
+
+    // Check the Naik-link 
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.naik.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "Naik",
+                      1, 0, 0, 0, 1,
+                      1, 0, 0, 0, 0 );
+
+    // Check the 5-link
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.5link.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "5-link",
+                      1, 0, 0, 0, 0,
+                      1, 0, 1, 0, 0 );
+
+    // Check the 7-link
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.7link.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "7-link",
+                      1, 0, 0, 0, 0,
+                      1, 0, 0, 1, 0 );
+
+    // Check level 2 smearing 
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.level2.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "level 2",
+                      1, 0, 0, 0, -1/24.,
+                      1, -1/16., 1/64., -1/384., -1/8. );
+
+    // Check level 1+2 smearing 
+    NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.Umom.level12.control");
+    pass *= testForce(GRID, Umu, Ucontrol, "level 1+2",
+                      1/8., -1/16., 1/64., -1/384., -1/24.,
+                      1   , -1/16., 1/64., -1/384., -1/8. );
+
+
+    if(pass){
+        Grid_pass("All tests passed.");
+    } else {
+        Grid_error("At least one test failed.");
+    }
+
+#endif
+
+    Grid_finalize();
+}
\ No newline at end of file
diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc
index 04d5b165db..4d9e38a343 100644
--- a/tests/smearing/Test_fatLinks.cc
+++ b/tests/smearing/Test_fatLinks.cc
@@ -38,24 +38,26 @@ directory
 using namespace Grid;
 
 
-/*!  @brief parameter file to easily adjust Nloop */
-struct ConfParameters: Serializable {
-    GRID_SERIALIZABLE_CLASS_MEMBERS(
-        ConfParameters,
-        int, benchmark, 
-        int, Nloop);
-
-    template <class ReaderClass>
-    ConfParameters(Reader<ReaderClass>& Reader){
-        read(Reader, "parameters", *this);
-    }
-};
-
+//#define USE_DOUBLE true 
+#define USE_DOUBLE true 
 
-bool testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, 
-               LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) {
-    Smear_HISQ<PeriodicGimplD> hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp);
-    LatticeGaugeFieldD diff(&GRID), Uproj(&GRID);
+#if USE_DOUBLE 
+    #define PREC double
+    typedef LatticeGaugeFieldD LGF;
+    typedef PeriodicGimplD GIMPL;
+    typedef vComplexD COMP;
+#else
+    #define PREC float
+    typedef LatticeGaugeFieldF LGF;
+    typedef PeriodicGimplF GIMPL;
+    typedef vComplexF COMP;
+#endif 
+
+
+bool testSmear(GridCartesian& GRID, LGF Umu, LGF Usmr, LGF Unaik, 
+               LGF Ucontrol, PREC c1, PREC cnaik, PREC c3, PREC c5, PREC c7, PREC clp) {
+    Smear_HISQ<GIMPL> hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp);
+    LGF diff(&GRID), Uproj(&GRID);
     hisq_fat.smear(Usmr, Unaik, Umu);
     bool result;
     if (cnaik < 1e-30) { // Testing anything but Naik term
@@ -85,6 +87,16 @@ bool testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD U
 }
 
 
+void hotStartSmear(GridCartesian& GRID) {
+    LGF Uproj(&GRID), Uhot(&GRID);
+    GridParallelRNG pRNG(&GRID); pRNG.SeedFixedIntegers(std::vector<int>({111,222,333,444}));
+    SU<Nc>::HotConfiguration(pRNG,Uhot);
+    Smear_HISQ<GIMPL> hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,-1/8.);
+    hisq_fat.projectU3(Uproj,Uhot);
+    Grid_log("norm2(Uproj) = ",norm2(Uproj)/(Nc*Nd*GRID.gSites()));
+}
+
+
 int main (int argc, char** argv) {
 
     // Params for the test.
@@ -94,11 +106,15 @@ int main (int argc, char** argv) {
     std::string conf_in  = "nersc.l8t4b3360";
     int threads          = GridThread::GetThreads();
 
-    typedef LatticeGaugeFieldD LGF;
+    if (sizeof(PREC)==4) {
+        Grid_log("Run in single precision.");
+    } else { 
+        Grid_log("Run in double precision.");
+    }
 
     // Initialize the Grid
     Grid_init(&argc,&argv);
-    Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+    Coordinate simd_layout = GridDefaultSimd(Nd,COMP::Nsimd());
     Coordinate mpi_layout  = GridDefaultMpi();
     Grid_log("mpi     = ",mpi_layout);
     Grid_log("simd    = ",simd_layout);
@@ -106,12 +122,10 @@ int main (int argc, char** argv) {
     Grid_log("threads = ",threads);
     GridCartesian GRID(latt_size,simd_layout,mpi_layout);
 
-    XmlReader Reader("fatParams.xml",false,"grid");
-    ConfParameters param(Reader);
-    if(param.benchmark) Grid_log("  Nloop = ",param.Nloop);
-
     LGF Umu(&GRID), Usmr(&GRID), Unaik(&GRID), Ucontrol(&GRID);
 
+#if USE_DOUBLE // NerscIO is hard-coded to double.
+
     // Read the configuration into Umu
     FieldMetaData header;
     NerscIO::readConfiguration(Umu, header, conf_in);
@@ -136,46 +150,17 @@ int main (int argc, char** argv) {
         Grid_error("At least one test failed.");
     }
 
+#endif 
+
+    // Does a small hot start cause an issue?
+    hotStartSmear(GRID);
+    latt_size[0]=16; latt_size[1]=16; latt_size[2]=16; latt_size[3]=16;
+    GridCartesian SYMM(latt_size,simd_layout,mpi_layout);
+    hotStartSmear(SYMM);
+
     // Test a C-style instantiation 
     double path_coeff[6] = {1, 2, 3, 4, 5, 6};
-    Smear_HISQ<PeriodicGimplD> hisq_fat_Cstyle(&GRID,path_coeff);
-
-    if (param.benchmark) {
-
-        autoView(U_v, Umu, CpuRead); // Gauge accessor
-
-        // Read in lattice sequentially, Nloop times 
-        double lookupTime = 0.; 
-        for(int i=0;i<param.Nloop;i++) {
-            double start = usecond();
-            for(int ss=0;ss<U_v.size();ss++)
-                for(int mu=0;mu<Nd;mu++) {
-                    auto U1 = U_v[ss](mu);
-            }
-            double stop  = usecond();
-        	lookupTime += stop-start; // microseconds
-        }
-        Grid_log("Time to lookup: ",lookupTime,"[ms]");
-
-        // Raise a matrix to the power nmat, for each link. 
-        auto U1 = U_v[0](0);
-        for(int nmat=1;nmat<8;nmat++) {
-            double multTime = 0.; 
-            for(int i=0;i<param.Nloop;i++) {
-                double start=usecond();
-                for(int ss=0;ss<U_v.size();ss++)
-                    for(int mu=0;mu<Nd;mu++) {
-                        auto U2 = U1;
-                        for(int j=1;j<nmat;j++) {
-                            U2 *= U1;
-                        }
-                }
-                double stop=usecond();
-                multTime += stop-start;
-            }
-            Grid_log("Time to multiply ",nmat," matrices: ",multTime," [ms]");
-        }
-    }
+    Smear_HISQ<GIMPL> hisq_fat_Cstyle(&GRID,path_coeff);
 
     Grid_finalize();
 }
\ No newline at end of file