From 77cef47d91bde7f298b84d2c0419561dfa0ffdaf Mon Sep 17 00:00:00 2001 From: Jeremy Sauer Date: Mon, 24 Mar 2025 20:15:37 -0600 Subject: [PATCH 1/5] Implemented capability of non-unity valued dx/d_zeta and dy/d_zeta metric tensor terms to accomodate terrain following coordinate frame. --- SRC/GRID/CUDA/cuda_gridDevice.cu | 24 +++++++++++++++++++----- SRC/GRID/CUDA/cuda_gridDevice_cu.h | 4 +++- SRC/GRID/grid.c | 29 ++++++++++++++++++++++++++--- SRC/GRID/grid.h | 9 +++++++++ 4 files changed, 57 insertions(+), 9 deletions(-) diff --git a/SRC/GRID/CUDA/cuda_gridDevice.cu b/SRC/GRID/CUDA/cuda_gridDevice.cu index ed963e1..92629aa 100644 --- a/SRC/GRID/CUDA/cuda_gridDevice.cu +++ b/SRC/GRID/CUDA/cuda_gridDevice.cu @@ -49,6 +49,8 @@ float *yPos_d; // Cell-center position in y (meters) float *zPos_d; // Cell-center position in z (meters) float *topoPos_d; //Topography elevation (z in meters) at the cell center position in x and y. +float *J13_d; // dx/d_zeta +float *J23_d; // dy/d_zeta float *J31_d; // dz/d_xi float *J32_d; // dz/d_eta float *J33_d; // dz/d_zeta @@ -101,6 +103,8 @@ extern "C" int cuda_gridDeviceSetup(){ Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); /* Allocate the GRID arrays */ /* Coordinate Arrays */ + fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d); + fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &xPos_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &yPos_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &zPos_d); @@ -119,6 +123,8 @@ extern "C" int cuda_gridDeviceSetup(){ cudaMemcpy(yPos_d, yPos, Nelems*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(zPos_d, zPos, Nelems*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(topoPos_d, topoPos, ((Nxp+2*Nh)*(Nyp+2*Nh))*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(J13_d, J13, Nelems*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(J23_d, J23, Nelems*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(J31_d, J31, Nelems*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(J32_d, J32, Nelems*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(J33_d, J33, Nelems*sizeof(float), cudaMemcpyHostToDevice); @@ -134,7 +140,7 @@ extern "C" int cuda_gridDeviceSetup(){ printf("Calling cudaDevice_calculateJacobians...\n"); printf("grid = {%d, %d, %d}\n",grid.x, grid.y, grid.z); printf("tBlock = {%d, %d, %d}\n",tBlock.x, tBlock.y, tBlock.z); - cudaDevice_calculateJacobians<<>>(J31_d, J32_d, J33_d, + cudaDevice_calculateJacobians<<>>(J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d, invD_Jac_d, xPos_d, yPos_d, zPos_d); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); @@ -145,6 +151,8 @@ extern "C" int cuda_gridDeviceSetup(){ /* cudaMemcpy the GPU-computed GRID arrays from Device Host*/ /* Coordinate Arrays */ + cudaMemcpy(J13, J13_d, Nelems*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(J23, J23_d, Nelems*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(J31, J31_d, Nelems*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(J32, J32_d, Nelems*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(J33, J33_d, Nelems*sizeof(float), cudaMemcpyDeviceToHost); @@ -166,6 +174,8 @@ extern "C" int cuda_gridDeviceCleanup(){ /* Free any GRID module arrays */ /* metric tensor fields */ + cudaFree(J13_d); + cudaFree(J23_d); cudaFree(J31_d); cudaFree(J32_d); cudaFree(J33_d); @@ -185,7 +195,7 @@ extern "C" int cuda_gridDeviceCleanup(){ /*----->>>>> __global__ void cudaDevice_calculateJacobians(); -------------------------------------------------- This is the cuda version of the calculateJacobians routine from the GRID module */ -__global__ void cudaDevice_calculateJacobians(float *J31_d, float *J32_d, float *J33_d, +__global__ void cudaDevice_calculateJacobians(float *J13_d, float *J23_d, float *J31_d, float *J32_d, float *J33_d, float *D_Jac_d, float *invD_Jac_d, float *xPos_d, float *yPos_d, float *zPos_d){ int i,j,k; @@ -252,14 +262,18 @@ __global__ void cudaDevice_calculateJacobians(float *J31_d, float *J32_d, float }//finite check */ #endif /*Set the elements*/ - /*d(x,y,z)/d_zetaa*/ + /*dx/d_zeta*/ + J13_d[ijk] = (T[1]*T[5] - T[2]*T[4])*invD_Jac_d[ijk]; + /*dy/d_zeta*/ + J23_d[ijk] = -(T[0]*T[5] - T[2]*T[3])*invD_Jac_d[ijk]; + /*dz/d_(xi, eta, zeta)*/ J31_d[ijk] = (T[3]*T[7] - T[4]*T[6])*invD_Jac_d[ijk]; J32_d[ijk] = (T[0]*T[7] - T[1]*T[6])*invD_Jac_d[ijk]; J33_d[ijk] = (T[0]*T[4] - T[1]*T[3])*invD_Jac_d[ijk]; #ifdef DEBUG if((i==64)&&(j==64)&&(k==64)){ - printf("At (%d, %d, %d):\n\t\t J31=%f, J32=%f, J33=%f,\n\t\t D_Jac=%f, invD_Jac=%f\n", - i,j,k, J31_d[ijk],J32_d[ijk],J33_d[ijk], + printf("At (%d, %d, %d)::\n\t\t J13=%f, J23=%f,\n\t\t J31=%f, J32=%f, J33=%f,\n\t\t D_Jac=%f, invD_Jac=%f\n", + i,j,k, J13_d[ijk],J23_d[ijk],J31_d[ijk],J32_d[ijk],J33_d[ijk], D_Jac_d[ijk],invD_Jac_d[ijk]); } #endif diff --git a/SRC/GRID/CUDA/cuda_gridDevice_cu.h b/SRC/GRID/CUDA/cuda_gridDevice_cu.h index a6fea11..253f2f0 100644 --- a/SRC/GRID/CUDA/cuda_gridDevice_cu.h +++ b/SRC/GRID/CUDA/cuda_gridDevice_cu.h @@ -46,6 +46,8 @@ extern float *yPos_d; /* Cell-center position in y (meters) */ extern float *zPos_d; /* Cell-center position in z (meters) */ extern float *topoPos_d; /*Topography elevation (z in meters) at the cell center position in x and y. */ +extern float *J13_d; // dx/d_zeta +extern float *J23_d; // dy/d_zeta extern float *J31_d; // dz/d_xi extern float *J32_d; // dz/d_eta extern float *J33_d; // dz/d_zeta @@ -70,7 +72,7 @@ extern "C" int cuda_gridDeviceCleanup(); /*----->>>>> __global__ void cudaDevice_calculateJacobians(); -------------------------------------------------- * This is the cuda version of the calculateJacobians routine from the GRID module * */ -__global__ void cudaDevice_calculateJacobians(float *J31_d, float *J32_d, float *J33_d, +__global__ void cudaDevice_calculateJacobians(float *J13_d, float *J23_d, float *J31_d, float *J32_d, float *J33_d, float *D_Jac_d, float *invD_Jac_d, float *xPos_d, float *yPos_d, float *zPos_d); diff --git a/SRC/GRID/grid.c b/SRC/GRID/grid.c index 68a25e3..319ea2a 100644 --- a/SRC/GRID/grid.c +++ b/SRC/GRID/grid.c @@ -54,6 +54,15 @@ float *zPos; /* Cell-center position in z (meters) */ float *topoPosGlobal; /*Topography elevation (z in meters) at the cell center position in x and y. (Global domain) */ float *topoPos; /*Topography elevation (z in meters) at the cell center position in x and y. (per-rank domain) */ +//float *J11; // dx/d_xi -- assumed = 1.0 +//float *J12; // dx/d_eta -- assumed = 1.0 + +//float *J21; // dy/d_xi -- assumed = 1.0 +//float *J22; // dy/d_eta -- assumed = 1.0 + +float *J13; // dx/d_zeta +float *J23; // dy/d_zeta + float *J31; // dz/d_xi float *J32; // dz/d_eta float *J33; // dz/d_zeta @@ -240,6 +249,8 @@ int gridInit(){ topoPos = memAllocateFloat2DField(Nxp, Nyp, Nh, "topoPos"); topoPosGlobal = memAllocateFloat2DField(Nx, Ny, 0, "topoPos"); /* Metric Tensors Fields */ + J13 = memAllocateFloat3DField(Nxp, Nyp, Nzp, Nh, "J13"); + J23 = memAllocateFloat3DField(Nxp, Nyp, Nzp, Nh, "J23"); J31 = memAllocateFloat3DField(Nxp, Nyp, Nzp, Nh, "J31"); J32 = memAllocateFloat3DField(Nxp, Nyp, Nzp, Nh, "J32"); J33 = memAllocateFloat3DField(Nxp, Nyp, Nzp, Nh, "J33"); @@ -266,6 +277,8 @@ int gridInit(){ //#if 1 errorCode = ioRegisterVar("D_Jac", "float", 4, dims4d, D_Jac); errorCode = ioRegisterVar("invD_Jac", "float", 4, dims4d, invD_Jac); + errorCode = ioRegisterVar("J13", "float", 4, dims4d, J13); + errorCode = ioRegisterVar("J23", "float", 4, dims4d, J23); errorCode = ioRegisterVar("J31", "float", 4, dims4d, J31); errorCode = ioRegisterVar("J32", "float", 4, dims4d, J32); errorCode = ioRegisterVar("J33", "float", 4, dims4d, J33); @@ -665,7 +678,11 @@ int calculateJacobians(){ /* Set the determinant and inverse determinant for this ijk location */ D_Jac[ijk] = determinant; invD_Jac[ijk] = 1.0/determinant; - //d(x,y,z)/d_zeta + //dx/d_zeta + J13[ijk] = (T[0][1]*T[1][2] - T[0][2]*T[1][1])*invD_Jac[ijk]; + //dy/d_zeta + J23[ijk] = -(T[0][0]*T[1][2] - T[0][2]*T[1][0])*invD_Jac[ijk]; + //dz/d_(xi, eta, zeta) J31[ijk] = (T[1][0]*T[2][1] - T[1][1]*T[2][0])*invD_Jac[ijk]; J32[ijk] = -(T[0][0]*T[2][1] - T[0][1]*T[2][0])*invD_Jac[ijk]; J33[ijk] = (T[0][0]*T[1][1] - T[0][1]*T[1][0])*invD_Jac[ijk]; @@ -685,7 +702,7 @@ int calculateJacobians(){ printf("mpi_rank_world--%d/%d coordHalos, calculateJacaobians()-Boundaries!\n",mpi_rank_world, mpi_size_world); fflush(stdout); #endif -/* TODO technically these only work for periodic domain. Since we currently use strictly constant horizontal and vertical spacing they work for now. */ +/* TODO technically these only work for horizontally iconstant resolution domain. */ /*lower i-index halos*/ for(iBnd=0; iBnd < 6; iBnd++){ #ifdef DEBUG @@ -755,7 +772,11 @@ int calculateJacobians(){ /* Set the determinant and inverse determinant for this ijk location */ D_Jac[ijkDest] = D_Jac[ijkTarget]; invD_Jac[ijkDest] = invD_Jac[ijkTarget]; - //d(x,y,z)/d_zeta + //dx/d_zeta + J13[ijkDest] = J13[ijkTarget]; + //dy/d_zeta + J23[ijkDest] = J23[ijkTarget]; + //dz/d_(xi, eta, zeta) J31[ijkDest] = J31[ijkTarget]; J32[ijkDest] = J32[ijkTarget]; J33[ijkDest] = J33[ijkTarget]; @@ -789,6 +810,8 @@ int calculateJacobians(){ printf("(xPos,yPos,zPos)[%d,%d,%d] = (%f,%f,%f)\n",i,j,k+1,xPos[ijkp1],yPos[ijkp1],zPos[ijkp1]); printf("D_Jac[%d] = %f\n",ijk,D_Jac[ijk]); printf("invD_Jac[%d] = %f\n",ijk,invD_Jac[ijk]); + printf("J13[%d] = %f\n",ijk,J13[ijk]); + printf("J23[%d] = %f\n",ijk,J23[ijk]); printf("J31[%d] = %f\n",ijk,J31[ijk]); printf("J32[%d] = %f\n",ijk,J32[ijk]); printf("J33[%d] = %f\n",ijk,J33[ijk]); diff --git a/SRC/GRID/grid.h b/SRC/GRID/grid.h index 7324260..3f6a28a 100644 --- a/SRC/GRID/grid.h +++ b/SRC/GRID/grid.h @@ -49,6 +49,15 @@ extern float *zPos; /* Cell-center position in z (meters) */ extern float *topoPos; /*Topography elevation (z in meters) at the cell center position in x and y. */ extern float *topoPosGlobal; /*Topography elevation (z in meters) at the cell center position in x and y. (Global domain) */ +//extern float *J11; // dx/d_xi -- assumed = 1.0 +//extern float *J12; // dx/d_eta -- assumed = 1.0 + +//extern float *J21; // dy/d_xi -- assumed = 1.0 +//extern float *J22; // dy/d_eta -- assumed = 1.0 + +extern float *J13; // dx/d_zeta +extern float *J23; // dy/d_zeta + extern float *J31; // dz/d_xi extern float *J32; // dz/d_eta extern float *J33; // dz/d_zeta From 7e72ef4a0cd537f65d80706ee6a4c31b6d5dc5cd Mon Sep 17 00:00:00 2001 From: Jeremy Sauer Date: Mon, 24 Mar 2025 21:43:12 -0600 Subject: [PATCH 2/5] Implemented the use of dx/d_zeta and dy/d_zeta metric tensor terms in the formulation of cell face velocties (for advection terms) and calculation of pressure gradient force components. --- SRC/GRID/CUDA/cuda_gridDevice.cu | 4 ++-- SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu | 5 +++-- SRC/HYDRO_CORE/CUDA/cuda_advectionDevice_cu.h | 1 + SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu | 11 ++++++----- SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h | 2 +- SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu | 10 ++++++---- SRC/HYDRO_CORE/CUDA/cuda_pressureDevice_cu.h | 4 ++-- 7 files changed, 21 insertions(+), 16 deletions(-) diff --git a/SRC/GRID/CUDA/cuda_gridDevice.cu b/SRC/GRID/CUDA/cuda_gridDevice.cu index 92629aa..4a779ed 100644 --- a/SRC/GRID/CUDA/cuda_gridDevice.cu +++ b/SRC/GRID/CUDA/cuda_gridDevice.cu @@ -103,13 +103,13 @@ extern "C" int cuda_gridDeviceSetup(){ Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); /* Allocate the GRID arrays */ /* Coordinate Arrays */ - fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &xPos_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &yPos_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &zPos_d); fecuda_DeviceMalloc(((Nxp+2*Nh)*(Nyp+2*Nh))*sizeof(float), &topoPos_d); /* Metric Tensors Fields */ + fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d); + fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &J31_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &J32_d); fecuda_DeviceMalloc(Nelems*sizeof(float), &J33_d); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu index b22b470..f2e642d 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu @@ -52,6 +52,7 @@ extern "C" int cuda_advectionDeviceCleanup(){ * This device function calculates the cell face velocities to prepare for use in the chosen advection scheme */ __device__ void cudaDevice_calcFaceVelocities(float* hydroFlds_d, float* hydroFaceVels_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int i,j,k,ijk; int im1jk,ijm1k,ijkm1; @@ -97,8 +98,8 @@ __device__ void cudaDevice_calcFaceVelocities(float* hydroFlds_d, float* hydroFa ( (D_Jac_d[ijk]/rho[ijk])*(v[ijk] + w[ijk]*J32_d[ijk]) +(D_Jac_d[ijm1k]/rho[ijm1k])*(v[ijm1k] + w[ijm1k]*J32_d[ijm1k])); w_cf[ijk] = 0.5*dZi_d* - ( (D_Jac_d[ijk]/rho[ijk])*(w[ijk]*J33_d[ijk]) - +(D_Jac_d[ijkm1]/rho[ijkm1])*(w[ijkm1]*J33_d[ijkm1])); + ( (D_Jac_d[ijk]/rho[ijk])*(u[ijk]*J13_d[ijk] + v[ijk]*J23_d[ijk] + w[ijk]*J33_d[ijk]) + +(D_Jac_d[ijkm1]/rho[ijkm1])*(u[ijkm1]*J13_d[ijkm1] + v[ijkm1]*J23_d[ijkm1] + w[ijkm1]*J33_d[ijkm1])); //Ensure ground and ceiling face vertical velocity component is set to 0 if((k==kMin_d)||((k>=kMax_d))){ diff --git a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice_cu.h index 8fcfff9..7e4f400 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice_cu.h @@ -40,6 +40,7 @@ extern "C" int cuda_advectionDeviceCleanup(); * This device function calculates the cell face velocities to prepare for use in the chosen advection scheme */ __device__ void cudaDevice_calcFaceVelocities(float* hydroFlds_d, float* hydroFaceVels_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); /*----->>>>> __device__ void cudaDevice_UpstreamDivAdvFlux(); -------------------------------------------------- diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu index 229f66d..9a2e8d6 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu @@ -418,7 +418,7 @@ extern "C" int cuda_hydroCoreDeviceBuildFrhs(float simTime, int simTime_it, int invOblen_d, z0m_d, z0t_d, qFlux_d, qskin_d, sea_mask_d, hydroRhoInv_d, hydroKappaM_d, sgstkeScalars_d, sgstke_ls_d, dedxi_d, moistScalars_d, moistTauFlds_d, moistScalarsFrhs_d, - J31_d, J32_d, J33_d, D_Jac_d); + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d); gpuErrchk( cudaGetLastError() ); #ifdef TIMERS_LEVEL2 stopSynchReportDestroyEvent(&startE, &stopE, &elapsedTime); @@ -898,7 +898,7 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim float* hydroRhoInv_d, float* hydroKappaM_d, float* sgstkeScalars_d, float* sgstke_ls_d, float* dedxi_d, float* moistScalars_d, float* moistTauFlds_d, float* moistScalarsFrhs_d, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int fldStride; float inv_pr; int iFld; @@ -908,7 +908,8 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim //### ADVECTION ###// /* Calculate the cell-face velocity components to prepare for advection in later phases of build_Frhs*/ cudaDevice_calcFaceVelocities(hydroFlds_d, hydroFaceVels_d, - J31_d, J32_d, J33_d, D_Jac_d); + J13_d, J23_d, + J31_d, J32_d, J33_d, D_Jac_d); //### MOLECULAR DIFFUSION ###// if((diffusionSelector_d > 0) && ((physics_oneRKonly_d==0) || (timeStage==numRKstages))){ // calculate NuGrad fields for(iFld=1; iFld < Nhydro_d; iFld++){ //NOTE: core progrnotic variables excluding rho, so (u,v,w,theta) @@ -933,11 +934,11 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim cudaDevice_calcPressureGradientForceMoist(&hydroFldsFrhs_d[fldStride*U_INDX], &hydroFldsFrhs_d[fldStride*V_INDX], &hydroFldsFrhs_d[fldStride*W_INDX], &hydroFlds_d[fldStride*RHO_INDX], &hydroPres_d[0], &moistScalars_d[0], - J31_d, J32_d, J33_d); + J13_d, J23_d, J31_d, J32_d, J33_d); }else{ // dry pressure gradient force cudaDevice_calcPressureGradientForce(&hydroFldsFrhs_d[fldStride*U_INDX], &hydroFldsFrhs_d[fldStride*V_INDX], &hydroFldsFrhs_d[fldStride*W_INDX], &hydroPres_d[0], - J31_d, J32_d, J33_d); + J13_d, J23_d, J31_d, J32_d, J33_d); } // end if (moistureSelector_d > 0)&&(moistureNvars_d > 0) } //end if pgfSelector_d > 0 diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h index b6b450a..07e68ff 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h @@ -145,7 +145,7 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim float* sgstkeScalars_d, float* sgstke_ls_d, float* dedxi_d, float* moistScalars_d, float* moistTauFlds_d, float* moistScalarsFrhs_d, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); __global__ void cudaDevice_hydroCoreUnitTestComplete(float simTime, int simTime_it, float dt, int timeStage, int numRKstages, float* hydroFlds,float* hydroFldsFrhs, float* hydroFaceVels, float* hydroBaseStateFlds, float* hydroTauFlds, diff --git a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu index 660e5fe..c02a720 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu @@ -150,7 +150,7 @@ __device__ void cudaDevice_calcPerturbationPressureMoist(float* pres, float* rho * This is the cuda version of the calcPressureGradientForce routine from the HYDRO_CORE module */ __device__ void cudaDevice_calcPressureGradientForce(float* Frhs_u, float* Frhs_v, float* Frhs_w, float* pres, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k,ijk,iStride,jStride,kStride; int ip1jk,im1jk,ijp1k,ijm1k,ijkp1,ijkm1; @@ -174,8 +174,10 @@ __device__ void cudaDevice_calcPressureGradientForce(float* Frhs_u, float* Frhs_ ijm1k = i*iStride + (j-1)*jStride + k*kStride; ijkm1 = i*iStride + j*jStride + (k-1)*kStride; - Frhs_u[ijk] = Frhs_u[ijk]-0.5*dXi_d*(pres[ip1jk] - pres[im1jk]); - Frhs_v[ijk] = Frhs_v[ijk]-0.5*dYi_d*(pres[ijp1k] - pres[ijm1k]); + Frhs_u[ijk] = Frhs_u[ijk]-0.5*( dXi_d*(pres[ip1jk] - pres[im1jk]) + +dZi_d*J13_d[ijk]*(pres[ijkp1] - pres[ijkm1]) ); + Frhs_v[ijk] = Frhs_v[ijk]-0.5*( dYi_d*(pres[ijp1k] - pres[ijm1k]) + +dZi_d*J23_d[ijk]*(pres[ijkp1] - pres[ijkm1]) ); Frhs_w[ijk] = Frhs_w[ijk]-0.5*( dXi_d*J31_d[ijk]*(pres[ip1jk] - pres[im1jk]) +dYi_d*J32_d[ijk]*(pres[ijp1k] - pres[ijm1k]) +dZi_d*J33_d[ijk]*(pres[ijkp1] - pres[ijkm1]) ); @@ -186,7 +188,7 @@ __device__ void cudaDevice_calcPressureGradientForce(float* Frhs_u, float* Frhs_ */ __device__ void cudaDevice_calcPressureGradientForceMoist(float* Frhs_u, float* Frhs_v, float* Frhs_w, float* rho, float* pres, float* moistScalars, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k,ijk,iStride,jStride,kStride,fldStride; int ip1jk,im1jk,ijp1k,ijm1k,ijkp1,ijkm1; diff --git a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice_cu.h index 6eef11f..5c428bd 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice_cu.h @@ -40,12 +40,12 @@ __device__ void cudaDevice_calcPerturbationPressureMoist(float* pres, float* rho * This is the cuda version of the calcPressureGradientForce routine from the HYDRO_CORE module */ __device__ void cudaDevice_calcPressureGradientForce(float* Frhs_u, float* Frhs_v, float* Frhs_w, float* pres, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); /*----->>>>> __device__ void cudaDevice_calcPressureGradientForceMoist(); ----------------------------------------- */ __device__ void cudaDevice_calcPressureGradientForceMoist(float* Frhs_u, float* Frhs_v, float* Frhs_w, float* rho, float* pres, float* moistScalars, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); #endif // _PRESSURE_CUDADEV_CU_H From 1b489161edcfc57d4af6b4db84145cb630cf46d4 Mon Sep 17 00:00:00 2001 From: Jeremy Sauer Date: Tue, 25 Mar 2025 00:51:05 -0600 Subject: [PATCH 3/5] Implemented dx/d_zeta and dy/d_zeta for terrain following coordinate in the moleuler diffusion term submodule. --- SRC/GRID/grid.c | 4 +- SRC/GRID/grid.h | 4 +- SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu | 4 +- .../CUDA/cuda_molecularDiffDevice.cu | 77 +++++++++++++------ .../CUDA/cuda_molecularDiffDevice_cu.h | 8 +- 5 files changed, 65 insertions(+), 32 deletions(-) diff --git a/SRC/GRID/grid.c b/SRC/GRID/grid.c index 319ea2a..2ef195a 100644 --- a/SRC/GRID/grid.c +++ b/SRC/GRID/grid.c @@ -55,9 +55,9 @@ float *topoPosGlobal; /*Topography elevation (z in meters) at the cell center po float *topoPos; /*Topography elevation (z in meters) at the cell center position in x and y. (per-rank domain) */ //float *J11; // dx/d_xi -- assumed = 1.0 -//float *J12; // dx/d_eta -- assumed = 1.0 +//float *J12; // dx/d_eta -- assumed = 0.0 -//float *J21; // dy/d_xi -- assumed = 1.0 +//float *J21; // dy/d_xi -- assumed = 0.0 //float *J22; // dy/d_eta -- assumed = 1.0 float *J13; // dx/d_zeta diff --git a/SRC/GRID/grid.h b/SRC/GRID/grid.h index 3f6a28a..adc9444 100644 --- a/SRC/GRID/grid.h +++ b/SRC/GRID/grid.h @@ -50,9 +50,9 @@ extern float *topoPos; /*Topography elevation (z in meters) at the cell center p extern float *topoPosGlobal; /*Topography elevation (z in meters) at the cell center position in x and y. (Global domain) */ //extern float *J11; // dx/d_xi -- assumed = 1.0 -//extern float *J12; // dx/d_eta -- assumed = 1.0 +//extern float *J12; // dx/d_eta -- assumed = 0.0 -//extern float *J21; // dy/d_xi -- assumed = 1.0 +//extern float *J21; // dy/d_xi -- assumed = 0.0 //extern float *J22; // dy/d_eta -- assumed = 1.0 extern float *J13; // dx/d_zeta diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu index 9a2e8d6..bc08aff 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu @@ -486,7 +486,7 @@ extern "C" int cuda_hydroCoreDeviceBuildFrhs(float simTime, int simTime_it, int if (diffusionSelector == 1){ cudaDevice_hydroCoreUnitTestCompleteMolecularDiffusion<<>>(hydroFlds_d, hydroFldsFrhs_d, hydroNuGradXFlds_d,hydroNuGradYFlds_d,hydroNuGradZFlds_d, - J31_d, J32_d, J33_d, D_Jac_d, invD_Jac_d); // call to div of nugrad + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d, invD_Jac_d); // call to div of nugrad } // endif diffusionSelector == 1 //Auxiliary scalar mixing (diffusion) from SGS-turbulence @@ -923,7 +923,7 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim &hydroNuGradYFlds_d[fldStride*(iFld-1)], &hydroNuGradZFlds_d[fldStride*(iFld-1)], inv_pr, - J31_d, J32_d, J33_d, D_Jac_d); + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d); } // end for (iFld=1; iFld < Nhydro_d; iFld++){ } // end if diffusionSelector_d > 0 diff --git a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu index c62bfef..e5bae06 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu @@ -61,7 +61,7 @@ extern "C" int cuda_molecularDiffDeviceCleanup(){ */ __global__ void cudaDevice_hydroCoreUnitTestCompleteMolecularDiffusion(float* hydroFlds, float* hydroFldsFrhs, float* hydroNuGradXFlds_d, float* hydroNuGradYFlds_d, float* hydroNuGradZFlds_d, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d, float* invD_Jac_d){ // calculate divergence of NuGrad int i,j,k; int iFld,fldStride; @@ -81,7 +81,7 @@ __global__ void cudaDevice_hydroCoreUnitTestCompleteMolecularDiffusion(float* hy &hydroNuGradXFlds_d[fldStride*(iFld-1)], &hydroNuGradYFlds_d[fldStride*(iFld-1)], &hydroNuGradZFlds_d[fldStride*(iFld-1)],iFld, - J31_d, J32_d, J33_d, D_Jac_d, invD_Jac_d); + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d, invD_Jac_d); }//for iFld }//end if in the range of non-halo cells @@ -92,7 +92,7 @@ __global__ void cudaDevice_hydroCoreUnitTestCompleteMolecularDiffusion(float* hy * of an arbitrary field. */ __device__ void cudaDevice_diffusionDriver(float* fld, float* NuGradX, float* NuGradY,float* NuGradZ, float inv_pr, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int i,j,k,ijk; @@ -134,7 +134,7 @@ __device__ void cudaDevice_diffusionDriver(float* fld, float* NuGradX, float* Nu &fld[ijk], &fld[im1jk], &fld[ijm1k], &fld[ijkm1], &fld[ip1jk], &fld[ijp1k], &fld[ijkp1], &fld[im1jm1k], &fld[im1jkm1], &fld[ijm1km1], &fld[im1jp1k], &fld[im1jkp1],&fld[ijm1kp1], &fld[ip1jkm1], &fld[ip1jm1k], &fld[ijp1km1],inv_pr, - J31_d, J32_d, J33_d, D_Jac_d); + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d); }//end if in the range of non-halo cells }//end diffusionDriver() @@ -147,13 +147,14 @@ __device__ void cudaDevice_calcConstNuGrad(float* NuGradX, float* NuGradY, float float* sFld_im1jm1k, float* sFld_im1jkm1, float* sFld_ijm1km1, float* sFld_im1jp1k, float* sFld_im1jkp1, float* sFld_ijm1kp1, float* sFld_ip1jkm1, float* sFld_ip1jm1k, float* sFld_ijp1km1, float inv_pr, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int i,j,k,ijk; int im1jk,ijm1k,ijkm1; int im1jkm1,ijm1km1; int iStride,jStride,kStride; float Txx,Tyy,Tzz; + float Tzx,Tzy; float Txz,Tyz; i = (blockIdx.x)*blockDim.x + threadIdx.x; j = (blockIdx.y)*blockDim.y + threadIdx.y; @@ -170,26 +171,55 @@ __device__ void cudaDevice_calcConstNuGrad(float* NuGradX, float* NuGradY, float Txx = nu_0_d*inv_pr*( dXi_d*(*sFld_ijk-(*sFld_im1jk)) //dphi_dXi @ i-1/2 + +0.5*(J13_d[ijk]+J13_d[im1jk]) //Txx part-->dphi_dZeta + *0.25*dZi_d*( (*sFld_ijkp1)+(*sFld_im1jkp1) //dphi_dZeta @i-1/2 + -(*sFld_ijkm1)-(*sFld_im1jkm1)) ); - + //Tyx = 0 since this would involve a factor of J12 which is dx/d_eta = 0 + + Tzx = nu_0_d*inv_pr*( 0.5*(J13_d[ijk]+J13_d[im1jk]) + *0.5*((J13_d[ijk]+J13_d[im1jk]) //Tzx part-->dphi_dXI + *dXi_d*(*sFld_ijk-(*sFld_im1jk)) //dphi_dXi @ i-1/2 + +(J23_d[ijk]+J23_d[im1jk]) //Tzx part-->dphi_dEta + *0.25*dYi_d*( (*sFld_ijp1k)+(*sFld_im1jp1k) //dphi_dEta @i-1/2 + -(*sFld_ijm1k)-(*sFld_im1jm1k)) + +(J33_d[ijk]+J33_d[im1jk]) //Tzx part-->dphi_dZeta + *0.25*dZi_d*( (*sFld_ijkp1)+(*sFld_im1jkp1) //dphi_dZeta @i-1/2 + -(*sFld_ijkm1)-(*sFld_im1jkm1)) + )); + //Txy = 0 since this would involve a factor of J12 which is dx/d_eta = 0 + Tyy = nu_0_d*inv_pr*( - dYi_d*(*sFld_ijk-(*sFld_ijm1k)) //dphi_dEta @ j-1/2 + dYi_d*(*sFld_ijk-(*sFld_ijm1k)) //dphi_dEta @ j-1/2 + +0.5*(J23_d[ijk]+J23_d[ijm1k]) //Tzy part-->dphi_dZeta + *0.25*dZi_d*( (*sFld_ijkp1)+(*sFld_ijm1kp1) //dphi_dZeta @j-1/2 + -(*sFld_ijkm1)-(*sFld_ijm1km1)) ); + Tzy = nu_0_d*inv_pr*( 0.5*(J23_d[ijk]+J23_d[ijm1k]) + *0.5*((J13_d[ijk]+J13_d[ijm1k]) //Tzy part-->dphi_dXI + *0.25*dXi_d*( (*sFld_ip1jk)+(*sFld_ip1jm1k) //dphi_dXi @j-1/2 + -(*sFld_im1jk)-(*sFld_im1jm1k)) + +(J23_d[ijk]+J23_d[ijm1k]) //Tzy part-->dphi_dEta + *dYi_d*(*sFld_ijk-(*sFld_ijm1k)) //dphi_dEta @ j-1/2 + +(J33_d[ijk]+J33_d[ijm1k]) //Tzy part-->dphi_dZeta + *0.25*dZi_d*( (*sFld_ijkp1)+(*sFld_ijm1kp1) //dphi_dZeta @j-1/2 + -(*sFld_ijkm1)-(*sFld_ijm1km1)) + )); + Txz = nu_0_d*inv_pr*( 0.5*(J31_d[ijk]+J31_d[ijkm1]) - *0.5*( - +(J31_d[ijk]+J31_d[ijkm1]) //Txz part-->dphi_dXi - *0.25*dXi_d*( (*sFld_ip1jk)+(*sFld_ip1jkm1) //dphi_dXi @k-1/2 - -(*sFld_im1jk)-(*sFld_im1jkm1)) - +(J32_d[ijk]+J32_d[ijkm1]) //Txz part-->dphi_dEta - *0.25*dYi_d*( (*sFld_ijp1k)+(*sFld_ijp1km1) //dphi_dEta @k-1/2 - -(*sFld_ijm1k)-(*sFld_ijm1km1)) - +(J33_d[ijk]+J33_d[ijkm1]) //Txz part-->dphi_dZeta + *0.5*((J31_d[ijk]+J31_d[ijkm1]) //Txz part-->dphi_dXi + *0.25*dXi_d*( (*sFld_ip1jk)+(*sFld_ip1jkm1) //dphi_dXi @k-1/2 + -(*sFld_im1jk)-(*sFld_im1jkm1)) + +(J32_d[ijk]+J32_d[ijkm1]) //Txz part-->dphi_dEta + *0.25*dYi_d*( (*sFld_ijp1k)+(*sFld_ijp1km1) //dphi_dEta @k-1/2 + -(*sFld_ijm1k)-(*sFld_ijm1km1)) + +(J33_d[ijk]+J33_d[ijkm1]) //Txz part-->dphi_dZeta *dZi_d*(*sFld_ijk-(*sFld_ijkm1)) //dphi_dZeta @ k-1/2 )); + Tyz = nu_0_d*inv_pr*( 0.5*(J32_d[ijk]+J32_d[ijkm1]) - *0.5*( - +(J31_d[ijk]+J31_d[ijkm1]) //Tyz part-->dphi_dXi + *0.5*((J31_d[ijk]+J31_d[ijkm1]) //Tyz part-->dphi_dXi *0.25*dXi_d*( (*sFld_ip1jk)+(*sFld_ip1jkm1) //dphi_dXi @k-1/2 -(*sFld_im1jk)-(*sFld_im1jkm1)) +(J32_d[ijk]+J32_d[ijkm1]) //Tyz part-->dphi_dEta @@ -199,8 +229,7 @@ __device__ void cudaDevice_calcConstNuGrad(float* NuGradX, float* NuGradY, float *dZi_d*(*sFld_ijk-(*sFld_ijkm1)) //dphi_dZeta @ k-1/2 )); Tzz = nu_0_d*inv_pr*( 0.5*(J33_d[ijk]+J33_d[ijkm1]) - *0.5*( - +(J31_d[ijk]+J31_d[ijkm1]) //Tzz part-->dphi_dXi + *0.5*((J31_d[ijk]+J31_d[ijkm1]) //Tzz part-->dphi_dXi *0.25*dXi_d*( (*sFld_ip1jk)+(*sFld_ip1jkm1) //dphi_dXi @k-1/2 -(*sFld_im1jk)-(*sFld_im1jkm1)) +(J32_d[ijk]+J32_d[ijkm1]) //Tzz part-->dphi_dEta @@ -210,12 +239,14 @@ __device__ void cudaDevice_calcConstNuGrad(float* NuGradX, float* NuGradY, float *dZi_d*(*sFld_ijk-(*sFld_ijkm1)) //dphi_dZeta @ k-1/2 )); Txx = Txx*D_Jac_d[ijk]; + Tzx = Tzx*0.25*(D_Jac_d[ijk]+D_Jac_d[im1jk]+D_Jac_d[ijkm1]+D_Jac_d[im1jkm1]); Tyy = Tyy*D_Jac_d[ijk]; + Tzy = Tzy*0.25*(D_Jac_d[ijk]+D_Jac_d[ijm1k]+D_Jac_d[ijkm1]+D_Jac_d[ijm1km1]); Txz = Txz*0.25*(D_Jac_d[ijk]+D_Jac_d[im1jk]+D_Jac_d[ijkm1]+D_Jac_d[im1jkm1]); Tyz = Tyz*0.25*(D_Jac_d[ijk]+D_Jac_d[ijm1k]+D_Jac_d[ijkm1]+D_Jac_d[ijm1km1]); Tzz = Tzz*D_Jac_d[ijk]; - *NuGradX = Txx; - *NuGradY = Tyy; + *NuGradX = Txx+Tzx; + *NuGradY = Tyy+Tzy; *NuGradZ = Txz+Tyz+Tzz; } // end cudaDevice_calcConstNuGrad() @@ -224,7 +255,7 @@ __device__ void cudaDevice_calcConstNuGrad(float* NuGradX, float* NuGradY, float * This is the cuda version of taking the divergence of nu_0 times the gradient of a field */ __device__ void cudaDevice_calcDivNuGrad(float* scalarFrhs, float* rho, float* NuGradX, float* NuGradY, float* NuGradZ, int iFld, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d, float* invD_Jac_d){ int i,j,k,ijk; @@ -247,7 +278,9 @@ __device__ void cudaDevice_calcDivNuGrad(float* scalarFrhs, float* rho, float* N ijkp1 = i*iStride + j*jStride + (k+1)*kStride; Frhs_tmp = rho[ijk]*invD_Jac_d[ijk]*D_Jac_d[ijk] *( (NuGradX[ip1jk]-(NuGradX[im1jk]))*dXi_d*0.5 + +(NuGradX[ijkp1]-(NuGradX[ijkm1]))*dZi_d*0.5*J13_d[ijk] +(NuGradY[ijp1k]-(NuGradY[ijm1k]))*dYi_d*0.5 + +(NuGradY[ijkp1]-(NuGradY[ijkm1]))*dZi_d*0.5*J23_d[ijk] +(NuGradZ[ip1jk]-(NuGradZ[im1jk]))*dXi_d*0.5*J31_d[ijk] +(NuGradZ[ijp1k]-(NuGradZ[ijm1k]))*dYi_d*0.5*J32_d[ijk] +(NuGradZ[ijkp1]-(NuGradZ[ijkm1]))*dZi_d*0.5*J33_d[ijk] diff --git a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice_cu.h index 49c54a5..770884f 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice_cu.h @@ -45,7 +45,7 @@ extern "C" int cuda_molecularDiffDeviceCleanup(); */ __global__ void cudaDevice_hydroCoreUnitTestCompleteMolecularDiffusion(float* hydroFlds, float* hydroFldsFrhs, float* hydroNuGradXFlds_d, float* hydroNuGradYFlds_d, float* hydroNuGradZFlds_d, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d, float* invD_Jac_d); /*----->>>>> __device__ void cudaDevice_diffusionDriver(); -------------------------------------------------- @@ -53,7 +53,7 @@ __global__ void cudaDevice_hydroCoreUnitTestCompleteMolecularDiffusion(float* hy * of an arbitrary field. */ __device__ void cudaDevice_diffusionDriver(float* fld, float* NuGradX, float* NuGradY,float* NuGradZ, float inv_pr, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); @@ -66,14 +66,14 @@ __device__ void cudaDevice_calcConstNuGrad(float* NuGradX, float* NuGradY, float float* sFld_im1jm1k, float* sFld_im1jkm1, float* sFld_ijm1km1, float* sFld_im1jp1k, float* sFld_im1jkp1, float* sFld_ijm1kp1, float* sFld_ip1jkm1, float* sFld_ip1jm1k, float* sFld_ijp1km1, float inv_pr, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); /*----->>>>> __device__ void cudaDevice_calcDivNuGrad(); -------------------------------------------------- * This is the cuda version of taking the divergence of nu_0 times the gradient of a field */ __device__ void cudaDevice_calcDivNuGrad(float* scalarFrhs, float* rho, float* NuGradX, float* NuGradY, float* NuGradZ, int iFld, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d, float* invD_Jac_d); #endif // _MOLDIFF_CUDADEV_CU_H From 27cd876f41e372cff7840c2b7d6d1ffa56721267 Mon Sep 17 00:00:00 2001 From: Jeremy Sauer Date: Tue, 25 Mar 2025 12:27:34 -0600 Subject: [PATCH 4/5] Implemented dx/d_zeta and dy/d_zeta metric tensor terms allowing terraing-following coordinate in sgsTKE prognostic equation and sgs Turbulence closure terms. --- SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu | 28 ++--- SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h | 2 +- SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu | 106 +++++++++++++----- SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice_cu.h | 18 ++- SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu | 44 +++++--- SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice_cu.h | 8 +- 6 files changed, 130 insertions(+), 76 deletions(-) diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu index bc08aff..d0936e8 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu @@ -432,7 +432,7 @@ extern "C" int cuda_hydroCoreDeviceBuildFrhs(float simTime, int simTime_it, int cudaDevice_hydroCoreUnitTestComplete<<>>(simTime, simTime_it, dt, timeStage, numRKstages, hydroFlds_d, hydroFldsFrhs_d, hydroFaceVels_d, hydroBaseStateFlds_d, hydroTauFlds_d, sgstkeScalars_d, sgstkeScalarsFrhs_d, moistScalars_d, moistScalarsFrhs_d, moistTauFlds_d, - J31_d, J32_d, J33_d, invD_Jac_d, zPos_d); + J13_d, J23_d, J31_d, J32_d, J33_d, invD_Jac_d, zPos_d); /*Calculate the Frhs contributions for the advection and SGS-mixing terms on Auxiliary scalar fields*/ if(NhydroAuxScalars > 0){ @@ -449,12 +449,12 @@ extern "C" int cuda_hydroCoreDeviceBuildFrhs(float simTime, int simTime_it, int for (iFld = 0; iFld < NhydroAuxScalars; iFld++){ cudaDevice_TausScalar<<>>(iFld, hydroRhoInv_d, hydroFlds_d, hydroKappaM_d, sgstke_ls_d, hydroAuxScalars_d, AuxScalarsTauFlds_d, - J31_d, J32_d, J33_d, D_Jac_d); // compute taus + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d); // compute taus gpuErrchk( cudaGetLastError() ); gpuErrchk( cudaDeviceSynchronize() ); cudaDevice_SGSforcing<<>>(iFld, AuxScalarsTauFlds_d, hydroAuxScalarsFrhs_d, - J31_d, J32_d, J33_d); // compute/add SGS forcing + J13_d, J23_d, J31_d, J32_d, J33_d); // compute/add SGS forcing gpuErrchk( cudaGetLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } //end for iFld @@ -468,7 +468,7 @@ extern "C" int cuda_hydroCoreDeviceBuildFrhs(float simTime, int simTime_it, int cudaDevice_hydroCoreUnitTestCompleteSGSTKE<<>>(hydroFlds_d, hydroRhoInv_d, hydroTauFlds_d, hydroKappaM_d, dedxi_d, sgstke_ls_d, sgstkeScalars_d, sgstkeScalarsFrhs_d, canopy_lad_d, - J31_d, J32_d, J33_d, D_Jac_d); //call to prognostic TKE equation + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d); //call to prognostic TKE equation if (canopySelector==1){ // canopy drag term to forcing of momentum cudaDevice_hydroCoreUnitTestCompleteCanopy<<>>(hydroFlds_d, hydroRhoInv_d, canopy_lad_d, hydroFldsFrhs_d); } @@ -703,7 +703,7 @@ __global__ void cudaDevice_hydroCoreUnitTestComplete(float simTime, int simTime_ float* hydroTauFlds, float* sgstkeScalars, float* sgstkeScalarsFrhs, float* moistScalars, float* moistScalarsFrhs, float* moistTauFlds, - float* J31_d, float* J32_d, float* J33_d, float* invD_Jac_d, float* zPos_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* invD_Jac_d, float* zPos_d){ int i,j,k,ijk; int iFld,fldStride; @@ -871,15 +871,15 @@ __global__ void cudaDevice_hydroCoreUnitTestComplete(float simTime, int simTime_ &hydroFldsFrhs[fldStride*V_INDX], &hydroFldsFrhs[fldStride*W_INDX], &hydroFldsFrhs[fldStride*THETA_INDX], - &hydroTauFlds[fldStride*0], &hydroTauFlds[fldStride*1], &hydroTauFlds[fldStride*2], - &hydroTauFlds[fldStride*3], &hydroTauFlds[fldStride*4], &hydroTauFlds[fldStride*5], - &hydroTauFlds[fldStride*6], &hydroTauFlds[fldStride*7], &hydroTauFlds[fldStride*8], - J31_d, J32_d, J33_d); + &hydroTauFlds[fldStride*0], &hydroTauFlds[fldStride*1], &hydroTauFlds[fldStride*2], + &hydroTauFlds[fldStride*3], &hydroTauFlds[fldStride*4], &hydroTauFlds[fldStride*5], + &hydroTauFlds[fldStride*6], &hydroTauFlds[fldStride*7], &hydroTauFlds[fldStride*8], + J13_d, J23_d, J31_d, J32_d, J33_d); if ((moistureSelector_d > 0) && (moistureSGSturb_d > 0)){ for(iFld=0; iFld < moistureNvars_d; iFld++){ // loop over moisture equations cudaDevice_hydroCoreCalcTurbMixingScalar(&moistScalarsFrhs[fldStride*iFld], &moistTauFlds[fldStride*(3*iFld+0)], &moistTauFlds[fldStride*(3*iFld+1)], &moistTauFlds[fldStride*(3*iFld+2)], - J31_d, J32_d, J33_d); + J13_d, J23_d, J31_d, J32_d, J33_d); } } } //end if turbulenceSelector_d > 0 @@ -951,13 +951,13 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim &hydroTauFlds_d[fldStride*0], &hydroTauFlds_d[fldStride*1], &hydroTauFlds_d[fldStride*2], &hydroTauFlds_d[fldStride*3], &hydroTauFlds_d[fldStride*4], &hydroTauFlds_d[fldStride*5], &hydroTauFlds_d[fldStride*6], &hydroTauFlds_d[fldStride*7], &hydroTauFlds_d[fldStride*8], - J31_d, J32_d, J33_d, + J13_d, J23_d, J31_d, J32_d, J33_d, &hydroRhoInv_d[0]); if ((moistureSelector_d > 0) && (moistureSGSturb_d > 0)){ for(iFld=0; iFld < moistureNvars_d; iFld++){ // loop over moisture equations cudaDevice_GradScalarToFaces(&moistScalars_d[fldStride*iFld], &hydroRhoInv_d[0], &moistTauFlds_d[fldStride*(3*iFld+0)], &moistTauFlds_d[fldStride*(3*iFld+1)], &moistTauFlds_d[fldStride*(3*iFld+2)], - J31_d, J32_d, J33_d); + J13_d, J23_d, J31_d, J32_d, J33_d); } } if(TKESelector_d > 0){ // Lilly SGSTKE based Km @@ -975,7 +975,7 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim &sgstkeScalars_d[fldStride*iFld], &sgstke_ls_d[fldStride*iFld], &hydroKappaM_d[0], D_Jac_d); // calculate Km cudaDevice_GradScalar(&sgstkeScalars_d[fldStride*iFld], &hydroRhoInv_d[0], &dedxi_d[fldStride*(iFld*3+0)], &dedxi_d[fldStride*(iFld*3+1)], &dedxi_d[fldStride*(iFld*3+2)], - J31_d, J32_d, J33_d); // calculate SGSTKE spatial gradients + J13_d, J23_d, J31_d, J32_d, J33_d); // calculate SGSTKE spatial gradients } cudaDevice_hydroCoreCalcTaus_PrognosticTKE_DeviatoricTerm( &hydroTauFlds_d[fldStride*0], &hydroTauFlds_d[fldStride*1], &hydroTauFlds_d[fldStride*2], @@ -984,7 +984,7 @@ __global__ void cudaDevice_hydroCoreCalcFaceVelocities(float simTime, int simTim &hydroFlds_d[fldStride*RHO_INDX], &hydroKappaM_d[0], &sgstke_ls_d[0], &hydroFlds_d[fldStride*U_INDX], &hydroFlds_d[fldStride*V_INDX], &hydroFlds_d[fldStride*W_INDX], &sgstkeScalars_d[0], - J31_d, J32_d, J33_d, D_Jac_d); // calculate tau_ij, tau_thj (sub-grid length scale and TKE from TKE_0) + J13_d, J23_d, J31_d, J32_d, J33_d, D_Jac_d); // calculate tau_ij, tau_thj (sub-grid length scale and TKE from TKE_0) }else{ // Samagorinsky Km cudaDevice_hydroCoreCalcEddyDiff(&hydroTauFlds_d[fldStride*0], &hydroTauFlds_d[fldStride*1], &hydroTauFlds_d[fldStride*2], &hydroTauFlds_d[fldStride*3], &hydroTauFlds_d[fldStride*4], &hydroTauFlds_d[fldStride*5], diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h index 07e68ff..c7b7150 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice_cu.h @@ -151,7 +151,7 @@ __global__ void cudaDevice_hydroCoreUnitTestComplete(float simTime, int simTime_ float* hydroFaceVels, float* hydroBaseStateFlds, float* hydroTauFlds, float* sgstkeScalars, float* sgstkeScalarsFrhs, float* moistScalars, float* moistScalarsFrhs, float* moistTauFlds, - float* J31_d, float* J32_d, float* J33_d, float* invD_Jac_d, float* zPos_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* invD_Jac_d, float* zPos_d); /*----->>>>> __device__ void cudaDevice_SetRhoInv(); -------------------------------------------------- * This is the cuda version of the SetRhoInv routine from the HYDRO_CORE module */ diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu index 39ded87..77a449e 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu @@ -61,7 +61,7 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f float* S11, float* S21, float* S31, float* S32, float* S22, float* S33, float* STH1, float* STH2, float* STH3, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* rhoInv){ int i,j,k,ijk; int im1jk,ijm1k,ijkm1; @@ -94,22 +94,28 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f /*S11 = 1/2*(du/dx+du/dx) @ (i-1/2,j,k), assuming A-grid */ S11[ijk] = ( - dXi_d*( u[ijk]*rhoInv[ijk] //du_dxi - -u[im1jk]*rhoInv[im1jk]) + dXi_d*( u[ijk]*rhoInv[ijk] //du_dxi + -u[im1jk]*rhoInv[im1jk]) + +J13_d[ijk] + *dZi_d*( u[ijk]*rhoInv[ijk] //du_dzeta + -u[ijkm1]*rhoInv[ijkm1]) ); //Done with S11 = 1/2*2*(du_dx) /*S22 = 1/2*(dv/dy+dv/dy) @ (i,j-1/2,k), assuming A-grid */ S22[ijk] = ( dYi_d*( v[ijk]*rhoInv[ijk] //dv_deta - -v[ijm1k]*rhoInv[ijm1k]) + -v[ijm1k]*rhoInv[ijm1k]) + +J23_d[ijk] + *dZi_d*( v[ijk]*rhoInv[ijk] //du_dzeta + -v[ijkm1]*rhoInv[ijkm1]) ); //Done with S22 = 1/2*2*(dv_dy) /*S33 = 1/2*(dw/dz+dw/dz) @ (i,j,k-1/2), assuming A-grid */ S33[ijk] = ( - J31_d[ijk]*dXi_d*( w[ijk]*rhoInv[ijk] //dw_dxi - -w[im1jk]*rhoInv[im1jk]) - +J32_d[ijk]*dYi_d*( w[ijk]*rhoInv[ijk] //dw_deta - -w[ijm1k]*rhoInv[ijm1k]) - +J33_d[ijk]*dZi_d*( w[ijk]*rhoInv[ijk] //dw_dzeta - -w[ijkm1]*rhoInv[ijkm1]) + J31_d[ijk]*dXi_d*( w[ijk]*rhoInv[ijk] //dw_dxi + -w[im1jk]*rhoInv[im1jk]) + +J32_d[ijk]*dYi_d*( w[ijk]*rhoInv[ijk] //dw_deta + -w[ijm1k]*rhoInv[ijm1k]) + +J33_d[ijk]*dZi_d*( w[ijk]*rhoInv[ijk] //dw_dzeta + -w[ijkm1]*rhoInv[ijkm1]) ); //Done with S33 = 1/2*2*(dw_dz) /*S21 = 1/2*(dv/dx+du/dy) @ (i-1/2,j-1/2,k), assuming A-grid */ S21[ijk] = 0.5*( @@ -117,10 +123,26 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f -v[im1jk]*rhoInv[im1jk]) +dXi_d*( v[ijm1k]*rhoInv[ijm1k] //dv_dxi @i-1/2,j-1 -v[im1jm1k]*rhoInv[im1jm1k])) - +0.5*( dYi_d*( u[ijk]*rhoInv[ijk] //du_deta + +0.25*(J13_d[ijk]*dZi_d*( v[ijk]*rhoInv[ijk] //dv_dzeta @i,j,k-1/2 + -v[ijkm1]*rhoInv[ijkm1]) + +J13_d[ijk]*dZi_d*( v[im1jk]*rhoInv[im1jk] //dv_dzeta @i-1,j,k-1/2 + -v[im1jkm1]*rhoInv[im1jkm1]) + +J13_d[ijk]*dZi_d*( v[ijm1k]*rhoInv[ijm1k] //dv_dzeta @i,j-1,k-1/2 + -v[ijm1km1]*rhoInv[ijm1km1]) + +J13_d[ijk]*dZi_d*( v[im1jm1k]*rhoInv[im1jm1k] //dv_dzeta @i-1,j-1,k-1/2 + -v[im1jm1km1]*rhoInv[im1jm1km1])) + +0.5*( dYi_d*( u[ijk]*rhoInv[ijk] //du_deta @i,j-1/2 -u[ijm1k]*rhoInv[ijm1k]) - +dYi_d*( u[im1jk]*rhoInv[im1jk] //du_deta + +dYi_d*( u[im1jk]*rhoInv[im1jk] //du_deta @i-1,j-1/2 -u[im1jm1k]*rhoInv[im1jm1k])) + +0.25*(J23_d[ijk]*dZi_d*( u[ijk]*rhoInv[ijk] //du_dzeta + -u[ijkm1]*rhoInv[ijkm1]) + +J23_d[ijk]*dZi_d*( u[im1jk]*rhoInv[im1jk] //du_deta + -u[im1jkm1]*rhoInv[im1jkm1]) + +J23_d[ijk]*dZi_d*( u[ijm1k]*rhoInv[ijm1k] //du_deta + -u[ijm1km1]*rhoInv[ijm1km1]) + +J23_d[ijk]*dZi_d*( u[im1jm1k]*rhoInv[im1jm1k] //du_deta + -u[im1jm1km1]*rhoInv[im1jm1km1])) ); //Done with S21 = 1/2*(dv_dx + du_dy) /*S31 = 1/2*(dw/dx+du/dz) @ (i-1/2,j,k-1/2), assuming A-grid */ S31[ijk] = 0.5*( @@ -128,7 +150,10 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f -w[im1jk]*rhoInv[im1jk]) +dXi_d*( w[ijkm1]*rhoInv[ijkm1] //dw_dxi @i-1/2,j,k-1 -w[im1jkm1]*rhoInv[im1jkm1])) - + +0.5*( J13_d[ijk]*dZi_d*( w[ijk]*rhoInv[ijk] //dw_dzeta @i,j,k-1/2 + -w[ijkm1]*rhoInv[ijkm1]) + +J13_d[ijk]*dZi_d*( w[im1jk]*rhoInv[im1jk] //dw_dzeta @i-1,j,k-1/2 + -w[im1jkm1]*rhoInv[im1jkm1])) +0.5*( J31_d[ijk]*dXi_d*( u[ijk]*rhoInv[ijk] //du_dxi @i-1/2,j,k -u[im1jk]*rhoInv[im1jk]) +J31_d[ijk]*dXi_d*( u[ijkm1]*rhoInv[ijkm1] //du_dxi @i-1/2,j,k-1 @@ -152,7 +177,10 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f -w[ijm1k]*rhoInv[ijm1k]) +dYi_d*( w[ijkm1]*rhoInv[ijkm1] //dw_deta @i,j-1/2,k-1 -w[ijm1km1]*rhoInv[ijm1km1])) - + +0.5*( J23_d[ijk]*dZi_d*( w[ijk]*rhoInv[ijk] //dw_dzeta @i,j,k-1/2 + -w[ijkm1]*rhoInv[ijkm1]) + +J23_d[ijk]*dZi_d*( w[ijm1k]*rhoInv[ijm1k] //dw_dzeta @i,j-1,k-1/2 + -w[ijm1km1]*rhoInv[ijm1km1])) +0.25*( J31_d[ijk]*dXi_d*( v[ijk]*rhoInv[ijk] //dv_dxi @i-1/2,j,k -v[im1jk]*rhoInv[im1jk]) +J31_d[ijk]*dXi_d*( v[ijm1k]*rhoInv[ijm1k] //dv_dxi @i-1/2,j-1,k @@ -165,22 +193,26 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f -v[ijm1k]*rhoInv[ijm1k]) +J32_d[ijk]*dYi_d*( v[ijkm1]*rhoInv[ijkm1] //dv_deta @i,j-1/2,k-1 -v[ijm1km1]*rhoInv[ijm1km1])) - +0.5*(J33_d[ijk]*dZi_d*( v[ijk]*rhoInv[ijk] //dv_dzeta @i,j,k-1/2 - -v[ijkm1]*rhoInv[ijkm1]) - +J33_d[ijk]*dZi_d*( v[ijm1k]*rhoInv[ijm1k] //dv_dzeta @i,j-1,k-1/2 - -v[ijm1km1]*rhoInv[ijm1km1])) + +0.5*( J33_d[ijk]*dZi_d*( v[ijk]*rhoInv[ijk] //dv_dzeta @i,j,k-1/2 + -v[ijkm1]*rhoInv[ijkm1]) + +J33_d[ijk]*dZi_d*( v[ijm1k]*rhoInv[ijm1k] //dv_dzeta @i,j-1,k-1/2 + -v[ijm1km1]*rhoInv[ijm1km1])) ); //Done with S32 = 1/2*(dw_dy + dv_dz) /*STH1 = (dTH/dx) @ (i-1/2), assuming A-grid */ STH1[ijk] = ( (dXi_d*( theta[ijk]*rhoInv[ijk] //dTH_dxi @i-1/2,j,k -theta[im1jk]*rhoInv[im1jk])) + +(J13_d[ijk]*dZi_d*( theta[ijk]*rhoInv[ijk] //dTH_dzeta @i,j,k-1/2 + -theta[ijkm1]*rhoInv[ijkm1])) ); //Done with STH1 = (dTH/dx) /*STH2 = (dTH/dy) @ (i,j-1/2,k), assuming A-grid */ STH2[ijk] = ( (dYi_d*( theta[ijk]*rhoInv[ijk] //dTH_deta @i,j-1/2,k -theta[ijm1k]*rhoInv[ijm1k])) + +(J23_d[ijk]*dZi_d*( theta[ijk]*rhoInv[ijk] //dTH_dzeta @i,j,k-1/2 + -theta[ijkm1]*rhoInv[ijkm1])) ); //Done with STH2 = (dTH/dy) /*STH3 = (dTH/dz) @ (i,j,k-1/2), assuming A-grid */ @@ -311,7 +343,7 @@ __device__ void cudaDevice_hydroCoreCalcTaus_PrognosticTKE_DeviatoricTerm( float* STH1, float* STH2, float* STH3, float* rho, float* Km, float* sgstke_ls, float* u, float* v, float* w, float* sgstke, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int i,j,k,ijk; int im1jk,ijm1k,ijkm1; int iStride,jStride,kStride; @@ -343,8 +375,12 @@ __device__ void cudaDevice_hydroCoreCalcTaus_PrognosticTKE_DeviatoricTerm( deviatoricTerm = (2.0/3.0)*rho[ijk]*( Km[ijk]*( dXi_d*( u[ijk]/rho[ijk] //du_dxi -u[im1jk]/rho[im1jk]) + +J13_d[ijk]*dZi_d*( u[ijk]/rho[ijk] //du_dzeta + -u[ijkm1]/rho[ijkm1]) +dYi_d*( v[ijk]/rho[ijk] //dv_deta -v[ijm1k]/rho[ijm1k]) + +J23_d[ijk]*dZi_d*( v[ijk]/rho[ijk] //dv_dzeta + -v[ijkm1]/rho[ijkm1]) +J31_d[ijk]*dXi_d*( w[ijk]/rho[ijk] //dw_dxi -w[im1jk]/rho[im1jk]) +J32_d[ijk]*dYi_d*( w[ijk]/rho[ijk] //dw_deta @@ -364,13 +400,13 @@ __device__ void cudaDevice_hydroCoreCalcTaus_PrognosticTKE_DeviatoricTerm( } //end if in the range of non-halo cells -} //cudaDevice_hydroCoreCalcTausi_PrognosticTKE_DeviatoricTerm() +} //cudaDevice_hydroCoreCalcTaus_PrognosticTKE_DeviatoricTerm() /*----->>>>> __device__ void cudaDevice_GradScalarToFaces(); -------------------------------------------------- * This cuda kernel calculates the spatial gradient of a scalar field: 1delta, gradient located at the cell face */ __device__ void cudaDevice_GradScalarToFaces(float* scalar, float* rhoInv, float* dSdx, float* dSdy, float* dSdz, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k,ijk; int im1jk,ijm1k,ijkm1; @@ -389,9 +425,11 @@ __device__ void cudaDevice_GradScalarToFaces(float* scalar, float* rhoInv, float if((i >= iMin_d-1)&&(i < iMax_d+1) && (j >= jMin_d-1)&&(j < jMax_d+1) && (k >= kMin_d-1)&&(k < kMax_d+1)){ - dSdx[ijk] = (dXi_d*(scalar[ijk]*rhoInv[ijk]-scalar[im1jk]*rhoInv[im1jk])); + dSdx[ijk] = (dXi_d*(scalar[ijk]*rhoInv[ijk]-scalar[im1jk]*rhoInv[im1jk]) + +J13_d[ijk]*dZi_d*(scalar[ijk]*rhoInv[ijk]-scalar[ijkm1]*rhoInv[ijkm1])); - dSdy[ijk] = (dYi_d*(scalar[ijk]*rhoInv[ijk]-scalar[ijm1k]*rhoInv[ijm1k])); + dSdy[ijk] = (dYi_d*(scalar[ijk]*rhoInv[ijk]-scalar[ijm1k]*rhoInv[ijm1k]) + +J23_d[ijk]*dZi_d*(scalar[ijk]*rhoInv[ijk]-scalar[ijkm1]*rhoInv[ijkm1])); dSdz[ijk] = (J31_d[ijk]*dXi_d*(scalar[ijk]*rhoInv[ijk]-scalar[im1jk]*rhoInv[im1jk]) +J32_d[ijk]*dYi_d*(scalar[ijk]*rhoInv[ijk]-scalar[ijm1k]*rhoInv[ijm1k]) @@ -448,7 +486,7 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixing(float* uFrhs, float* vFrhs, f float* T11, float* T21, float* T31, float* T32, float* T22, float* T33, float* TH1, float* TH2, float* TH3, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k,ijk; int ip1jk,ijp1k,ijkp1; int iStride,jStride,kStride; @@ -476,7 +514,9 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixing(float* uFrhs, float* vFrhs, f /*uFrhs = -d_dxj( -2*(Kappa_m*T1j) ), j=1,2,3 @ (i,j,k) cell-center, assuming A-grid */ uFrhs[ijk] = uFrhs[ijk] - ( dXi_d*(T11[ip1jk] - T11[ijk]) + +J13_d[ijk]*dZi_d*(T11[ijkp1] - T11[ijk]) +dYi_d*(T21[ijp1k] - T21[ijk]) + +J23_d[ijk]*dZi_d*(T21[ijkp1] - T21[ijk]) +J31_d[ijk]*dXi_d*(T31[ip1jk] - T31[ijk]) +J32_d[ijk]*dYi_d*(T31[ijp1k] - T31[ijk]) +J33_d[ijk]*dZi_d*(T31[ijkp1] - T31[ijk]) @@ -484,7 +524,9 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixing(float* uFrhs, float* vFrhs, f /*vFrhs = -d_dxj( -2*(Kappa_m*T2j) ), j=1,2,3 @ (i,j,k) cell-center, assuming A-grid */ vFrhs[ijk] = vFrhs[ijk] - ( dXi_d*(T21[ip1jk] - T21[ijk]) + +J13_d[ijk]*dZi_d*(T21[ijkp1] - T21[ijk]) +dYi_d*(T22[ijp1k] - T22[ijk]) + +J23_d[ijk]*dZi_d*(T22[ijkp1] - T22[ijk]) +J31_d[ijk]*dXi_d*(T32[ip1jk] - T32[ijk]) +J32_d[ijk]*dYi_d*(T32[ijp1k] - T32[ijk]) +J33_d[ijk]*dZi_d*(T32[ijkp1] - T32[ijk]) @@ -492,14 +534,18 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixing(float* uFrhs, float* vFrhs, f /*wFrhs = -d_dxj( -2*(Kappa_m*dT3j_dxj)), j=1,2,3 @ (i,j,k) cell-center, assuming A-grid */ wFrhs[ijk] = wFrhs[ijk] - ( dXi_d*(T31[ip1jk] - T31[ijk]) + +J13_d[ijk]*dZi_d*(T31[ijkp1] - T31[ijk]) +dYi_d*(T32[ijp1k] - T32[ijk]) + +J23_d[ijk]*dZi_d*(T32[ijkp1] - T32[ijk]) +J31_d[ijk]*dXi_d*(T33[ip1jk] - T33[ijk]) +J32_d[ijk]*dYi_d*(T33[ijp1k] - T33[ijk]) +J33_d[ijk]*dZi_d*(T33[ijkp1] - T33[ijk]) ); //Done with wFrhs = -2*(d_dx(Kappa_m*T31) + d_dy(Kappa_m*T32) + d_dz(Kappa_m*T33)) thetaFrhs[ijk] = thetaFrhs[ijk] - ( dXi_d*(TH1[ip1jk] - TH1[ijk]) + +J13_d[ijk]*dZi_d*(TH1[ijkp1] - TH1[ijk]) +dYi_d*(TH2[ijp1k] - TH2[ijk]) + +J23_d[ijk]*dZi_d*(TH2[ijkp1] - TH2[ijk]) +J31_d[ijk]*dXi_d*(TH3[ip1jk] - TH3[ijk]) +J32_d[ijk]*dYi_d*(TH3[ijp1k] - TH3[ijk]) +J33_d[ijk]*dZi_d*(TH3[ijkp1] - TH3[ijk]) @@ -512,7 +558,7 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixing(float* uFrhs, float* vFrhs, f * This is the cuda version of calculating forcing terms from subgrid-scale mixing of a scalar field */ __device__ void cudaDevice_hydroCoreCalcTurbMixingScalar(float* mFrhs, float* M1, float* M2, float* M3, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k,ijk; int ip1jk,ijp1k,ijkp1; @@ -536,7 +582,9 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixingScalar(float* mFrhs, float* M1 mFrhs[ijk] = mFrhs[ijk] - ( dXi_d*(M1[ip1jk] - M1[ijk]) + +J13_d[ijk]*dZi_d*(M1[ijkp1] - M1[ijk]) +dYi_d*(M2[ijp1k] - M2[ijk]) + +J23_d[ijk]*dZi_d*(M2[ijkp1] - M2[ijk]) +J31_d[ijk]*dXi_d*(M3[ip1jk] - M3[ijk]) +J32_d[ijk]*dYi_d*(M3[ijp1k] - M3[ijk]) +J33_d[ijk]*dZi_d*(M3[ijkp1] - M3[ijk]) @@ -551,7 +599,7 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixingScalar(float* mFrhs, float* M1 */ __global__ void cudaDevice_TausScalar(int iFld, float* hydroRhoInv_d, float* hydroFlds_d, float* hydroKappaM_d, float* sgstke_ls_d, float* Scalars_d, float* ScalarsTauFlds_d, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int i,j,k; int fldStride; @@ -568,7 +616,7 @@ __global__ void cudaDevice_TausScalar(int iFld, float* hydroRhoInv_d, float* hyd cudaDevice_GradScalarToFaces(&Scalars_d[fldStride*iFld], &hydroRhoInv_d[0], &ScalarsTauFlds_d[fldStride*0], &ScalarsTauFlds_d[fldStride*1], &ScalarsTauFlds_d[fldStride*2], - &J31_d[0], &J32_d[0], &J33_d[0]); + &J13_d[0], &J23_d[0], &J31_d[0], &J32_d[0], &J33_d[0]); cudaDevice_hydroCoreCalcTausScalar(&ScalarsTauFlds_d[fldStride*0], &ScalarsTauFlds_d[fldStride*1], &ScalarsTauFlds_d[fldStride*2], &hydroFlds_d[fldStride*RHO_INDX], &hydroKappaM_d[0], &sgstke_ls_d[0], &D_Jac_d[0]); // calculate tau_sj @@ -581,7 +629,7 @@ __global__ void cudaDevice_TausScalar(int iFld, float* hydroRhoInv_d, float* hyd * CUDA kernel for calculating SGS-Forcing (divergence of SGS-Tau fields) for a given "scalar" field. */ __global__ void cudaDevice_SGSforcing(int iFld, float* ScalarsTauFlds_d, float* ScalarsFrhs_d, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k; int fldStride; @@ -598,7 +646,7 @@ __global__ void cudaDevice_SGSforcing(int iFld, float* ScalarsTauFlds_d, float* cudaDevice_hydroCoreCalcTurbMixingScalar(&ScalarsFrhs_d[fldStride*iFld], &ScalarsTauFlds_d[fldStride*0], &ScalarsTauFlds_d[fldStride*1], &ScalarsTauFlds_d[fldStride*2], - &J31_d[0], &J32_d[0], &J33_d[0]); + &J13_d[0], &J23_d[0], &J31_d[0], &J32_d[0], &J33_d[0]); }//end if in the range of non-halo cells } // end cudaDevice_SGSforcing() diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice_cu.h index f9c36a9..25bc0a2 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice_cu.h @@ -60,7 +60,7 @@ __device__ void cudaDevice_hydroCoreCalcStrainRateElements(float* u, float* v, f float* S11, float* S21, float* S31, float* S32, float* S22, float* S33, float* STH1, float* STH2, float* STH3, - float* J31_d, float* J32_d, float* J33_d, + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* rhoInv); /*----->>>>> __device__ void cudaDevice_hydroCoreCalcEddyDiff(); --------------------------------------------- @@ -89,13 +89,13 @@ __device__ void cudaDevice_hydroCoreCalcTaus_PrognosticTKE_DeviatoricTerm( float* STH1, float* STH2, float* STH3, float* rho, float* Km, float* sgstke_ls, float* u, float* v, float* w, float* sgstke, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); /*----->>>>> __device__ void cudaDevice_GradScalarToFaces(); -------------------------------------------------- * This cuda kernel calculates the spatial gradient of a scalar field: 1delta, gradient located at the cell face */ __device__ void cudaDevice_GradScalarToFaces(float* scalar, float* rhoInv, float* dSdx, float* dSdy, float* dSdz, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); /*----->>>>> __device__ void cudaDevice_hydroCoreCalcTausScalar(); --------------------------------------------- * This is the cuda version of calculating SGS stresses of a scalar field field for subgrid-scale mixing formulations @@ -110,29 +110,25 @@ __device__ void cudaDevice_hydroCoreCalcTurbMixing(float* uFrhs, float* vFrhs, f float* T11, float* T21, float* T31, float* T32, float* T22, float* T33, float* TH1, float* TH2, float* TH3, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); /*----->>>>> __device__ void cudaDevice_hydroCoreCalcTurbMixingScalar(); --------------------------------------------- * This is the cuda version of calculating forcing terms from subgrid-scale mixing of a scalar field */ __device__ void cudaDevice_hydroCoreCalcTurbMixingScalar(float* mFrhs, float* M1, float* M2, float* M3, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); /*----->>>>> __global__ void cudaDevice_TausScalar() --------------------------------------------- * CUDA kernel for calculating SGS-Tau fields (intermediate calculation stress tensors) for a given "scalar" field. */ __global__ void cudaDevice_TausScalar(int iFld, float* hydroRhoInv_d, float* hydroFlds_d, float* hydroKappaM_d, float* sgstke_ls_d, float* Scalars_d, float* ScalarsTauFlds_d, - float* J11_d, float* J12_d, float* J13_d, - float* J21_d, float* J22_d, float* J23_d, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); /*----->>>>> __global__ void cudaDevice_SGSforcing() --------------------------------------------- * CUDA kernel for calculating SGS-Forcing (divergence of SGS-Tau fields) for a given "scalar" field. */ __global__ void cudaDevice_SGSforcing(int iFld, float* ScalarsTauFlds_d, float* ScalarsFrhs_d, - float* J11_d, float* J12_d, float* J13_d, - float* J21_d, float* J22_d, float* J23_d, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); #endif // _SGSTURB_CUDADEV_CU_H diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu index 3889b4f..56fabfc 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu @@ -68,7 +68,7 @@ extern "C" int cuda_sgstkeDeviceCleanup(){ __global__ void cudaDevice_hydroCoreUnitTestCompleteSGSTKE(float* hydroFlds_d, float* hydroRhoInv_d, float* hydroTauFlds_d, float* hydroKappaM_d, float* dedxi_d, float* sgstke_ls_d, float* sgstkeScalars_d, float* sgstkeScalarsFrhs_d, float* canopy_lad_d, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d){ int fldStride,iFld; fldStride = (Nx_d+2*Nh_d)*(Ny_d+2*Nh_d)*(Nz_d+2*Nh_d); @@ -83,11 +83,11 @@ __global__ void cudaDevice_hydroCoreUnitTestCompleteSGSTKE(float* hydroFlds_d, f &hydroTauFlds_d[fldStride*4], &hydroTauFlds_d[fldStride*3], &hydroTauFlds_d[fldStride*5], &hydroFlds_d[fldStride*U_INDX], &hydroFlds_d[fldStride*V_INDX], &hydroFlds_d[fldStride*W_INDX], &hydroRhoInv_d[0], &sgstkeScalarsFrhs_d[fldStride*iFld], - J31_d, J32_d, J33_d); // shear production term + J13_d, J23_d, J31_d, J32_d, J33_d); // shear production term cudaDevice_sgstkeTurbTransport(&hydroKappaM_d[0], &dedxi_d[fldStride*(iFld*3+0)], &dedxi_d[fldStride*(iFld*3+1)], &dedxi_d[fldStride*(iFld*3+2)], &hydroFlds_d[fldStride*RHO_INDX], &sgstkeScalarsFrhs_d[fldStride*iFld], - J31_d, J32_d, J33_d); // turbulent transport term + J13_d, J23_d, J31_d, J32_d, J33_d); // turbulent transport term if(canopySelector_d==1){ // 1-eq SGSTKE with canopy model cudaDevice_canopySGSTKEtransfer(&hydroRhoInv_d[0], &hydroFlds_d[fldStride*U_INDX], &hydroFlds_d[fldStride*V_INDX], &hydroFlds_d[fldStride*W_INDX], &canopy_lad_d[0], @@ -97,7 +97,7 @@ __global__ void cudaDevice_hydroCoreUnitTestCompleteSGSTKE(float* hydroFlds_d, f cudaDevice_sgstkeTurbTransport(&hydroKappaM_d[0], &dedxi_d[fldStride*(iFld*3+0)], &dedxi_d[fldStride*(iFld*3+1)], &dedxi_d[fldStride*(iFld*3+2)], &hydroFlds_d[fldStride*RHO_INDX], &sgstkeScalarsFrhs_d[fldStride*iFld], - J31_d, J32_d, J33_d); // turbulent transport term + J13_d, J23_d, J31_d, J32_d, J33_d); // turbulent transport term cudaDevice_sgstkeDissip(&sgstkeScalars_d[fldStride*iFld], &hydroRhoInv_d[0], &sgstke_ls_d[fldStride*iFld], &sgstkeScalarsFrhs_d[fldStride*iFld], 0, D_Jac_d); // dissipation term cudaDevice_canopySGSTKEtransfer(&hydroRhoInv_d[0], &hydroFlds_d[fldStride*U_INDX], &hydroFlds_d[fldStride*V_INDX], @@ -255,7 +255,7 @@ __device__ void cudaDevice_sgstkeDissip(float* sgstke, float* rhoInv, float* sgs */ __device__ void cudaDevice_sgstkeShearProd(float* tau_11, float* tau_12, float* tau_13, float* tau_22, float* tau_23, float* tau_33, float* u, float* v, float* w, float* rhoInv, float* Frhs_sgstke, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ float term_11,term_12,term_13,term_21,term_22,term_23,term_31,term_32,term_33; float f_sgstke_shear; @@ -282,27 +282,33 @@ __device__ void cudaDevice_sgstkeShearProd(float* tau_11, float* tau_12, float* if((i >= iMin_d)&&(i < iMax_d) && (j >= jMin_d)&&(j < jMax_d) && (k >= kMin_d)&&(k < kMax_d)){ // term_11 = -tau_11*du/dx - term_11 = -tau_11[ijk]*0.5*(dXi_d*(u[ip1jk]*rhoInv[ip1jk]-u[im1jk]*rhoInv[im1jk])); + term_11 = -tau_11[ijk]*0.5*(dXi_d*(u[ip1jk]*rhoInv[ip1jk]-u[im1jk]*rhoInv[im1jk]) + +J13_d[ijk]*dZi_d*(u[ijkp1]*rhoInv[ijkp1]-u[ijkm1]*rhoInv[ijkm1])); // term_12 = -tau_12*du/dy - term_12 = -tau_12[ijk]*0.5*(dYi_d*(u[ijp1k]*rhoInv[ijp1k]-u[ijm1k]*rhoInv[ijm1k])); + term_12 = -tau_12[ijk]*0.5*(dYi_d*(u[ijp1k]*rhoInv[ijp1k]-u[ijm1k]*rhoInv[ijm1k]) + +J23_d[ijk]*dZi_d*(u[ijkp1]*rhoInv[ijkp1]-u[ijkm1]*rhoInv[ijkm1])); // term_13 = -tau_13*du/dz term_13 = -tau_13[ijk]*0.5*(J31_d[ijk]*dXi_d*(u[ip1jk]*rhoInv[ip1jk]-u[im1jk]*rhoInv[im1jk]) +J32_d[ijk]*dYi_d*(u[ijp1k]*rhoInv[ijp1k]-u[ijm1k]*rhoInv[ijm1k]) +J33_d[ijk]*dZi_d*(u[ijkp1]*rhoInv[ijkp1]-u[ijkm1]*rhoInv[ijkm1])); // term_21 = -tau_21*dv/dx - term_21 = -tau_12[ijk]*0.5*(dXi_d*(v[ip1jk]*rhoInv[ip1jk]-v[im1jk]*rhoInv[im1jk])); + term_21 = -tau_12[ijk]*0.5*(dXi_d*(v[ip1jk]*rhoInv[ip1jk]-v[im1jk]*rhoInv[im1jk]) + +J13_d[ijk]*dZi_d*(v[ijkp1]*rhoInv[ijkp1]-v[ijkm1]*rhoInv[ijkm1])); // term_22 = -tau_22*dv/dy - term_22 = -tau_22[ijk]*0.5*(dYi_d*(v[ijp1k]*rhoInv[ijp1k]-v[ijm1k]*rhoInv[ijm1k])); + term_22 = -tau_22[ijk]*0.5*(dYi_d*(v[ijp1k]*rhoInv[ijp1k]-v[ijm1k]*rhoInv[ijm1k]) + +J23_d[ijk]*dZi_d*(v[ijkp1]*rhoInv[ijkp1]-v[ijkm1]*rhoInv[ijkm1])); // term_23 = -tau_23*dv/dz term_23 = -tau_23[ijk]*0.5*(J31_d[ijk]*dXi_d*(v[ip1jk]*rhoInv[ip1jk]-v[im1jk]*rhoInv[im1jk]) +J32_d[ijk]*dYi_d*(v[ijp1k]*rhoInv[ijp1k]-v[ijm1k]*rhoInv[ijm1k]) +J33_d[ijk]*dZi_d*(v[ijkp1]*rhoInv[ijkp1]-v[ijkm1]*rhoInv[ijkm1])); // term_31 = -tau_31*dw/dx - term_31 = -tau_13[ijk]*0.5*(dXi_d*(w[ip1jk]*rhoInv[ip1jk]-w[im1jk]*rhoInv[im1jk])); + term_31 = -tau_13[ijk]*0.5*(dXi_d*(w[ip1jk]*rhoInv[ip1jk]-w[im1jk]*rhoInv[im1jk]) + +J13_d[ijk]*dZi_d*(w[ijkp1]*rhoInv[ijkp1]-w[ijkm1]*rhoInv[ijkm1])); // term_32 = -tau_32*dw/dy - term_32 = -tau_23[ijk]*0.5*(dYi_d*(w[ijp1k]*rhoInv[ijp1k]-w[ijm1k]*rhoInv[ijm1k])); + term_32 = -tau_23[ijk]*0.5*(dYi_d*(w[ijp1k]*rhoInv[ijp1k]-w[ijm1k]*rhoInv[ijm1k]) + +J23_d[ijk]*dZi_d*(w[ijkp1]*rhoInv[ijkp1]-w[ijkm1]*rhoInv[ijkm1])); // term_33 = -tau_33*dw/dz term_33 = -tau_33[ijk]*0.5*(J31_d[ijk]*dXi_d*(w[ip1jk]*rhoInv[ip1jk]-w[im1jk]*rhoInv[im1jk]) +J32_d[ijk]*dYi_d*(w[ijp1k]*rhoInv[ijp1k]-w[ijm1k]*rhoInv[ijm1k]) @@ -326,7 +332,7 @@ __device__ void cudaDevice_sgstkeShearProd(float* tau_11, float* tau_12, float* /*----->>>>> __device__ void cudaDevice_GradScalar(); -------------------------------------------------- */ __device__ void cudaDevice_GradScalar(float* scalar, float* rhoInv, float* dedx, float* dedy, float* dedz, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ int i,j,k,ijk; int im1jk,ijm1k,ijkm1; @@ -349,9 +355,11 @@ __device__ void cudaDevice_GradScalar(float* scalar, float* rhoInv, float* dedx, if((i >= iMin_d-1)&&(i < iMax_d+1) && (j >= jMin_d-1)&&(j < jMax_d+1) && (k >= kMin_d-1)&&(k < kMax_d+1)){ - dedx[ijk] = 0.5*(dXi_d*(scalar[ip1jk]*rhoInv[ip1jk]-scalar[im1jk]*rhoInv[im1jk])); + dedx[ijk] = 0.5*(dXi_d*(scalar[ip1jk]*rhoInv[ip1jk]-scalar[im1jk]*rhoInv[im1jk]) + +J13_d[ijk]*dZi_d*(scalar[ijkp1]*rhoInv[ijkp1]-scalar[ijkm1]*rhoInv[ijkm1])); - dedy[ijk] = 0.5*(dYi_d*(scalar[ijp1k]*rhoInv[ijp1k]-scalar[ijm1k]*rhoInv[ijm1k])); + dedy[ijk] = 0.5*(dYi_d*(scalar[ijp1k]*rhoInv[ijp1k]-scalar[ijm1k]*rhoInv[ijm1k]) + +J23_d[ijk]*dZi_d*(scalar[ijkp1]*rhoInv[ijkp1]-scalar[ijkm1]*rhoInv[ijkm1])); dedz[ijk] = 0.5*(J31_d[ijk]*dXi_d*(scalar[ip1jk]*rhoInv[ip1jk]-scalar[im1jk]*rhoInv[im1jk]) +J32_d[ijk]*dYi_d*(scalar[ijp1k]*rhoInv[ijp1k]-scalar[ijm1k]*rhoInv[ijm1k]) @@ -364,7 +372,7 @@ __device__ void cudaDevice_GradScalar(float* scalar, float* rhoInv, float* dedx, /*----->>>>> __device__ void cudaDevice_sgstkeTurbTransport(); -------------------------------------------------- */ __device__ void cudaDevice_sgstkeTurbTransport(float* Km, float* dedx, float* dedy, float* dedz, float* rho, float* Frhs_sgstke, - float* J31_d, float* J32_d, float* J33_d){ + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d){ float term_x; float term_y; @@ -393,9 +401,11 @@ __device__ void cudaDevice_sgstkeTurbTransport(float* Km, float* dedx, float* de if((i >= iMin_d)&&(i < iMax_d) && (j >= jMin_d)&&(j < jMax_d) && (k >= kMin_d)&&(k < kMax_d)){ // -2.0*Km*de/dxi // 2.0 * 0.5 factor of the 2dx derivative cancel out - term_x = -(dXi_d*(dedx[ip1jk]*Km[ip1jk]*rho[ip1jk]-dedx[im1jk]*Km[im1jk]*rho[im1jk])); + term_x = -(dXi_d*(dedx[ip1jk]*Km[ip1jk]*rho[ip1jk]-dedx[im1jk]*Km[im1jk]*rho[im1jk]) + +J13_d[ijk]*dZi_d*(dedz[ijkp1]*Km[ijkp1]*rho[ijkp1]-dedz[ijkm1]*Km[ijkm1]*rho[ijkm1])); - term_y = -(dYi_d*(dedy[ijp1k]*Km[ijp1k]*rho[ijp1k]-dedy[ijm1k]*Km[ijm1k]*rho[ijm1k])); + term_y = -(dYi_d*(dedy[ijp1k]*Km[ijp1k]*rho[ijp1k]-dedy[ijm1k]*Km[ijm1k]*rho[ijm1k]) + +J23_d[ijk]*dZi_d*(dedz[ijkp1]*Km[ijkp1]*rho[ijkp1]-dedz[ijkm1]*Km[ijkm1]*rho[ijkm1])); term_z = -(J31_d[ijk]*dXi_d*(dedz[ip1jk]*Km[ip1jk]*rho[ip1jk]-dedz[im1jk]*Km[im1jk]*rho[im1jk]) +J32_d[ijk]*dYi_d*(dedz[ijp1k]*Km[ijp1k]*rho[ijp1k]-dedz[ijm1k]*Km[ijm1k]*rho[ijm1k]) diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice_cu.h b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice_cu.h index fed518c..6d8e930 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice_cu.h +++ b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice_cu.h @@ -50,7 +50,7 @@ extern "C" int cuda_sgstkeDeviceCleanup(); __global__ void cudaDevice_hydroCoreUnitTestCompleteSGSTKE(float* hydroFlds_d, float* hydroRhoInv_d, float* hydroTauFlds_d, float* hydroKappaM_d, float* dedxi_d, float* sgstke_ls_d, float* sgstkeScalars_d, float* sgstkeScalarsFrhs_d, - float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d, float* D_Jac_d); /*----->>>>> __device__ void cudaDevice_sgstkeLengthScale(); -------------------------------------------------- * This cuda kernel calculates the sub-grid length scale @@ -72,19 +72,19 @@ __device__ void cudaDevice_sgstkeDissip(float* sgstke, float* rhoInv, float* sgs */ __device__ void cudaDevice_sgstkeShearProd(float* tau_11, float* tau_12, float* tau_13, float* tau_22, float* tau_23, float* tau_33, float* u, float* v, float* w, float* rhoInv, float* Frhs_sgstke, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); /*----->>>>> __device__ void cudaDevice_GradScalar(); -------------------------------------------------- * This cuda kernel calculates the spatial gradient of a scalar field: 2delta, gradient located at the cell center */ __device__ void cudaDevice_GradScalar(float* scalar, float* rhoInv, float* dedx, float* dedy, float* dedz, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); /*----->>>>> __device__ void cudaDevice_sgstkeTurbTransport(); -------------------------------------------------- * This cuda kernel calculates the Turbulent Transport term in the SGSTKE equation */ __device__ void cudaDevice_sgstkeTurbTransport(float* Km, float* dedx, float* dedy, float* dedz, float* rho, float* Frhs_sgstke, - float* J31_d, float* J32_d, float* J33_d); + float* J13_d, float* J23_d, float* J31_d, float* J32_d, float* J33_d); #endif // _SGSTKE_CUDADEV_CU_H From 40025d8749953c4a6c6b9c50bb7ac846bed3582e Mon Sep 17 00:00:00 2001 From: Jeremy Sauer Date: Tue, 25 Mar 2025 18:14:25 -0600 Subject: [PATCH 5/5] Fix sign on J32 element in (unused, only for sanity check) device-sided Jacobian calculation routine --- SRC/GRID/CUDA/cuda_gridDevice.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SRC/GRID/CUDA/cuda_gridDevice.cu b/SRC/GRID/CUDA/cuda_gridDevice.cu index 4a779ed..bfa1074 100644 --- a/SRC/GRID/CUDA/cuda_gridDevice.cu +++ b/SRC/GRID/CUDA/cuda_gridDevice.cu @@ -268,7 +268,7 @@ __global__ void cudaDevice_calculateJacobians(float *J13_d, float *J23_d, float J23_d[ijk] = -(T[0]*T[5] - T[2]*T[3])*invD_Jac_d[ijk]; /*dz/d_(xi, eta, zeta)*/ J31_d[ijk] = (T[3]*T[7] - T[4]*T[6])*invD_Jac_d[ijk]; - J32_d[ijk] = (T[0]*T[7] - T[1]*T[6])*invD_Jac_d[ijk]; + J32_d[ijk] = -(T[0]*T[7] - T[1]*T[6])*invD_Jac_d[ijk]; J33_d[ijk] = (T[0]*T[4] - T[1]*T[3])*invD_Jac_d[ijk]; #ifdef DEBUG if((i==64)&&(j==64)&&(k==64)){