diff --git a/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu b/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu
index 6c55981..7d37b55 100644
--- a/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu
+++ b/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu
@@ -102,12 +102,12 @@ extern "C" int cuda_GADDeviceSetup(){
     cudaMemcpyToSymbol(numgridCells_away_d, &numgridCells_away, sizeof(int));
 
     /*Device memory allocations and Host-to-Device memcopy for turbine arrays */
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineType_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRank_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefi_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefj_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefk_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineYawing_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineType_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRank_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefi_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefj_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefk_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineYawing_d);
     cudaMemcpy(GAD_turbineType_d, GAD_turbineType, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRank_d, GAD_turbineRank, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRefi_d, GAD_turbineRefi, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
@@ -115,13 +115,13 @@ extern "C" int cuda_GADDeviceSetup(){
     cudaMemcpy(GAD_turbineRefk_d, GAD_turbineRefk, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineYawing_d, GAD_turbineYawing, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefMag_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefDir_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Xcoords_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Ycoords_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_rotorTheta_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_yawError_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_anFactor_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefMag_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefDir_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Xcoords_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Ycoords_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_rotorTheta_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_yawError_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_anFactor_d);
     cudaMemcpy(GAD_turbineRefMag_d, GAD_turbineRefMag, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRefDir_d, GAD_turbineRefDir, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_Xcoords_d, GAD_Xcoords, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
@@ -136,10 +136,10 @@ extern "C" int cuda_GADDeviceSetup(){
     cudaMemcpy(GAD_yawError_d, GAD_yawError, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_anFactor_d, GAD_anFactor, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     
-    fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineUseries_d);
-    fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineVseries_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &u_sampAvg_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &v_sampAvg_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineUseries_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineVseries_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &u_sampAvg_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &v_sampAvg_d);
 
     //Initialize u_sampAvg & GAD_turbineUseries as constant (per-turbine) then send down to the device 
     tmp_vector = (float *) malloc(GADrefSeriesLength*sizeof(float));
@@ -171,40 +171,40 @@ extern "C" int cuda_GADDeviceSetup(){
     }
     free(tmp_vector);
 
-    fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_hubHeights_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_rotorD_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_nacelleD_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_hubHeights_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_rotorD_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_nacelleD_d);
     cudaMemcpy(GAD_hubHeights_d, GAD_hubHeights, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_rotorD_d, GAD_rotorD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_nacelleD_d, GAD_nacelleD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
    
      
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyTwist_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyChord_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyPitch_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyOmega_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyTwist_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyChord_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyPitch_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyOmega_d);
     cudaMemcpy(turbinePolyTwist_d, turbinePolyTwist, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyChord_d, turbinePolyChord, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyPitch_d, turbinePolyPitch, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyOmega_d, turbinePolyOmega, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), &rnorm_vect_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*alphaBounds*sizeof(float), &alpha_minmax_vect_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCl_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCd_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)), &rnorm_vect_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*alphaBounds), &alpha_minmax_vect_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCl_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCd_d);
 
     cudaMemcpy(rnorm_vect_d, rnorm_vect, GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(alpha_minmax_vect_d, alpha_minmax_vect, GADNumTurbineTypes*alphaBounds*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyCd_d, turbinePolyCd, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyCl_d, turbinePolyCl, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_turbineVolMask_d);
+    fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_turbineVolMask_d);
     cudaMemcpy(GAD_turbineVolMask_d, GAD_turbineVolMask, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
 
     if (GADoutputForces == 1){
-      fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceX_d);
-      fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceY_d);
-      fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceZ_d);
+      fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceX_d);
+      fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceY_d);
+      fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceZ_d);
       cudaMemcpy(GAD_forceX_d, GAD_forceX, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
       cudaMemcpy(GAD_forceY_d, GAD_forceY, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
       cudaMemcpy(GAD_forceZ_d, GAD_forceZ, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
diff --git a/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu b/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu
index 25c355e..1d02ee1 100644
--- a/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu
+++ b/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu
@@ -29,21 +29,21 @@ float *urban_heat_redis_d;                 /* Base Address of memory containing
 */
 extern "C" int cuda_urbanDeviceSetup(){
    int errorCode = CUDA_URBAN_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(urbanSelector_d, &urbanSelector, sizeof(int));
    cudaMemcpyToSymbol(cd_build_d, &cd_build, sizeof(float));
    cudaMemcpyToSymbol(ct_build_d, &ct_build, sizeof(float));
 
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &building_mask_d);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems, &building_mask_d);
    cudaMemcpy(building_mask_d, building_mask, Nelems*sizeof(float), cudaMemcpyHostToDevice);
 
    cudaMemcpyToSymbol(delta_aware_bdg_d, &delta_aware_bdg, sizeof(float));
 
    if(urban_heatRedis > 0){
      Nelems = (Nxp+2*Nh)*(Nyp+2*Nh);
-     fecuda_DeviceMalloc(Nelems*sizeof(float), &urban_heat_redis_d);
+     fecuda_DeviceMalloc(Nelems, &urban_heat_redis_d);
      cudaMemcpy(urban_heat_redis_d, urban_heat_redis, Nelems*sizeof(float), cudaMemcpyHostToDevice);
    }
 
diff --git a/SRC/FECUDA/fecuda_Device_cu.h b/SRC/FECUDA/fecuda_Device_cu.h
index ac1981d..6c64b3f 100644
--- a/SRC/FECUDA/fecuda_Device_cu.h
+++ b/SRC/FECUDA/fecuda_Device_cu.h
@@ -55,11 +55,6 @@ extern __constant__ int rankYid_d;
 */
 extern "C" int fecuda_DeviceSetup(int tBx, int tBy, int tBz);
 
-/*----->>>>> void fecuda_DeviceMallocInt();    -----------------------------------------------------------
-* Used to allocate device memory integer blocks and set the  host memory addresses of device memory pointers.
-*/
-extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d);
-
 /*----->>>>> int fecuda_SetBlocksPerGrid();   ------------------------------------------------------------------
  * Used to set the "dim3 grid" module variable that is passed to any device kernel 
  * to specify the number of blocks per grid in each dimenaion
diff --git a/SRC/FECUDA/fecuda_Utils.cu b/SRC/FECUDA/fecuda_Utils.cu
index 136c5f1..6bc9e65 100644
--- a/SRC/FECUDA/fecuda_Utils.cu
+++ b/SRC/FECUDA/fecuda_Utils.cu
@@ -180,19 +180,19 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers(){
 /*----->>>>> void fecuda_DeviceMalloc();    -----------------------------------------------------------
 * Used to allocate device memory float blocks and set the  host memory addresses of device memory pointers.
 */
-extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d) {
-    cudaMalloc((void**)memBlock_d,sizeof(float)*Nelems);
+extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d) {
+    cudaMalloc((void**)memBlock_d,(size_t)(sizeof(float))*Nelems);
     gpuErrchk( cudaPeekAtLastError() );
-    cudaMemset(*memBlock_d,'\0',sizeof(float)*Nelems);    
+    cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(float))*Nelems);    
     gpuErrchk( cudaPeekAtLastError() );
 #ifdef DEBUG
     printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d);
 #endif
 }
-extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d) {
-    cudaMalloc((void**)memBlock_d,sizeof(int)*Nelems);
+extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d) {
+    cudaMalloc((void**)memBlock_d,(size_t)(sizeof(int))*Nelems);
     gpuErrchk( cudaPeekAtLastError() );
-    cudaMemset(*memBlock_d,'\0',sizeof(int)*Nelems);
+    cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(int))*Nelems);
     gpuErrchk( cudaPeekAtLastError() );
 #ifdef DEBUG
     printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d);
diff --git a/SRC/FECUDA/fecuda_Utils_cu.h b/SRC/FECUDA/fecuda_Utils_cu.h
index 3f66b97..2ede109 100644
--- a/SRC/FECUDA/fecuda_Utils_cu.h
+++ b/SRC/FECUDA/fecuda_Utils_cu.h
@@ -41,7 +41,12 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers();
 /*----->>>>> void fecuda_DeviceMalloc();    -----------------------------------------------------------
 * Used to allocate device memory float blocks and set the  host memory addresses of device memory pointers.
 */
-extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d);
+extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d);
+
+/*----->>>>> void fecuda_DeviceMallocInt();    -----------------------------------------------------------
+* Used to allocate device memory integer blocks and set the  host memory addresses of device memory pointers.
+*/
+extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d);
 
 /*----->>>>> int fecuda_SendRecvWestEast(); -------------------------------------------------------------------
 Used to perform western/eastern device domain halo exchange for an arbitrary field.
diff --git a/SRC/GRID/CUDA/cuda_gridDevice.cu b/SRC/GRID/CUDA/cuda_gridDevice.cu
index 2ede7ff..c0abd8d 100644
--- a/SRC/GRID/CUDA/cuda_gridDevice.cu
+++ b/SRC/GRID/CUDA/cuda_gridDevice.cu
@@ -64,7 +64,7 @@ float *invD_Jac_d; //inverse Determinant of the Jacbian
 */
 extern "C" int cuda_gridDeviceSetup(){
    int errorCode = CUDA_GRID_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 #ifdef DEBUG 
    cudaEvent_t startE, stopE;
    float elapsedTime;
@@ -100,21 +100,21 @@ extern "C" int cuda_gridDeviceSetup(){
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/
 
    /*Set the full memory block number of elements for grid fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); 
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); 
    /* Allocate the GRID arrays */
    /* Coordinate Arrays */
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &xPos_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &yPos_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &zPos_d);
-   fecuda_DeviceMalloc(((Nxp+2*Nh)*(Nyp+2*Nh))*sizeof(float), &topoPos_d);
+   fecuda_DeviceMalloc(Nelems, &xPos_d);
+   fecuda_DeviceMalloc(Nelems, &yPos_d);
+   fecuda_DeviceMalloc(Nelems, &zPos_d);
+   fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)), &topoPos_d);
    /* Metric Tensors Fields */
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J31_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J32_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J33_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &D_Jac_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &invD_Jac_d);
+   fecuda_DeviceMalloc(Nelems, &J13_d);
+   fecuda_DeviceMalloc(Nelems, &J23_d);
+   fecuda_DeviceMalloc(Nelems, &J31_d);
+   fecuda_DeviceMalloc(Nelems, &J32_d);
+   fecuda_DeviceMalloc(Nelems, &J33_d);
+   fecuda_DeviceMalloc(Nelems, &D_Jac_d);
+   fecuda_DeviceMalloc(Nelems, &invD_Jac_d);
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMalloc calls*/
 
    /* cudaMemcpy the GRID arrays from Host to Device*/
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu
index 51ed665..1749cd5 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu
@@ -66,22 +66,22 @@ extern "C" int cuda_BCsDeviceSetup(){
    /*Allocate arrays*/
    if(hydroBCs==1){ //Using LAD BCs
      if((rankYid == 0)||(rankYid == numProcsY-1)){
-       fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanes_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesNext_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesBuffer_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanes_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesNext_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesBuffer_d);
      }
      if((rankXid == 0)||(rankXid == numProcsX-1)){
-       fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanes_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesNext_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesBuffer_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanes_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesNext_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesBuffer_d);
      }
-     fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanes_d);
-     fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesNext_d);
-     fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesBuffer_d);
+     fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanes_d);
+     fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesNext_d);
+     fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesBuffer_d);
      if(surflayerSelector == 3){
-       fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanes_d);
-       fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesNext_d);
-       fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesBuffer_d);
+       fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanes_d);
+       fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesNext_d);
+       fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesBuffer_d);
      }
    }//end if hydroBCs == 1
   
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu
index 5215273..876fbd0 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu
@@ -24,13 +24,13 @@ float *hydroBaseStatePres_d;   /*Base Adress of memory containing the diagnostic
 */
 extern "C" int cuda_BaseStateDeviceSetup(){
    int errorCode = CUDA_BASESTATE_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    /*Set the full memory block number of elements for base-state fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
    /* Allocate the Base State arrays on the device */
-   fecuda_DeviceMalloc(Nelems*2*sizeof(float), &hydroBaseStateFlds_d);  //Only rho and theta base-state variables
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroBaseStatePres_d);  //Only base-state pressure 
+   fecuda_DeviceMalloc(Nelems*2, &hydroBaseStateFlds_d);  //Only rho and theta base-state variables
+   fecuda_DeviceMalloc(Nelems, &hydroBaseStatePres_d);  //Only base-state pressure 
 
    /* Send the Base State arrays down to the device */
    cudaMemcpy(hydroBaseStateFlds_d, hydroBaseStateFlds, Nelems*2*sizeof(float), cudaMemcpyHostToDevice);
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu
index dfd26a9..c869c9f 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu
@@ -25,15 +25,15 @@ __constant__ float b_hyb_d;                      /*hybrid advection scheme param
 */
 extern "C" int cuda_advectionDeviceSetup(){
    int errorCode = CUDA_ADVECTION_SUCCESS;
-   int Nelems;
+   size_t Nelems;
    
    cudaMemcpyToSymbol(advectionSelector_d, &advectionSelector, sizeof(int));
    cudaMemcpyToSymbol(ceilingAdvectionBC_d, &ceilingAdvectionBC, sizeof(int));
    cudaMemcpyToSymbol(b_hyb_d, &b_hyb, sizeof(float));
 
    /*Set the full memory block number of elements for hydroCore fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*3*sizeof(float), &hydroFaceVels_d); /*Cell-face Velocities*/
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems*3, &hydroFaceVels_d); /*Cell-face Velocities*/
 
    return(errorCode);
 } //end cuda_advectionDeviceSetup()
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu
index 7de459d..21c1d96 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu
@@ -35,7 +35,7 @@ __device__ __constant__ float srcAuxScMassSpecValue_d[MAX_AUXSC_SRC]; /*Mass spe
 */
 extern "C" int cuda_auxScalarsDeviceSetup(){
    int errorCode = CUDA_AUXSCALARS_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(NhydroAuxScalars_d, &NhydroAuxScalars, sizeof(int));
    if (NhydroAuxScalars > 0){
@@ -51,11 +51,11 @@ extern "C" int cuda_auxScalarsDeviceSetup(){
    }//end if NydroAuxScalars > 0
 
    if (NhydroAuxScalars > 0){
-     Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-     fecuda_DeviceMalloc(Nelems*NhydroAuxScalars*sizeof(float), &hydroAuxScalars_d); /*Prognostic variable fields*/
-     fecuda_DeviceMalloc(Nelems*NhydroAuxScalars*sizeof(float), &hydroAuxScalarsFrhs_d); /*Prognostic variable Frhs*/
+     Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+     fecuda_DeviceMalloc(Nelems*NhydroAuxScalars, &hydroAuxScalars_d); /*Prognostic variable fields*/
+     fecuda_DeviceMalloc(Nelems*NhydroAuxScalars, &hydroAuxScalarsFrhs_d); /*Prognostic variable Frhs*/
      if ((turbulenceSelector > 0) && (AuxScSGSturb > 0)){
-       fecuda_DeviceMalloc(Nelems*3*sizeof(float), &AuxScalarsTauFlds_d);
+       fecuda_DeviceMalloc(Nelems*3, &AuxScalarsTauFlds_d);
      }
    } // end if NhydroAuxScalars > 0
 
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu
index 7f30608..60a0f26 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu
@@ -26,9 +26,9 @@ float *canopy_lad_d;          /* Base Address of memory containing leaf area den
 */
 extern "C" int cuda_canopyDeviceSetup(){
    int errorCode = CUDA_CANOPY_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
 
    cudaMemcpyToSymbol(canopySelector_d, &canopySelector, sizeof(int));
    cudaMemcpyToSymbol(canopySkinOpt_d, &canopySkinOpt, sizeof(int));
@@ -36,7 +36,7 @@ extern "C" int cuda_canopyDeviceSetup(){
    cudaMemcpyToSymbol(canopy_lf_d, &canopy_lf, sizeof(float));
 
    Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &canopy_lad_d);
+   fecuda_DeviceMalloc(Nelems, &canopy_lad_d);
    cudaMemcpy(canopy_lad_d, canopy_lad, Nelems*sizeof(float), cudaMemcpyHostToDevice);
 
    return(errorCode);
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu
index 81d9d56..c2133cb 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu
@@ -32,7 +32,8 @@ float* randcp_d;            /*Base address for pseudo-random numbers used for ce
 */
 extern "C" int cuda_cellpertDeviceSetup(){
    int errorCode = CUDA_CELLPERT_SUCCESS;
-   int Nelems1d_xy, Nelems1d;
+   int Nelems1d_xy;
+   size_t Nelems1d;
 
    /*Constants*/
    cudaMemcpyToSymbol(cellpertSelector_d, &cellpertSelector, sizeof(int));
@@ -45,8 +46,8 @@ extern "C" int cuda_cellpertDeviceSetup(){
    cudaMemcpyToSymbol(cellpert_ktop_d, &cellpert_ktop, sizeof(int));
 
    Nelems1d_xy = (Nx/cellpert_gppc+min(Nx%cellpert_gppc,1))*(2*cellpert_ndbc+min(Ny%cellpert_gppc,1)) + (Ny/cellpert_gppc-2*cellpert_ndbc)*(2*cellpert_ndbc+min(Nx%cellpert_gppc,1));
-   Nelems1d = Nelems1d_xy*(cellpert_ktop-cellpert_kbottom+1);
-   fecuda_DeviceMalloc(Nelems1d*sizeof(float), &randcp_d);
+   Nelems1d = (size_t)(Nelems1d_xy*(cellpert_ktop-cellpert_kbottom+1));
+   fecuda_DeviceMalloc(Nelems1d, &randcp_d);
 
    return(errorCode);
 } //end cuda_cellpertDeviceSetup()
@@ -94,7 +95,7 @@ extern "C" int cuda_hydroCoreDeviceBuildCPmethod(int simTime_it){
    n_tot = n_xy*(cellpert_ktop-cellpert_kbottom+1);
 
    curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
-   curandSetPseudoRandomGeneratorSeed(gen,simTime_it);
+   curandSetPseudoRandomGeneratorSeed(gen,(unsigned long long)simTime_it);
    curandGenerateUniform(gen,randcp_d,n_tot);
 
 #ifdef URBAN_EXT
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu
index d9c0e58..5c691ad 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu
@@ -90,7 +90,7 @@ __constant__ float L_v_d;            /* latent heat of vaporization (J/kg) */
 */
 extern "C" int cuda_hydroCoreDeviceSetup(){
    int errorCode = CUDA_HYDRO_CORE_SUCCESS;
-   int Nelems;
+   size_t Nelems;
  
    /*Synchronize the Device*/
    gpuErrchk( cudaDeviceSynchronize() );
@@ -133,11 +133,11 @@ extern "C" int cuda_hydroCoreDeviceSetup(){
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/
 
    /*Set the full memory block number of elements for hydroCore fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); 
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); 
    /* Allocate the HYDRO_CORE arrays */
-   fecuda_DeviceMalloc(Nelems*Nhydro*sizeof(float), &hydroFlds_d); /*Prognostic variable fields*/ 
-   fecuda_DeviceMalloc(Nelems*Nhydro*sizeof(float), &hydroFldsFrhs_d); /*Prognostic variable field Frhs(s)*/ 
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroRhoInv_d); 
+   fecuda_DeviceMalloc(Nelems*(size_t)Nhydro, &hydroFlds_d); /*Prognostic variable fields*/ 
+   fecuda_DeviceMalloc(Nelems*(size_t)Nhydro, &hydroFldsFrhs_d); /*Prognostic variable field Frhs(s)*/ 
+   fecuda_DeviceMalloc(Nelems, &hydroRhoInv_d); 
    
    /*AUXILIARY SCALARS*/
    if(NhydroAuxScalars > 0){
@@ -1282,7 +1282,7 @@ extern "C" int cuda_hydroCoreInitFieldsDevice(){
      }
    }// end if surflayerSelector > 0
    if(NhydroAuxScalars > 0){ /*Copy any required host auxiliary sclar fields to the device */
-     cudaMemcpy(hydroAuxScalars_d, hydroAuxScalars, Nelems*NhydroAuxScalars*sizeof(float), cudaMemcpyHostToDevice);
+     cudaMemcpy(hydroAuxScalars_d, hydroAuxScalars, (size_t)(Nelems)*(size_t)(NhydroAuxScalars*sizeof(float)), cudaMemcpyHostToDevice);
    }// end if hydroAuxScalars > 0
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/
    gpuErrchk( cudaDeviceSynchronize() );
@@ -1333,9 +1333,9 @@ extern "C" int cuda_hydroCoreSynchFieldsFromDevice(){
      }
    }//endif surflayerSelector > 0
    if(NhydroAuxScalars > 0){
-     gpuErrchk( cudaMemcpy(hydroAuxScalars, hydroAuxScalars_d, Nelems*NhydroAuxScalars*sizeof(float), cudaMemcpyDeviceToHost) );
+     gpuErrchk( cudaMemcpy(hydroAuxScalars, hydroAuxScalars_d, (size_t)(Nelems)*(size_t)(NhydroAuxScalars*sizeof(float)), cudaMemcpyDeviceToHost) );
      if((hydroForcingWrite==1)||(hydroForcingLog==1)){
-       gpuErrchk( cudaMemcpy(hydroAuxScalarsFrhs, hydroAuxScalarsFrhs_d, Nelems*NhydroAuxScalars*sizeof(float), cudaMemcpyDeviceToHost) );
+       gpuErrchk( cudaMemcpy(hydroAuxScalarsFrhs, hydroAuxScalarsFrhs_d, (size_t)(Nelems)*(size_t)(NhydroAuxScalars*sizeof(float)), cudaMemcpyDeviceToHost) );
      } //endif we need to send up the Frhs
    } //end if NhydroAuxScalars > 0
    if(hydroSubGridWrite==1){
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu
index fcd076f..c0eb479 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu
@@ -42,7 +42,7 @@ float* lsf_meanPhiBlock_d;              /*Base address of work arrray for block
 */
 extern "C" int cuda_lsfDeviceSetup(){
    int errorCode = CUDA_LSF_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(lsfSelector_d, &lsfSelector, sizeof(int));
    cudaMemcpyToSymbol(lsf_w_surf_d, &lsf_w_surf, sizeof(float));
@@ -69,9 +69,9 @@ extern "C" int cuda_lsfDeviceSetup(){
        fflush(stdout);
      }
      cudaMemcpyToSymbol(lsf_numPhiVars_d, &lsf_numPhiVars, sizeof(float));
-     Nelems = (Nzp+2*Nh);
-     fecuda_DeviceMalloc(Nelems*lsf_numPhiVars*sizeof(float), &lsf_slabMeanPhiProfiles_d);
-     fecuda_DeviceMalloc(grid_red.x*grid_red.y*grid_red.z*sizeof(float), &lsf_meanPhiBlock_d);
+     Nelems = (size_t)(Nzp+2*Nh);
+     fecuda_DeviceMalloc(Nelems*(size_t)lsf_numPhiVars, &lsf_slabMeanPhiProfiles_d);
+     fecuda_DeviceMalloc((size_t)(grid_red.x*grid_red.y*grid_red.z), &lsf_meanPhiBlock_d);
    }
 
    return(errorCode);
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu
index ecd39a9..0d6588f 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu
@@ -35,7 +35,7 @@ float* fcond_d;                          /*Base address for f_cond array*/
 */
 extern "C" int cuda_moistureDeviceSetup(){
    int errorCode = CUDA_MOISTURE_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(moistureSelector_d, &moistureSelector, sizeof(int));
    if (moistureSelector > 0){
@@ -49,11 +49,11 @@ extern "C" int cuda_moistureDeviceSetup(){
      cudaMemcpyToSymbol(moistureCondBasePres_d, &moistureCondBasePres, sizeof(int));
      cudaMemcpyToSymbol(moistureMPcallTscale_d, &moistureMPcallTscale, sizeof(float));
 
-     Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-     fecuda_DeviceMalloc(Nelems*moistureNvars*sizeof(float), &moistScalars_d);
-     fecuda_DeviceMalloc(Nelems*moistureNvars*sizeof(float), &moistScalarsFrhs_d);
-     fecuda_DeviceMalloc(Nelems*moistureNvars*3*sizeof(float), &moistTauFlds_d);
-     fecuda_DeviceMalloc(Nelems*sizeof(float), &fcond_d);
+     Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+     fecuda_DeviceMalloc(Nelems*moistureNvars, &moistScalars_d);
+     fecuda_DeviceMalloc(Nelems*moistureNvars, &moistScalarsFrhs_d);
+     fecuda_DeviceMalloc(Nelems*moistureNvars*3, &moistTauFlds_d);
+     fecuda_DeviceMalloc(Nelems, &fcond_d);
    }
 
    return(errorCode);
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu
index 18021c7..453bf16 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu
@@ -27,15 +27,15 @@ float* hydroNuGradZFlds_d;                 /* Base address for diffusion for nu*
 */
 extern "C" int cuda_molecularDiffDeviceSetup(){
    int errorCode = CUDA_MOLDIFF_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(diffusionSelector_d, &diffusionSelector, sizeof(int));
    cudaMemcpyToSymbol(nu_0_d, &nu_0, sizeof(float));
 
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*(Nhydro-1)*sizeof(float), &hydroNuGradXFlds_d); // all Nhydro except density 
-   fecuda_DeviceMalloc(Nelems*(Nhydro-1)*sizeof(float), &hydroNuGradYFlds_d);  
-   fecuda_DeviceMalloc(Nelems*(Nhydro-1)*sizeof(float), &hydroNuGradZFlds_d); 
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems*(Nhydro-1), &hydroNuGradXFlds_d); // all Nhydro except density 
+   fecuda_DeviceMalloc(Nelems*(Nhydro-1), &hydroNuGradYFlds_d);  
+   fecuda_DeviceMalloc(Nelems*(Nhydro-1), &hydroNuGradZFlds_d); 
 
    return(errorCode);
 } //end cuda_molecularDiffDeviceSetup()
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu
index 26f3eb7..db8e94f 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu
@@ -23,14 +23,14 @@ float *hydroPres_d;            /*Base Adress of memory containing the diagnostic
 */
 extern "C" int cuda_pressureDeviceSetup(){
    int errorCode = CUDA_PRESSURE_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    //Copy the pgfSelector constant to device constant-memory
    cudaMemcpyToSymbol(pgfSelector_d, &pgfSelector, sizeof(int));
 
    /*Set the full memory block number of elements for hydroCore fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroPres_d);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems, &hydroPres_d);
 
    return(errorCode);
 } //end cuda_pressureDeviceSetup()
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu
index 77a449e..e5da676 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu
@@ -26,17 +26,17 @@ float* hydroKappaM_d;  /*Base address for KappaM (eddy diffusivity for momentum)
 */
 extern "C" int cuda_sgsTurbDeviceSetup(){
    int errorCode = CUDA_SGSTURB_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(turbulenceSelector_d, &turbulenceSelector, sizeof(int));
    cudaMemcpyToSymbol(TKESelector_d, &TKESelector, sizeof(int));
    cudaMemcpyToSymbol(c_s_d, &c_s, sizeof(float));
    cudaMemcpyToSymbol(c_k_d, &c_k, sizeof(float));
 
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
 
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroKappaM_d); 
-   fecuda_DeviceMalloc(Nelems*9*sizeof(float), &hydroTauFlds_d);
+   fecuda_DeviceMalloc(Nelems, &hydroKappaM_d); 
+   fecuda_DeviceMalloc(Nelems*9, &hydroTauFlds_d);
   
    /* Done */
    return(errorCode);
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu
index af6e39b..0ca8740 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu
@@ -31,13 +31,13 @@ float* dedxi_d; /*Base address for d(SGSTKE)/dxi field arrays*/
 */
 extern "C" int cuda_sgstkeDeviceSetup(){
    int errorCode = CUDA_SGSTKE_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*TKESelector*sizeof(float), &sgstkeScalars_d);
-   fecuda_DeviceMalloc(Nelems*TKESelector*sizeof(float), &sgstkeScalarsFrhs_d);
-   fecuda_DeviceMalloc(Nelems*TKESelector*sizeof(float), &sgstke_ls_d);
-   fecuda_DeviceMalloc(Nelems*TKESelector*3*sizeof(float), &dedxi_d);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems*TKESelector, &sgstkeScalars_d);
+   fecuda_DeviceMalloc(Nelems*TKESelector, &sgstkeScalarsFrhs_d);
+   fecuda_DeviceMalloc(Nelems*TKESelector, &sgstke_ls_d);
+   fecuda_DeviceMalloc(Nelems*TKESelector*3, &dedxi_d);
 
    cudaMemcpyToSymbol(TKEAdvSelector_d, &TKEAdvSelector, sizeof(int));
    cudaMemcpyToSymbol(TKEAdvSelector_b_hyb_d, &TKEAdvSelector_b_hyb, sizeof(float));
diff --git a/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu
index 47f0218..dfe92cf 100644
--- a/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu
+++ b/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu
@@ -60,7 +60,7 @@ float *sea_mask_d;
 */
 extern "C" int cuda_surfaceLayerDeviceSetup(){
    int errorCode = CUDA_SURFLAYER_SUCCESS;
-   int Nelems2d;
+   size_t Nelems2d;
 
    cudaMemcpyToSymbol(surflayerSelector_d, &surflayerSelector, sizeof(int));
    cudaMemcpyToSymbol(surflayer_z0_d, &surflayer_z0, sizeof(float));
@@ -80,19 +80,19 @@ extern "C" int cuda_surfaceLayerDeviceSetup(){
    cudaMemcpyToSymbol(surflayer_ideal_qte_d, &surflayer_ideal_qte, sizeof(float));
    cudaMemcpyToSymbol(surflayer_ideal_qamp_d, &surflayer_ideal_qamp, sizeof(float));
 
-   Nelems2d = (Nxp+2*Nh)*(Nyp+2*Nh);  //2-d element count
+   Nelems2d = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh));  //2-d element count
 
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &cdFld_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &chFld_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &cqFld_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &fricVel_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &htFlux_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &qFlux_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &tskin_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &qskin_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &invOblen_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &z0m_d);
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &z0t_d);
+   fecuda_DeviceMalloc(Nelems2d, &cdFld_d);
+   fecuda_DeviceMalloc(Nelems2d, &chFld_d);
+   fecuda_DeviceMalloc(Nelems2d, &cqFld_d);
+   fecuda_DeviceMalloc(Nelems2d, &fricVel_d);
+   fecuda_DeviceMalloc(Nelems2d, &htFlux_d);
+   fecuda_DeviceMalloc(Nelems2d, &qFlux_d);
+   fecuda_DeviceMalloc(Nelems2d, &tskin_d);
+   fecuda_DeviceMalloc(Nelems2d, &qskin_d);
+   fecuda_DeviceMalloc(Nelems2d, &invOblen_d);
+   fecuda_DeviceMalloc(Nelems2d, &z0m_d);
+   fecuda_DeviceMalloc(Nelems2d, &z0t_d);
 
    // offshore
    cudaMemcpyToSymbol(surflayer_offshore_d, &surflayer_offshore, sizeof(int));
@@ -104,7 +104,7 @@ extern "C" int cuda_surfaceLayerDeviceSetup(){
    cudaMemcpyToSymbol(surflayer_offshore_theta_d, &surflayer_offshore_theta, sizeof(float));
    cudaMemcpyToSymbol(surflayer_offshore_visc_d, &surflayer_offshore_visc, sizeof(int));
 
-   fecuda_DeviceMalloc(Nelems2d*sizeof(float), &sea_mask_d);
+   fecuda_DeviceMalloc(Nelems2d, &sea_mask_d);
    if (surflayer_offshore > 0){
      cudaMemcpy(sea_mask_d, sea_mask, Nelems2d*sizeof(float), cudaMemcpyHostToDevice);
    }
@@ -460,7 +460,7 @@ __device__ void cudaDevice_SurfaceLayerMOSTdry(int ijk, float* u, float* v, floa
    tauyz = -cd_i*sqrtf(powf(*u/ *rho,2.0)+powf(*v/ *rho,2.0))*(*v);
    *tau31 = tauxz;
    *tau32 = tauyz;
-   *fricVel = powf(powf(tauxz,2.0)+powf(tauyz,2.0),0.25);
+   *fricVel = powf(powf(tauxz/(*rho),2.0)+powf(tauyz/(*rho),2.0),0.25);
    tauthz = (*htFlux)*(*rho);
    *tauTH3 = tauthz;
    *invOblen = -(kappa_d*accel_g_d*(*htFlux))/(powf((*fricVel),3.0)*th1);
@@ -573,7 +573,7 @@ __device__ void cudaDevice_SurfaceLayerMOSTmoist(int ijk, float* u, float* v, fl
    tauyz = -cd_i*sqrtf(powf(*u/ *rho,2.0)+powf(*v/ *rho,2.0))*(*v);
    *tau31 = tauxz;
    *tau32 = tauyz;
-   *fricVel = powf(powf(tauxz,2.0)+powf(tauyz,2.0),0.25);
+   *fricVel = powf(powf(tauxz/(*rho),2.0)+powf(tauyz/(*rho),2.0),0.25);
    tauthz = (*htFlux)*(*rho);
    *tauTH3 = tauthz;
    tauqz = (*qFlux)*(*rho); // specified qflux or delta-qv-based flux assumes qv units of g/kg
diff --git a/SRC/HYDRO_CORE/hydro_core.h b/SRC/HYDRO_CORE/hydro_core.h
index b24f239..38214e3 100644
--- a/SRC/HYDRO_CORE/hydro_core.h
+++ b/SRC/HYDRO_CORE/hydro_core.h
@@ -37,7 +37,7 @@
 #define THETA_INDX_BS         1
 
 #define MAX_HC_FLDNAME_LENGTH 256
-#define MAX_AUXSC_SRC         256
+#define MAX_AUXSC_SRC         1280 
 /*#################------------------- HYDRO_CORE module variable declarations ---------------------#################*/
 /* Parameters */
 extern int Nhydro;          /*Number of prognostic variable fields under hydro_core */
diff --git a/SRC/MEM_UTILS/mem_utils.c b/SRC/MEM_UTILS/mem_utils.c
index 2276b99..c0664d6 100644
--- a/SRC/MEM_UTILS/mem_utils.c
+++ b/SRC/MEM_UTILS/mem_utils.c
@@ -137,16 +137,21 @@ float * memAllocateFloat4DField(int Nfields, int iN, int jN, int kN, int halo_ex
   float *blockOfFields;
   void  *m_field;
   void  *memsetReturnVal;
+  size_t Nbytes;
 
+  Nbytes = (size_t)(Nfields)*(size_t)((iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float));
   if(posix_memalign(&m_field, ALIGN_SIZE, 
-                    (Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float))) {
-     fprintf(stderr, "Rank %d/%d memAllocateFloat4DField(%s): Memory Allocation of m_field failed!\n",
-             mpi_rank_world,mpi_size_world,fieldName);
+                    Nbytes)) {
+                    //(Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float))) {
+     fprintf(stderr, "Rank %d/%d memAllocateFloat4DField(%s): Memory Allocation of m_field with %zu bytes failed!\n",
+             mpi_rank_world,mpi_size_world,fieldName,Nbytes);
+     fflush(stdout);
      exit(1);
   } // if
   
   /*initialize the allocated space to zero everywhere*/
-  memsetReturnVal = memset(m_field,0,(Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float));
+  memsetReturnVal = memset(m_field,0,Nbytes);
+  //memsetReturnVal = memset(m_field,0,(Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float));
 
   blockOfFields = (float *) m_field;
 
diff --git a/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu b/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu
index 649956b..c91c473 100644
--- a/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu
+++ b/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu
@@ -52,7 +52,7 @@ float *timeFrhsTmp_d; /* Multistage time scheme variable fields Frhs 4-D array *
 */
 extern "C" int cuda_timeIntDeviceSetup(){
    int errorCode = CUDA_TIME_INTEGRATION_SUCCESS;
-   int Nelems;
+   size_t Nelems;
    int NtimeTotVars;
   
    /*Synchronize the Device*/
@@ -68,11 +68,11 @@ extern "C" int cuda_timeIntDeviceSetup(){
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/
 
    /*Set the full memory block number of elements for timeInt fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); 
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); 
    /* Allocate the TIME_INTEGRATION arrays */
    /*TIME_INTEGRATION/CUDA internal device arrays*/
    NtimeTotVars = 5 + TKESelector*turbulenceSelector + moistureNvars*moistureSelector + NhydroAuxScalars; 
-   fecuda_DeviceMalloc(NtimeTotVars*Nelems*sizeof(float), &timeFlds0_d);
+   fecuda_DeviceMalloc(NtimeTotVars*Nelems, &timeFlds0_d);
    
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMalloc calls*/
 
diff --git a/docs/publications.rst b/docs/publications.rst
index 86e5ab7..2897cc3 100644
--- a/docs/publications.rst
+++ b/docs/publications.rst
@@ -4,6 +4,12 @@
 Publications
 ************
 
+.. _bowen_pressurewaves_2025:
+
+| Lin, Yi, and Bowen Zhou. “A theory for the spurious pressure waves in compressible large
+|   eddy simulations of shallow cumulus clouds”. *Geophysical Research Letters*, 52 (2025)
+|   https://doi.org/10.1029/2025GL118786
+
 .. _urban_climate_2025:
 
 | Muñoz-Esparza, D., J. Sauer, P.A. Jimenez, J. Boehnert, D. Hahn, and M. Steiner.
diff --git a/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh b/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh
index b506b53..cc1f27e 100755
--- a/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh
+++ b/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh
@@ -12,10 +12,10 @@ export SRCDIR=${BASEDIR}/scripts/python_utilities/post-processing/
 
 hostname
 module load conda
-# Be to have created the following conda environment (on Casper) from 
-# the environment.yml file in this */scripts/batch_jobs/ directory
+# The following conda environment (on Casper) can be created (for other platforms) from 
+# the environment.yml file in this repository */scripts/batch_jobs/ directory
 # with---> conda env create -f environment.yml
-conda activate mpi4py-casper-oneapi-2024.2.1-openmpi-5.0.6 
+conda activate /glade/u/fehelp/casper/conda-envs/mpi4py-casper-oneapi-2024.2.1-openmpi-5.0.6 
 which python
 
 mpiexec python -u ${SRCDIR}/FEbinaryToNetCDF.py -f ${SRCDIR}/convert.json -a ${SRCDIR}/field_attributes.json
diff --git a/scripts/batch_jobs/fasteddy_pbs_script_casper.sh b/scripts/batch_jobs/fasteddy_pbs_script_casper.sh
index 7b76af4..f7d6b67 100755
--- a/scripts/batch_jobs/fasteddy_pbs_script_casper.sh
+++ b/scripts/batch_jobs/fasteddy_pbs_script_casper.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #PBS -A <ProjectAccount>
 #PBS -N FastEddy 
-#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB
+#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB:gpu_type=a100
 #PBS -l walltime=12:00:00
 #PBS -q casper
 #PBS -j oe
diff --git a/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh b/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh
index dd30309..20ce7e4 100755
--- a/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh
+++ b/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #PBS -A <ProjectAccount>
 #PBS -N FastEddy 
-#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB
+#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB:gpu_type=a100
 #PBS -l walltime=12:00:00
 #PBS -q main 
 #PBS -j oe
diff --git a/scripts/python_utilities/coupler/SimGrid.py b/scripts/python_utilities/coupler/SimGrid.py
index 31c3d16..3061ec7 100644
--- a/scripts/python_utilities/coupler/SimGrid.py
+++ b/scripts/python_utilities/coupler/SimGrid.py
@@ -187,22 +187,33 @@
     if(j%int(Ny/10)==0):
       print('{:d}% complete...'.format(10*int(j/int(Ny/10))))
     for i in range(Nx):
-      for k in range(Nz):
         zbot = data_topo[j,i]
-        zPos_uni = k*d_zeta + 0.5*d_zeta
-        zPos_str[k] = zDeform(zPos_uni,zbot,ztop,c1,fCoeff)
-        zarr[k,j,i] = zPos_str[k]
+        zPos_uni = np.linspace(0.5*d_zeta,(Nz-0.5)*d_zeta,Nz)
+        zPos_str = zDeform(zPos_uni,zbot,ztop,c1,fCoeff)
+        zarr[:,j,i] = zPos_str
         if (j==0) and (i==0):
-          if (k==0):
-            print('k,zPos_str,dz=',k,',',zPos_str[k],', -')
-          else:
-            print('k,zPos_str,dz=',k,',',zPos_str[k],',',zPos_str[k]-zPos_str[k-1])
-
+         for k in range(Nz):
+           if (k==0):
+             print('k,zPos_str,dz=',k,',',zPos_str[k],', -')
+           else:
+             print('k,zPos_str,dz=',k,',',zPos_str[k],',',zPos_str[k]-zPos_str[k-1])
 
 ind_topomin = np.where(data_topo==topoPos_min)
-z_lowTopo_v = zarr[:,ind_topomin[0],ind_topomin[1]]
+if isinstance(ind_topomin, tuple):
+   j_topomin = ind_topomin[0][0]
+   i_topomin = ind_topomin[1][0]
+else:
+   j_topomin = ind_topomin[0]
+   i_topomin = ind_topomin[1]
+z_lowTopo_v = zarr[:,j_topomin,i_topomin]
 ind_topomax = np.where(data_topo==topoPos_max)
-z_highTopo_v = zarr[:,ind_topomax[0],ind_topomax[1]]
+if isinstance(ind_topomax, tuple):
+  j_topomax = ind_topomax[0][0]
+  i_topomax = ind_topomax[1][0]
+else:
+  j_topomax = ind_topomax[0]
+  i_topomax = ind_topomax[1]
+z_highTopo_v = zarr[:,j_topomax,i_topomax]
 
 dz_lowTopo_v = z_lowTopo_v[1:Nz]-z_lowTopo_v[0:Nz-1]
 dz_highTopo_v = z_highTopo_v[1:Nz]-z_highTopo_v[0:Nz-1]
diff --git a/scripts/python_utilities/coupler/genicbcs.json b/scripts/python_utilities/coupler/genicbcs.json
index c4ee317..0ca9302 100644
--- a/scripts/python_utilities/coupler/genicbcs.json
+++ b/scripts/python_utilities/coupler/genicbcs.json
@@ -7,6 +7,6 @@
   "timeHour0": 17,
   "timeMinute0": 0,
   "timeSecond0": 0,
-  "secMax": 9000,
+  "secMax": 5401,
   "secInc": 300
 }
diff --git a/scripts/python_utilities/coupler/simgrid.json b/scripts/python_utilities/coupler/simgrid.json
index 39248d4..697270e 100644
--- a/scripts/python_utilities/coupler/simgrid.json
+++ b/scripts/python_utilities/coupler/simgrid.json
@@ -1,5 +1,5 @@
 {
-  "name_dom": "FTCollinsCO",
+  "name_dom": "FortCollinsCO",
   "FE_ref_GIS_nc": "geospec_file.nc",
   "FE_params_file": "FE_parameters_file.in",
   "center_lat": 40.5948,