diff --git a/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu b/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu index 6c55981..7d37b55 100644 --- a/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu +++ b/SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu @@ -102,12 +102,12 @@ extern "C" int cuda_GADDeviceSetup(){ cudaMemcpyToSymbol(numgridCells_away_d, &numgridCells_away, sizeof(int)); /*Device memory allocations and Host-to-Device memcopy for turbine arrays */ - fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineType_d); - fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRank_d); - fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefi_d); - fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefj_d); - fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefk_d); - fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineYawing_d); + fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineType_d); + fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRank_d); + fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefi_d); + fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefj_d); + fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefk_d); + fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineYawing_d); cudaMemcpy(GAD_turbineType_d, GAD_turbineType, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(GAD_turbineRank_d, GAD_turbineRank, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(GAD_turbineRefi_d, GAD_turbineRefi, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice); @@ -115,13 +115,13 @@ extern "C" int cuda_GADDeviceSetup(){ cudaMemcpy(GAD_turbineRefk_d, GAD_turbineRefk, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(GAD_turbineYawing_d, GAD_turbineYawing, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefMag_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefDir_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Xcoords_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Ycoords_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_rotorTheta_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_yawError_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_anFactor_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefMag_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefDir_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Xcoords_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Ycoords_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_rotorTheta_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_yawError_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_anFactor_d); cudaMemcpy(GAD_turbineRefMag_d, GAD_turbineRefMag, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_turbineRefDir_d, GAD_turbineRefDir, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_Xcoords_d, GAD_Xcoords, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice); @@ -136,10 +136,10 @@ extern "C" int cuda_GADDeviceSetup(){ cudaMemcpy(GAD_yawError_d, GAD_yawError, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_anFactor_d, GAD_anFactor, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice); - fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineUseries_d); - fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineVseries_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &u_sampAvg_d); - fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &v_sampAvg_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineUseries_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineVseries_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &u_sampAvg_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbines), &v_sampAvg_d); //Initialize u_sampAvg & GAD_turbineUseries as constant (per-turbine) then send down to the device tmp_vector = (float *) malloc(GADrefSeriesLength*sizeof(float)); @@ -171,40 +171,40 @@ extern "C" int cuda_GADDeviceSetup(){ } free(tmp_vector); - fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_hubHeights_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_rotorD_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_nacelleD_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_hubHeights_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_rotorD_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_nacelleD_d); cudaMemcpy(GAD_hubHeights_d, GAD_hubHeights, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_rotorD_d, GAD_rotorD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_nacelleD_d, GAD_nacelleD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice); - fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyTwist_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyChord_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyPitch_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyOmega_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyTwist_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyChord_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyPitch_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyOmega_d); cudaMemcpy(turbinePolyTwist_d, turbinePolyTwist, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(turbinePolyChord_d, turbinePolyChord, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(turbinePolyPitch_d, turbinePolyPitch, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(turbinePolyOmega_d, turbinePolyOmega, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice); - fecuda_DeviceMalloc(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), &rnorm_vect_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*alphaBounds*sizeof(float), &alpha_minmax_vect_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCl_d); - fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCd_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)), &rnorm_vect_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*alphaBounds), &alpha_minmax_vect_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCl_d); + fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCd_d); cudaMemcpy(rnorm_vect_d, rnorm_vect, GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(alpha_minmax_vect_d, alpha_minmax_vect, GADNumTurbineTypes*alphaBounds*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(turbinePolyCd_d, turbinePolyCd, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(turbinePolyCl_d, turbinePolyCl, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice); - fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_turbineVolMask_d); + fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_turbineVolMask_d); cudaMemcpy(GAD_turbineVolMask_d, GAD_turbineVolMask, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice); if (GADoutputForces == 1){ - fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceX_d); - fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceY_d); - fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceZ_d); + fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceX_d); + fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceY_d); + fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceZ_d); cudaMemcpy(GAD_forceX_d, GAD_forceX, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_forceY_d, GAD_forceY, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(GAD_forceZ_d, GAD_forceZ, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice); diff --git a/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu b/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu index 25c355e..1d02ee1 100644 --- a/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu +++ b/SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu @@ -29,21 +29,21 @@ float *urban_heat_redis_d; /* Base Address of memory containing */ extern "C" int cuda_urbanDeviceSetup(){ int errorCode = CUDA_URBAN_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(urbanSelector_d, &urbanSelector, sizeof(int)); cudaMemcpyToSymbol(cd_build_d, &cd_build, sizeof(float)); cudaMemcpyToSymbol(ct_build_d, &ct_build, sizeof(float)); - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*sizeof(float), &building_mask_d); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems, &building_mask_d); cudaMemcpy(building_mask_d, building_mask, Nelems*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpyToSymbol(delta_aware_bdg_d, &delta_aware_bdg, sizeof(float)); if(urban_heatRedis > 0){ Nelems = (Nxp+2*Nh)*(Nyp+2*Nh); - fecuda_DeviceMalloc(Nelems*sizeof(float), &urban_heat_redis_d); + fecuda_DeviceMalloc(Nelems, &urban_heat_redis_d); cudaMemcpy(urban_heat_redis_d, urban_heat_redis, Nelems*sizeof(float), cudaMemcpyHostToDevice); } diff --git a/SRC/FECUDA/fecuda_Device_cu.h b/SRC/FECUDA/fecuda_Device_cu.h index ac1981d..6c64b3f 100644 --- a/SRC/FECUDA/fecuda_Device_cu.h +++ b/SRC/FECUDA/fecuda_Device_cu.h @@ -55,11 +55,6 @@ extern __constant__ int rankYid_d; */ extern "C" int fecuda_DeviceSetup(int tBx, int tBy, int tBz); -/*----->>>>> void fecuda_DeviceMallocInt(); ----------------------------------------------------------- -* Used to allocate device memory integer blocks and set the host memory addresses of device memory pointers. -*/ -extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d); - /*----->>>>> int fecuda_SetBlocksPerGrid(); ------------------------------------------------------------------ * Used to set the "dim3 grid" module variable that is passed to any device kernel * to specify the number of blocks per grid in each dimenaion diff --git a/SRC/FECUDA/fecuda_Utils.cu b/SRC/FECUDA/fecuda_Utils.cu index 136c5f1..6bc9e65 100644 --- a/SRC/FECUDA/fecuda_Utils.cu +++ b/SRC/FECUDA/fecuda_Utils.cu @@ -180,19 +180,19 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers(){ /*----->>>>> void fecuda_DeviceMalloc(); ----------------------------------------------------------- * Used to allocate device memory float blocks and set the host memory addresses of device memory pointers. */ -extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d) { - cudaMalloc((void**)memBlock_d,sizeof(float)*Nelems); +extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d) { + cudaMalloc((void**)memBlock_d,(size_t)(sizeof(float))*Nelems); gpuErrchk( cudaPeekAtLastError() ); - cudaMemset(*memBlock_d,'\0',sizeof(float)*Nelems); + cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(float))*Nelems); gpuErrchk( cudaPeekAtLastError() ); #ifdef DEBUG printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d); #endif } -extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d) { - cudaMalloc((void**)memBlock_d,sizeof(int)*Nelems); +extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d) { + cudaMalloc((void**)memBlock_d,(size_t)(sizeof(int))*Nelems); gpuErrchk( cudaPeekAtLastError() ); - cudaMemset(*memBlock_d,'\0',sizeof(int)*Nelems); + cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(int))*Nelems); gpuErrchk( cudaPeekAtLastError() ); #ifdef DEBUG printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d); diff --git a/SRC/FECUDA/fecuda_Utils_cu.h b/SRC/FECUDA/fecuda_Utils_cu.h index 3f66b97..2ede109 100644 --- a/SRC/FECUDA/fecuda_Utils_cu.h +++ b/SRC/FECUDA/fecuda_Utils_cu.h @@ -41,7 +41,12 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers(); /*----->>>>> void fecuda_DeviceMalloc(); ----------------------------------------------------------- * Used to allocate device memory float blocks and set the host memory addresses of device memory pointers. */ -extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d); +extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d); + +/*----->>>>> void fecuda_DeviceMallocInt(); ----------------------------------------------------------- +* Used to allocate device memory integer blocks and set the host memory addresses of device memory pointers. +*/ +extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d); /*----->>>>> int fecuda_SendRecvWestEast(); ------------------------------------------------------------------- Used to perform western/eastern device domain halo exchange for an arbitrary field. diff --git a/SRC/GRID/CUDA/cuda_gridDevice.cu b/SRC/GRID/CUDA/cuda_gridDevice.cu index 2ede7ff..c0abd8d 100644 --- a/SRC/GRID/CUDA/cuda_gridDevice.cu +++ b/SRC/GRID/CUDA/cuda_gridDevice.cu @@ -64,7 +64,7 @@ float *invD_Jac_d; //inverse Determinant of the Jacbian */ extern "C" int cuda_gridDeviceSetup(){ int errorCode = CUDA_GRID_SUCCESS; - int Nelems; + size_t Nelems; #ifdef DEBUG cudaEvent_t startE, stopE; float elapsedTime; @@ -100,21 +100,21 @@ extern "C" int cuda_gridDeviceSetup(){ gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/ /*Set the full memory block number of elements for grid fields*/ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); /* Allocate the GRID arrays */ /* Coordinate Arrays */ - fecuda_DeviceMalloc(Nelems*sizeof(float), &xPos_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &yPos_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &zPos_d); - fecuda_DeviceMalloc(((Nxp+2*Nh)*(Nyp+2*Nh))*sizeof(float), &topoPos_d); + fecuda_DeviceMalloc(Nelems, &xPos_d); + fecuda_DeviceMalloc(Nelems, &yPos_d); + fecuda_DeviceMalloc(Nelems, &zPos_d); + fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)), &topoPos_d); /* Metric Tensors Fields */ - fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &J31_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &J32_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &J33_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &D_Jac_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &invD_Jac_d); + fecuda_DeviceMalloc(Nelems, &J13_d); + fecuda_DeviceMalloc(Nelems, &J23_d); + fecuda_DeviceMalloc(Nelems, &J31_d); + fecuda_DeviceMalloc(Nelems, &J32_d); + fecuda_DeviceMalloc(Nelems, &J33_d); + fecuda_DeviceMalloc(Nelems, &D_Jac_d); + fecuda_DeviceMalloc(Nelems, &invD_Jac_d); gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMalloc calls*/ /* cudaMemcpy the GRID arrays from Host to Device*/ diff --git a/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu index 51ed665..1749cd5 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu @@ -66,22 +66,22 @@ extern "C" int cuda_BCsDeviceSetup(){ /*Allocate arrays*/ if(hydroBCs==1){ //Using LAD BCs if((rankYid == 0)||(rankYid == numProcsY-1)){ - fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanes_d); - fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesNext_d); - fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesBuffer_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanes_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesNext_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesBuffer_d); } if((rankXid == 0)||(rankXid == numProcsX-1)){ - fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanes_d); - fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesNext_d); - fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesBuffer_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanes_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesNext_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesBuffer_d); } - fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanes_d); - fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesNext_d); - fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesBuffer_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanes_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesNext_d); + fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesBuffer_d); if(surflayerSelector == 3){ - fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanes_d); - fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesNext_d); - fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesBuffer_d); + fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanes_d); + fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesNext_d); + fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesBuffer_d); } }//end if hydroBCs == 1 diff --git a/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu index 5215273..876fbd0 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu @@ -24,13 +24,13 @@ float *hydroBaseStatePres_d; /*Base Adress of memory containing the diagnostic */ extern "C" int cuda_BaseStateDeviceSetup(){ int errorCode = CUDA_BASESTATE_SUCCESS; - int Nelems; + size_t Nelems; /*Set the full memory block number of elements for base-state fields*/ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); /* Allocate the Base State arrays on the device */ - fecuda_DeviceMalloc(Nelems*2*sizeof(float), &hydroBaseStateFlds_d); //Only rho and theta base-state variables - fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroBaseStatePres_d); //Only base-state pressure + fecuda_DeviceMalloc(Nelems*2, &hydroBaseStateFlds_d); //Only rho and theta base-state variables + fecuda_DeviceMalloc(Nelems, &hydroBaseStatePres_d); //Only base-state pressure /* Send the Base State arrays down to the device */ cudaMemcpy(hydroBaseStateFlds_d, hydroBaseStateFlds, Nelems*2*sizeof(float), cudaMemcpyHostToDevice); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu index dfd26a9..c869c9f 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu @@ -25,15 +25,15 @@ __constant__ float b_hyb_d; /*hybrid advection scheme param */ extern "C" int cuda_advectionDeviceSetup(){ int errorCode = CUDA_ADVECTION_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(advectionSelector_d, &advectionSelector, sizeof(int)); cudaMemcpyToSymbol(ceilingAdvectionBC_d, &ceilingAdvectionBC, sizeof(int)); cudaMemcpyToSymbol(b_hyb_d, &b_hyb, sizeof(float)); /*Set the full memory block number of elements for hydroCore fields*/ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*3*sizeof(float), &hydroFaceVels_d); /*Cell-face Velocities*/ + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems*3, &hydroFaceVels_d); /*Cell-face Velocities*/ return(errorCode); } //end cuda_advectionDeviceSetup() diff --git a/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu index 7de459d..21c1d96 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_auxScalarsDevice.cu @@ -35,7 +35,7 @@ __device__ __constant__ float srcAuxScMassSpecValue_d[MAX_AUXSC_SRC]; /*Mass spe */ extern "C" int cuda_auxScalarsDeviceSetup(){ int errorCode = CUDA_AUXSCALARS_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(NhydroAuxScalars_d, &NhydroAuxScalars, sizeof(int)); if (NhydroAuxScalars > 0){ @@ -51,11 +51,11 @@ extern "C" int cuda_auxScalarsDeviceSetup(){ }//end if NydroAuxScalars > 0 if (NhydroAuxScalars > 0){ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*NhydroAuxScalars*sizeof(float), &hydroAuxScalars_d); /*Prognostic variable fields*/ - fecuda_DeviceMalloc(Nelems*NhydroAuxScalars*sizeof(float), &hydroAuxScalarsFrhs_d); /*Prognostic variable Frhs*/ + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems*NhydroAuxScalars, &hydroAuxScalars_d); /*Prognostic variable fields*/ + fecuda_DeviceMalloc(Nelems*NhydroAuxScalars, &hydroAuxScalarsFrhs_d); /*Prognostic variable Frhs*/ if ((turbulenceSelector > 0) && (AuxScSGSturb > 0)){ - fecuda_DeviceMalloc(Nelems*3*sizeof(float), &AuxScalarsTauFlds_d); + fecuda_DeviceMalloc(Nelems*3, &AuxScalarsTauFlds_d); } } // end if NhydroAuxScalars > 0 diff --git a/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu index 7f30608..60a0f26 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_canopyDevice.cu @@ -26,9 +26,9 @@ float *canopy_lad_d; /* Base Address of memory containing leaf area den */ extern "C" int cuda_canopyDeviceSetup(){ int errorCode = CUDA_CANOPY_SUCCESS; - int Nelems; + size_t Nelems; - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); cudaMemcpyToSymbol(canopySelector_d, &canopySelector, sizeof(int)); cudaMemcpyToSymbol(canopySkinOpt_d, &canopySkinOpt, sizeof(int)); @@ -36,7 +36,7 @@ extern "C" int cuda_canopyDeviceSetup(){ cudaMemcpyToSymbol(canopy_lf_d, &canopy_lf, sizeof(float)); Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*sizeof(float), &canopy_lad_d); + fecuda_DeviceMalloc(Nelems, &canopy_lad_d); cudaMemcpy(canopy_lad_d, canopy_lad, Nelems*sizeof(float), cudaMemcpyHostToDevice); return(errorCode); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu index 81d9d56..c2133cb 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_cellpertDevice.cu @@ -32,7 +32,8 @@ float* randcp_d; /*Base address for pseudo-random numbers used for ce */ extern "C" int cuda_cellpertDeviceSetup(){ int errorCode = CUDA_CELLPERT_SUCCESS; - int Nelems1d_xy, Nelems1d; + int Nelems1d_xy; + size_t Nelems1d; /*Constants*/ cudaMemcpyToSymbol(cellpertSelector_d, &cellpertSelector, sizeof(int)); @@ -45,8 +46,8 @@ extern "C" int cuda_cellpertDeviceSetup(){ cudaMemcpyToSymbol(cellpert_ktop_d, &cellpert_ktop, sizeof(int)); Nelems1d_xy = (Nx/cellpert_gppc+min(Nx%cellpert_gppc,1))*(2*cellpert_ndbc+min(Ny%cellpert_gppc,1)) + (Ny/cellpert_gppc-2*cellpert_ndbc)*(2*cellpert_ndbc+min(Nx%cellpert_gppc,1)); - Nelems1d = Nelems1d_xy*(cellpert_ktop-cellpert_kbottom+1); - fecuda_DeviceMalloc(Nelems1d*sizeof(float), &randcp_d); + Nelems1d = (size_t)(Nelems1d_xy*(cellpert_ktop-cellpert_kbottom+1)); + fecuda_DeviceMalloc(Nelems1d, &randcp_d); return(errorCode); } //end cuda_cellpertDeviceSetup() @@ -94,7 +95,7 @@ extern "C" int cuda_hydroCoreDeviceBuildCPmethod(int simTime_it){ n_tot = n_xy*(cellpert_ktop-cellpert_kbottom+1); curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(gen,simTime_it); + curandSetPseudoRandomGeneratorSeed(gen,(unsigned long long)simTime_it); curandGenerateUniform(gen,randcp_d,n_tot); #ifdef URBAN_EXT diff --git a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu index d9c0e58..5c691ad 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_hydroCoreDevice.cu @@ -90,7 +90,7 @@ __constant__ float L_v_d; /* latent heat of vaporization (J/kg) */ */ extern "C" int cuda_hydroCoreDeviceSetup(){ int errorCode = CUDA_HYDRO_CORE_SUCCESS; - int Nelems; + size_t Nelems; /*Synchronize the Device*/ gpuErrchk( cudaDeviceSynchronize() ); @@ -133,11 +133,11 @@ extern "C" int cuda_hydroCoreDeviceSetup(){ gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/ /*Set the full memory block number of elements for hydroCore fields*/ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); /* Allocate the HYDRO_CORE arrays */ - fecuda_DeviceMalloc(Nelems*Nhydro*sizeof(float), &hydroFlds_d); /*Prognostic variable fields*/ - fecuda_DeviceMalloc(Nelems*Nhydro*sizeof(float), &hydroFldsFrhs_d); /*Prognostic variable field Frhs(s)*/ - fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroRhoInv_d); + fecuda_DeviceMalloc(Nelems*(size_t)Nhydro, &hydroFlds_d); /*Prognostic variable fields*/ + fecuda_DeviceMalloc(Nelems*(size_t)Nhydro, &hydroFldsFrhs_d); /*Prognostic variable field Frhs(s)*/ + fecuda_DeviceMalloc(Nelems, &hydroRhoInv_d); /*AUXILIARY SCALARS*/ if(NhydroAuxScalars > 0){ @@ -1282,7 +1282,7 @@ extern "C" int cuda_hydroCoreInitFieldsDevice(){ } }// end if surflayerSelector > 0 if(NhydroAuxScalars > 0){ /*Copy any required host auxiliary sclar fields to the device */ - cudaMemcpy(hydroAuxScalars_d, hydroAuxScalars, Nelems*NhydroAuxScalars*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(hydroAuxScalars_d, hydroAuxScalars, (size_t)(Nelems)*(size_t)(NhydroAuxScalars*sizeof(float)), cudaMemcpyHostToDevice); }// end if hydroAuxScalars > 0 gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/ gpuErrchk( cudaDeviceSynchronize() ); @@ -1333,9 +1333,9 @@ extern "C" int cuda_hydroCoreSynchFieldsFromDevice(){ } }//endif surflayerSelector > 0 if(NhydroAuxScalars > 0){ - gpuErrchk( cudaMemcpy(hydroAuxScalars, hydroAuxScalars_d, Nelems*NhydroAuxScalars*sizeof(float), cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(hydroAuxScalars, hydroAuxScalars_d, (size_t)(Nelems)*(size_t)(NhydroAuxScalars*sizeof(float)), cudaMemcpyDeviceToHost) ); if((hydroForcingWrite==1)||(hydroForcingLog==1)){ - gpuErrchk( cudaMemcpy(hydroAuxScalarsFrhs, hydroAuxScalarsFrhs_d, Nelems*NhydroAuxScalars*sizeof(float), cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(hydroAuxScalarsFrhs, hydroAuxScalarsFrhs_d, (size_t)(Nelems)*(size_t)(NhydroAuxScalars*sizeof(float)), cudaMemcpyDeviceToHost) ); } //endif we need to send up the Frhs } //end if NhydroAuxScalars > 0 if(hydroSubGridWrite==1){ diff --git a/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu index fcd076f..c0eb479 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_largeScaleForcingsDevice.cu @@ -42,7 +42,7 @@ float* lsf_meanPhiBlock_d; /*Base address of work arrray for block */ extern "C" int cuda_lsfDeviceSetup(){ int errorCode = CUDA_LSF_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(lsfSelector_d, &lsfSelector, sizeof(int)); cudaMemcpyToSymbol(lsf_w_surf_d, &lsf_w_surf, sizeof(float)); @@ -69,9 +69,9 @@ extern "C" int cuda_lsfDeviceSetup(){ fflush(stdout); } cudaMemcpyToSymbol(lsf_numPhiVars_d, &lsf_numPhiVars, sizeof(float)); - Nelems = (Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*lsf_numPhiVars*sizeof(float), &lsf_slabMeanPhiProfiles_d); - fecuda_DeviceMalloc(grid_red.x*grid_red.y*grid_red.z*sizeof(float), &lsf_meanPhiBlock_d); + Nelems = (size_t)(Nzp+2*Nh); + fecuda_DeviceMalloc(Nelems*(size_t)lsf_numPhiVars, &lsf_slabMeanPhiProfiles_d); + fecuda_DeviceMalloc((size_t)(grid_red.x*grid_red.y*grid_red.z), &lsf_meanPhiBlock_d); } return(errorCode); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu index ecd39a9..0d6588f 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_moistureDevice.cu @@ -35,7 +35,7 @@ float* fcond_d; /*Base address for f_cond array*/ */ extern "C" int cuda_moistureDeviceSetup(){ int errorCode = CUDA_MOISTURE_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(moistureSelector_d, &moistureSelector, sizeof(int)); if (moistureSelector > 0){ @@ -49,11 +49,11 @@ extern "C" int cuda_moistureDeviceSetup(){ cudaMemcpyToSymbol(moistureCondBasePres_d, &moistureCondBasePres, sizeof(int)); cudaMemcpyToSymbol(moistureMPcallTscale_d, &moistureMPcallTscale, sizeof(float)); - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*moistureNvars*sizeof(float), &moistScalars_d); - fecuda_DeviceMalloc(Nelems*moistureNvars*sizeof(float), &moistScalarsFrhs_d); - fecuda_DeviceMalloc(Nelems*moistureNvars*3*sizeof(float), &moistTauFlds_d); - fecuda_DeviceMalloc(Nelems*sizeof(float), &fcond_d); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems*moistureNvars, &moistScalars_d); + fecuda_DeviceMalloc(Nelems*moistureNvars, &moistScalarsFrhs_d); + fecuda_DeviceMalloc(Nelems*moistureNvars*3, &moistTauFlds_d); + fecuda_DeviceMalloc(Nelems, &fcond_d); } return(errorCode); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu index 18021c7..453bf16 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_molecularDiffDevice.cu @@ -27,15 +27,15 @@ float* hydroNuGradZFlds_d; /* Base address for diffusion for nu* */ extern "C" int cuda_molecularDiffDeviceSetup(){ int errorCode = CUDA_MOLDIFF_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(diffusionSelector_d, &diffusionSelector, sizeof(int)); cudaMemcpyToSymbol(nu_0_d, &nu_0, sizeof(float)); - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*(Nhydro-1)*sizeof(float), &hydroNuGradXFlds_d); // all Nhydro except density - fecuda_DeviceMalloc(Nelems*(Nhydro-1)*sizeof(float), &hydroNuGradYFlds_d); - fecuda_DeviceMalloc(Nelems*(Nhydro-1)*sizeof(float), &hydroNuGradZFlds_d); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems*(Nhydro-1), &hydroNuGradXFlds_d); // all Nhydro except density + fecuda_DeviceMalloc(Nelems*(Nhydro-1), &hydroNuGradYFlds_d); + fecuda_DeviceMalloc(Nelems*(Nhydro-1), &hydroNuGradZFlds_d); return(errorCode); } //end cuda_molecularDiffDeviceSetup() diff --git a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu index 26f3eb7..db8e94f 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_pressureDevice.cu @@ -23,14 +23,14 @@ float *hydroPres_d; /*Base Adress of memory containing the diagnostic */ extern "C" int cuda_pressureDeviceSetup(){ int errorCode = CUDA_PRESSURE_SUCCESS; - int Nelems; + size_t Nelems; //Copy the pgfSelector constant to device constant-memory cudaMemcpyToSymbol(pgfSelector_d, &pgfSelector, sizeof(int)); /*Set the full memory block number of elements for hydroCore fields*/ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroPres_d); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems, &hydroPres_d); return(errorCode); } //end cuda_pressureDeviceSetup() diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu index 77a449e..e5da676 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_sgsTurbDevice.cu @@ -26,17 +26,17 @@ float* hydroKappaM_d; /*Base address for KappaM (eddy diffusivity for momentum) */ extern "C" int cuda_sgsTurbDeviceSetup(){ int errorCode = CUDA_SGSTURB_SUCCESS; - int Nelems; + size_t Nelems; cudaMemcpyToSymbol(turbulenceSelector_d, &turbulenceSelector, sizeof(int)); cudaMemcpyToSymbol(TKESelector_d, &TKESelector, sizeof(int)); cudaMemcpyToSymbol(c_s_d, &c_s, sizeof(float)); cudaMemcpyToSymbol(c_k_d, &c_k, sizeof(float)); - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); - fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroKappaM_d); - fecuda_DeviceMalloc(Nelems*9*sizeof(float), &hydroTauFlds_d); + fecuda_DeviceMalloc(Nelems, &hydroKappaM_d); + fecuda_DeviceMalloc(Nelems*9, &hydroTauFlds_d); /* Done */ return(errorCode); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu index af6e39b..0ca8740 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_sgstkeDevice.cu @@ -31,13 +31,13 @@ float* dedxi_d; /*Base address for d(SGSTKE)/dxi field arrays*/ */ extern "C" int cuda_sgstkeDeviceSetup(){ int errorCode = CUDA_SGSTKE_SUCCESS; - int Nelems; + size_t Nelems; - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); - fecuda_DeviceMalloc(Nelems*TKESelector*sizeof(float), &sgstkeScalars_d); - fecuda_DeviceMalloc(Nelems*TKESelector*sizeof(float), &sgstkeScalarsFrhs_d); - fecuda_DeviceMalloc(Nelems*TKESelector*sizeof(float), &sgstke_ls_d); - fecuda_DeviceMalloc(Nelems*TKESelector*3*sizeof(float), &dedxi_d); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); + fecuda_DeviceMalloc(Nelems*TKESelector, &sgstkeScalars_d); + fecuda_DeviceMalloc(Nelems*TKESelector, &sgstkeScalarsFrhs_d); + fecuda_DeviceMalloc(Nelems*TKESelector, &sgstke_ls_d); + fecuda_DeviceMalloc(Nelems*TKESelector*3, &dedxi_d); cudaMemcpyToSymbol(TKEAdvSelector_d, &TKEAdvSelector, sizeof(int)); cudaMemcpyToSymbol(TKEAdvSelector_b_hyb_d, &TKEAdvSelector_b_hyb, sizeof(float)); diff --git a/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu b/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu index 47f0218..dfe92cf 100644 --- a/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu +++ b/SRC/HYDRO_CORE/CUDA/cuda_surfaceLayerDevice.cu @@ -60,7 +60,7 @@ float *sea_mask_d; */ extern "C" int cuda_surfaceLayerDeviceSetup(){ int errorCode = CUDA_SURFLAYER_SUCCESS; - int Nelems2d; + size_t Nelems2d; cudaMemcpyToSymbol(surflayerSelector_d, &surflayerSelector, sizeof(int)); cudaMemcpyToSymbol(surflayer_z0_d, &surflayer_z0, sizeof(float)); @@ -80,19 +80,19 @@ extern "C" int cuda_surfaceLayerDeviceSetup(){ cudaMemcpyToSymbol(surflayer_ideal_qte_d, &surflayer_ideal_qte, sizeof(float)); cudaMemcpyToSymbol(surflayer_ideal_qamp_d, &surflayer_ideal_qamp, sizeof(float)); - Nelems2d = (Nxp+2*Nh)*(Nyp+2*Nh); //2-d element count + Nelems2d = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)); //2-d element count - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &cdFld_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &chFld_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &cqFld_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &fricVel_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &htFlux_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &qFlux_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &tskin_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &qskin_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &invOblen_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &z0m_d); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &z0t_d); + fecuda_DeviceMalloc(Nelems2d, &cdFld_d); + fecuda_DeviceMalloc(Nelems2d, &chFld_d); + fecuda_DeviceMalloc(Nelems2d, &cqFld_d); + fecuda_DeviceMalloc(Nelems2d, &fricVel_d); + fecuda_DeviceMalloc(Nelems2d, &htFlux_d); + fecuda_DeviceMalloc(Nelems2d, &qFlux_d); + fecuda_DeviceMalloc(Nelems2d, &tskin_d); + fecuda_DeviceMalloc(Nelems2d, &qskin_d); + fecuda_DeviceMalloc(Nelems2d, &invOblen_d); + fecuda_DeviceMalloc(Nelems2d, &z0m_d); + fecuda_DeviceMalloc(Nelems2d, &z0t_d); // offshore cudaMemcpyToSymbol(surflayer_offshore_d, &surflayer_offshore, sizeof(int)); @@ -104,7 +104,7 @@ extern "C" int cuda_surfaceLayerDeviceSetup(){ cudaMemcpyToSymbol(surflayer_offshore_theta_d, &surflayer_offshore_theta, sizeof(float)); cudaMemcpyToSymbol(surflayer_offshore_visc_d, &surflayer_offshore_visc, sizeof(int)); - fecuda_DeviceMalloc(Nelems2d*sizeof(float), &sea_mask_d); + fecuda_DeviceMalloc(Nelems2d, &sea_mask_d); if (surflayer_offshore > 0){ cudaMemcpy(sea_mask_d, sea_mask, Nelems2d*sizeof(float), cudaMemcpyHostToDevice); } @@ -460,7 +460,7 @@ __device__ void cudaDevice_SurfaceLayerMOSTdry(int ijk, float* u, float* v, floa tauyz = -cd_i*sqrtf(powf(*u/ *rho,2.0)+powf(*v/ *rho,2.0))*(*v); *tau31 = tauxz; *tau32 = tauyz; - *fricVel = powf(powf(tauxz,2.0)+powf(tauyz,2.0),0.25); + *fricVel = powf(powf(tauxz/(*rho),2.0)+powf(tauyz/(*rho),2.0),0.25); tauthz = (*htFlux)*(*rho); *tauTH3 = tauthz; *invOblen = -(kappa_d*accel_g_d*(*htFlux))/(powf((*fricVel),3.0)*th1); @@ -573,7 +573,7 @@ __device__ void cudaDevice_SurfaceLayerMOSTmoist(int ijk, float* u, float* v, fl tauyz = -cd_i*sqrtf(powf(*u/ *rho,2.0)+powf(*v/ *rho,2.0))*(*v); *tau31 = tauxz; *tau32 = tauyz; - *fricVel = powf(powf(tauxz,2.0)+powf(tauyz,2.0),0.25); + *fricVel = powf(powf(tauxz/(*rho),2.0)+powf(tauyz/(*rho),2.0),0.25); tauthz = (*htFlux)*(*rho); *tauTH3 = tauthz; tauqz = (*qFlux)*(*rho); // specified qflux or delta-qv-based flux assumes qv units of g/kg diff --git a/SRC/HYDRO_CORE/hydro_core.h b/SRC/HYDRO_CORE/hydro_core.h index b24f239..38214e3 100644 --- a/SRC/HYDRO_CORE/hydro_core.h +++ b/SRC/HYDRO_CORE/hydro_core.h @@ -37,7 +37,7 @@ #define THETA_INDX_BS 1 #define MAX_HC_FLDNAME_LENGTH 256 -#define MAX_AUXSC_SRC 256 +#define MAX_AUXSC_SRC 1280 /*#################------------------- HYDRO_CORE module variable declarations ---------------------#################*/ /* Parameters */ extern int Nhydro; /*Number of prognostic variable fields under hydro_core */ diff --git a/SRC/MEM_UTILS/mem_utils.c b/SRC/MEM_UTILS/mem_utils.c index 2276b99..c0664d6 100644 --- a/SRC/MEM_UTILS/mem_utils.c +++ b/SRC/MEM_UTILS/mem_utils.c @@ -137,16 +137,21 @@ float * memAllocateFloat4DField(int Nfields, int iN, int jN, int kN, int halo_ex float *blockOfFields; void *m_field; void *memsetReturnVal; + size_t Nbytes; + Nbytes = (size_t)(Nfields)*(size_t)((iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float)); if(posix_memalign(&m_field, ALIGN_SIZE, - (Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float))) { - fprintf(stderr, "Rank %d/%d memAllocateFloat4DField(%s): Memory Allocation of m_field failed!\n", - mpi_rank_world,mpi_size_world,fieldName); + Nbytes)) { + //(Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float))) { + fprintf(stderr, "Rank %d/%d memAllocateFloat4DField(%s): Memory Allocation of m_field with %zu bytes failed!\n", + mpi_rank_world,mpi_size_world,fieldName,Nbytes); + fflush(stdout); exit(1); } // if /*initialize the allocated space to zero everywhere*/ - memsetReturnVal = memset(m_field,0,(Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float)); + memsetReturnVal = memset(m_field,0,Nbytes); + //memsetReturnVal = memset(m_field,0,(Nfields)*(iN+2*halo_extent)*(jN+2*halo_extent)*(kN+2*halo_extent)*sizeof(float)); blockOfFields = (float *) m_field; diff --git a/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu b/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu index 649956b..c91c473 100644 --- a/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu +++ b/SRC/TIME_INTEGRATION/CUDA/cuda_timeIntDevice.cu @@ -52,7 +52,7 @@ float *timeFrhsTmp_d; /* Multistage time scheme variable fields Frhs 4-D array * */ extern "C" int cuda_timeIntDeviceSetup(){ int errorCode = CUDA_TIME_INTEGRATION_SUCCESS; - int Nelems; + size_t Nelems; int NtimeTotVars; /*Synchronize the Device*/ @@ -68,11 +68,11 @@ extern "C" int cuda_timeIntDeviceSetup(){ gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/ /*Set the full memory block number of elements for timeInt fields*/ - Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); + Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); /* Allocate the TIME_INTEGRATION arrays */ /*TIME_INTEGRATION/CUDA internal device arrays*/ NtimeTotVars = 5 + TKESelector*turbulenceSelector + moistureNvars*moistureSelector + NhydroAuxScalars; - fecuda_DeviceMalloc(NtimeTotVars*Nelems*sizeof(float), &timeFlds0_d); + fecuda_DeviceMalloc(NtimeTotVars*Nelems, &timeFlds0_d); gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMalloc calls*/ diff --git a/docs/publications.rst b/docs/publications.rst index 86e5ab7..2897cc3 100644 --- a/docs/publications.rst +++ b/docs/publications.rst @@ -4,6 +4,12 @@ Publications ************ +.. _bowen_pressurewaves_2025: + +| Lin, Yi, and Bowen Zhou. “A theory for the spurious pressure waves in compressible large +| eddy simulations of shallow cumulus clouds”. *Geophysical Research Letters*, 52 (2025) +| https://doi.org/10.1029/2025GL118786 + .. _urban_climate_2025: | Muñoz-Esparza, D., J. Sauer, P.A. Jimenez, J. Boehnert, D. Hahn, and M. Steiner. diff --git a/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh b/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh index b506b53..cc1f27e 100755 --- a/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh +++ b/scripts/batch_jobs/fasteddy_convert_pbs_script_casper.sh @@ -12,10 +12,10 @@ export SRCDIR=${BASEDIR}/scripts/python_utilities/post-processing/ hostname module load conda -# Be to have created the following conda environment (on Casper) from -# the environment.yml file in this */scripts/batch_jobs/ directory +# The following conda environment (on Casper) can be created (for other platforms) from +# the environment.yml file in this repository */scripts/batch_jobs/ directory # with---> conda env create -f environment.yml -conda activate mpi4py-casper-oneapi-2024.2.1-openmpi-5.0.6 +conda activate /glade/u/fehelp/casper/conda-envs/mpi4py-casper-oneapi-2024.2.1-openmpi-5.0.6 which python mpiexec python -u ${SRCDIR}/FEbinaryToNetCDF.py -f ${SRCDIR}/convert.json -a ${SRCDIR}/field_attributes.json diff --git a/scripts/batch_jobs/fasteddy_pbs_script_casper.sh b/scripts/batch_jobs/fasteddy_pbs_script_casper.sh index 7b76af4..f7d6b67 100755 --- a/scripts/batch_jobs/fasteddy_pbs_script_casper.sh +++ b/scripts/batch_jobs/fasteddy_pbs_script_casper.sh @@ -1,7 +1,7 @@ #!/bin/bash #PBS -A #PBS -N FastEddy -#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB +#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB:gpu_type=a100 #PBS -l walltime=12:00:00 #PBS -q casper #PBS -j oe diff --git a/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh b/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh index dd30309..20ce7e4 100755 --- a/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh +++ b/scripts/batch_jobs/fasteddy_pbs_script_derecho.sh @@ -1,7 +1,7 @@ #!/bin/bash #PBS -A #PBS -N FastEddy -#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB +#PBS -l select=1:ncpus=4:mpiprocs=4:ngpus=4:mem=100GB:gpu_type=a100 #PBS -l walltime=12:00:00 #PBS -q main #PBS -j oe diff --git a/scripts/python_utilities/coupler/SimGrid.py b/scripts/python_utilities/coupler/SimGrid.py index 31c3d16..3061ec7 100644 --- a/scripts/python_utilities/coupler/SimGrid.py +++ b/scripts/python_utilities/coupler/SimGrid.py @@ -187,22 +187,33 @@ if(j%int(Ny/10)==0): print('{:d}% complete...'.format(10*int(j/int(Ny/10)))) for i in range(Nx): - for k in range(Nz): zbot = data_topo[j,i] - zPos_uni = k*d_zeta + 0.5*d_zeta - zPos_str[k] = zDeform(zPos_uni,zbot,ztop,c1,fCoeff) - zarr[k,j,i] = zPos_str[k] + zPos_uni = np.linspace(0.5*d_zeta,(Nz-0.5)*d_zeta,Nz) + zPos_str = zDeform(zPos_uni,zbot,ztop,c1,fCoeff) + zarr[:,j,i] = zPos_str if (j==0) and (i==0): - if (k==0): - print('k,zPos_str,dz=',k,',',zPos_str[k],', -') - else: - print('k,zPos_str,dz=',k,',',zPos_str[k],',',zPos_str[k]-zPos_str[k-1]) - + for k in range(Nz): + if (k==0): + print('k,zPos_str,dz=',k,',',zPos_str[k],', -') + else: + print('k,zPos_str,dz=',k,',',zPos_str[k],',',zPos_str[k]-zPos_str[k-1]) ind_topomin = np.where(data_topo==topoPos_min) -z_lowTopo_v = zarr[:,ind_topomin[0],ind_topomin[1]] +if isinstance(ind_topomin, tuple): + j_topomin = ind_topomin[0][0] + i_topomin = ind_topomin[1][0] +else: + j_topomin = ind_topomin[0] + i_topomin = ind_topomin[1] +z_lowTopo_v = zarr[:,j_topomin,i_topomin] ind_topomax = np.where(data_topo==topoPos_max) -z_highTopo_v = zarr[:,ind_topomax[0],ind_topomax[1]] +if isinstance(ind_topomax, tuple): + j_topomax = ind_topomax[0][0] + i_topomax = ind_topomax[1][0] +else: + j_topomax = ind_topomax[0] + i_topomax = ind_topomax[1] +z_highTopo_v = zarr[:,j_topomax,i_topomax] dz_lowTopo_v = z_lowTopo_v[1:Nz]-z_lowTopo_v[0:Nz-1] dz_highTopo_v = z_highTopo_v[1:Nz]-z_highTopo_v[0:Nz-1] diff --git a/scripts/python_utilities/coupler/genicbcs.json b/scripts/python_utilities/coupler/genicbcs.json index c4ee317..0ca9302 100644 --- a/scripts/python_utilities/coupler/genicbcs.json +++ b/scripts/python_utilities/coupler/genicbcs.json @@ -7,6 +7,6 @@ "timeHour0": 17, "timeMinute0": 0, "timeSecond0": 0, - "secMax": 9000, + "secMax": 5401, "secInc": 300 } diff --git a/scripts/python_utilities/coupler/simgrid.json b/scripts/python_utilities/coupler/simgrid.json index 39248d4..697270e 100644 --- a/scripts/python_utilities/coupler/simgrid.json +++ b/scripts/python_utilities/coupler/simgrid.json @@ -1,5 +1,5 @@ { - "name_dom": "FTCollinsCO", + "name_dom": "FortCollinsCO", "FE_ref_GIS_nc": "geospec_file.nc", "FE_params_file": "FE_parameters_file.in", "center_lat": 40.5948,