diff --git a/CMakeLists.txt b/CMakeLists.txt index 97551690b..ae7942af0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,11 +170,21 @@ endif() ###################################################################### # GPU support -if (ENABLE_GPU) - add_definitions(-DPMC_USE_GPU) - enable_language(CUDA) + +#Compile GPU tests if CUDA detected +find_package(CUDA) +if (CUDA_FOUND) + enable_language(CUDA) + #If enable_gpu, all will be executed on GPU + if (ENABLE_GPU) + #Define global variable of use gpu no matter what test to include user main test + add_definitions(-DPMC_USE_GPU) + endif() endif() +if(NOT CUDA_FOUND AND ENABLE_GPU) + message( WARNING "Not CUDA module found, can't support GPU computation") +endif() ###################################################################### # debugging options @@ -262,6 +272,15 @@ if (ENABLE_TESTS) add_test(test_MONARCH_1 ${CMAKE_BINARY_DIR}/test_run/monarch/test_monarch_1.sh ${MPI_TEST_FLAG}) add_test(test_MONARCH_2 ${CMAKE_BINARY_DIR}/test_run/monarch/test_monarch_2.sh ${MPI_TEST_FLAG}) + if (CUDA_FOUND AND NOT ENABLE_GPU) + #todo: new unit_tests with gpu + add_test(test_rxn_arrhenius_mech_gpu ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius_gpu.sh ${MPI_TEST_FLAG}) + + #Relevant mechanisms tests + add_test(test_chemistry_cb05cl_ae5_gpu ${CMAKE_BINARY_DIR}/test_run/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh ${MPI_TEST_FLAG}) + + endif() + endif() add_test(test_additive_1 ${CMAKE_BINARY_DIR}/test_run/additive/test_additive_1.sh) @@ -411,6 +430,7 @@ endif() # partmc library set(STD_C_FLAGS "-std=c99") + set(STD_CUDA_FLAGS "-dc -arch=compute_70 -code=sm_70") if(${CMAKE_Fortran_COMPILER_ID} MATCHES "Intel") @@ -491,14 +511,16 @@ set(SUB_MODELS_SRC ${SUB_MODELS_F_SRC} ${SUB_MODELS_C_SRC}) set(CAMP_C_SRC src/camp_solver.c src/rxn_solver.c src/aero_phase_solver.c - src/aero_rep_solver.c src/sub_model_solver.c) + src/aero_rep_solver.c src/sub_model_solver.c + src/debug_and_stats/camp_debug_2.c) set_source_files_properties(${CAMP_C_SRC} PROPERTIES COMPILE_FLAGS ${STD_C_FLAGS}) set(CAMP_CXX_SRC "") -if(ENABLE_GPU) +if (CUDA_FOUND) +#if(ENABLE_GPU) set(CAMP_CUDA_SRC src/cuda/camp_gpu_solver.cu src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu @@ -521,33 +543,38 @@ if(ENABLE_GPU) set_source_files_properties(${CAMP_CUDA_SRC} PROPERTIES COMPILE_FLAGS ${STD_CUDA_FLAGS}) + set_source_files_properties(${CAMP_CUDA_SRC} PROPERTIES LANGUAGE CUDA) else() - set(CAMP_CUDA_SRC "") +# set(CAMP_CUDA_SRC "") endif() -add_library(partmclib src/aero_state.F90 src/integer_varray.F90 - src/integer_rmap.F90 src/integer_rmap2.F90 src/aero_sorted.F90 - src/aero_binned.F90 src/bin_grid.F90 src/constants.F90 - src/scenario.F90 src/env_state.F90 src/aero_mode.F90 - src/aero_dist.F90 src/aero_weight.F90 src/aero_weight_array.F90 - src/coag_kernel_additive.F90 src/coag_kernel_sedi.F90 - src/coag_kernel_constant.F90 src/coag_kernel_brown.F90 - src/coag_kernel_zero.F90 src/coag_kernel_brown_free.F90 - src/coag_kernel_brown_cont.F90 src/aero_data.F90 src/run_exact.F90 - src/run_part.F90 src/util.F90 src/stats.F90 src/run_sect.F90 src/output.F90 - src/mosaic.F90 src/gas_data.F90 src/gas_state.F90 - src/coagulation.F90 src/exact_soln.F90 src/coagulation_dist.F90 - src/coag_kernel.F90 src/spec_line.F90 src/spec_file.F90 src/rand.F90 - src/aero_particle.F90 src/aero_particle_array.F90 src/mpi.F90 - src/netcdf.F90 src/aero_info.F90 src/aero_info_array.F90 - src/nucleate.F90 src/condense.F90 src/fractal.F90 src/chamber.F90 - src/property.F90 src/chem_spec_data.F90 - src/rxn_data.F90 src/camp_state.F90 src/mechanism_data.F90 - src/camp_core.F90 src/camp_solver_data.F90 src/aero_rep_data.F90 - src/aero_phase_data.F90 src/aero_rep_factory.F90 src/camp_interface.F90 - src/rxn_factory.F90 src/sub_model_data.F90 src/sub_model_factory.F90 - src/solver_stats.F90 src/camp_box_model_data.F90 +set(CAMP_F_SRC + src/aero_state.F90 src/integer_varray.F90 + src/integer_rmap.F90 src/integer_rmap2.F90 src/aero_sorted.F90 + src/aero_binned.F90 src/bin_grid.F90 src/constants.F90 + src/scenario.F90 src/env_state.F90 src/aero_mode.F90 + src/aero_dist.F90 src/aero_weight.F90 src/aero_weight_array.F90 + src/coag_kernel_additive.F90 src/coag_kernel_sedi.F90 + src/coag_kernel_constant.F90 src/coag_kernel_brown.F90 + src/coag_kernel_zero.F90 src/coag_kernel_brown_free.F90 + src/coag_kernel_brown_cont.F90 src/aero_data.F90 src/run_exact.F90 + src/run_part.F90 src/util.F90 src/stats.F90 src/run_sect.F90 src/output.F90 + src/mosaic.F90 src/gas_data.F90 src/gas_state.F90 + src/coagulation.F90 src/exact_soln.F90 src/coagulation_dist.F90 + src/coag_kernel.F90 src/spec_line.F90 src/spec_file.F90 src/rand.F90 + src/aero_particle.F90 src/aero_particle_array.F90 src/mpi.F90 + src/netcdf.F90 src/aero_info.F90 src/aero_info_array.F90 + src/nucleate.F90 src/condense.F90 src/fractal.F90 src/chamber.F90 + src/property.F90 src/chem_spec_data.F90 + src/rxn_data.F90 src/camp_state.F90 src/mechanism_data.F90 + src/camp_core.F90 src/camp_solver_data.F90 src/aero_rep_data.F90 + src/aero_phase_data.F90 src/aero_rep_factory.F90 src/camp_interface.F90 + src/rxn_factory.F90 src/sub_model_data.F90 src/sub_model_factory.F90 + src/solver_stats.F90 src/camp_box_model_data.F90 + ) + +add_library(partmclib ${CAMP_F_SRC} ${CAMP_C_SRC} ${AEROSOL_REPS_SRC} ${SUB_MODELS_SRC} ${REACTIONS_SRC} ${SUNDIALS_SRC} ${GSL_SRC} ${C_SORT_SRC} ${CAMP_CUDA_SRC} ${CAMP_CXX_SRC} ) @@ -556,6 +583,24 @@ target_link_libraries(partmclib ${NETCDF_LIBS} ${SUNDIALS_LIBS} set_target_properties(partmclib PROPERTIES OUTPUT_NAME partmc) +if (CUDA_FOUND AND NOT ENABLE_GPU) + add_library(partmclib_gpu ${CAMP_F_SRC} + ${CAMP_C_SRC} ${AEROSOL_REPS_SRC} ${SUB_MODELS_SRC} ${REACTIONS_SRC} + ${SUNDIALS_SRC} ${GSL_SRC} ${C_SORT_SRC} ${CAMP_CUDA_SRC} ${CAMP_CXX_SRC} ) + + target_compile_definitions(partmclib_gpu PRIVATE + "PMC_USE_GPU=ON") + + target_link_libraries(partmclib_gpu ${NETCDF_LIBS} ${SUNDIALS_LIBS} + ${MOSAIC_LIB} ${GSL_LIBS} ${JSON_LIB}) + + set_target_properties(partmclib_gpu PROPERTIES OUTPUT_NAME partmc_gpu) + + add_executable(partmc_gpu src/partmc.F90) + + target_link_libraries(partmc_gpu partmclib_gpu) +endif() + ###################################################################### # partmc executable @@ -614,13 +659,21 @@ set_source_files_properties(${CB5_EBI_SOLVER} PROPERTIES COMPILE_FLAGS ${CB5_EBI_FLAGS}) ###################################################################### -# test_chemistry_cb05cl_ae5_big +# test_chemistry_cb05cl_ae5 -add_executable(test_chemistry_cb05cl_ae5_big - test/chemistry/cb05cl_ae5/test_cb05cl_ae5_big.F90 +add_executable(test_chemistry_cb05cl_ae5 + test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER} test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90) -target_link_libraries(test_chemistry_cb05cl_ae5_big partmclib) +target_link_libraries(test_chemistry_cb05cl_ae5 partmclib) + +if (CUDA_FOUND AND NOT ENABLE_GPU) + add_executable(test_chemistry_cb05cl_ae5_gpu + test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 + ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER} + test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90) + target_link_libraries(test_chemistry_cb05cl_ae5_gpu partmclib_gpu) +endif() ###################################################################### # MONARCH interface @@ -633,20 +686,19 @@ target_link_libraries(mock_monarch partmclib) if (ENABLE_TESTS) ###################################################################### -# test_chemistry_cb05cl_ae5 +# test_chemistry_cb05cl_ae5_big -add_executable(test_chemistry_cb05cl_ae5 - test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 - ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER} - test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90) -target_link_libraries(test_chemistry_cb05cl_ae5 partmclib) +add_executable(test_chemistry_cb05cl_ae5_big + test/chemistry/cb05cl_ae5/test_cb05cl_ae5_big.F90 + ${CB5_EBI_SOLVER} ${CB5_KPP_SOLVER} + test/chemistry/cb05cl_ae5/module_BSC_CHEM_DATA.F90) +target_link_libraries(test_chemistry_cb05cl_ae5_big partmclib) ###################################################################### # test_chemistry_eqsam_v03d set(EQSAM_V03D - test/chemistry/eqsam_v03d/eqsam_v03d.F90 - ) + test/chemistry/eqsam_v03d/eqsam_v03d.F90) add_executable(test_chemistry_eqsam_v03d test/chemistry/eqsam_v03d/test_eqsam_v03d.F90 ${EQSAM_V03D}) @@ -709,19 +761,26 @@ target_link_libraries(test_sub_model_ZSR_aerosol_water partmclib) # New unit tests (UNDER DEVELOPMENT) set(UNIT_TEST_SRC - test/unit_tests/unit_test_data.F90 - test/unit_tests/unit_test_driver.F90) + test/unit_tests/unit_test_data.F90 + test/unit_tests/unit_test_driver.F90) set(UNIT_TEST_RXN_ARRHENIUS_SRC - test/unit_tests/rxns/unit_test_rxn_arrhenius.F90 ${UNIT_TEST_SRC}) -set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_DEFINITIONS - "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \ - ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()") -set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS - ${STD_F_FLAGS}) + test/unit_tests/rxns/unit_test_rxn_arrhenius.F90 ${UNIT_TEST_SRC}) +set_source_files_properties(${UNIT_TEST_RXN_ARRHENIUS_SRC} PROPERTIES COMPILE_FLAGS ${STD_F_FLAGS}) add_executable(unit_test_rxn_arrhenius ${UNIT_TEST_RXN_ARRHENIUS_SRC}) +target_compile_definitions(unit_test_rxn_arrhenius PRIVATE + "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \ + ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()") target_link_libraries(unit_test_rxn_arrhenius partmclib) +if (CUDA_FOUND AND NOT ENABLE_GPU) + add_executable(unit_test_rxn_arrhenius_gpu ${UNIT_TEST_RXN_ARRHENIUS_SRC}) + target_compile_definitions(unit_test_rxn_arrhenius_gpu PRIVATE + "UNIT_TEST_MODULE_=pmc_unit_test_rxn_arrhenius \ + ;UNIT_TEST_TYPE_=unit_test_rxn_arrhenius_t()") + target_link_libraries(unit_test_rxn_arrhenius_gpu partmclib_gpu) +endif() + ###################################################################### # test_camp_core diff --git a/src/camp_common.h b/src/camp_common.h index 8f6fd5049..68ecee258 100644 --- a/src/camp_common.h +++ b/src/camp_common.h @@ -59,6 +59,12 @@ typedef struct { int param_id; // sub model Jacobian id } JacMap; +/* GPU data structure */ +typedef struct { + size_t deriv_size; + +} DataGPU; + /* Model data structure */ typedef struct { int n_per_cell_state_var; // number of state variables per grid cell @@ -167,6 +173,30 @@ typedef struct { // for the current grid cell int n_sub_model_env_data; // Number of sub model environmental parameters // from all sub models + //#ifdef CUDA_FOUND + // GPU data + double *deriv_gpu_data; + double *deriv_cpu; + double *jac_gpu_data; + double *jac_cpu; + size_t deriv_size; + size_t jac_size; + size_t state_size; + size_t env_size; + size_t rate_constants_size; + size_t rate_constants_idx_size; + bool few_data; + bool implemented_all; + int *int_pointer; + int *int_pointer_gpu; + double *double_pointer; + double *double_pointer_gpu; + double *state_gpu; + double *env_gpu; + double *rate_constants_gpu; + int *rate_constants_idx_gpu; + + //#endif } ModelData; /* Solver data structure */ diff --git a/src/camp_debug.h b/src/camp_debug.h index 58bac669d..5f86fe1e6 100644 --- a/src/camp_debug.h +++ b/src/camp_debug.h @@ -5,6 +5,8 @@ * Header file with some debugging functions for use with camp_solver.c * */ +#ifndef CAMP_DEBUG_H +#define CAMP_DEBUG_H #ifdef PMC_DEBUG #define PMC_DEBUG_SPEC_ 0 #define PMC_DEBUG_PRINT(x) \ @@ -146,3 +148,5 @@ static void print_derivative(N_Vector deriv) { printf(" index: %d \n", i); } } + +#endif \ No newline at end of file diff --git a/src/camp_solver.c b/src/camp_solver.c index 97bb01fa7..bdae8594b 100644 --- a/src/camp_solver.c +++ b/src/camp_solver.c @@ -337,8 +337,9 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn, sd->model_data.sub_model_env_idx[0] = 0; #ifdef PMC_USE_GPU - solver_new_gpu_cu(n_dep_var, n_state_var, n_rxn, n_rxn_int_param, - n_rxn_float_param, n_rxn_env_param, n_cells); + solver_new_gpu_cu(&(sd->model_data), n_dep_var, n_state_var, n_rxn, + n_rxn_int_param, n_rxn_float_param, n_rxn_env_param, + n_cells); #endif #ifdef PMC_DEBUG @@ -452,11 +453,6 @@ void solver_initialize(void *solver_data, double *abs_tol, double rel_tol, flag = CVodeSetDlsGuessHelper(sd->cvode_mem, guess_helper); check_flag_fail(&flag, "CVodeSetDlsGuessHelper", 1); -// Allocate Jacobian on GPU -#ifdef PMC_USE_GPU - allocate_jac_gpu(sd->model_data.n_per_cell_solver_jac_elem, n_cells); -#endif - // Set gpu rxn values #ifdef PMC_USE_GPU solver_set_rxn_data_gpu(&(sd->model_data)); @@ -969,7 +965,8 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data, #ifdef PMC_USE_GPU // Calculate the Jacobian - rxn_calc_jac_gpu(md, J, time_step); + // TODO: Fix jacobian mapping with jac_map[i_map].rxn_id + // rxn_calc_jac_gpu(md, J, time_step); #endif #ifdef PMC_DEBUG @@ -978,7 +975,6 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data, #endif // Solving on CPU only - // Loop over the grid cells to calculate sub-model and rxn Jacobians for (int i_cell = 0; i_cell < n_cells; ++i_cell) { // Set the grid cell state pointers @@ -1009,18 +1005,20 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data, clock_t start = clock(); #endif -#ifndef PMC_USE_GPU + //#ifdef PMC_USE_GPU // Calculate the reaction Jacobian rxn_calc_jac(md, J_rxn_data, time_step); PMC_DEBUG_JAC(md->J_rxn, "reaction Jacobian"); -#else - // Add contributions from reactions not implemented on GPU - rxn_calc_jac_specific_types(md, J_rxn_data, time_step); +/*#else + // Add contributions from reactions not implemented on GPU + rxn_calc_jac_specific_types(md, J_rxn_data, time_step); + PMC_DEBUG_JAC(md->J_rxn, "reaction Jacobian"); #endif -// rxn_calc_jac_specific_types(md, J_rxn_data, time_step); +rxn_calc_jac_specific_types(md, J_rxn_data, time_step); +*/ #ifdef PMC_DEBUG clock_t end = clock(); sd->timeJac += (end - start); @@ -1868,7 +1866,7 @@ void error_handler(int error_code, const char *module, const char *function, */ void model_free(ModelData model_data) { #ifdef PMC_USE_GPU - // free_gpu_cu(); + free_gpu_cu(); #endif #ifdef PMC_USE_SUNDIALS diff --git a/src/camp_solver_data.F90 b/src/camp_solver_data.F90 index cb694c908..5bfb4c50a 100644 --- a/src/camp_solver_data.F90 +++ b/src/camp_solver_data.F90 @@ -888,7 +888,8 @@ subroutine solve(this, camp_state, t_initial, t_final, solver_stats) end if ! Reset the solver function timers - call this%reset_timers( ) + !Move this to intialization: I want to print only 1 time the deriv, not all times we call solve... + !call this%reset_timers( ) #endif ! Run the solver diff --git a/src/cuda/camp_gpu_solver.cu b/src/cuda/camp_gpu_solver.cu index d4d021709..990bbe7c7 100644 --- a/src/cuda/camp_gpu_solver.cu +++ b/src/cuda/camp_gpu_solver.cu @@ -27,30 +27,12 @@ extern "C" { int max_n_gpu_thread; int max_n_gpu_blocks; -//General data -double *deriv_gpu_data; -double *deriv_cpu; -double *jac_gpu_data; -double *jac_cpu; -size_t deriv_size; -size_t jac_size; -size_t state_size; -size_t env_size; -size_t rate_constants_size; -size_t rate_constants_idx_size; - -bool few_data = 0; -bool implemented_all = true; -int *int_pointer; -int *int_pointer_gpu; -double *double_pointer; -double *double_pointer_gpu; -double *state_gpu; -double *state_cpu; -double *env_gpu; -double *rate_constants_gpu; -double *rate_constants_cpu; -int *rate_constants_idx_gpu; +//Debug info +int counterDeriv; // Total calls to f() +int counterJac; // Total calls to Jac() +clock_t timeDeriv; // Compute time for calls to f() +clock_t timeJac; // Compute time for calls to Jac() + static void HandleError(cudaError_t err, const char *file, @@ -81,21 +63,24 @@ static void HandleError2(const char *file, * \param n_rxn_float_param Total number of floating-point reaction parameters * \param n_cells Number of grid cells to solve simultaneously */ -void solver_new_gpu_cu(int n_dep_var, +void solver_new_gpu_cu(ModelData *model_data, int n_dep_var, int n_state_var, int n_rxn, int n_rxn_int_param, int n_rxn_float_param, int n_rxn_env_param, int n_cells) { - //Lengths - state_size = n_state_var*n_cells * sizeof(double); - deriv_size = n_dep_var*n_cells * sizeof(double); - env_size = PMC_NUM_ENV_PARAM_*n_cells * sizeof(double); //Temp and pressure - rate_constants_size = n_rxn_env_param * n_cells * sizeof(double); - rate_constants_idx_size = (n_rxn+1) * sizeof(int); + model_data->state_size = n_state_var * n_cells * sizeof(double); + model_data->deriv_size = n_dep_var * n_cells * sizeof(double); + model_data->env_size = PMC_NUM_ENV_PARAM_ * n_cells * sizeof(double); //Temp and pressure + model_data->rate_constants_size = n_rxn_env_param * n_cells * sizeof(double); + model_data->rate_constants_idx_size = (n_rxn+1) * sizeof(int); + model_data->few_data = 0; + model_data->implemented_all = true; + counterDeriv=0; + counterJac=0; //Detect if we are working with few data values if (n_dep_var*n_cells < DATA_SIZE_LIMIT_OPT){ - few_data = 1; + model_data->few_data = 1; } //Set working GPU: we have 4 gpu available on power9. as default, it should be assign to gpu 0 @@ -112,18 +97,17 @@ void solver_new_gpu_cu(int n_dep_var, int n_blocks = (n_rxn + max_n_gpu_thread - 1) / max_n_gpu_thread; //GPU allocation - cudaMalloc((void **) &deriv_gpu_data, deriv_size); - cudaMalloc((void **) &state_gpu, state_size); - cudaMalloc((void **) &env_gpu, env_size); - cudaMalloc((void **) &rate_constants_gpu, rate_constants_size); - cudaMalloc((void **) &rate_constants_idx_gpu, rate_constants_idx_size); + cudaMalloc((void **) &model_data->deriv_gpu_data, model_data->deriv_size); + cudaMalloc((void **) &model_data->state_gpu, model_data->state_size); + cudaMalloc((void **) &model_data->env_gpu, model_data->env_size); + cudaMalloc((void **) &model_data->rate_constants_gpu, model_data->rate_constants_size); + cudaMalloc((void **) &model_data->rate_constants_idx_gpu, model_data->rate_constants_idx_size); //GPU allocation few data on pinned memory - if(few_data){ + if(model_data->few_data){ //Notice auxiliar variables are created because we // can't pin directly variables initialized before - cudaMallocHost((void**)&rate_constants_cpu, rate_constants_size); - cudaMallocHost((void**)&deriv_cpu, deriv_size); + cudaMallocHost((void**)&model_data->deriv_cpu, model_data->deriv_size); } // Warning if exceeding GPU limits @@ -138,22 +122,6 @@ void solver_new_gpu_cu(int n_dep_var, } -/** \brief Allocate Jacobian on GPU -* -* \param n_jac_elem Number of data elements on the jacobian -* \param n_cells Number of cells to compute -*/ -void allocate_jac_gpu(int n_jac_elem, int n_cells){ - - jac_size = n_jac_elem * n_cells * sizeof(double); - cudaMalloc((void **) &jac_gpu_data, jac_size); - - if(few_data){ - cudaMallocHost((void**)&jac_cpu, jac_size); - } - -} - /** \brief Set reaction data on GPU prepared structure. RXN data is divided * into two different matrix, per double and int data respectively. Matrix are * reversed to improve memory access on GPU. @@ -164,6 +132,7 @@ void allocate_jac_gpu(int n_jac_elem, int n_cells){ void solver_set_rxn_data_gpu(ModelData *model_data) { int n_rxn = model_data->n_rxn; + int n_cells = model_data->n_cells; unsigned int int_max_length = 0; unsigned int double_max_length = 0; @@ -226,7 +195,7 @@ void solver_set_rxn_data_gpu(ModelData *model_data) { #ifdef FAILURE_DETAIL printf("WARNING: Reaction type %d is not fully implemented on GPU. Computing on CPU...\n", rxn_type); #endif - implemented_all=false; + model_data->implemented_all=false; } //Get RXN lengths @@ -242,8 +211,6 @@ void solver_set_rxn_data_gpu(ModelData *model_data) { } - //todo checkout chem_mod after 99 merge, save the changed files of this branch (like gpu) copy into checkout chem_mod - //Add a for to search the biggest distance int_max_length (ptrs[i] - ptrs[i-1] //Total lengths of rxn structure @@ -252,15 +219,15 @@ void solver_set_rxn_data_gpu(ModelData *model_data) { //Allocate int and double rxn data separately //Add -1 to avoid access and have a square matrix - int_pointer = (int *) malloc(rxn_int_length * sizeof(int)); - memset(int_pointer, -1, rxn_int_length * sizeof(int)); + model_data->int_pointer = (int *) malloc(rxn_int_length * sizeof(int)); + memset(model_data->int_pointer, -1, rxn_int_length * sizeof(int)); //Add 0 to avoid access and have a square matrix - double_pointer = (double*)calloc(rxn_double_length, sizeof(double)); + model_data->double_pointer = (double*)calloc(rxn_double_length, sizeof(double)); //GPU allocation - cudaMalloc((void **) &int_pointer_gpu, rxn_int_length * sizeof(int)); - cudaMalloc((void **) &double_pointer_gpu, rxn_double_length * sizeof(double)); + cudaMalloc((void **) &model_data->int_pointer_gpu, rxn_int_length * sizeof(int)); + cudaMalloc((void **) &model_data->double_pointer_gpu, rxn_double_length * sizeof(double)); //Update number of zeros added on each reaction for (int i_rxn = 0; i_rxn < n_rxn; i_rxn++) @@ -284,11 +251,11 @@ void solver_set_rxn_data_gpu(ModelData *model_data) { int i_pos=rxn_position[i_rxn];//i_rxn;//rxn_position[i_rxn];//for bubblesort for (int j = 0; j < int_lengths[i_pos]; j++){ int *rxn_int_data = &(model_data->rxn_int_data[model_data->rxn_int_indices[i_pos]]); - int_pointer[n_rxn*j + i_rxn] = rxn_int_data[j]; + model_data->int_pointer[n_rxn*j + i_rxn] = rxn_int_data[j]; } for (int j = 0; j < double_lengths[i_pos]; j++) { double *rxn_float_data = &(model_data->rxn_float_data[model_data->rxn_float_indices[i_pos]]); - double_pointer[n_rxn*j + i_rxn] = rxn_float_data[j]; + model_data->double_pointer[n_rxn*j + i_rxn] = rxn_float_data[j]; } //Reorder the rate indices //Todo update on main code the rate_constants to read consecutively in cpu @@ -296,11 +263,19 @@ void solver_set_rxn_data_gpu(ModelData *model_data) { } //Save data to GPU - HANDLE_ERROR(cudaMemcpy(int_pointer_gpu, int_pointer, rxn_int_length*sizeof(int), cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(double_pointer_gpu, double_pointer, rxn_double_length*sizeof(double), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->int_pointer_gpu, model_data->int_pointer, rxn_int_length*sizeof(int), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->double_pointer_gpu, model_data->double_pointer, rxn_double_length*sizeof(double), cudaMemcpyHostToDevice)); //Set rate_constants-idx - HANDLE_ERROR(cudaMemcpy(rate_constants_idx_gpu, rate_constants_idx_aux, rate_constants_idx_size, cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->rate_constants_idx_gpu, rate_constants_idx_aux, model_data->rate_constants_idx_size, cudaMemcpyHostToDevice)); + + //Allocate jacobian + model_data->jac_size = model_data->n_per_cell_solver_jac_elem * n_cells * sizeof(double); + cudaMalloc((void **) &model_data->jac_gpu_data, model_data->jac_size); + + if(model_data->few_data){ + cudaMallocHost((void**)&model_data->jac_cpu, model_data->jac_size); + } } @@ -310,11 +285,11 @@ void solver_set_rxn_data_gpu(ModelData *model_data) { * \param deriv_init Pointer to first value of derivative array * \param time_step Current time step being computed (s) * \param deriv_length_cell Derivative length for one cell - * \param state_size_cell Derivative length for one cell + * \param model_data->state_size_cell Derivative length for one cell * \param n_rxn Number of reactions to include * \param n_cells_gpu Number of cells to compute - * \param int_pointer Pointer to integer reaction data - * \param double_pointer Pointer to double reaction data + * \param model_data->int_pointer Pointer to integer reaction data + * \param model_data->double_pointer Pointer to double reaction data * \param rate_constants_init Pointer to first value of reaction rates */ __global__ void solveDerivative(double *state_init, double *deriv_init, @@ -408,6 +383,14 @@ __global__ void solveDerivative(double *state_init, double *deriv_init, } +static void print_derivative_3(N_Vector deriv) { + // printf(" deriv length: %d\n", NV_LENGTH_S(deriv)); + for (int i = 0; i < NV_LENGTH_S(deriv); i++) { // NV_LENGTH_S(deriv) + printf(" deriv: % -le", NV_DATA_S(deriv)[i]); + printf(" index: %d \n", i); + } +} + /** \brief Calculate the time derivative \f$f(t,y)\f$ on GPU * * \param model_data Pointer to the model data @@ -425,41 +408,49 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste double *state = model_data->total_state; double *rate_constants = model_data->rxn_env_data; +//TODO: not working with new arrhenius test FOR SOME REASON SEG FAULT +//Pa ARREGLARLO poner deriv size y todo eso en un struct gpu y asi el multi cell y one cell core tienen su struct y sus cosas +//(parece que al ser global variable pues no lo pilla bien y no sobreescribe x algun motivo) + //Faster, use for few values - if (few_data){ + if (model_data->few_data){ //This method of passing them as a function parameter has a theoric maximum of 4kb of data - state_gpu= state; - rate_constants_gpu= rate_constants; + model_data->state_gpu= state; + model_data->rate_constants_gpu= rate_constants; } //Slower, use for large values else{ - HANDLE_ERROR(cudaMemcpy(state_gpu, state, state_size, cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->state_gpu, state, model_data->state_size, cudaMemcpyHostToDevice)); //todo rate_constants only change each time_step, not each deriv iteration - HANDLE_ERROR(cudaMemcpy(rate_constants_gpu, rate_constants, rate_constants_size, cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->rate_constants_gpu, rate_constants, model_data->rate_constants_size, cudaMemcpyHostToDevice)); } - HANDLE_ERROR(cudaMemset(deriv_gpu_data, 0, deriv_size)); + HANDLE_ERROR(cudaMemset(model_data->deriv_gpu_data, 0.0, model_data->deriv_size)); + //TODO: execute this asyncrhonous and let CPU thinks be computed in the meanwhile solveDerivative << < (n_blocks), max_n_gpu_thread >> > - (state_gpu, deriv_gpu_data, time_step, model_data->n_per_cell_dep_var, - model_data->n_per_cell_state_var, model_data->n_rxn_env_data, - n_rxn, n_cells, - int_pointer_gpu, double_pointer_gpu, rate_constants_gpu, rate_constants_idx_gpu); + (model_data->state_gpu, model_data->deriv_gpu_data, time_step, model_data->n_per_cell_dep_var, + model_data->n_per_cell_state_var, model_data->n_rxn_env_data, + n_rxn, n_cells, + model_data->int_pointer_gpu, model_data->double_pointer_gpu, model_data->rate_constants_gpu, model_data->rate_constants_idx_gpu); cudaDeviceSynchronize();// Secure cuda synchronization //Use pinned memory for few values - if (few_data){ - HANDLE_ERROR(cudaMemcpy(deriv_cpu, deriv_gpu_data, deriv_size, cudaMemcpyDeviceToHost)); - memcpy(deriv_data, deriv_cpu, deriv_size); + if (model_data->few_data){ + HANDLE_ERROR(cudaMemcpy(model_data->deriv_cpu, model_data->deriv_gpu_data, model_data->deriv_size, cudaMemcpyDeviceToHost)); + memcpy(deriv_data, model_data->deriv_cpu, model_data->deriv_size); } else { - HANDLE_ERROR(cudaMemcpy(deriv_data, deriv_gpu_data, deriv_size, cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(deriv_data, model_data->deriv_gpu_data, model_data->deriv_size, cudaMemcpyDeviceToHost)); } + //if(counterDeriv==0) print_derivative_3(deriv); - //TODO Calculate on CPU non gpu-translated type reactions (HL & SIMPOL phase transfer) (or wait till v2.0 with C++) + //counterDeriv++; +//todo: add mock_monarch as test gpu? +//todo: fix debug things... this camp_debug.h is horrible, and probably it will be a better option to print timederiv or counter... } @@ -469,14 +460,13 @@ void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_ste * \param jac_init Pointer to first value of jacobian array * \param time_step Current time step being computed (s) * \param jac_length_cell jacobian length for one cell - * \param state_size_cell jacobian length for one cell + * \param model_data->state_size_cell jacobian length for one cell * \param n_rxn Number of reactions to include * \param n_cells_gpu Number of cells to compute - * \param int_pointer Pointer to integer reaction data - * \param double_pointer Pointer to double reaction data + * \param model_data->int_pointer Pointer to integer reaction data + * \param model_data->double_pointer Pointer to double reaction data * \param rate_constants_init Pointer to first value of reaction rates */ -//TODO: fix jacGPU once matt let me modify rxn_solver and reduce jacobians (in v2.0 or before if needed) __global__ void solveJacobian(double *state_init, double *jac_init, double time_step, int jac_length_cell, int state_size_cell, int rate_constants_size_cell, int n_rxn, @@ -562,15 +552,29 @@ __global__ void solveJacobian(double *state_init, double *jac_init, } } +static void print_jacobian_3(SUNMatrix M) { + printf("\n NNZ JAC: %lld \n", SM_NNZ_S(M)); + printf("DATA | INDEXVALS:\n"); + for (int i = 0; i < SM_NNZ_S(M); i++) { + printf("% -le \n", (SM_DATA_S(M))[i]); + // printf("%lld \n", (SM_INDEXVALS_S(M))[i]); + } + printf("PTRS:\n"); + for (int i = 0; i <= SM_NP_S(M); i++) { + //printf("%lld \n", (SM_INDEXPTRS_S(M))[i]); + } +} + /** \brief Calculate the Jacobian on GPU * * \param model_data Pointer to the model data * \param J Jacobian to be calculated * \param time_step Current model time step (s) */ - void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step) { + //TODO: Fix jacobian with jac_ids... + // Get a pointer to the jacobian data double *jac_data = SM_DATA_S(jac); int n_cells = model_data->n_cells; @@ -581,56 +585,61 @@ void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step) double *rate_constants = model_data->rxn_env_data; //Faster, use for few values - if (few_data){ + if (model_data->few_data){ //This method of passing them as a function parameter has a theoric maximum of 4kb of data - state_gpu= state; - rate_constants_gpu= rate_constants; + model_data->state_gpu= state; + model_data->rate_constants_gpu= rate_constants; } //Slower, use for large values else{ - HANDLE_ERROR(cudaMemcpy(state_gpu, state, state_size, cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(rate_constants_gpu, rate_constants, rate_constants_size, cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->state_gpu, state, model_data->state_size, cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(model_data->rate_constants_gpu, rate_constants, model_data->rate_constants_size, cudaMemcpyHostToDevice)); } - HANDLE_ERROR(cudaMemset(jac_gpu_data, 0, jac_size)); + HANDLE_ERROR(cudaMemset(model_data->jac_gpu_data, 0, model_data->jac_size)); solveJacobian << < (n_blocks), max_n_gpu_thread >> > - (state_gpu, jac_gpu_data, time_step, model_data->n_per_cell_rxn_jac_elem, + (model_data->state_gpu, model_data->jac_gpu_data, time_step, model_data->n_per_cell_rxn_jac_elem, model_data->n_per_cell_state_var, model_data->n_rxn_env_data, - n_rxn, n_cells, int_pointer_gpu, double_pointer_gpu, rate_constants_gpu, rate_constants_idx_gpu); + n_rxn, n_cells, model_data->int_pointer_gpu, model_data->double_pointer_gpu, model_data->rate_constants_gpu, model_data->rate_constants_idx_gpu); cudaDeviceSynchronize();// Secure cuda synchronization //Use pinned memory for few values - if (few_data){ - HANDLE_ERROR(cudaMemcpy(jac_cpu, jac_gpu_data, jac_size, cudaMemcpyDeviceToHost)); - memcpy(jac_data, jac_cpu, jac_size); + if (model_data->few_data){ + HANDLE_ERROR(cudaMemcpy(model_data->jac_cpu, model_data->jac_gpu_data, model_data->jac_size, cudaMemcpyDeviceToHost)); + memcpy(jac_data, model_data->jac_cpu, model_data->jac_size); } else { - HANDLE_ERROR(cudaMemcpy(jac_data, jac_gpu_data, jac_size, cudaMemcpyDeviceToHost)); + HANDLE_ERROR(cudaMemcpy(jac_data, model_data->jac_gpu_data, model_data->jac_size, cudaMemcpyDeviceToHost)); } + if(counterJac==0)print_jacobian_3(jac); + + counterJac++; + } /** \brief Free GPU data structures */ void free_gpu_cu() { +/* +printf("free gpu\n"); + HANDLE_ERROR(cudaFree(model_data->int_pointer_gpu)); + HANDLE_ERROR(cudaFree(model_data->double_pointer_gpu)); + HANDLE_ERROR(cudaFree(model_data->deriv_gpu_data)); + //HANDLE_ERROR(cudaFree(model_data->jac_gpu_data)); - HANDLE_ERROR(cudaFree(int_pointer_gpu)); - HANDLE_ERROR(cudaFree(double_pointer_gpu)); - HANDLE_ERROR(cudaFree(deriv_gpu_data)); - HANDLE_ERROR(cudaFree(jac_gpu_data)); - - if(few_data){ + if(model_data->few_data){ } else{ - HANDLE_ERROR(cudaFree(state_gpu)); - HANDLE_ERROR(cudaFree(rate_constants_gpu)); - HANDLE_ERROR(cudaFree(rate_constants_idx_gpu)); + HANDLE_ERROR(cudaFree(model_data->state_gpu)); + HANDLE_ERROR(cudaFree(model_data->rate_constants_gpu)); + HANDLE_ERROR(cudaFree(model_data->rate_constants_idx_gpu)); } - + */ } /* Auxiliar functions */ diff --git a/src/cuda/camp_gpu_solver.h b/src/cuda/camp_gpu_solver.h index d6be403d7..e531fab94 100644 --- a/src/cuda/camp_gpu_solver.h +++ b/src/cuda/camp_gpu_solver.h @@ -12,7 +12,7 @@ #include #include #include "../camp_common.h" - +#include "../debug_and_stats/camp_debug_2.h" //Value to consider data size too big -> Memory optimization will change below and under the limit #define DATA_SIZE_LIMIT_OPT 2000 @@ -24,9 +24,8 @@ //Force solving on CPU: Test option #define FORCE_CPU 0 -void solver_new_gpu_cu(int n_dep_var, int n_state_var, int n_rxn, +void solver_new_gpu_cu(ModelData *model_data, int n_dep_var, int n_state_var, int n_rxn, int n_rxn_int_param, int n_rxn_float_param, int n_rxn_env_param, int n_cells); -void allocate_jac_gpu(int n_jac_elem, int n_cells); void rxn_update_env_state_gpu(ModelData *model_data, double *env); void rxn_calc_deriv_gpu(ModelData *model_data, N_Vector deriv, realtype time_step); void rxn_calc_jac_gpu(ModelData *model_data, SUNMatrix jac, realtype time_step); diff --git a/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu b/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu index a944e0f6d..85220dcec 100644 --- a/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu +++ b/src/cuda/rxns_gpu/rxn_SIMPOL_phase_transfer.cu @@ -17,7 +17,6 @@ extern "C"{ #define TEMPERATURE_K_ env_data[0] #define PRESSURE_PA_ env_data[1] - //todo fix // Universal gas constant (J/mol/K) #define UNIV_GAS_CONST_ 8.314472 // Small number for ignoring low concentrations diff --git a/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu b/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu index 98fd0da15..4746ba708 100644 --- a/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu +++ b/src/cuda/rxns_gpu/rxn_aqueous_equilibrium.cu @@ -143,10 +143,11 @@ __device__ double rxn_gpu_aqueous_equilibrium_calc_overall_rate(int *rxn_data, */ #ifdef PMC_USE_SUNDIALS -#ifdef PMC_USE_GPU -__device__ -#endif -void rxn_gpu_aqueous_equilibrium_calc_deriv_contrib(double *rate_constants, double *state, +//TODO: Apply this to cpu reactions, merging cpu and gpu code +//#ifdef PMC_USE_GPU +//__device__ +//#endif +__device__ void rxn_gpu_aqueous_equilibrium_calc_deriv_contrib(double *rate_constants, double *state, double *deriv, void *rxn_data, double * double_pointer_gpu, double time_step, int n_rxn2) { int n_rxn=n_rxn2; diff --git a/src/cuda/rxns_gpu/rxn_arrhenius.cu b/src/cuda/rxns_gpu/rxn_arrhenius.cu index 73b177825..22504e4b5 100644 --- a/src/cuda/rxns_gpu/rxn_arrhenius.cu +++ b/src/cuda/rxns_gpu/rxn_arrhenius.cu @@ -247,28 +247,25 @@ __device__ void rxn_gpu_arrhenius_calc_jac_contrib(double *rate_constants, doubl // Calculate the reaction rate //double rate = RATE_CONSTANT_; - double rate = rate_constants[0]; - for (int i_spec=0; i_spec +#include + +/** \brief Print derivative array + * + * \param deriv Derivative array + */ +static void print_derivative_2(N_Vector deriv) { + // printf(" deriv length: %d\n", NV_LENGTH_S(deriv)); + for (int i = 0; i < NV_LENGTH_S(deriv); i++) { // NV_LENGTH_S(deriv) + printf(" deriv: % -le", NV_DATA_S(deriv)[i]); + printf(" index: %d \n", i); + } +} \ No newline at end of file diff --git a/src/debug_and_stats/camp_debug_2.h b/src/debug_and_stats/camp_debug_2.h new file mode 100644 index 000000000..71bf7a09e --- /dev/null +++ b/src/debug_and_stats/camp_debug_2.h @@ -0,0 +1,12 @@ +// +// Created by cguzman on 10/24/19. +// + +#ifndef CAMP_DEBUG_2_H +#define CAMP_DEBUG_2_H + +#include "../camp_common.h" + +static void print_derivative_2(N_Vector deriv); + +#endif // CAMP_DEBUG_2_H diff --git a/src/rxn_solver.c b/src/rxn_solver.c index 9984e85b8..6889ea301 100644 --- a/src/rxn_solver.c +++ b/src/rxn_solver.c @@ -510,6 +510,10 @@ void rxn_calc_jac_specific_types(ModelData *model_data, double *J_data, rxn_int_data, rxn_float_data, rxn_env_data, time_step); break; + case RXN_ARRHENIUS: // TODO delete arrhenius when it works + rxn_arrhenius_calc_jac_contrib(model_data, J_data, rxn_int_data, + rxn_float_data, rxn_env_data, time_step); + break; case RXN_CONDENSED_PHASE_ARRHENIUS: rxn_condensed_phase_arrhenius_calc_jac_contrib( model_data, J_data, rxn_int_data, rxn_float_data, rxn_env_data, diff --git a/test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh b/test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh new file mode 100755 index 000000000..b6e591b83 --- /dev/null +++ b/test/chemistry/cb05cl_ae5/test_chemistry_cb05cl_ae5_gpu.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# exit on error +set -e +# turn on command echoing +set -v +# make sure that the current directory is the one where this script is +cd ${0%/*} +# make the output directory if it doesn't exist +mkdir -p out + +((counter = 1)) +while [ true ] +do + echo Attempt $counter + +if ! ../../../test_chemistry_cb05cl_ae5_gpu; then + echo Failure "$counter" + if [ "$counter" -gt 1 ] + then + echo FAIL + exit 1 + fi + echo retrying... + else + echo PASS + exit 0 + fi + ((counter++)) +done diff --git a/test/monarch/pmc_monarch_interface.F90 b/test/monarch/pmc_monarch_interface.F90 index a61559d39..211258da8 100644 --- a/test/monarch/pmc_monarch_interface.F90 +++ b/test/monarch/pmc_monarch_interface.F90 @@ -390,7 +390,6 @@ subroutine integrate(this, start_time, time_step, i_start, i_end, j_start, & state_size_per_cell = this%camp_core%state_size_per_cell() end if - #ifdef PMC_DEBUG ! Evaluate the Jacobian during solving solver_stats%eval_Jac = .true. @@ -435,7 +434,6 @@ subroutine integrate(this, start_time, time_step, i_start, i_end, j_start, & call this%camp_core%solve(this%camp_state, & real(time_step, kind=dp), solver_stats = solver_stats) - #ifdef PMC_DEBUG ! Check the Jacobian evaluations call warn_assert_msg(611569150, solver_stats%Jac_eval_fails.eq.0,& @@ -444,8 +442,6 @@ subroutine integrate(this, start_time, time_step, i_start, i_end, j_start, & trim( to_string( start_time ) ) ) #endif - - ! Update the MONARCH tracer array with new species concentrations MONARCH_conc(i,j,k_flip,this%map_monarch_id(:)) = & this%camp_state%state_var(this%map_camp_id(:)) diff --git a/test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh b/test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh new file mode 100755 index 000000000..43e472eab --- /dev/null +++ b/test/unit_tests/input_files/run_rxn_arrhenius_gpu.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# exit on error +set -e +# turn on command echoing +set -v +# make sure that the current directory is the one where this script is +cd ${0%/*} +# make the output directory if it doesn't exist +mkdir -p out + +((counter = 1)) +while [ true ] +do + echo Attempt $counter + +if [[ $1 == "MPI" ]]; then + exec_str="mpirun -v -np 2 ../../../unit_test_rxn_arrhenius_gpu" +else + exec_str="../../../unit_test_rxn_arrhenius_gpu" +fi +if ! $exec_str; then + echo Failure "$counter" + if [ "$counter" -gt 5 ] + then + echo FAIL + exit 1 + fi + echo retrying... + else + echo PASS + exit 0 + fi + ((counter++)) +done