From cf4c23825a1aede77ea7723df55df89ef4dfde5d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 14 May 2020 00:07:50 -0400 Subject: [PATCH 1/4] A large overhaul of the HAN code. Among many other things: - Fix the segmentation logic for gather with MPI_IN_PLACE. - Fix MPI_IN_PLACE in allreduce and allgather. - Fix an imblance bug in MPI_allgather - Accept more human readable configuration files. We can now specify the collective by name instead of a magic number, and the component we want to use also by name. - Add the capability to have optional arguments in the collective communication configuration file. Right now the capability exists for segment lengths, but is yet to be connected with the algorithms. Signed-off-by: Xi Luo Signed-off-by: George Bosilca --- ompi/mca/coll/base/coll_base_util.c | 257 +++- ompi/mca/coll/base/coll_base_util.h | 15 +- ompi/mca/coll/han/Makefile.am | 3 +- ompi/mca/coll/han/coll_han.h | 317 ++--- ompi/mca/coll/han/coll_han_allgather.c | 247 ++-- ompi/mca/coll/han/coll_han_allreduce.c | 201 ++- ompi/mca/coll/han/coll_han_bcast.c | 182 ++- ompi/mca/coll/han/coll_han_component.c | 253 +--- ompi/mca/coll/han/coll_han_dynamic.c | 1106 +++++++---------- ompi/mca/coll/han/coll_han_dynamic.h | 36 +- ompi/mca/coll/han/coll_han_dynamic_file.c | 460 +++---- ompi/mca/coll/han/coll_han_dynamic_file.h | 13 +- ompi/mca/coll/han/coll_han_gather.c | 291 +++-- ompi/mca/coll/han/coll_han_module.c | 116 +- ompi/mca/coll/han/coll_han_reduce.c | 78 +- ompi/mca/coll/han/coll_han_scatter.c | 120 +- ompi/mca/coll/han/coll_han_subcomms.c | 88 +- ompi/mca/coll/han/coll_han_topo.c | 69 +- ompi/mca/coll/han/coll_han_trigger.c | 12 +- ompi/mca/coll/han/coll_han_trigger.h | 18 +- ompi/mca/coll/han/coll_han_utils.c | 58 - .../coll/tuned/coll_tuned_decision_fixed.c | 3 +- ompi/mca/coll/tuned/coll_tuned_dynamic_file.c | 64 +- ompi/request/request.c | 2 +- 24 files changed, 1798 insertions(+), 2211 deletions(-) delete mode 100644 ompi/mca/coll/han/coll_han_utils.c diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 29b4a70caca..e6b1fde3d6e 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -29,6 +29,8 @@ #include "ompi/mca/topo/base/base.h" #include "ompi/mca/pml/pml.h" #include "coll_base_util.h" +#include "coll_base_functions.h" +#include int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, @@ -268,7 +270,7 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req, } else { scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm); } - + for (int i=0; icb.req_complete_cb = NULL; req->req_complete_cb_data = NULL; req->data.objs.objs[0] = NULL; @@ -309,35 +312,249 @@ OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, N /* File reading functions */ static void skiptonewline (FILE *fptr, int *fileline) { - do { - char val; - int rc; + char val; + int rc; + do { rc = fread(&val, 1, 1, fptr); - if (0 == rc) return; - if ((1 == rc)&&('\n' == val)) { + if (0 == rc) { + return; + } + if ('\n' == val) { (*fileline)++; return; - } + } } while (1); } -long ompi_coll_base_file_getnext (FILE *fptr, int *fileline) +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val) { + char trash; + int rc; + do { - long val; - int rc; - char trash; - - rc = fscanf(fptr, "%li", &val); - if (rc == EOF) return MYEOF; - if (1 == rc) return val; - /* in all other cases, skip to the end */ + rc = fscanf(fptr, "%li", val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val) +{ + char trash, token[32]; + int rc; + + *val = NULL; /* security in case we fail */ + do { + rc = fscanf(fptr, "%32s", token); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + if( '#' == token[0] ) { + skiptonewline(fptr, fileline); + continue; + } + *val = (char*)malloc(strlen(token) + 1); + strcpy(*val, token); + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val) +{ + char trash; + int rc; + + do { + rc = fscanf(fptr, "%" PRIsize_t, val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ rc = fread(&trash, sizeof(char), 1, fptr); - if (rc == EOF) return MYEOF; + if (rc == EOF) { + return -1; + } if ('\n' == trash) (*fileline)++; if ('#' == trash) { skiptonewline (fptr, fileline); - } + } + } while (1); +} + +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected) +{ + char trash; + int rc; + + do { + rc = fread(&trash, sizeof(char), 1, fptr); + if (0 == rc) { /* hit the end of the file */ + return -1; + } + if ('\n' == trash) { + (*fileline)++; + continue; + } + if ('#' == trash) { + skiptonewline (fptr, fileline); + continue; + } + if( trash == expected ) + return 1; /* return true and eat the char */ + if( isblank(trash) ) /* skip all spaces if that's not what we were looking for */ + continue; + if( 0 != fseek(fptr, -1, SEEK_CUR) ) + return -1; + return 0; } while (1); } + +/** + * There are certainly simpler implementation for this function when performance + * is not a critical point. But, as this function is used during the collective + * configuration, and we can do this configurations once for each communicator, + * I would rather have a more complex but faster implementation. + * The approach here is to search for the largest common denominators, to create + * something similar to a dichotomic search. + */ +int mca_coll_base_name_to_colltype(const char* name) +{ + if( 'n' == name[0] ) { + if( 0 == strncmp(name, "neighbor_all", 12) ) { + if( 't' != name[12] ) { + if( 0 == strncmp(name+12, "gather", 6) ) { + if('\0' == name[18]) return NEIGHBOR_ALLGATHER; + if( 'v' == name[18]) return NEIGHBOR_ALLGATHERV; + } + } else { + if( 0 == strncmp(name+12, "toall", 5) ) { + if( '\0' == name[17] ) return NEIGHBOR_ALLTOALL; + if( 'v' == name[17] ) return NEIGHBOR_ALLTOALLV; + if( 'w' == name[17] ) return NEIGHBOR_ALLTOALLW; + } + } + } + return -1; + } + if( 'a' == name[0] ) { + if( 0 != strncmp(name, "all", 3) ) { + return -1; + } + if( 't' != name[3] ) { + if( 'r' == name[3] ) { + if( 0 == strcmp(name+3, "reduce") ) + return ALLREDUCE; + } else { + if( 0 == strncmp(name+3, "gather", 6) ) { + if( '\0' == name[9] ) return ALLGATHER; + if( 'v' == name[9] ) return ALLGATHERV; + } + } + } else { + if( 0 == strncmp(name+3, "toall", 5) ) { + if( '\0' == name[8] ) return ALLTOALL; + if( 'v' == name[8] ) return ALLTOALLV; + if( 'w' == name[8] ) return ALLTOALLW; + } + } + return -1; + } + if( 'r' > name[0] ) { + if( 'b' == name[0] ) { + if( 0 == strcmp(name, "barrier") ) + return BARRIER; + if( 0 == strcmp(name, "bcast") ) + return BCAST; + } else if( 'g'== name[0] ) { + if( 0 == strncmp(name, "gather", 6) ) { + if( '\0' == name[6] ) return GATHER; + if( 'v' == name[6] ) return GATHERV; + } + } + if( 0 == strcmp(name, "exscan") ) + return EXSCAN; + return -1; + } + if( 's' > name[0] ) { + if( 0 == strncmp(name, "reduce", 6) ) { + if( '\0' == name[6] ) return REDUCE; + if( '_' == name[6] ) { + if( 0 == strncmp(name+7, "scatter", 7) ) { + if( '\0' == name[14] ) return REDUCESCATTER; + if( 0 == strcmp(name+14, "_block") ) return REDUCESCATTERBLOCK; + } + } + } + return -1; + } + if( 0 == strcmp(name, "scan") ) + return SCAN; + if( 0 == strcmp(name, "scatterv") ) + return SCATTERV; + if( 0 == strcmp(name, "scatter") ) + return SCATTER; + return -1; +} + +/* conversion table for all COLLTYPE_T values defined in ompi/mca/coll/base/coll_base_functions.h */ +static const char* colltype_translation_table[] = { + [ALLGATHER] = "allgather", + [ALLGATHERV] = "allgatherv", + [ALLREDUCE] = "allreduce", + [ALLTOALL] = "alltoall", + [ALLTOALLV] = "alltoallv", + [ALLTOALLW] = "alltoallw", + [BARRIER] = "barrier", + [BCAST] = "bcast", + [EXSCAN] = "exscan", + [GATHER] = "gather", + [GATHERV] = "gatherv", + [REDUCE] = "reduce", + [REDUCESCATTER] = "reduce_scatter", + [REDUCESCATTERBLOCK] = "reduce_scatter_block", + [SCAN] = "scan", + [SCATTER] = "scatter", + [SCATTERV] = "scatterv", + [NEIGHBOR_ALLGATHER] = "neighbor_allgather", + [NEIGHBOR_ALLGATHERV] = "neighbor_allgatherv", + [NEIGHBOR_ALLTOALL] = "neighbor_alltoall", + [NEIGHBOR_ALLTOALLV] = "neighbor_alltoallv", + [NEIGHBOR_ALLTOALLW] = "neighbor_alltoallw", + [COLLCOUNT] = NULL +}; + +const char* mca_coll_base_colltype_to_str(int collid) +{ + if( (collid < 0) || (collid >= COLLCOUNT) ) { + return NULL; + } + return colltype_translation_table[collid]; +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 239322b022c..e20ed6652cc 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -178,8 +178,17 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, ompi_datatype_t * const rtypes[]); /* File reading function */ -#define MYEOF -999 -long ompi_coll_base_file_getnext(FILE *fptr, int *fileline); +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val); +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val); +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); +/* peek at the next valid token to see if it begins with the expected value. If yes + * eat the value, otherwise put it back into the file. + */ +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); + +/* Miscelaneous function */ +const char* mca_coll_base_colltype_to_str(int collid); +int mca_coll_base_name_to_colltype(const char* name); END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am index 55892512e3b..61b40d97c51 100644 --- a/ompi/mca/coll/han/Makefile.am +++ b/ompi/mca/coll/han/Makefile.am @@ -26,8 +26,7 @@ coll_han_trigger.c \ coll_han_dynamic.c \ coll_han_dynamic_file.c \ coll_han_topo.c \ -coll_han_subcomms.c \ -coll_han_utils.c +coll_han_subcomms.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index 1af75ffec30..a7feefe082c 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -20,9 +20,7 @@ #include "opal/util/output.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_han_trigger.h" -#include "ompi/mca/coll/han/coll_han_dynamic.h" - -BEGIN_C_DECLS +#include "ompi/mca/coll/han/coll_han_dynamic.h" /* * Today; @@ -33,131 +31,123 @@ BEGIN_C_DECLS #define COLL_HAN_LOW_MODULES 2 #define COLL_HAN_UP_MODULES 2 -typedef struct { - uint32_t umod; - uint32_t lmod; - uint32_t fs; - uint32_t ualg; - uint32_t us; -} selection; - -struct mca_bcast_argu_s { +struct mca_coll_han_bcast_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; void *buff; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; int root_low_rank; int root_up_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; }; -typedef struct mca_bcast_argu_s mca_bcast_argu_t; +typedef struct mca_coll_han_bcast_args_s mca_coll_han_bcast_args_t; -struct mca_reduce_argu_s { +struct mca_coll_han_reduce_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; void *sbuf; void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; int root_low_rank; int root_up_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; }; -typedef struct mca_reduce_argu_s mca_reduce_argu_t; +typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t; -struct mca_allreduce_argu_s { +struct mca_coll_han_allreduce_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; - ompi_request_t *req; int *completed; }; -typedef struct mca_allreduce_argu_s mca_allreduce_argu_t; +typedef struct mca_coll_han_allreduce_args_s mca_coll_han_allreduce_args_t; -struct mca_scatter_argu_s { +struct mca_coll_han_scatter_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; void *sbuf_reorder_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; - ompi_request_t *req; }; -typedef struct mca_scatter_argu_s mca_scatter_argu_t; +typedef struct mca_coll_han_scatter_args_s mca_coll_han_scatter_args_t; -struct mca_gather_argu_s { +struct mca_coll_han_gather_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; - ompi_request_t *req; }; -typedef struct mca_gather_argu_s mca_gather_argu_t; +typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t; -struct mca_allgather_argu_s { +struct mca_coll_han_allgather_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; bool is_mapbycore; int *topo; - ompi_request_t *req; }; -typedef struct mca_allgather_argu_s mca_allgather_argu_t; +typedef struct mca_coll_han_allgather_s mca_coll_han_allgather_t; /** * Structure to hold the han coll component. First it holds the @@ -184,7 +174,7 @@ typedef struct mca_coll_han_component_t { /* up level module for reduce */ uint32_t han_reduce_up_module; /* low level module for reduce */ - uint32_t han_reduce_low_module; + uint32_t han_reduce_low_module; /* segment size for allreduce */ uint32_t han_allreduce_segsize; /* up level module for allreduce */ @@ -203,21 +193,10 @@ typedef struct mca_coll_han_component_t { uint32_t han_scatter_up_module; /* low level module for scatter */ uint32_t han_scatter_low_module; - /* whether enable auto tune */ - uint32_t han_auto_tune; /* whether we need reproducible results * (but disables topological optimisations) */ uint32_t han_reproducible; - /* create a 3D array - * num_processes (n): 2 4 8 16 32 64 (6) - * num_core (c): 2 4 8 12 (4) - * message size (m): 1 - 4194304 (23) - */ - uint32_t han_auto_tune_n; - uint32_t han_auto_tune_c; - uint32_t han_auto_tune_m; - selection *han_auto_tuned; bool use_simple_algorithm[COLLCOUNT]; /* Dynamic configuration rules */ @@ -228,7 +207,7 @@ typedef struct mca_coll_han_component_t { mca_coll_han_dynamic_rules_t dynamic_rules; /* Dynamic rules from mca parameter */ COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; - int topo_level; + TOPO_LVL_T topo_level; /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ int max_dynamic_errors; @@ -240,7 +219,7 @@ typedef void (*previous_dummy_fn_t) (void); * Structure used to store what is necessary for the collective operations * routines in case of fallback. */ -typedef struct collective_fallback_t { +typedef struct mca_coll_han_collective_fallback_s { union { mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_allgatherv_fn_t allgatherv; @@ -252,7 +231,7 @@ typedef struct collective_fallback_t { previous_dummy_fn_t dummy; } previous_routine; mca_coll_base_module_t *previous_module; -} collective_fallback_t; +} mca_coll_han_collective_fallback_t; /** Coll han module */ typedef struct mca_coll_han_module_t { @@ -271,7 +250,7 @@ typedef struct mca_coll_han_module_t { bool are_ppn_imbalanced; /* To be able to fallback when the cases are not supported */ - struct collective_fallback_t previous_routines[COLLCOUNT]; + struct mca_coll_han_collective_fallback_s previous_routines[COLLCOUNT]; /* To be able to fallback on reproducible algorithm */ mca_coll_base_module_reduce_fn_t reproducible_reduce; @@ -280,7 +259,7 @@ typedef struct mca_coll_han_module_t { mca_coll_base_module_t *reproducible_allreduce_module; /* Topological level of this communicator */ - int topologic_level; + TOPO_LVL_T topologic_level; /* Collective module storage for module choice */ mca_coll_han_collective_modules_storage_t modules_storage; @@ -334,19 +313,20 @@ int han_request_free(ompi_request_t ** request); /* Subcommunicator creation */ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); +void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); /* Gather topology information */ int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int num_topo_level); /* Utils */ -void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, - int *root_up_rank); -uint32_t han_auto_tuned_get_n(uint32_t n); -uint32_t han_auto_tuned_get_c(uint32_t c); -uint32_t han_auto_tuned_get_m(uint32_t m); +static inline void +mca_coll_han_get_ranks(int *vranks, int root, int low_size, + int *root_low_rank, int *root_up_rank) +{ + *root_up_rank = vranks[root] / low_size; + *root_low_rank = vranks[root] % low_size; +} -const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll); const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); /** Dynamic component choice */ @@ -356,7 +336,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); */ int mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module); + mca_coll_han_module_t *han_module); int mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, @@ -382,22 +362,13 @@ mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, /* Bcast */ int mca_coll_han_bcast_intra_simple(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, - int seg_count, struct ompi_datatype_t *dtype, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop); + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_bcast_t0_task(void *task_argu); -int mca_coll_han_bcast_t1_task(void *task_argu); /* Reduce */ int @@ -422,145 +393,75 @@ mca_coll_han_reduce_reproducible(const void *sbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - - -void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, int seg_count, struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop); - -int mca_coll_han_reduce_intra(const void *sbuf, +int mca_coll_han_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, ompi_op_t* op, int root, - struct ompi_communicator_t *comm, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_reduce_t0_task(void *task_argu); -int mca_coll_han_reduce_t1_task(void *task_argu); - /* Allreduce */ int mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, mca_coll_base_module_t *module); int mca_coll_han_allreduce_reproducible(const void *sbuf, void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); -void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, - int seg_count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, - int cur_seg, - int w_rank, - int last_seg_count, - bool noop, ompi_request_t * req, int *completed); int mca_coll_han_allreduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_allreduce_t0_task(void *task_argu); -int mca_coll_han_allreduce_t1_task(void *task_argu); -int mca_coll_han_allreduce_t2_task(void *task_argu); -int mca_coll_han_allreduce_t3_task(void *task_argu); /* Scatter */ int mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_scatter_us_task(void *task_argu); -int mca_coll_han_scatter_ls_task(void *task_argu); -void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - void *sbuf_reorder_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req); - -/* Gather */ -int -mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_gather_lg_task(void *task_argu); -int mca_coll_han_gather_ug_task(void *task_argu); -void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req); + +/* Gather */ +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); int mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); /* reordering after gather, for unordered ranks */ void ompi_coll_han_reorder_gather(const void *sbuf, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - int * topo); + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo); @@ -571,30 +472,12 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_allgather_lg_task(void *task_argu); -int mca_coll_han_allgather_uag_task(void *task_argu); -int mca_coll_han_allgather_lb_task(void *task_argu); -void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, - bool noop, bool is_mapbycore, int *topo, ompi_request_t * req); int mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); -END_C_DECLS #endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c index 50702d28ff9..d8d8cd5b55f 100644 --- a/ompi/mca/coll/han/coll_han_allgather.c +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -16,40 +16,45 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, - bool noop, - bool is_mapbycore, - int *topo, - ompi_request_t * req) +static int mca_coll_han_allgather_lb_task(void *task_args); +static int mca_coll_han_allgather_lg_task(void *task_args); +static int mca_coll_han_allgather_uag_task(void *task_args); + +static inline void +mca_coll_han_set_allgather_args(mca_coll_han_allgather_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, + bool noop, + bool is_mapbycore, + int *topo, + ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->is_mapbycore = is_mapbycore; - argu->topo = topo; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->topo = topo; + args->req = req; } int @@ -60,44 +65,46 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int w_rank; - w_rank = ompi_comm_rank(comm); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; mca_coll_han_comm_create_new(comm, han_module); ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int low_rank = ompi_comm_rank(low_comm); + int w_rank = ompi_comm_rank(comm); + + /* Init topo */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced){ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather with this communicator. It need to fall back on another component\n")); + return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, + comm, han_module->previous_allgather_module); + } ompi_request_t *temp_request = NULL; /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; - - /* Init topo */ - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; int root_low_rank = 0; /* Create lg (lower level gather) task */ mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); /* Setup lg task arguments */ - mca_allgather_argu_t *lg_argu = malloc(sizeof(mca_allgather_argu_t)); - mac_coll_han_set_allgather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, + mca_coll_han_allgather_t *lg_args = malloc(sizeof(mca_coll_han_allgather_t)); + mca_coll_han_set_allgather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, rdtype, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, topo, temp_request); - /* Init lg task */ - init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_argu)); - /* Issure lg task */ + /* Init and issue lg task */ + init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_args)); issue_task(lg); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); @@ -105,48 +112,70 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, return OMPI_SUCCESS; } -/* lg: lower level (shared memory) gather task */ -int mca_coll_han_allgather_lg_task(void *task_argu) +/* lg: lower level gather task */ +int mca_coll_han_allgather_lg_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; + char *tmp_buf = NULL, *tmp_rbuf = NULL; + char *tmp_send = NULL; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", t->w_rank)); - OBJ_RELEASE(t->cur_task); /* If the process is one of the node leader */ - char *tmp_buf = NULL; - char *tmp_rbuf = NULL; + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (t->rdtype, &rlb, &rext); + if (MPI_IN_PLACE == t->sbuf) { + t->sdtype = t->rdtype; + t->scount = t->rcount; + } if (!t->noop) { int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; + if (MPI_IN_PLACE == t->sbuf) { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + ompi_datatype_copy_content_same_ddt(t->rdtype, t->rcount, tmp_rbuf, tmp_send); + } + } + /* Lower level (shared memory or intra-node) gather */ + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_gather(MPI_IN_PLACE, t->scount, t->sdtype, + tmp_rbuf, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + t->low_comm->c_coll->coll_gather(tmp_send, t->rcount, t->rdtype, + NULL, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } } - /* Shared memory gather */ - t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, - t->rdtype, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_gather_module); + else { + t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, + t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_gather_module); + } + t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; /* Create uag (upper level all-gather) task */ - mca_coll_task_t *uag = OBJ_NEW(mca_coll_task_t); - /* Setup uag task arguments */ - t->cur_task = uag; - /* Init uag task */ + mca_coll_task_t *uag = t->cur_task; + /* Init and issue uag task */ init_task(uag, mca_coll_han_allgather_uag_task, (void *) t); - /* Issure uag task */ issue_task(uag); return OMPI_SUCCESS; } /* uag: upper level (inter-node) all-gather task */ -int mca_coll_han_allgather_uag_task(void *task_argu) +int mca_coll_han_allgather_uag_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; - OBJ_RELEASE(t->cur_task); + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, @@ -213,21 +242,18 @@ int mca_coll_han_allgather_uag_task(void *task_argu) /* Create lb (low level broadcast) task */ - mca_coll_task_t *lb = OBJ_NEW(mca_coll_task_t); - /* Setup lb task arguments */ - t->cur_task = lb; - /* Init lb task */ + mca_coll_task_t *lb = t->cur_task; + /* Init and issue lb task */ init_task(lb, mca_coll_han_allgather_lb_task, (void *) t); - /* Issure lb task */ issue_task(lb); return OMPI_SUCCESS; } -/* lb: low level (shared-memory) broadcast task */ -int mca_coll_han_allgather_lb_task(void *task_argu) +/* lb: low level broadcast task */ +int mca_coll_han_allgather_lb_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n", t->w_rank)); OBJ_RELEASE(t->cur_task); @@ -246,11 +272,11 @@ int mca_coll_han_allgather_lb_task(void *task_argu) int mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module){ + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module){ /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; @@ -264,12 +290,13 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, /* unbalanced case needs algo adaptation */ if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allgather with this communicator. It need to fall back on another component\n")); + "han cannot handle allgather with this communicator. It need to fall back on another component\n")); return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, - comm, han_module->previous_allgather_module); + rcount, rdtype, + comm, han_module->previous_allgather_module); } + int w_rank = ompi_comm_rank(comm); /* setup up/low coordinates */ int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); @@ -279,27 +306,55 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, /* allocate the intermediary buffer * to gather on leaders on the low sub communicator */ + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (rdtype, &rlb, &rext); char *tmp_buf = NULL; char *tmp_buf_start = NULL; + char *tmp_send = NULL; + if (MPI_IN_PLACE == sbuf) { + scount = rcount; + sdtype = rdtype; + } if (low_rank == root_low_rank) { ptrdiff_t rsize, rgap = 0; /* Compute the size to receive all the local data, including datatypes empty gaps */ rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); - // intermediary buffer on node leaders to gather on low comm + /* intermediary buffer on node leaders to gather on low comm */ tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; + if (MPI_IN_PLACE == sbuf) { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send); + } } + /* 1. low gather on node leaders into tmp_buf */ - low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, - tmp_buf_start, rcount, rdtype, root_low_rank, - low_comm, low_comm->c_coll->coll_gather_module); + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + low_comm->c_coll->coll_gather(MPI_IN_PLACE, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + low_comm->c_coll->coll_gather(tmp_send, rcount, rdtype, + NULL, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + } + else { + low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } /* 2. allgather between node leaders, from tmp_buf to reorder_buf */ if (low_rank == root_low_rank) { /* allocate buffer to store unordered result on node leaders - * * if the processes are mapped-by core, no need to reorder: - * * distribution of ranks on core first and node next, - * * in a increasing order for both patterns */ + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns. + */ char *reorder_buf = NULL; char *reorder_buf_start = NULL; if (han_module->is_mapbycore) { @@ -307,7 +362,7 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, } else { if (0 == low_rank && 0 == up_rank) { // first rank displays message OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Allgather needs reordering: ", w_rank)); + "[%d]: Future Allgather needs reordering: ", up_rank)); } ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); @@ -332,8 +387,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, */ if (!han_module->is_mapbycore) { ompi_coll_han_reorder_gather(reorder_buf_start, - rbuf, rcount, rdtype, - comm, topo); + rbuf, rcount, rdtype, + comm, topo); free(reorder_buf); reorder_buf = NULL; } @@ -347,4 +402,4 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, return OMPI_SUCCESS; - } +} diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c index 6a4fd6038f7..00b50d9e714 100644 --- a/ompi/mca/coll/han/coll_han_allreduce.c +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -17,46 +17,52 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_allreduce_t0_task(void *task_args); +static int mca_coll_han_allreduce_t1_task(void *task_args); +static int mca_coll_han_allreduce_t2_task(void *task_args); +static int mca_coll_han_allreduce_t3_task(void *task_args); + /* Only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, - int seg_count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, - int cur_seg, - int w_rank, - int last_seg_count, - bool noop, ompi_request_t * req, int *completed) +static inline void +mca_coll_han_set_allreduce_args(mca_coll_han_allreduce_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, + int seg_count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, + int cur_seg, + int w_rank, + int last_seg_count, + bool noop, ompi_request_t * req, int *completed) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->rbuf = rbuf; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->op = op; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; - argu->req = req; - argu->completed = completed; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->req = req; + args->completed = completed; } -/* - * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: +/* + * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: * lr: lower level (shared-memory or intra-node) reduce, * ur: upper level (inter-node) reduce, * ub: upper level (inter-node) bcast, @@ -84,68 +90,31 @@ mca_coll_han_allreduce_intra(const void *sbuf, mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; if (! ompi_op_is_commute(op)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allreduce with this communicator." - "It need to fall back on another component\n")); + "han cannot handle allreduce with this operation." + "Fall back on another component\n")); return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, han_module->previous_allreduce_module); } - + ptrdiff_t extent, lb; ompi_datatype_get_extent(dtype, &lb, &extent); int w_rank; w_rank = ompi_comm_rank(comm); int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); + size_t dtype_size; + ompi_datatype_type_size(dtype, &dtype_size); /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; - /* Auto tune is enabled */ - if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { - uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); - uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); - uint32_t m = han_auto_tuned_get_m(typelng * count); - uint32_t id = - n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + - c * mca_coll_han_component.han_auto_tune_m + m + - mca_coll_han_component.han_auto_tune_n * mca_coll_han_component.han_auto_tune_c * - mca_coll_han_component.han_auto_tune_m; - uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; - uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; - uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; - /* ualg and us are only available when using ADAPT */ - /* - uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; - uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; - */ - /* Set up umod */ - up_comm = han_module->cached_up_comms[umod]; - /* Set up lmod */ - low_comm = han_module->cached_low_comms[lmod]; - /* Set up fs */ - COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); - /* Set up ualg and us, which is only available when using ADAPT */ - /* - if (umod == 1) { - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - } - */ - } else { - low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; - up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, typelng, - seg_count); - } + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_allreduce_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_allreduce_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, dtype_size, + seg_count); /* Determine number of elements sent per task. */ OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, @@ -161,8 +130,8 @@ mca_coll_han_allreduce_intra(const void *sbuf, /* Setup up t0 task arguments */ int *completed = (int *) malloc(sizeof(int)); completed[0] = 0; - mca_allreduce_argu_t *t = malloc(sizeof(mca_allreduce_argu_t)); - mac_coll_han_set_allreduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, + mca_coll_han_allreduce_args_t *t = malloc(sizeof(mca_coll_han_allreduce_args_t)); + mca_coll_han_set_allreduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank, NULL, completed); @@ -208,35 +177,47 @@ mca_coll_han_allreduce_intra(const void *sbuf, init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); issue_task(t3); } - if (t->completed != NULL) { - free(t->completed); - t->completed = NULL; - } + free(t->completed); + t->completed = NULL; free(t); return OMPI_SUCCESS; } /* t0 task */ -int mca_coll_han_allreduce_t0_task(void *task_argu) +int mca_coll_han_allreduce_t0_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); OBJ_RELEASE(t->cur_task); ptrdiff_t extent, lb; ompi_datatype_get_extent(t->dtype, &lb, &extent); - t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, - t->op, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_reduce_module); + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->rbuf, NULL, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } return OMPI_SUCCESS; } /* t1 task */ -int mca_coll_han_allreduce_t1_task(void *task_argu) +int mca_coll_han_allreduce_t1_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -270,16 +251,16 @@ int mca_coll_han_allreduce_t1_task(void *task_argu) } if (!t->noop) { - ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; } /* t2 task */ -int mca_coll_han_allreduce_t2_task(void *task_argu) +int mca_coll_han_allreduce_t2_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -336,9 +317,9 @@ int mca_coll_han_allreduce_t2_task(void *task_argu) } /* t3 task */ -int mca_coll_han_allreduce_t3_task(void *task_argu) +int mca_coll_han_allreduce_t3_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -441,9 +422,23 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, low_rank = ompi_comm_rank(low_comm); /* Low_comm reduce */ - ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + ret = low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + else { + ret = low_comm->c_coll->coll_reduce((char *)rbuf, NULL, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + } + else { + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, count, dtype, op, root_low_rank, low_comm, low_comm->c_coll->coll_reduce_module); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((30, cs->han_output, "HAN/ALLREDUCE: low comm reduce failed. " @@ -512,7 +507,7 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:allreduce_reproducible: " "fallback on %s\n", - components_name[fallback]); + available_components[fallback].component_name); } han_module->reproducible_allreduce_module = fallback_module; han_module->reproducible_allreduce = fallback_module->coll_allreduce; diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c index 6eebc3b7d38..0251ba16192 100644 --- a/ompi/mca/coll/han/coll_han_bcast.c +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -16,31 +16,35 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, - int seg_count, struct ompi_datatype_t *dtype, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) +static int mca_coll_han_bcast_t0_task(void *task_args); +static int mca_coll_han_bcast_t1_task(void *task_args); + +static inline void +mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t * cur_task, void *buff, + int seg_count, struct ompi_datatype_t *dtype, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) { - argu->cur_task = cur_task; - argu->buff = buff; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->root_low_rank = root_low_rank; - argu->root_up_rank = root_up_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; + args->cur_task = cur_task; + args->buff = buff; + args->seg_count = seg_count; + args->dtype = dtype; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; } -/* - * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: * ub: upper level (inter-node) bcast * lb: low level (shared-memory or intra-node) bcast. * Hence, in each iteration, there is a combination of collective operations which is called a task. @@ -58,13 +62,10 @@ mca_coll_han_bcast_intra(void *buff, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int seg_count = count, w_rank = ompi_comm_rank(comm); + ptrdiff_t extent, lb; + size_t dtype_size; /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ @@ -72,68 +73,34 @@ mca_coll_han_bcast_intra(void *buff, if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); + "han cannot handle bcast with this communicator. It need to fall back on another component\n")); return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); + comm, han_module->previous_bcast_module); } - ompi_datatype_type_size(dtype, &typelng); + ompi_datatype_get_extent(dtype, &lb, &extent); + ompi_datatype_type_size(dtype, &dtype_size); /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm; - ompi_communicator_t *up_comm; - /* Auto tune is enabled */ - if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { - uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); - uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); - uint32_t m = han_auto_tuned_get_m(typelng * count); - uint32_t id = - n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + - c * mca_coll_han_component.han_auto_tune_m + m; - uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; - uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; - uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; - /* ualg and us are only available when using ADAPT */ - /* - uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; - uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; - */ - /* Set up umod */ - up_comm = han_module->cached_up_comms[umod]; - /* Set up lmod */ - low_comm = han_module->cached_low_comms[lmod]; - /* Set up fs */ - COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); - /* Set up ualg and us, which is only available when using ADAPT */ - /* - if (umod == 1) { - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - } - */ + ompi_communicator_t *low_comm, *up_comm; - } else { - /* If auto tune is disabled, use MCA parameters */ - low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; - up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, typelng, - seg_count); - } + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, dtype_size, + seg_count); int num_segments = (count + seg_count - 1) / seg_count; OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, - "In HAN seg_count %d count %d num_seg %d\n", + "In HAN seg_count %d count %d num_seg %d\n", seg_count, count, num_segments)); int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); - int root_low_rank; - int root_up_rank; + int root_low_rank, root_up_rank; mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, @@ -142,8 +109,8 @@ mca_coll_han_bcast_intra(void *buff, /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ - mca_bcast_argu_t *t = malloc(sizeof(mca_bcast_argu_t)); - mac_coll_han_set_bcast_argu(t, t0, (char *) buff, seg_count, dtype, + mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t)); + mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank); @@ -161,9 +128,7 @@ mca_coll_han_bcast_intra(void *buff, while (t->cur_seg <= t->num_segments - 2) { /* Create t1 task */ - mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); - /* Setup up t1 task arguments */ - t->cur_task = t1; + t->cur_task = t1 = OBJ_NEW(mca_coll_task_t); t->buff = (char *) t->buff + extent * seg_count; t->cur_seg = t->cur_seg + 1; /* Init the t1 task */ @@ -177,43 +142,40 @@ mca_coll_han_bcast_intra(void *buff, } /* t0 task: issue and wait for the upper level ibcast of segment 0 */ -int mca_coll_han_bcast_t0_task(void *task_argu) +int mca_coll_han_bcast_t0_task(void *task_args) { - mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); if (t->noop) { return OMPI_SUCCESS; - } else { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(t->dtype, &lb, &extent); - ompi_request_t *ibcast_req; - t->up_comm->c_coll->coll_ibcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, - t->up_comm, &ibcast_req, t->up_comm->c_coll->coll_ibcast_module); - ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); - return OMPI_SUCCESS; } + t->up_comm->c_coll->coll_bcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, t->up_comm->c_coll->coll_bcast_module); + return OMPI_SUCCESS; } -/* t1 task: +/* t1 task: * 1. issue the upper level ibcast of segment cur_seg + 1 * 2. issue the low level bcast of segment cur_seg * 3. wait for the completion of the ibcast */ -int mca_coll_han_bcast_t1_task(void *task_argu) +int mca_coll_han_bcast_t1_task(void *task_args) { - mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + ompi_request_t *ibcast_req = NULL; + ptrdiff_t extent, lb; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); - ptrdiff_t extent, lb; ompi_datatype_get_extent(t->dtype, &lb, &extent); - ompi_request_t *ibcast_req = NULL; - int tmp_count = t->seg_count; if (!t->noop) { if (t->cur_seg <= t->num_segments - 2 ) { - if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + int tmp_count = t->seg_count; + if (t->cur_seg == t->num_segments - 2) { tmp_count = t->last_seg_count; } t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count, @@ -227,8 +189,8 @@ int mca_coll_han_bcast_t1_task(void *task_argu) t->seg_count, t->dtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_bcast_module); - if (!t->noop && ibcast_req != NULL) { - ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); + if (NULL != ibcast_req) { + ompi_request_wait(&ibcast_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; @@ -242,20 +204,17 @@ mca_coll_han_bcast_intra_simple(void *buff, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int w_rank; - w_rank = ompi_comm_rank(comm); - /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; mca_coll_han_comm_create_new(comm, han_module); ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + int w_rank = ompi_comm_rank(comm); int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); - int root_low_rank; - int root_up_rank; + int root_low_rank, root_up_rank; /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ @@ -263,30 +222,33 @@ mca_coll_han_bcast_intra_simple(void *buff, if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); + "han cannot handle bcast with this communicator. It need to fall back on another component\n")); return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); + comm, han_module->previous_bcast_module); } else { OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, - "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); + "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); } mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: root_low_rank %d root_up_rank %d\n", + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, root_up_rank)); if (low_rank == root_low_rank) { - up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module); + up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, + up_comm, up_comm->c_coll->coll_bcast_module); /* To remove when han has better sub-module selection. For now switching to ibcast enables to make runs with libnbc. */ //ompi_request_t req; - //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module); + //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, + // up_comm, &req, up_comm->c_coll->coll_ibcast_module); //ompi_request_wait(&req, MPI_STATUS_IGNORE); } - low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, + low_comm, low_comm->c_coll->coll_bcast_module); return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index cfb40c7da02..1a6912cc0ea 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -25,13 +25,24 @@ #include "coll_han.h" #include "coll_han_dynamic.h" #include "coll_han_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_util.h" /* * Public string showing the coll ompi_han component version number */ const char *mca_coll_han_component_version_string = - "Open MPI han collective MCA component version " OMPI_VERSION; - + "Open MPI HAN collective MCA component version " OMPI_VERSION; + +ompi_coll_han_components available_components[COMPONENTS_COUNT] = { + { SELF, "self", NULL }, + { BASIC, "basic", NULL }, + { LIBNBC, "libnbc", NULL }, + { TUNED, "tuned", NULL }, + { SM, "sm", NULL }, + { SHARED, "shared", NULL }, + { ADAPT, "adapt", NULL }, + { HAN, "han", NULL } +}; /* * Local functions @@ -46,35 +57,33 @@ static int han_register(void); */ mca_coll_han_component_t mca_coll_han_component = { - /* First, fill in the super */ - { - /* First, the mca_component_t struct containing meta - information about the component itself */ + /* First, the mca_component_t struct containing meta + information about the component itself */ - .collm_version = { - MCA_COLL_BASE_VERSION_2_0_0, + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, - /* Component name and version */ - .mca_component_name = "han", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), + /* Component name and version */ + .mca_component_name = "han", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), - /* Component functions */ - .mca_open_component = han_open, - .mca_close_component = han_close, - .mca_register_component_params = han_register, - }, - .collm_data = { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE}, + /* Component functions */ + .mca_open_component = han_open, + .mca_close_component = han_close, + .mca_register_component_params = han_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, - /* Initialization / querying functions */ + /* Initialization / querying functions */ - .collm_init_query = mca_coll_han_init_query, - .collm_comm_query = mca_coll_han_comm_query, - }, + .collm_init_query = mca_coll_han_init_query, + .collm_comm_query = mca_coll_han_comm_query, + }, /* han-component specifc information */ @@ -87,27 +96,10 @@ mca_coll_han_component_t mca_coll_han_component = { */ static int han_open(void) { - int param; - mca_coll_han_component_t *cs = &mca_coll_han_component; - if (cs->han_auto_tune) { - cs->han_auto_tuned = - (selection *) malloc(2 * cs->han_auto_tune_n * cs->han_auto_tune_c * - cs->han_auto_tune_m * sizeof(selection)); - char *filename = "/home/dycz0fx/results/auto/auto_tuned.bin"; - FILE *file = fopen(filename, "r"); - fread(cs->han_auto_tuned, sizeof(selection), - 2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file); - fclose(file); - } + /* Get the global coll verbosity: it will be ours */ + mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; - /* - * Get the global coll verbosity: it will be ours - */ - cs->han_output = ompi_coll_base_framework.framework_output; - opal_output_verbose(1, cs->han_output, - "coll:han:component_open: done!"); - - cs->topo_level = GLOBAL_COMMUNICATOR; + mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; return mca_coll_han_init_dynamic_rules(); } @@ -117,11 +109,6 @@ static int han_open(void) */ static int han_close(void) { - mca_coll_han_component_t *cs = &mca_coll_han_component; - if (cs->han_auto_tune && cs->han_auto_tuned != NULL) { - free(cs->han_auto_tuned); - cs->han_auto_tuned = NULL; - } mca_coll_han_free_dynamic_rules(); return OMPI_SUCCESS; } @@ -154,57 +141,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) return "invalid topologic level"; } } -const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll) -{ - switch(coll) { - case ALLGATHER: - return "allgather"; - case ALLGATHERV: - return "allgatherv"; - case ALLREDUCE: - return "allreduce"; - case ALLTOALL: - return "alltoall"; - case ALLTOALLV: - return "alltoallv"; - case ALLTOALLW: - return "alltoallw"; - case BARRIER: - return "barrier"; - case BCAST: - return "bcast"; - case EXSCAN: - return "exscan"; - case GATHER: - return "gather"; - case GATHERV: - return "gatherv"; - case REDUCE: - return "reduce"; - case REDUCESCATTER: - return "reduce_scatter"; - case REDUCESCATTERBLOCK: - return "reduce_scatter_block"; - case SCAN: - return "scan"; - case SCATTER: - return "scatter"; - case SCATTERV: - return "scatterv"; - case NEIGHBOR_ALLGATHER: - return "neighbor_allgather"; - case NEIGHBOR_ALLGATHERV: - return "neighbor_allgatherv"; - case NEIGHBOR_ALLTOALL: - return "neighbor_alltoall"; - case NEIGHBOR_ALLTOALLV: - return "neighbor_alltoallv"; - case NEIGHBOR_ALLTOALLW: - return "neighbor_alltoallw"; - default: - return ""; - } -} + /* * Register MCA params @@ -215,15 +152,14 @@ static int han_register(void) mca_coll_han_component_t *cs = &mca_coll_han_component; /* Generated parameters name and description */ - char param_name[100] = ""; - char param_desc[300] = ""; + char param_name[128], param_desc[256]; int param_desc_size; COLLTYPE_T coll; TOPO_LVL_T topo_lvl; COMPONENT_T component; cs->han_priority = 0; - (void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", + (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); @@ -261,16 +197,14 @@ static int han_register(void) "up level module for allreduce, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reduce_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_up_module); cs->han_reduce_low_module = 0; (void) mca_base_component_var_register(c, "reduce_low_module", "low level module for allreduce, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reduce_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module); cs->han_allreduce_segsize = 524288; (void) mca_base_component_var_register(c, "allreduce_segsize", "segment size for allreduce", @@ -283,32 +217,28 @@ static int han_register(void) "up level module for allreduce, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allreduce_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_up_module); cs->han_allreduce_low_module = 0; (void) mca_base_component_var_register(c, "allreduce_low_module", "low level module for allreduce, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allreduce_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module); cs->han_allgather_up_module = 0; (void) mca_base_component_var_register(c, "allgather_up_module", "up level module for allgather, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allgather_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_up_module); cs->han_allgather_low_module = 0; (void) mca_base_component_var_register(c, "allgather_low_module", "low level module for allgather, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allgather_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module); cs->han_gather_up_module = 0; (void) mca_base_component_var_register(c, "gather_up_module", @@ -336,15 +266,7 @@ static int han_register(void) "low level module for scatter, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_scatter_low_module); - - cs->han_auto_tune = 0; - (void) mca_base_component_var_register(c, "auto_tune", - "whether enable auto tune, 0 disable, 1 enable, default 0", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module); cs->han_reproducible = 0; (void) mca_base_component_var_register(c, "reproducible", @@ -353,17 +275,15 @@ static int han_register(void) "0 disable 1 enable, default 0", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reproducible); - + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible); /* Simple algorithms MCA parameters */ for(coll = 0 ; coll < COLLCOUNT ; coll++) { cs->use_simple_algorithm[coll] = false; if(is_simple_implemented(coll)) { - snprintf(param_name, 100, "use_simple_%s", - mca_coll_han_colltype_to_str(coll)); - snprintf(param_desc, 300, "whether to enable simple algo for %s", - mca_coll_han_colltype_to_str(coll)); + snprintf(param_name, sizeof(param_name), "use_simple_%s", + mca_coll_base_colltype_to_str(coll)); + snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s", + mca_coll_base_colltype_to_str(coll)); mca_base_component_var_register(c, param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, @@ -374,31 +294,28 @@ static int han_register(void) } /* Dynamic rules MCA parameters */ - /* TODO: Find a way to avoid unused entried */ memset(cs->mca_rules, 0, COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); - for(coll = 0 ; coll < COLLCOUNT ; coll++) { + for(coll = 0; coll < COLLCOUNT; coll++) { if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { continue; } /* * Default values - * Do not avoid to set correct default parameters */ cs->mca_rules[coll][INTRA_NODE] = TUNED; cs->mca_rules[coll][INTER_NODE] = BASIC; cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; - for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) { + for(topo_lvl = 0; topo_lvl < NB_TOPO_LVL; topo_lvl++) { - snprintf(param_name, 100, "%s_dynamic_%s_module", - mca_coll_han_colltype_to_str(coll), + snprintf(param_name, sizeof(param_name), "%s_dynamic_%s_module", + mca_coll_base_colltype_to_str(coll), mca_coll_han_topo_lvl_to_str(topo_lvl)); - param_desc_size = snprintf(param_desc, 300, - "Collective module to use for " - "collective %s on %s topological level: ", - mca_coll_han_colltype_to_str(coll), + param_desc_size = snprintf(param_desc, sizeof(param_desc), + "Collective module to use for %s on %s topological level: ", + mca_coll_base_colltype_to_str(coll), mca_coll_han_topo_lvl_to_str(topo_lvl)); /* * Exhaustive description: @@ -410,10 +327,10 @@ static int han_register(void) /* Han can only be used on the global communicator */ continue; } - param_desc_size += snprintf(param_desc+param_desc_size, 300, + param_desc_size += snprintf(param_desc+param_desc_size, sizeof(param_desc) - param_desc_size, "%d = %s; ", component, - components_name[component]); + available_components[component].component_name); } mca_base_component_var_register(c, param_name, param_desc, @@ -424,45 +341,11 @@ static int han_register(void) } } - /* - * TODO: remove the following lines when auto-tune is added back to the code - */ - cs->han_auto_tune = 0; - - cs->han_auto_tune_n = 5; - cs->han_auto_tune_c = 3; - cs->han_auto_tune_m = 21; -#if 0 - cs->han_auto_tune_n = 5; - (void) mca_base_component_var_register(c, "auto_tune_n", - "auto tune n", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_n); - - cs->han_auto_tune_c = 3; - (void) mca_base_component_var_register(c, "auto_tune_c", - "auto tune c", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_c); - - cs->han_auto_tune_m = 21; - (void) mca_base_component_var_register(c, "auto_tune_m", - "auto tune n", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_auto_tune_m); -#endif - /* Dynamic rules */ cs->use_dynamic_file_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "use_dynamic_file_rules", - "Switch used to decide if we use " - "dynamic module choice rules " - "defines by file", + "Enable the dynamic selection provided via the dynamic_rules_filename MCA", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -471,8 +354,7 @@ static int han_register(void) cs->dynamic_rules_filename = NULL; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dynamic_rules_filename", - "Filename of configuration file that " - "contains the dynamic module choice rules", + "Configuration file containing the dynamic selection rules", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -481,9 +363,7 @@ static int han_register(void) cs->dump_dynamic_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dump_dynamic_rules", - "Switch used to decide if we dump " - "dynamic rules provided by " - "configuration file", + "Switch used to decide if we dump dynamic rules provided by configuration file", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -492,11 +372,8 @@ static int han_register(void) if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) && !cs->use_dynamic_file_rules) { opal_output_verbose(0, cs->han_output, - "coll:han:han_register " - "you asked for dynamic rules " - "but they are not activated. " - "Check coll_han_use_dynamic_file_rules " - "MCA parameter"); + "HAN: dynamic rules for collectives are hot activated." + "Check coll_han_use_dynamic_file_rules MCA parameter"); } cs->max_dynamic_errors = 10; diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c index 2cda40e34bf..d93cf26ad76 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.c +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -22,31 +22,29 @@ */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id) { - switch (coll_id){ - case ALLGATHER: - case ALLGATHERV: - case ALLREDUCE: - case BCAST: - case GATHER: - case REDUCE: - case SCATTER: - return true; - default: - return false; + switch (coll_id) { + case ALLGATHER: + case ALLGATHERV: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + case SCATTER: + return true; + default: + return false; } } -static COMPONENT_T -component_name_to_id(const char* name) +COMPONENT_T +mca_coll_han_component_name_to_id(const char* name) { - int i; - if(NULL == name) { return -1; } - for(i=SELF ; itopologic_level; mca_coll_base_module_t *han_base_module = (mca_coll_base_module_t *) han_module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + int nb_modules = 0; + mca_coll_base_avail_coll_t *item; + /* If the modules are get yet, return success */ if(han_module->storage_initialized) { return OMPI_SUCCESS; @@ -76,7 +75,7 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, mca_coll_base_avail_coll_t) { mca_coll_base_module_t *module = item->ac_module; const char *name = item->ac_component_name; - int id = component_name_to_id(name); + int id = mca_coll_han_component_name_to_id(name); if(id >= 0 && NULL != module && module != han_base_module) { /* @@ -85,16 +84,10 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, */ han_module->modules_storage.modules[id].module_handler = module; opal_output_verbose(80, mca_coll_han_component.han_output, - "coll:han:get_all_coll_modules " - "Han found module %s with id %d " - "for topological level %d (%s) " - "for communicator (%d/%s)\n", - name, - id, - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + "coll:han:get_all_coll_modules HAN found module %s with id %d " + "for topological level %d (%s) for communicator (%d/%s)\n", + name, id, topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); nb_modules++; } } @@ -109,16 +102,11 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, } opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_all_coll_modules " - "Han sub-communicator modules storage " - "for topological level %d (%s) " - "gets %d modules " + "coll:han:get_all_coll_modules HAN sub-communicator modules storage " + "for topological level %d (%s) gets %d modules " "for communicator (%d/%s)\n", - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - nb_modules, - comm->c_contextid, - comm->c_name); + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + nb_modules, comm->c_contextid, comm->c_name); assert(0 != nb_modules); @@ -133,15 +121,13 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, */ static const msg_size_rule_t* get_dynamic_rule(COLLTYPE_T collective, - int msg_size, - struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) + size_t msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { /* Indexes of the rule */ - int coll_idx; - int topo_idx; - int conf_idx; - int msg_size_idx; + int coll_idx, topo_idx; + int conf_idx, msg_size_idx; /* Aliases */ const mca_coll_han_dynamic_rules_t *dynamic_rules = NULL; @@ -157,107 +143,78 @@ get_dynamic_rule(COLLTYPE_T collective, /* Find the collective rule */ dynamic_rules = &(mca_coll_han_component.dynamic_rules); - for(coll_idx = dynamic_rules->nb_collectives-1 ; - coll_idx >= 0 ; coll_idx--) { + for(coll_idx = dynamic_rules->nb_collectives-1; + coll_idx >= 0; coll_idx--) { if(dynamic_rules->collective_rules[coll_idx].collective_id == collective) { coll_rule = &(dynamic_rules->collective_rules[coll_idx]); break; } } - if(coll_idx < 0) { - /* - * No dynamic rules for this collective - */ + if(coll_idx < 0) { /* No dynamic rules for this collective */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched for collective %d (%s) " + "coll:han:get_dynamic_rule HAN searched for collective %d (%s) " "but did not find any rule for this collective\n", - collective, - mca_coll_han_colltype_to_str(collective)); + collective, mca_coll_base_colltype_to_str(collective)); return NULL; } /* Find the topologic level rule */ - for(topo_idx = coll_rule->nb_topologic_levels-1 ; - topo_idx >= 0 ; topo_idx--) { + for(topo_idx = coll_rule->nb_topologic_levels-1; + topo_idx >= 0; topo_idx--) { if(coll_rule->topologic_rules[topo_idx].topologic_level == topo_lvl) { topo_rule = &(coll_rule->topologic_rules[topo_idx]); break; } } - if(topo_idx < 0) { - /* - * No topologic level rules for this collective - */ + if(topo_idx < 0) { /* No topologic level rules for this collective */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched for topologic level %d (%s) rule " + "coll:han:get_dynamic_rule HAN searched for topologic level %d (%s) rule " "for collective %d (%s) but did not find any rule\n", - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - collective, - mca_coll_han_colltype_to_str(collective)); + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + collective, mca_coll_base_colltype_to_str(collective)); return NULL; } /* Find the configuration rule */ - for(conf_idx = topo_rule->nb_rules-1 ; - conf_idx >= 0 ; conf_idx--) { + for(conf_idx = topo_rule->nb_rules-1; + conf_idx >= 0; conf_idx--) { if(topo_rule->configuration_rules[conf_idx].configuration_size <= comm_size) { conf_rule = &(topo_rule->configuration_rules[conf_idx]); break; } } if(conf_idx < 0) { - /* - * No corresponding configuration - * Should not happen with a correct file - */ - + /* No corresponding configuration. Should not have happen with a correct file */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " "but did not manage to find anything. " "This is the result of an invalid configuration file: " "the first configuration size of each collective must be 1\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size); + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), comm_size); return NULL; } /* Find the message size rule */ - for(msg_size_idx = conf_rule->nb_msg_size-1 ; - msg_size_idx >= 0 ; msg_size_idx--) { + for(msg_size_idx = conf_rule->nb_msg_size-1; + msg_size_idx >= 0; msg_size_idx--) { if(conf_rule->msg_size_rules[msg_size_idx].msg_size <= msg_size) { msg_size_rule = &(conf_rule->msg_size_rules[msg_size_idx]); break; } } if(msg_size_idx < 0) { - /* - * No corresponding message size - * Should not happen with a correct file - */ + /* No corresponding message size. Should not happen with a correct file */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message " - "but did not manage to find anything. " + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message but did not manage to find anything. " "This is the result of an invalid configuration file: " "the first message size of each configuration must be 0\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size, - msg_size); + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size); return NULL; } @@ -268,29 +225,19 @@ get_dynamic_rule(COLLTYPE_T collective, * Module correctness is checked outside */ opal_output_verbose(80, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message. " - "Found a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message : component %d (%s)\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size, - msg_size, - msg_size_rule->collective_id, - mca_coll_han_colltype_to_str(msg_size_rule->collective_id), + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message. Found a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message : component %d (%s)\n", + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size, msg_size_rule->collective_id, + mca_coll_base_colltype_to_str(msg_size_rule->collective_id), msg_size_rule->topologic_level, mca_coll_han_topo_lvl_to_str(msg_size_rule->topologic_level), msg_size_rule->configuration_size, - msg_size_rule->msg_size, - component, - components_name[component]); + msg_size_rule->msg_size, component, available_components[component].component_name); return msg_size_rule; } @@ -300,14 +247,13 @@ get_dynamic_rule(COLLTYPE_T collective, * for a msg_size sized message on the comm communicator * following the dynamic rules */ -mca_coll_base_module_t * +static mca_coll_base_module_t* get_module(COLLTYPE_T coll_id, - int msg_size, + size_t msg_size, struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module) { const msg_size_rule_t *dynamic_rule; - mca_coll_base_module_t *sub_module = NULL; TOPO_LVL_T topo_lvl; COMPONENT_T mca_rule_component; @@ -323,37 +269,26 @@ get_module(COLLTYPE_T coll_id, han_module); if(NULL != dynamic_rule) { /* Use dynamic rule from file */ - sub_module = han_module->modules_storage - .modules[dynamic_rule->component] - .module_handler; - } else { + return han_module->modules_storage.modules[dynamic_rule->component].module_handler; + } + /* + * No dynamic rule from file + * Use rule from mca parameter + */ + if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { /* - * No dynamic rule from file - * Use rule from mca parameter + * Invalid MCA parameter value + * Warn the user and return NULL */ - if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { - /* - * Invalid MCA parameter value - * Warn the user and return NULL - */ - opal_output_verbose(0, mca_coll_han_component.han_output, - "coll:han:get_module " - "Invalid MCA parameter value %d " - "for collective %d (%s) " - "on topologic level %d (%s)\n", - mca_rule_component, - coll_id, - mca_coll_han_colltype_to_str(coll_id), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl)); - return NULL; - } - sub_module = han_module->modules_storage - .modules[mca_rule_component] - .module_handler; + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:get_module Invalid MCA parameter value %d " + "for collective %d (%s) on topologic level %d (%s)\n", + mca_rule_component, coll_id, + mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl)); + return NULL; } - - return sub_module; + return han_module->modules_storage.modules[mca_rule_component].module_handler; } @@ -365,38 +300,35 @@ get_module(COLLTYPE_T coll_id, */ int mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ - ompi_datatype_type_size(sdtype, &dtype_size); - msg_size = dtype_size * scount; - + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } sub_module = get_module(ALLGATHER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -408,26 +340,17 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgather_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLGATHER, - mca_coll_han_colltype_to_str(ALLGATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHER: No module found for the sub-" - "communicator. " + "HAN/ALLGATHER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - han_module - ->previous_allgather_module); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; } else if (NULL == sub_module->coll_allgather) { /* * No valid collective from dynamic rules @@ -435,62 +358,43 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, */ han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_allgather_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "coll:han:mca_coll_han_allgather_intra_dynamic HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLGATHER, - mca_coll_han_colltype_to_str(ALLGATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHER: the module found for the sub-" - "communicator cannot handle the ALLGATHER operation. " - "Falling back to another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - han_module - ->previous_allgather_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + "HAN/ALLGATHER: the module found for the sub-communicator" + " cannot handle the ALLGATHER operation. Falling back to another component\n")); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_allgather is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_allgather_fn_t allgather; if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { allgather = mca_coll_han_allgather_intra_simple; } else { allgather = mca_coll_han_allgather_intra; } - - return allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - sub_module); + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgather = sub_module->coll_allgather; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allgather is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - sub_module); + return allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); } @@ -503,30 +407,25 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, const int *rcounts, - const int *displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *displs, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size, msg_size; - int rank; - int verbosity; - int comm_size; - int i; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgatherv_fn_t allgatherv; + int rank, verbosity = 0, comm_size, i; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size, msg_size = 0; /* Compute configuration information for dynamic rules */ comm_size = ompi_comm_size(comm); ompi_datatype_type_size(rdtype, &dtype_size); - msg_size = 0; - for(i = 0 ; i < comm_size ; i++) { + for(i = 0; i < comm_size; i++) { if(dtype_size * rcounts[i] > msg_size) { msg_size = dtype_size * rcounts[i]; } @@ -539,11 +438,7 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -555,26 +450,17 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHERV: No module found for the sub-" - "communicator. " + "HAN/ALLGATHERV: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; } else if (NULL == sub_module->coll_allgatherv) { /* * No valid collective from dynamic rules @@ -583,31 +469,24 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/ALLGATHERV: the module found for the sub-" "communicator cannot handle the ALLGATHERV operation. " "Falling back to another component\n")); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + rbuf, rcounts, displs, + rdtype, comm, + han_module->previous_allgatherv_module); + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid @@ -616,36 +495,28 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, */ opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han used for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " + "HAN used for collective %d (%s) with topological level %d (%s) " + "on communicator (%d/%s) but this module cannot handle " "this collective on this topologic level\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); - return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgatherv is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgatherv = sub_module->coll_allgatherv; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allgatherv is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - sub_module); + return allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + sub_module); } @@ -657,39 +528,32 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_allreduce_intra_dynamic(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allreduce_fn_t allreduce; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(ALLREDUCE, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -701,25 +565,17 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allreduce_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLREDUCE, - mca_coll_han_colltype_to_str(ALLREDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLREDUCE: No module found for the sub-" - "communicator. " + "HAN/ALLREDUCE: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, - op, comm, - han_module - ->previous_allreduce_module); + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; } else if (NULL == sub_module->coll_allreduce) { /* * No valid collective from dynamic rules @@ -728,60 +584,49 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allreduce_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLREDUCE, - mca_coll_han_colltype_to_str(ALLREDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/ALLREDUCE: the module found for the sub-" "communicator cannot handle the ALLREDUCE operation. " "Falling back to another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, - op, comm, - han_module - ->previous_allreduce_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* Reproducibility: fallback on reproducible algo */ if (mca_coll_han_component.han_reproducible) { - return mca_coll_han_allreduce_reproducible(sbuf, rbuf, count, dtype, op, - comm, module); + allreduce = mca_coll_han_allreduce_reproducible; + } else { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allreduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } } + sub_module = module; + } else { /* - * No fallback mechanism activated for this configuration + * If we get here: * sub_module is valid - * sub_module->coll_allreduce is valid and point to this function - * Call han topological collective algorithm + * sub_module->coll_allreduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective */ - mca_coll_base_module_allreduce_fn_t allreduce; - if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { - allreduce = mca_coll_han_allreduce_intra_simple; - } else { - allreduce = mca_coll_han_allreduce_intra; - } - return allreduce(sbuf, rbuf, count, dtype, - op, comm, module); + allreduce = mca_coll_han_allreduce_intra; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allreduce is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allreduce(sbuf, rbuf, count, dtype, - op, comm, sub_module); + return allreduce(sbuf, rbuf, count, dtype, + op, comm, sub_module); } @@ -793,38 +638,31 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, */ int mca_coll_han_bcast_intra_dynamic(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_bcast_fn_t bcast; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(BCAST, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -836,23 +674,17 @@ mca_coll_han_bcast_intra_dynamic(void *buff, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_bcast_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - BCAST, - mca_coll_han_colltype_to_str(BCAST), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/BCAST: No module found for the sub-" - "communicator. " + "HAN/BCAST: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, comm, - han_module->previous_bcast_module); + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; } else if (NULL == sub_module->coll_bcast) { /* * No valid collective from dynamic rules @@ -861,61 +693,44 @@ mca_coll_han_bcast_intra_dynamic(void *buff, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_bcast_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - BCAST, - mca_coll_han_colltype_to_str(BCAST), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/BCAST: the module found for the sub-" "communicator cannot handle the BCAST operation. " "Falling back to another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, comm, - han_module->previous_bcast_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_bcast is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_bcast_fn_t bcast; if(mca_coll_han_component.use_simple_algorithm[BCAST]) { bcast = mca_coll_han_bcast_intra_simple; } else { bcast = mca_coll_han_bcast_intra; } - return bcast(buff, - count, - dtype, - root, - comm, - module); + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_bcast is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + bcast = sub_module->coll_bcast; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_bcast is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_bcast(buff, - count, - dtype, - root, - comm, - sub_module); + return bcast(buff, count, dtype, + root, comm, sub_module); } @@ -927,39 +742,32 @@ mca_coll_han_bcast_intra_dynamic(void *buff, */ int mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_gather_fn_t gather; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(sdtype, &dtype_size); - msg_size = dtype_size * scount; + dtype_size = dtype_size * scount; sub_module = get_module(GATHER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -971,26 +779,17 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_gather_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - GATHER, - mca_coll_han_colltype_to_str(GATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/GATHER: No module found for the sub-" - "communicator. " + "HAN/GATHER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_gather_module); + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; } else if (NULL == sub_module->coll_gather) { /* * No valid collective from dynamic rules @@ -999,62 +798,45 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_gather_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - GATHER, - mca_coll_han_colltype_to_str(GATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/GATHER: the module found for the sub-" "communicator cannot handle the GATHER operation. " "Falling back to another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_gather_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_gather is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_gather_fn_t gather; if(mca_coll_han_component.use_simple_algorithm[GATHER]) { gather = mca_coll_han_gather_intra_simple; } else { gather = mca_coll_han_gather_intra; } - - - return gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_gather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + gather = sub_module->coll_gather; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_gather is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + return gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); } @@ -1066,40 +848,33 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_reduce_intra_dynamic(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(REDUCE, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -1111,25 +886,17 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_reduce_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - REDUCE, - mca_coll_han_colltype_to_str(REDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/REDUCE: No module found for the sub-" - "communicator. " + "HAN/REDUCE: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_reduce(sbuf, rbuf, count, dtype, - op, root, comm, - han_module - ->previous_reduce_module); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; } else if (NULL == sub_module->coll_reduce) { /* * No valid collective from dynamic rules @@ -1138,60 +905,51 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_reduce_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - REDUCE, - mca_coll_han_colltype_to_str(REDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/REDUCE: the module found for the sub-" "communicator cannot handle the REDUCE operation. " "Falling back to another component\n")); - return han_module->previous_reduce(sbuf, rbuf, count, dtype, - op, root, comm, - han_module - ->previous_reduce_module); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; } if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* Reproducibility: fallback on reproducible algo */ if (mca_coll_han_component.han_reproducible) { - return mca_coll_han_reduce_reproducible(sbuf, rbuf, count, dtype, op, - root, comm, module); + reduce = mca_coll_han_reduce_reproducible; + } else { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_reduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } } + sub_module = module; + } else { /* - * No fallback mechanism activated for this configuration + * If we get here: * sub_module is valid - * sub_module->coll_reduce is valid and point to this function - * Call han topological collective algorithm + * sub_module->coll_reduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective */ - mca_coll_base_module_reduce_fn_t reduce; - if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { - reduce = mca_coll_han_reduce_intra_simple; - } else { - reduce = mca_coll_han_reduce_intra; - } - return reduce(sbuf, rbuf, count, dtype, - op, root, comm, module); + reduce = sub_module->coll_reduce; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_reduce is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_reduce(sbuf, rbuf, count, dtype, - op, root, comm, sub_module); + return reduce(sbuf, rbuf, count, dtype, + op, root, comm, sub_module); } @@ -1203,39 +961,32 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, */ int mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_scatter_fn_t scatter; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(rdtype, &dtype_size); - msg_size = dtype_size * rcount; + dtype_size = dtype_size * rcount; sub_module = get_module(SCATTER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -1247,26 +998,17 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_scatter_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - SCATTER, - mca_coll_han_colltype_to_str(SCATTER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/SCATTER: No module found for the sub-" - "communicator. " + "HAN/SCATTER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_scatter_module); + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; } else if (NULL == sub_module->coll_scatter) { /* * No valid collective from dynamic rules @@ -1275,38 +1017,26 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_scatter_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - SCATTER, - mca_coll_han_colltype_to_str(SCATTER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/SCATTER: the module found for the sub-" "communicator cannot handle the SCATTER operation. " "Falling back to another component\n")); - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_scatter_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_scatter is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_scatter_fn_t scatter; scatter = mca_coll_han_scatter_intra; /* * TODO: Uncomment when scatter simple is merged @@ -1316,10 +1046,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * scatter = mca_coll_han_scatter_intra; * } */ - return scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + } else { + scatter = sub_module->coll_scatter; } /* @@ -1329,10 +1057,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * They points to the collective to use, according to the dynamic rules * Selector's job is done, call the collective */ - return sub_module->coll_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + return scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); } - - diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h index 979b292ba0f..0ccecb63ba3 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.h +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -1,5 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ @@ -27,9 +30,9 @@ * ################################################# * * Han dynamic rules allow the user to define the collective - * module to call depending the topological configuration of the + * module to call depending on the topological configuration of the * sub-communicators and the collective parameters. This mechanism - * can also be used to fallback the main collective on another module. + * can also be used to fallback to the main collective on another module. * The interface is described in coll_han_dynamic_file.h. * * ############################# @@ -39,7 +42,7 @@ * directly accesses the module on the communicator. This information is * stored in the collective structure of the communicator during the collective * module choice at the communicator initialization. When han needs this - * information for the first time, it identifies the modles by their name and + * information for the first time, it identifies the modules by their name and * stores them in its module structure. * Then, the modules are identified by their identifier. * @@ -69,7 +72,7 @@ * adds an indirection on the collective call: dynamic choice functions. These * functions do not implement any collective. First, they try to find a dynamic * rule from file for the given collective. If there is not any rule for the - * fiven configuration, MCA parameter defined rules are used. Once the module + * given configuration, MCA parameter defined rules are used. Once the module * to use is found, the correct collective implementation is called. * * This indirection is also used on the global communicator. This allows han @@ -92,11 +95,9 @@ * by increasing value, some of them will not be considered */ -BEGIN_C_DECLS - /* Dynamic rules support */ typedef enum COMPONENTS { - SELF=0, + SELF = 0, BASIC, LIBNBC, TUNED, @@ -107,18 +108,17 @@ typedef enum COMPONENTS { COMPONENTS_COUNT } COMPONENT_T; -static const char *components_name[]={"self", - "basic", - "libnbc", - "tuned", - "sm", - "shared", - "adapt", - "han"}; +typedef struct { + COMPONENT_T id; + char* component_name; + mca_coll_base_component_t* component; +} ompi_coll_han_components; + +extern ompi_coll_han_components available_components[COMPONENTS_COUNT]; /* Topologic levels */ typedef enum TOPO_LVL { - INTRA_NODE=0, + INTRA_NODE = 0, INTER_NODE, /* Identifies the global communicator as a topologic level */ GLOBAL_COMMUNICATOR, @@ -135,7 +135,7 @@ typedef struct msg_size_rule_s { int configuration_size; /* Message size of the rule */ - int msg_size; + size_t msg_size; /* Component to use on this specific configuration * and message size */ @@ -209,6 +209,6 @@ typedef struct mca_coll_han_collective_modules_storage_s { /* Tests if a dynamic collective is implemented */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); +COMPONENT_T mca_coll_han_component_name_to_id(const char* name); -END_C_DECLS #endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index d163071edc2..ff12a7652d0 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -26,11 +26,14 @@ #include "ompi/mca/coll/base/coll_base_util.h" +#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) +#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval) +#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval) + static void check_dynamic_rules(void); /* Current file line for verbose message */ static int fileline = 1; -#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) int mca_coll_han_init_dynamic_rules(void) @@ -38,31 +41,31 @@ mca_coll_han_init_dynamic_rules(void) /* File management */ const char *fname; FILE *fptr = NULL; - int nb_entries = 0; + int nb_entries = 0, rc; /* Loop counters */ int i, j, k, l; /* Collective informations */ - int nb_coll; - COLLTYPE_T coll_id; + long nb_coll, coll_id; + char * coll_name = NULL; collective_rule_t *coll_rules; /* Topo informations */ - int nb_topo; - TOPO_LVL_T topo_lvl; + long nb_topo, topo_lvl; topologic_rule_t *topo_rules; /* Configuration informations */ - int nb_rules, conf_size; + long nb_rules, conf_size; configuration_rule_t *conf_rules; /* Message size informations */ - int nb_msg_size, msg_size; + long nb_msg_size; + size_t msg_size; msg_size_rule_t *msg_size_rules; /* Component informations */ - COMPONENT_T component; + long component; /* If the dynamic rules are not used, do not even read the file */ if(!mca_coll_han_component.use_dynamic_file_rules) { @@ -70,47 +73,31 @@ mca_coll_han_init_dynamic_rules(void) return OMPI_SUCCESS; } - fname = mca_coll_han_component.dynamic_rules_filename; - - if(NULL == fname) { + if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "coll_han_use_dynamic_file_rules is true but " - "coll_han_dynamic_rules_filename is not set: " - "coll han will use dynamic rules from mca " - "parameters and their default value\n"); + "coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but " + "coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n"); mca_coll_han_component.dynamic_rules.nb_collectives = 0; return OMPI_SUCCESS; } - fptr = fopen(fname, "r"); - - if(NULL == fptr) { + if( NULL == (fptr = fopen(fname, "r")) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "cannot open dynamic file provided by " - "coll_han_dynamic_rules_filename=%s " - "please provide it with full path and " - "check file permissions. Rules from " - "MCA parameters will be used instead\n", + "coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by " + "coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and " + "check file permissions. Rules from MCA parameters will be used instead\n", fname); mca_coll_han_component.dynamic_rules.nb_collectives = 0; return OMPI_SUCCESS; } /* The first information of the file is the collective count */ - nb_coll = getnext(fptr); - - if(nb_coll <= 0) { + if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for collective count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for collective count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_coll); + fname, fileline, nb_coll); mca_coll_han_component.dynamic_rules.nb_collectives = 0; goto file_reading_error; } @@ -126,69 +113,68 @@ mca_coll_han_init_dynamic_rules(void) } /* Iterates on collective rules */ - for(i=0 ; i= COLLCOUNT) { + if( getnext_string(fptr, &coll_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "invalid collective id %d at line %d: the collective " - "must be at least %d and less than %d\n", - coll_id, - fileline, - ALLGATHER, - COLLCOUNT); - coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d." + "The rest of the input file will be ignored.\n", + fileline); goto file_reading_error; } + coll_id = mca_coll_base_name_to_colltype(coll_name); + if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) { + /* maybe the file was in the old format and we read the collective index instead of the name. */ + char* endp; + coll_id = strtol(coll_name, &endp, 10); + if( '\0' != *endp ) { /* there is garbage in the input */ + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules invalid collective %s " + "at line %d: the collective must be at least %d and less than %d. " + "The rest of the input file will be ignored.\n", + coll_name, fileline, ALLGATHER, COLLCOUNT); + goto file_reading_error; + } + coll_name = (char*)mca_coll_base_colltype_to_str(coll_id); + } if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "read collective id %d at line %d " - "but this collective is not implemented yet. " - "This is not an error but this set of rules " - "will not be used\n", - fname, - coll_id, - fileline); + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "read collective id %ld at line %d but this collective is not implemented yet. " + "This is not an error but this set of rules will not be used\n", + fname, coll_id, fileline); } /* * The first information of a collective rule * is the number of topologic rules */ - nb_topo = getnext(fptr); - if(nb_topo < 0) { + if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for topo level count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_topo); - coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_topo); goto file_reading_error; } /* Store the collective rule informations */ - coll_rules[i].collective_id = coll_id; coll_rules[i].nb_topologic_levels = nb_topo; + coll_rules[i].collective_id = (COLLTYPE_T)coll_id; if(0 == nb_topo) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for topo level count\n", - fname, - fileline, - nb_topo); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count\n", + fname, fileline, nb_topo); continue; } @@ -197,30 +183,21 @@ mca_coll_han_init_dynamic_rules(void) coll_rules[i].topologic_rules = topo_rules; if(NULL == topo_rules) { coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterates on topologic rules */ - for(j=0 ; j= NB_TOPO_LVL) { + if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid topo level %d is given " - "or the reader encountered an unexpected EOF. " - "Topologic level must be at least %d and " - "less than %d\n", - fname, - fileline, - topo_lvl, - INTRA_NODE, - NB_TOPO_LVL); - topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. " + "Topologic level must be at least %d and less than %d\n", + fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL); goto file_reading_error; } @@ -228,38 +205,26 @@ mca_coll_han_init_dynamic_rules(void) * The first information of a topologic rule * is the number of configurations */ - nb_rules = getnext(fptr); - - if(nb_rules < 0) { + nb_rules = -1; + if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for rules count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for rules count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_rules); - topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_rules); goto file_reading_error; } /* Store the topologic rule informations */ topo_rules[j].collective_id = coll_id; - topo_rules[j].topologic_level = topo_lvl; + topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl; topo_rules[j].nb_rules = nb_rules; if(0 == nb_rules) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for configuration rules count\n", - fname, - fileline, - nb_rules); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for configuration rules count\n", + fname, fileline, nb_rules); continue; } @@ -268,32 +233,21 @@ mca_coll_han_init_dynamic_rules(void) topo_rules[j].configuration_rules = conf_rules; if(NULL == conf_rules) { topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterate on configuration rules */ - for(k=0 ; k 1)) { + /* Get the configuration size */ + if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "invalid configuration size %d at line %d " - "or the reader encountered an unexpected EOF " - "the configuration size must be at least %d " - "and the first configuration size " - "of a topologic level must be %d\n", - conf_size, - fileline, - 1, - 1); - conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d " + "or the reader encountered an unexpected EOF the configuration size must be at least %d " + "and the first configuration size of a topologic level must be %d\n", + conf_size, fileline, 1, 1); goto file_reading_error; } @@ -301,21 +255,12 @@ mca_coll_han_init_dynamic_rules(void) * The first information of a configuration rule * is the number of message size rules */ - nb_msg_size = getnext(fptr); - if(nb_msg_size < 0) { + if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for message size rules count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_msg_size); - conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_msg_size); goto file_reading_error; } @@ -327,13 +272,9 @@ mca_coll_han_init_dynamic_rules(void) if(0 == nb_msg_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for message size rules count\n", - fname, - fileline, - nb_msg_size); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count\n", + fname, fileline, nb_msg_size); continue; } @@ -342,88 +283,102 @@ mca_coll_han_init_dynamic_rules(void) conf_rules[k].msg_size_rules = msg_size_rules; if(NULL == msg_size_rules) { conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterate on message size rules */ - for(l=0 ; l 1)) { + rc = getnext_size_t(fptr, &msg_size); + if( (rc < 0) || + (0 == l && msg_size > 1)) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for message size " - "or the reader encountered " - "an unexpected EOF. " - "The first message size rule of " - "a configuration must be 0\n", - fname, - fileline, - msg_size); - conf_rules[k].nb_msg_size = l+1; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %" PRIsize_t " is given for message size " + "or the reader encountered an unexpected EOF. " + "The first message size rule of a configuration must be 0\n", + fname, fileline, msg_size); goto file_reading_error; } /* Get the component identifier for this message size rule */ - component = getnext(fptr); - if(component < SELF || component >= COMPONENTS_COUNT) { + if( getnext_string(fptr, &target_comp_name) < 0 ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: cannot read the name of a collective component\n", + fname, fileline); + goto file_reading_error; + } + component = mca_coll_han_component_name_to_id(target_comp_name); + if( (component < SELF) || (component >= COMPONENTS_COUNT) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid collective " - "component id %d is given or the " - "reader encountered an unexpected EOF. " - "Collective component id must be at " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid collective component name %s was given or the " + "reader encountered an unexpected EOF. Collective component id must be at " "least %d and less than %d\n", - fname, - fileline, - component, - SELF, - COMPONENTS_COUNT); - conf_rules[k].nb_msg_size = l+1; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT); + free(target_comp_name); goto file_reading_error; } - /* Store message size rule informations */ + /* Store message size rule information */ msg_size_rules[l].collective_id = coll_id; msg_size_rules[l].topologic_level = topo_lvl; msg_size_rules[l].configuration_size = conf_size; msg_size_rules[l].msg_size = msg_size; - msg_size_rules[l].component = component; + msg_size_rules[l].component = (COMPONENT_T)component; nb_entries++; + /* do we have the optional segment length */ + if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n"); + long seglength; + if( 0 != topo_lvl ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found segment lengths for topological collective at level != 0 " + "for collective %s component %s. These values will be ignored.\n", + fname, fileline, coll_name, target_comp_name); + } + while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) { + if( getnext_long(fptr, &seglength) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found end of file while reading the optional list " + "of segment lengths for collective %s component %s\n", + fname, fileline, coll_name, target_comp_name); + free(target_comp_name); + goto file_reading_error; + } + } + } + free(target_comp_name); } } } } + if( NULL != coll_name ) { + free(coll_name); + coll_name = NULL; + } - if(MYEOF != getnext(fptr)) { + if( getnext_long(fptr, &nb_coll) ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "Warning on file %s at line %d: " "rule reading is over but reader does not seem " "to have reached the end of the file\n", - fname, - fileline); + fname, fileline); } opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "read %d rules from %s\n", - nb_entries, - fname); + nb_entries, fname); if(mca_coll_han_component.dump_dynamic_rules) { mca_coll_han_dump_dynamic_rules(); @@ -447,6 +402,9 @@ mca_coll_han_init_dynamic_rules(void) return OMPI_ERROR; file_reading_error: + if( NULL != coll_name ) { + free(coll_name); + } opal_output_verbose(0, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "could not fully read dynamic rules file. " @@ -531,7 +489,8 @@ static void check_dynamic_rules(void) configuration_rule_t *conf_rules; /* Message size informations */ - int nb_msg_size, msg_size; + int nb_msg_size; + size_t msg_size; msg_size_rule_t *msg_size_rules; /* Component informations */ @@ -540,73 +499,49 @@ static void check_dynamic_rules(void) nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; - for(i=0 ; i=1 && conf_rules[k-1].configuration_size > conf_size) { + if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d: " - "configuration sizes %d and %d are " - "not sorted by increasing value\n", - coll_id, - topo_lvl, - conf_rules[k-1].configuration_size, - conf_size); + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d: " + "configuration sizes %d and %d are not sorted by increasing value\n", + coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size); } - for(l=0 ; l=1 && msg_size_rules[l-1].msg_size > msg_size) { + if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d " - "with configuration size %d: " - "message sizes %d and %d are " + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d: " + "message sizes %" PRIsize_t " and %" PRIsize_t " are " "not sorted by increasing value\n", - coll_id, - topo_lvl, - conf_size, - msg_size_rules[l-1].msg_size, - msg_size); + coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size); } - if(HAN == component - && GLOBAL_COMMUNICATOR != topo_lvl) { + if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d " - "with configuration size %d " - "for message size %d: " - "han collective component %d " - "can only be activated for " - "topology level %d\n", - coll_id, - topo_lvl, - conf_size, - msg_size, - HAN, - GLOBAL_COMMUNICATOR); + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d " + "for message size %" PRIsize_t ": han collective component %d " + "can only be activated for topology level %d\n", + coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR); } } } @@ -618,9 +553,6 @@ void mca_coll_han_dump_dynamic_rules(void) { int nb_entries = 0; - /* Loop counters */ - int i, j, k, l; - /* Collective informations */ int nb_coll; COLLTYPE_T coll_id; @@ -645,42 +577,32 @@ void mca_coll_han_dump_dynamic_rules(void) nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; - for(i=0 ; i collective component %d (%s)\n", - nb_entries, - coll_id, - mca_coll_han_colltype_to_str(coll_id), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - conf_size, - msg_size, - component, - components_name[component]); + "coll:han:dump_dynamic_rules %d collective %d (%s) " + "topology level %d (%s) configuration size %d " + "mesage size %d -> collective component %d (%s)\n", + nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size, + msg_size, component, available_components[component].component_name); nb_entries++; } diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.h b/ompi/mca/coll/han/coll_han_dynamic_file.h index 846b9b74cc7..b61ba0c5d8d 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.h +++ b/ompi/mca/coll/han/coll_han_dynamic_file.h @@ -1,5 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ @@ -60,10 +63,9 @@ * communicator and the corresponding level for sub-communicators * created by han. * - Configuration size: - * The configuration size is the number of elements of the actual - * topology level in the upper topology level. For example, if - * topology levels are intra-node and inter-node, it can be the - * number of MPI ranks per node or the number of nodes in the global + * The configuration size is the number of elements in a topology level. + * For example, if topology levels are intra-node and inter-node, it can + * be the number of MPI ranks per node or the number of nodes in the global * communicator. For the GLOBAL_COMMUNICATOR topologic level, * the configuration size is the communicator size. * - Message_size Component: @@ -101,11 +103,8 @@ * the reader. */ -BEGIN_C_DECLS - int mca_coll_han_init_dynamic_rules(void); void mca_coll_han_free_dynamic_rules(void); void mca_coll_han_dump_dynamic_rules(void); -END_C_DECLS #endif diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c index 2cbd6d976ce..946c2797050 100644 --- a/ompi/mca/coll/han/coll_han_gather.c +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -16,40 +16,44 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_gather_lg_task(void *task_args); +static int mca_coll_han_gather_ug_task(void *task_args); + /* only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) +static inline void +mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root = root; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->req = req; } int @@ -81,30 +85,26 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); + "han cannot handle gather with this communicator. It need to fall back on another component\n")); return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + rcount, rdtype, root, + comm, han_module->previous_gather_module); } /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; /* create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; /* Get the 'virtual ranks' mapping correspondong to the communicators */ vranks = han_module->cached_vranks; @@ -115,8 +115,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", - w_rank, root, root_low_rank, root_up_rank)); + "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", + w_rank, root, root_low_rank, root_up_rank)); ompi_datatype_type_extent(rdtype, &rextent); @@ -127,7 +127,7 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, * in a increasing order for both patterns */ if (han_module->is_mapbycore) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Han Gather is_bycore: ", w_rank)); + "[%d]: Han Gather is_bycore: ", w_rank)); reorder_rbuf = (char *)rbuf; } else { @@ -145,12 +145,12 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, /* Create lg task */ mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); /* Setup lg task arguments */ - mca_gather_argu_t *lg_argu = malloc(sizeof(mca_gather_argu_t)); - mac_coll_han_set_gather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, + mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t)); + mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, temp_request); /* Init lg task */ - init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_argu)); + init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args)); /* Issure lg task */ issue_task(lg); @@ -167,18 +167,18 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, /* reorder rbuf based on rank */ if (w_rank == root && !han_module->is_mapbycore) { for (i=0; iw_rank)); - OBJ_RELEASE(t->cur_task); /* If the process is one of the node leader */ char *tmp_buf = NULL; @@ -203,31 +202,29 @@ int mca_coll_han_gather_lg_task(void *task_argu) int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&t->rdtype->super, - (int64_t)t->rcount * low_size, - &rgap); + (int64_t)t->rcount * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; } - /* shared memory node gather */ + /* Low level (usually intra-node or shared memory) node gather */ t->low_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount, - t->sdtype, - tmp_rbuf, - t->rcount, - t->rdtype, - t->root_low_rank, - t->low_comm, - t->low_comm->c_coll->coll_gather_module); + t->scount, + t->sdtype, + tmp_rbuf, + t->rcount, + t->rdtype, + t->root_low_rank, + t->low_comm, + t->low_comm->c_coll->coll_gather_module); /* Prepare up comm gather */ t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; /* Create ug (upper level all-gather) task */ - mca_coll_task_t *ug = OBJ_NEW(mca_coll_task_t); - /* Setup ug task arguments */ - t->cur_task = ug; + mca_coll_task_t *ug = t->cur_task; /* Init ug task */ init_task(ug, mca_coll_han_gather_ug_task, (void *) t); /* Issure ug task */ @@ -237,9 +234,9 @@ int mca_coll_han_gather_lg_task(void *task_argu) } /* ug: upper level (intra-node) gather task */ -int mca_coll_han_gather_ug_task(void *task_argu) +int mca_coll_han_gather_ug_task(void *task_args) { - mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; + mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args; OBJ_RELEASE(t->cur_task); if (t->noop) { @@ -249,14 +246,14 @@ int mca_coll_han_gather_ug_task(void *task_argu) int low_size = ompi_comm_size(t->low_comm); /* inter node gather */ t->up_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount*low_size, - t->sdtype, - (char *)t->rbuf, - t->rcount*low_size, - t->rdtype, - t->root_up_rank, - t->up_comm, - t->up_comm->c_coll->coll_gather_module); + t->scount*low_size, + t->sdtype, + (char *)t->rbuf, + t->rcount*low_size, + t->rdtype, + t->root_up_rank, + t->up_comm, + t->up_comm->c_coll->coll_gather_module); if (t->sbuf_inter_free != NULL) { free(t->sbuf_inter_free); @@ -274,12 +271,12 @@ int mca_coll_han_gather_ug_task(void *task_argu) /* only work with regular situation (each node has equal number of processes) */ int mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { int w_rank = ompi_comm_rank(comm); int w_size = ompi_comm_size(comm); @@ -294,10 +291,10 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, * as the comm_split is made on the base of low_rank */ if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); + "han cannot handle gather with this communicator. It need to fall back on another component\n")); return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + rcount, rdtype, root, + comm, han_module->previous_gather_module); } /* create the subcommunicators */ @@ -325,11 +322,11 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, reorder_buf_start = (char *)rbuf; } else { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather needs reordering: ", w_rank)); + "[%d]: Future Gather needs reordering: ", w_rank)); ptrdiff_t rgap = 0; ptrdiff_t rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * w_size, - &rgap); + (int64_t)rcount * w_size, + &rgap); reorder_buf = (char *)malloc(rsize); /* rgap is the size of unused space at the start of the datatype */ reorder_buf_start = reorder_buf - rgap; @@ -338,40 +335,40 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, } /* allocate the intermediary buffer - * * to gather on leaders on the low sub communicator */ + * to gather on leaders on the low sub communicator */ char *tmp_buf = NULL; // allocated memory char *tmp_buf_start = NULL; // start of the data if (low_rank == root_low_rank) { - ptrdiff_t rsize, rgap = 0; + ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * low_size, - &rgap); + (int64_t)rcount * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; } /* 1. low gather on nodes leaders */ low_comm->c_coll->coll_gather((char *)sbuf, - scount, - sdtype, - tmp_buf_start, - rcount, - rdtype, - root_low_rank, - low_comm, - low_comm->c_coll->coll_gather_module); + scount, + sdtype, + tmp_buf_start, + rcount, + rdtype, + root_low_rank, + low_comm, + low_comm->c_coll->coll_gather_module); /* 2. upper gather (inter-node) between node leaders */ if (low_rank == root_low_rank) { up_comm->c_coll->coll_gather((char *)tmp_buf_start, - scount*low_size, - sdtype, - (char *)reorder_buf_start, - rcount*low_size, - rdtype, - root_up_rank, - up_comm, - up_comm->c_coll->coll_gather_module); + scount*low_size, + sdtype, + (char *)reorder_buf_start, + rcount*low_size, + rdtype, + root_up_rank, + up_comm, + up_comm->c_coll->coll_gather_module); if (tmp_buf != NULL) { free(tmp_buf); @@ -379,7 +376,7 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, tmp_buf_start = NULL; } OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug gather finish\n", t->w_rank)); + "[%d] Future Gather: ug gather finish\n", w_rank)); } /* 3. reorder data on root into rbuf @@ -388,8 +385,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, */ if (w_rank == root && !han_module->is_mapbycore) { ompi_coll_han_reorder_gather(reorder_buf_start, - rbuf, rcount, rdtype, - comm, topo); + rbuf, rcount, rdtype, + comm, topo); free(reorder_buf); } @@ -408,28 +405,28 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, */ void ompi_coll_han_reorder_gather(const void *sbuf, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - int * topo) { - int i; - int topolevel = 2; // always 2 levels in topo - int w_rank = ompi_comm_rank(comm); - int w_size = ompi_comm_size(comm); - ptrdiff_t rextent; - ompi_datatype_type_extent(rdtype, &rextent); - for (i=0; icached_topo = NULL; module->is_mapbycore = false; module->storage_initialized = false; - for (i = 0 ; i < NB_TOPO_LVL ; i++) { + for( i = 0; i < NB_TOPO_LVL; i++ ) { module->sub_comm[i] = NULL; } - for (i=SELF ; imodules_storage.modules[i].module_handler = NULL; } @@ -72,16 +70,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) } -#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \ - if (NULL != (obj)) { \ - OBJ_RELEASE(obj); \ - } \ -} while (0) +#define OBJ_RELEASE_IF_NOT_NULL(obj) \ + do { \ + if (NULL != (obj)) { \ + OBJ_RELEASE(obj); \ + } \ + } while (0) /* * Module destructor */ -static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) +static void +mca_coll_han_module_destruct(mca_coll_han_module_t * module) { int i; @@ -126,7 +126,6 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) han_module_clear(module); } - OBJ_CLASS_INSTANCE(mca_coll_han_module_t, mca_coll_base_module_t, mca_coll_han_module_construct, @@ -191,50 +190,30 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) /* All is good -- return a module */ han_module->topologic_level = mca_coll_han_component.topo_level; - /* - * TODO: When the selector is fully implemented, - * this if will be meaningless - */ + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatterv = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; + if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { /* We are on the global communicator, return topological algorithms */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; han_module->super.coll_allgatherv = NULL; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; - han_module->super.coll_scatterv = NULL; } else { /* We are on a topologic sub-communicator, return only the selector */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; - han_module->super.coll_scatterv = NULL; } opal_output_verbose(10, ompi_coll_base_framework.framework_output, @@ -247,28 +226,28 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) /* * In this macro, the following variables are supposed to have been declared * in the caller: - * . ompi_communicator_t *comm + * . ompi_communicator_t *comm * . mca_coll_han_module_t *han_module - */ -#define HAN_SAVE_PREV_COLL_API(__api) do { \ - han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ - han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\ - if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ - opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ - "(%d/%s): no underlying " # __api"; disqualifying myself", \ - comm->c_contextid, comm->c_name); \ - return OMPI_ERROR; \ - } \ - /* TODO add a OBJ_RELEASE at module disabling */ \ - /* + FIXME find why releasing generates memory corruption */ \ - OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ + */ +#define HAN_SAVE_PREV_COLL_API(__api) \ + do { \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + return OMPI_ERROR; \ + } \ + OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ } while(0) /* * Init module on the communicator */ -static int han_module_enable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) +static int +han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; @@ -290,8 +269,9 @@ static int han_module_enable(mca_coll_base_module_t * module, /* * Module disable */ -static int mca_coll_han_module_disable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) +static int +mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c index d0dc337ce8b..26f7198a58f 100644 --- a/ompi/mca/coll/han/coll_han_reduce.c +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -15,33 +15,37 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, - int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) +static int mca_coll_han_reduce_t0_task(void *task_args); +static int mca_coll_han_reduce_t1_task(void *task_args); + +static inline void +mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, + int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->rbuf = rbuf; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->op = op; - argu->root_low_rank = root_low_rank; - argu->root_up_rank = root_up_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; } -/* - * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: * lb: low level (shared-memory or intra-node) reduce. * ub: upper level (inter-node) reduce * Hence, in each iteration, there is a combination of collective operations which is called a task. @@ -53,13 +57,13 @@ void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cu * iter 4 | | | | ur | task: t1, contains ur */ int -mca_coll_han_reduce_intra(const void *sbuf, +mca_coll_han_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, ompi_op_t* op, int root, - struct ompi_communicator_t *comm, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { ptrdiff_t extent, lb; @@ -67,8 +71,8 @@ mca_coll_han_reduce_intra(const void *sbuf, int w_rank; w_rank = ompi_comm_rank(comm); int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); + size_t dtype_size; + ompi_datatype_type_size(dtype, &dtype_size); mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; /* Do not initialize topology if the operation cannot commute */ @@ -95,7 +99,7 @@ mca_coll_han_reduce_intra(const void *sbuf, /* use MCA parameters for now */ low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, typelng, + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, dtype_size, seg_count); int num_segments = (count + seg_count - 1) / seg_count; @@ -117,8 +121,8 @@ mca_coll_han_reduce_intra(const void *sbuf, /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ - mca_reduce_argu_t *t = malloc(sizeof(mca_reduce_argu_t)); - mac_coll_han_set_reduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, + mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t)); + mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank); @@ -158,9 +162,9 @@ mca_coll_han_reduce_intra(const void *sbuf, } /* t0 task: issue and wait for the low level reduce of segment 0 */ -int mca_coll_han_reduce_t0_task(void *task_argu) +int mca_coll_han_reduce_t0_task(void *task_args) { - mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); @@ -173,8 +177,8 @@ int mca_coll_han_reduce_t0_task(void *task_argu) } /* t1 task */ -int mca_coll_han_reduce_t1_task(void *task_argu) { - mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; +int mca_coll_han_reduce_t1_task(void *task_args) { + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); @@ -207,7 +211,7 @@ int mca_coll_han_reduce_t1_task(void *task_argu) { } if (!t->noop && ireduce_req) { - ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; @@ -349,7 +353,7 @@ mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:reduce_reproducible: " "fallback on %s\n", - components_name[fallback]); + available_components[fallback].component_name); } han_module->reproducible_reduce_module = fallback_module; han_module->reproducible_reduce = fallback_module->coll_reduce; diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c index b2a87529384..bbd781f3517 100644 --- a/ompi/mca/coll/han/coll_han_scatter.c +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -15,51 +15,55 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_scatter_us_task(void *task_args); +static int mca_coll_han_scatter_ls_task(void *task_args); + /* Only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - void *sbuf_reorder_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) +static inline void +mca_coll_han_set_scatter_args(mca_coll_han_scatter_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + void *sbuf_reorder_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->sbuf_reorder_free = sbuf_reorder_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root = root; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->sbuf_reorder_free = sbuf_reorder_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->req = req; } int mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { int i, j; int w_rank, w_size; @@ -80,9 +84,9 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, /* Create the subcommunicators */ mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); @@ -91,20 +95,15 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, ompi_request_t *temp_request = NULL; /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; int root_low_rank; int root_up_rank; - mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, @@ -152,20 +151,20 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, /* Create us task */ mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); /* Setup us task arguments */ - mca_scatter_argu_t *us_argu = malloc(sizeof(mca_scatter_argu_t)); - mac_coll_han_set_scatter_argu(us_argu, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, + mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t)); + mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, (char *) rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, temp_request); /* Init us task */ - init_task(us, mca_coll_han_scatter_us_task, (void *) (us_argu)); + init_task(us, mca_coll_han_scatter_us_task, (void *) (us_args)); /* Issure us task */ issue_task(us); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); return OMPI_SUCCESS; -prev_scatter_intra: + prev_scatter_intra: return han_module->previous_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, @@ -173,10 +172,9 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, } /* us: upper level (intra-node) scatter task */ -int mca_coll_han_scatter_us_task(void *task_argu) +int mca_coll_han_scatter_us_task(void *task_args) { - mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; - OBJ_RELEASE(t->cur_task); + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", @@ -202,9 +200,7 @@ int mca_coll_han_scatter_us_task(void *task_argu) t->sbuf_reorder_free = NULL; } /* Create ls tasks for the current union segment */ - mca_coll_task_t *ls = OBJ_NEW(mca_coll_task_t); - /* Setup up ls task arguments */ - t->cur_task = ls; + mca_coll_task_t *ls = t->cur_task; /* Init ls task */ init_task(ls, mca_coll_han_scatter_ls_task, (void *) t); /* Issure ls task */ @@ -213,14 +209,14 @@ int mca_coll_han_scatter_us_task(void *task_argu) return OMPI_SUCCESS; } -/* ls: lower level (shared memory) scatter task */ -int mca_coll_han_scatter_ls_task(void *task_argu) +/* ls: lower level (shared memory or intra-node) scatter task */ +int mca_coll_han_scatter_ls_task(void *task_args) { - mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n", t->w_rank)); OBJ_RELEASE(t->cur_task); - /* Shared memory scatter */ + t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf, t->rcount, t->rdtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_scatter_module); diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index e99f3e614b8..28c1b47db91 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -82,12 +82,9 @@ static void create_internode_comm_new(ompi_communicator_t *comm, * comm: input communicator of the collective */ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) + mca_coll_han_module_t *han_module) { - int low_rank, low_size; - int up_rank; - int w_rank; - int w_size; + int low_rank, low_size, up_rank, w_rank, w_size; ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); const int *origin_priority; @@ -97,7 +94,6 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_base_module_allreduce_fn_t old_allreduce; mca_coll_base_module_t *old_allreduce_module; - mca_coll_base_module_allgather_fn_t old_allgather; mca_coll_base_module_t *old_allgather_module; @@ -208,13 +204,13 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, * vrank */ comm->c_coll->coll_allgather(&vrank, - 1, - MPI_INT, - vranks, - 1, - MPI_INT, - comm, - comm->c_coll->coll_allgather_module); + 1, + MPI_INT, + vranks, + 1, + MPI_INT, + comm, + comm->c_coll->coll_allgather_module); /* * Set the cached info @@ -227,7 +223,7 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - /* Put allreduce, allgather, reduce and bcast back */ + /* Put allreduce, allgather, reduce, bcast and gather back */ comm->c_coll->coll_allreduce = old_allreduce; comm->c_coll->coll_allreduce_module = old_allreduce_module; @@ -353,12 +349,9 @@ static void create_internode_comm(ompi_communicator_t *comm, * comm: input communicator of the collective */ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) + mca_coll_han_module_t *han_module) { - int low_rank, low_size; - int up_rank; - int w_rank; - int w_size; + int low_rank, low_size, up_rank, w_rank, w_size; ompi_communicator_t **low_comms; ompi_communicator_t **up_comms; const int *origin_priority; @@ -368,31 +361,67 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_base_module_allreduce_fn_t old_allreduce; mca_coll_base_module_t *old_allreduce_module; + mca_coll_base_module_allgather_fn_t old_allgather; mca_coll_base_module_t *old_allgather_module; + mca_coll_base_module_bcast_fn_t old_bcast; + mca_coll_base_module_t *old_bcast_module; + + mca_coll_base_module_gather_fn_t old_gather; + mca_coll_base_module_t *old_gather_module; + + mca_coll_base_module_reduce_fn_t old_reduce; + mca_coll_base_module_t *old_reduce_module; + /* use cached communicators if possible */ if (han_module->cached_comm == comm && - han_module->cached_low_comms != NULL && - han_module->cached_up_comms != NULL && - han_module->cached_vranks != NULL) { + han_module->cached_low_comms != NULL && + han_module->cached_up_comms != NULL && + han_module->cached_vranks != NULL) { return; } - /* We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones */ + /* + * We cannot use han allreduce and allgather without sub-communicators + * Temporary set previous ones + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ old_allreduce = comm->c_coll->coll_allreduce; old_allreduce_module = comm->c_coll->coll_allreduce_module; old_allgather = comm->c_coll->coll_allgather; old_allgather_module = comm->c_coll->coll_allgather_module; + old_reduce = comm->c_coll->coll_reduce; + old_reduce_module = comm->c_coll->coll_reduce_module; + + old_bcast = comm->c_coll->coll_bcast; + old_bcast_module = comm->c_coll->coll_bcast_module; + + old_gather = comm->c_coll->coll_gather; + old_gather_module = comm->c_coll->coll_gather_module; + comm->c_coll->coll_allreduce = han_module->previous_allreduce; comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; comm->c_coll->coll_allgather = han_module->previous_allgather; comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + comm->c_coll->coll_reduce = han_module->previous_reduce; + comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; + + comm->c_coll->coll_bcast = han_module->previous_bcast; + comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; + + comm->c_coll->coll_gather = han_module->previous_gather; + comm->c_coll->coll_gather_module = han_module->previous_gather_module; + + /* create communicators if there is no cached communicator */ w_rank = ompi_comm_rank(comm); @@ -479,12 +508,21 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), MCA_BASE_VAR_SOURCE_SET, NULL); - /* Put allreduce and allgather back */ + /* Put allreduce, allgather, reduce, bcast and gather back */ comm->c_coll->coll_allreduce = old_allreduce; comm->c_coll->coll_allreduce_module = old_allreduce_module; comm->c_coll->coll_allgather = old_allgather; comm->c_coll->coll_allgather_module = old_allgather_module; + + comm->c_coll->coll_reduce = old_reduce; + comm->c_coll->coll_reduce_module = old_reduce_module; + + comm->c_coll->coll_bcast = old_bcast; + comm->c_coll->coll_bcast_module = old_bcast_module; + + comm->c_coll->coll_gather = old_gather; + comm->c_coll->coll_gather_module = old_gather_module; } diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c index cbcfd698d05..a013a8aa656 100644 --- a/ompi/mca/coll/han/coll_han_topo.c +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -52,7 +52,6 @@ static void mca_coll_han_topo_print(int *topo, struct ompi_communicator_t *comm, int num_topo_level); - /* * takes the number part of a host: hhh2031 -->2031 */ @@ -81,8 +80,8 @@ static int mca_coll_han_hostname_to_number(char* hostname, int size) * processes virtual topids */ static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level) + struct ompi_communicator_t* comm, + int num_topo_level) { int *self_topo = (int *)malloc(sizeof(int) * num_topo_level); char hostname[1024]; @@ -126,7 +125,7 @@ static void mca_coll_han_topo_get(int *topo, * */ static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level) + int level, int num_topo_level) { int i, j; int min, min_loc; @@ -173,11 +172,11 @@ static void mca_coll_han_topo_sort(int *topo, int start, int end, } else if (i == end) { new_end = end; mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); + num_topo_level); } else if (last != topo[i * num_topo_level + level]) { new_end = i - 1; mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); + num_topo_level); new_start = i; last = topo[i * num_topo_level + level]; } @@ -197,12 +196,13 @@ static void mca_coll_han_topo_sort(int *topo, int start, int end, * | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... | * +----------+-------+----------+-------+------+----------+-------+-----+ */ -static bool mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) +static bool +mca_coll_han_topo_is_mapbycore(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) { - int i; int size = ompi_comm_size(comm); + int i; for (i = 1; i < size; i++) { /* @@ -223,12 +223,15 @@ static bool mca_coll_han_topo_is_mapbycore(int *topo, } /* The topo is supposed sorted by host */ -static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level){ - int i; +static bool +mca_coll_han_topo_are_ppn_imbalanced(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) +{ int size = ompi_comm_size(comm); - if (size < 2){ + int i; + + if (size < 2) { return false; } int ppn; @@ -236,37 +239,37 @@ static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, /* Find the ppn for the first node */ for (i = 1; i < size; i++) { - if (topo[i * num_topo_level] != last_host){ + if (topo[i * num_topo_level] != last_host) { break; } } ppn = i; /* All on one node */ - if ( size == ppn){ + if( size == ppn ) { return false; } /* Trivial case */ - if (size % ppn != 0){ + if( size % ppn != 0 ) { return true; } last_host = topo[ppn * num_topo_level]; /* Check that the 2nd and next hosts also this ppn. Since the topo is sorted * one just need to jump ppn ranks to check the supposed switch of host */ - for (i = 2 * ppn; i < size; i += ppn ){ + for (i = 2 * ppn; i < size; i += ppn ) { /* the list of ranks for the last known host have ended before */ - if (topo[(i-1) * num_topo_level] != last_host){ + if (topo[(i-1) * num_topo_level] != last_host) { return true; } /* the list of ranks for the last known host are bigger than excpected */ - if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){ + if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]) { return true; } last_host = topo[i * num_topo_level]; } /* Check the last host */ - if (topo[(size-1) * num_topo_level] != last_host){ + if (topo[(size-1) * num_topo_level] != last_host) { return true; } @@ -280,12 +283,12 @@ static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, * * @param num_topo_level (IN) Number of the topological levels */ -int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module, - int num_topo_level) +int* +mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) { - int size; - int *topo; + int size, *topo; size = ompi_comm_size(comm); @@ -328,17 +331,17 @@ int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, return topo; } -static void mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) +static void +mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) { int rank = ompi_comm_rank(comm); int size = ompi_comm_size(comm); if (rank == 0) { - int i; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank)); - for (i=0; ifunc_ptr = NULL; - t->func_argu = NULL; + t->func_args = NULL; } static void mca_coll_task_destructor(mca_coll_task_t * t) { t->func_ptr = NULL; - t->func_argu = NULL; + t->func_args = NULL; } OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, mca_coll_task_destructor); /* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu) +int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args) { + OBJ_CONSTRUCT(t, mca_coll_task_t); t->func_ptr = func_ptr; - t->func_argu = func_argu; + t->func_args = func_args; return OMPI_SUCCESS; } /* Issue the task */ int issue_task(mca_coll_task_t * t) { - t->func_ptr(t->func_argu); - return OMPI_SUCCESS; + return t->func_ptr(t->func_args); } diff --git a/ompi/mca/coll/han/coll_han_trigger.h b/ompi/mca/coll/han/coll_han_trigger.h index c7314d25fb8..3a94661b355 100644 --- a/ompi/mca/coll/han/coll_han_trigger.h +++ b/ompi/mca/coll/han/coll_han_trigger.h @@ -12,25 +12,17 @@ #ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H #define MCA_COLL_HAN_TRIGGER_EXPORT_H -#include "ompi_config.h" -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/coll/coll.h" #include "ompi/communicator/communicator.h" -#include "ompi/win/win.h" -#include "ompi/mca/coll/base/coll_base_functions.h" -#include "opal/util/info.h" #include "ompi/op/op.h" -#include "opal/runtime/opal_progress.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/datatype/ompi_datatype.h" + typedef int (*task_func_ptr) (void *); struct mca_coll_task_s { opal_object_t super; task_func_ptr func_ptr; - void *func_argu; + void *func_args; }; typedef struct mca_coll_task_s mca_coll_task_t; @@ -38,9 +30,9 @@ typedef struct mca_coll_task_s mca_coll_task_t; OBJ_CLASS_DECLARATION(mca_coll_task_t); /* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu); +int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args); /* Issue the task */ int issue_task(mca_coll_task_t * t); -#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ +#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_utils.c b/ompi/mca/coll/han/coll_han_utils.c deleted file mode 100644 index 293777a256e..00000000000 --- a/ompi/mca/coll/han/coll_han_utils.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018-2020 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "coll_han.h" - -/* Get root's low_rank and up_rank from vranks array */ -void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, - int *root_up_rank) -{ - *root_up_rank = vranks[root] / low_size; - *root_low_rank = vranks[root] % low_size; -} - -uint32_t han_auto_tuned_get_n(uint32_t n) -{ - uint32_t avail[5] = { 4, 8, 16, 32, 64 }; - uint32_t i; - for (i = 0; i < 5; i++) { - if (avail[i] >= n) { - return i; - } - } - return i - 1; -} - -uint32_t han_auto_tuned_get_c(uint32_t c) -{ - uint32_t avail[3] = { 4, 8, 12 }; - uint32_t i; - for (i = 0; i < 3; i++) { - if (avail[i] >= c) { - return i; - } - } - return i - 1; -} - -uint32_t han_auto_tuned_get_m(uint32_t m) -{ - uint32_t avail[21] = - { 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, -262144, 524288, 1048576, 2097152, 4194304 }; - uint32_t i; - for (i = 0; i < 21; i++) { - if (avail[i] >= m) { - return i; - } - } - return i - 1; -} diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index cc73fcf835b..637122185e5 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -1446,7 +1446,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount, communicator_size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - if (rank == root) { + /* Determine block size */ + if ( (rank == root) || (MPI_IN_PLACE == sbuf) ) { ompi_datatype_type_size(rdtype, &dsize); total_dsize = dsize * (ptrdiff_t)rcount; } else { diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index 098a4fa9491..a259c789ac2 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -40,7 +40,7 @@ static int fileline=0; /* used for verbose error messages */ -#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) +#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) /* * Reads a rule file called fname @@ -56,9 +56,8 @@ static int fileline=0; /* used for verbose error messages */ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) { + long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS; FILE *fptr = (FILE*) NULL; - int X, CI, NCS, CS, ALG, NMS, FANINOUT; - long MS, SS; int x, ncs, nms; ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ @@ -101,45 +100,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** goto on_file_error; } - X = (int)getnext(fptr); - if (X<0) { + if( (getnext(fptr, &X) < 0) || (X < 0) ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); goto on_file_error; } if (X>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); goto on_file_error; } for (x=0;x=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); goto on_file_error; } if (alg_rules[CI].alg_rule_id != CI) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI)); alg_p = &alg_rules[CI]; alg_p->alg_rule_id = CI; alg_p->n_com_sizes = 0; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; - NCS = (int)getnext (fptr); - if (NCS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline)); + if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI)); alg_p->n_com_sizes = NCS; alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); if (NULL == alg_p->com_rules) { @@ -151,20 +147,18 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** com_p = &(alg_p->com_rules[ncs]); - CS = (int)getnext (fptr); - if (CS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } com_p->mpi_comsize = CS; - NMS = (int)getnext (fptr); - if (NMS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n", + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", NMS, CI, CS)); com_p->n_msg_sizes = NMS; com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); @@ -179,37 +173,33 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** msg_p = &(com_p->msg_rules[nms]); - MS = getnext (fptr); - if (MS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->msg_size = (size_t)MS; - ALG = (int)getnext (fptr); - if (ALG<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_alg = ALG; - FANINOUT = (int)getnext (fptr); - if (FANINOUT<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_topo_faninout = FANINOUT; - SS = getnext (fptr); - if (SS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_segsize = SS; if (!nms && MS) { OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); goto on_file_error; } @@ -222,7 +212,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** } /* comm size */ total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI)); } /* per collective */ diff --git a/ompi/request/request.c b/ompi/request/request.c index a8ddb68ad3a..abf33449d89 100644 --- a/ompi/request/request.c +++ b/ompi/request/request.c @@ -54,7 +54,7 @@ static void ompi_request_construct(ompi_request_t* req) /* don't call _INIT, we don't to set the request to _INACTIVE and there will * be no matching _FINI invocation */ req->req_state = OMPI_REQUEST_INVALID; - req->req_complete = false; + req->req_complete = REQUEST_COMPLETED; req->req_persistent = false; req->req_start = NULL; req->req_free = NULL; From 28a984fcbe7b888c8a16de6f3e2ec8dbf2e873dc Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 1 Oct 2020 00:17:28 -0400 Subject: [PATCH 2/4] Redo the initialization of all HAN collectives. Cleanup the fallback collective support. - In case the module is unable to deliver the expected result, it will fallback executing the collective operation on another collective component. This change make the support for this fallback simpler to use. - Implement a fallback allowing a HAN module to remove itself as potential active collective module, and instead fallback to the next module in line. - Completely disable the HAN modules on error. From the moment an error is encountered they remove themselves from the communicator, and in case some other modules calls them simply behave as a pass-through. Communicator: provide ompi_comm_split_with_info to split and provide info at the same time Add ompi_comm_coll_preference info key to control collective component selection COLL HAN: use info keys instead of component-level variable to communicate topology level between abstraction layers - The info value is a comma-separated list of entries, which are chosen with decreasing priorities. This overrides the priority of the component, unless the component has disqualified itself. An entry prefixed with ^ starts the ignore-list. Any entry following this character will be ingnored during the collective component selection for the communicator. Example: "sm,libnbc,^han,adapt" gives sm the highest preference, followed by libnbc. The components han and adapt are ignored in the selection process. - Allocate a temporary buffer for all lower-level leaders (length 2 segments) - Fix the handling of MPI_IN_PLACE for gather and scatter. COLL HAN: Fix topology handling - HAN should not rely on node names to determine the ordering of ranks. Instead, use the node leaders as identifiers and short-cut if the node-leaders agree that ranks are consecutive. Also, error out if the rank distribution is imbalanced for now. Signed-off-by: Joseph Schuchart Signed-off-by: George Bosilca --- ompi/communicator/comm.c | 24 +- ompi/communicator/communicator.h | 15 + ompi/group/group.c | 28 ++ ompi/group/group.h | 8 + ompi/mca/coll/adapt/coll_adapt_ibcast.c | 4 +- ompi/mca/coll/base/coll_base_comm_select.c | 101 ++++- ompi/mca/coll/base/coll_base_util.c | 4 +- ompi/mca/coll/base/coll_base_util.h | 2 +- ompi/mca/coll/han/coll_han.h | 106 +++-- ompi/mca/coll/han/coll_han_allgather.c | 51 ++- ompi/mca/coll/han/coll_han_allreduce.c | 72 ++-- ompi/mca/coll/han/coll_han_bcast.c | 85 ++-- ompi/mca/coll/han/coll_han_component.c | 1 - ompi/mca/coll/han/coll_han_dynamic.c | 9 +- ompi/mca/coll/han/coll_han_dynamic_file.c | 26 +- ompi/mca/coll/han/coll_han_gather.c | 191 ++++++--- ompi/mca/coll/han/coll_han_module.c | 68 ++- ompi/mca/coll/han/coll_han_reduce.c | 144 +++++-- ompi/mca/coll/han/coll_han_scatter.c | 62 ++- ompi/mca/coll/han/coll_han_subcomms.c | 473 ++++++--------------- ompi/mca/coll/han/coll_han_topo.c | 395 ++++++----------- ompi/mca/coll/han/coll_han_trigger.c | 15 - ompi/mca/coll/han/coll_han_trigger.h | 15 +- ompi/mca/coll/sm/coll_sm_module.c | 2 +- 24 files changed, 1004 insertions(+), 897 deletions(-) diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 4c6a7a7b4fa..649979746d6 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ -/* -** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). -*/ -int ompi_comm_split( ompi_communicator_t* comm, int color, int key, - ompi_communicator_t **newcomm, bool pass_on_topo ) + +int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ) { int myinfo[2]; int size, my_size; @@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d", newcomp->c_contextid, comm->c_contextid ); - + /* Copy info if there is one */ + if (info) { + newcomp->super.s_info = OBJ_NEW(opal_info_t); + opal_info_dup(info, &(newcomp->super.s_info)); + } /* Activate the communicator and init coll-component */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, } +/* +** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). +*/ +int ompi_comm_split( ompi_communicator_t* comm, int color, int key, + ompi_communicator_t **newcomm, bool pass_on_topo ) +{ + return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo); +} + /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 8936b7f1df9..01c02614885 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm, OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key, ompi_communicator_t** newcomm, bool pass_on_topo); +/** + * split a communicator based on color and key. Parameters + * are identical to the MPI-counterpart of the function. + * Similar to \see ompi_comm_split with an additional info parameter. + * + * @param comm: input communicator + * @param color + * @param key + * + * @ + */ +OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ); + /** * split a communicator based on type and key. Parameters * are identical to the MPI-counterpart of the function. diff --git a/ompi/group/group.c b/ompi/group/group.c index f5cc88be98c..9e368c96da9 100644 --- a/ompi/group/group.c +++ b/ompi/group/group.c @@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group) return false; } + +/** + * Count the number of processes on this group that share the same node as + * this process. + */ +int ompi_group_count_local_peers (ompi_group_t *group) +{ + int local_peers = 0; + for (int i = 0 ; i < group->grp_proc_count ; ++i) { + ompi_proc_t *proc = NULL; +#if OMPI_GROUP_SPARSE + proc = ompi_group_peer_lookup (group, i); +#else + proc = ompi_group_get_proc_ptr_raw (group, i); + if (ompi_proc_is_sentinel (proc)) { + /* the proc must be stored in the group or cached in the proc + * hash table if the process resides in the local node + * (see ompi_proc_complete_init) */ + continue; + } +#endif + if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { + local_peers++; + } + } + + return local_peers; +} diff --git a/ompi/group/group.h b/ompi/group/group.h index 661666246e9..d1cf7d99ae8 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t return ompi_group_get_proc_ptr (group, peer_id, false); } +/** + * Return true if all processes in the group are not on the local node. + */ bool ompi_group_have_remote_peers (ompi_group_t *group); +/** + * Count the number of processes on the local node. + */ +int ompi_group_count_local_peers (ompi_group_t *group); + /** * Function to print the group info */ diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index b22982c0114..605d6262303 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req) || (context->con->tree->tree_nextsize > 0 && rank != context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } @@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req) && num_recv_fini == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index 405bd6b388e..8c6023d411d 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -38,6 +38,7 @@ #include "mpi.h" #include "ompi/communicator/communicator.h" #include "opal/util/output.h" +#include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/class/opal_list.h" #include "opal/class/opal_object.h" @@ -312,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a, return 0; } +static inline int +component_in_argv(char **argv, const char* component_name) +{ + if( NULL != argv ) { + while( NULL != *argv ) { + if( 0 == strcmp(component_name, *argv) ) { + return 1; + } + argv++; /* move to the next argument */ + } + } + return 0; +} + /* * For each module in the list, check and see if it wants to run, and * do the resulting priority comparison. Make a list of modules to be @@ -321,13 +336,66 @@ static int avail_coll_compare (opal_list_item_t **a, static opal_list_t *check_components(opal_list_t * components, ompi_communicator_t * comm) { - int priority; + int priority, flag; const mca_base_component_t *component; mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; opal_list_t *selectable; mca_coll_base_avail_coll_t *avail; - + char info_val[OPAL_MAX_INFO_VAL+1]; + char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL; + + /* Check if this communicator comes with restrictions on the collective modules + * it wants to use. The restrictions are consistent with the MCA parameter + * to limit the collective components loaded, but it applies for each + * communicator and is provided as an info key during the communicator + * creation. Unlike the MCA param, this info key is used not to select + * components but either to prevent components from being used or to + * force a change in the component priority. + */ + if( NULL != comm->super.s_info) { + opal_info_get(comm->super.s_info, "ompi_comm_coll_preference", + sizeof(info_val), info_val, &flag); + if( !flag ) { + goto proceed_to_select; + } + coll_argv = opal_argv_split(info_val, ','); + if(NULL == coll_argv) { + goto proceed_to_select; + } + int idx2, count_include = opal_argv_count(coll_argv); + /* Allocate the coll_include argv */ + coll_include = (char**)malloc((count_include + 1) * sizeof(char*)); + coll_include[count_include] = NULL; /* NULL terminated array */ + /* Dispatch the include/exclude in the corresponding arrays */ + for( int idx = 0; NULL != coll_argv[idx]; idx++ ) { + if( '^' == coll_argv[idx][0] ) { + coll_include[idx] = NULL; /* NULL terminated array */ + + /* Allocate the coll_exclude argv */ + coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*)); + /* save the exclude components */ + for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) { + coll_exclude[idx2 - idx] = coll_argv[idx2]; + } + coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */ + coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */ + count_include = idx; + break; + } + coll_include[idx] = coll_argv[idx]; + } + /* Reverse the order of the coll_inclide argv to faciliate the ordering of + * the selected components reverse. + */ + for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) { + char* temp = coll_include[idx2]; + coll_include[idx2] = coll_include[count_include - 1]; + coll_include[count_include - 1] = temp; + count_include--; + } + } + proceed_to_select: /* Make a list of the components that query successfully */ selectable = OBJ_NEW(opal_list_t); @@ -335,6 +403,13 @@ static opal_list_t *check_components(opal_list_t * components, OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) { component = cli->cli_component; + /* dont bother is we have this component in the exclusion list */ + if( component_in_argv(coll_exclude, component->mca_component_name) ) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:base:comm_select: component disqualified: %s (due to communicator info key)", + component->mca_component_name ); + continue; + } priority = check_one_component(comm, component, &module); if (priority >= 0) { /* We have a component that indicated that it wants to run @@ -370,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components, /* Put this list in priority order */ opal_list_sort(selectable, avail_coll_compare); + /* For all valid component reorder them not on their provided priorities but on + * the order requested in the info key. As at this point the coll_include is + * already ordered backward we can simply prepend the components. + */ + mca_coll_base_avail_coll_t *item, *item_next; + OPAL_LIST_FOREACH_SAFE(item, item_next, + selectable, mca_coll_base_avail_coll_t) { + if( component_in_argv(coll_include, item->ac_component_name) ) { + opal_list_remove_item(selectable, &item->super); + opal_list_prepend(selectable, &item->super); + } + } + + opal_argv_free(coll_argv); + if( NULL != coll_exclude ) { + free(coll_exclude); + } + if( NULL != coll_include ) { + free(coll_include); + } + /* All done */ return selectable; } @@ -403,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm, return priority; } - /************************************************************************** * Query functions **************************************************************************/ diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index e6b1fde3d6e..99c8b516a27 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -551,10 +551,10 @@ static const char* colltype_translation_table[] = { [COLLCOUNT] = NULL }; -const char* mca_coll_base_colltype_to_str(int collid) +char* mca_coll_base_colltype_to_str(int collid) { if( (collid < 0) || (collid >= COLLCOUNT) ) { return NULL; } - return colltype_translation_table[collid]; + return strdup(colltype_translation_table[collid]); } diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index e20ed6652cc..ee649fa63fb 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -187,7 +187,7 @@ int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); /* Miscelaneous function */ -const char* mca_coll_base_colltype_to_str(int collid); +char* mca_coll_base_colltype_to_str(int collid); int mca_coll_base_name_to_colltype(const char* name); END_C_DECLS diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index a7feefe082c..16efcbe8e5a 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -64,6 +64,7 @@ struct mca_coll_han_reduce_args_s { int w_rank; int last_seg_count; bool noop; + bool is_tmp_rbuf; }; typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t; @@ -126,6 +127,7 @@ struct mca_coll_han_gather_args_s { int root_low_rank; int w_rank; bool noop; + bool is_mapbycore; }; typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t; @@ -207,7 +209,6 @@ typedef struct mca_coll_han_component_t { mca_coll_han_dynamic_rules_t dynamic_rules; /* Dynamic rules from mca parameter */ COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; - TOPO_LVL_T topo_level; /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ int max_dynamic_errors; @@ -219,7 +220,7 @@ typedef void (*previous_dummy_fn_t) (void); * Structure used to store what is necessary for the collective operations * routines in case of fallback. */ -typedef struct mca_coll_han_collective_fallback_s { +typedef struct mca_coll_han_single_collective_fallback_s { union { mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_allgatherv_fn_t allgatherv; @@ -229,9 +230,24 @@ typedef struct mca_coll_han_collective_fallback_s { mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_scatter_fn_t scatter; previous_dummy_fn_t dummy; - } previous_routine; - mca_coll_base_module_t *previous_module; -} mca_coll_han_collective_fallback_t; + }; + mca_coll_base_module_t* module; +} mca_coll_han_single_collective_fallback_t; + +/* + * The structure containing a replacement for all collective supported + * by HAN. This structure is used as a fallback during subcommunicator + * creation. + */ +typedef struct mca_coll_han_collectives_fallback_s { + mca_coll_han_single_collective_fallback_t allgather; + mca_coll_han_single_collective_fallback_t allgatherv; + mca_coll_han_single_collective_fallback_t allreduce; + mca_coll_han_single_collective_fallback_t bcast; + mca_coll_han_single_collective_fallback_t reduce; + mca_coll_han_single_collective_fallback_t gather; + mca_coll_han_single_collective_fallback_t scatter; +} mca_coll_han_collectives_fallback_t; /** Coll han module */ typedef struct mca_coll_han_module_t { @@ -241,7 +257,6 @@ typedef struct mca_coll_han_module_t { /* Whether this module has been lazily initialized or not yet */ bool enabled; - struct ompi_communicator_t *cached_comm; struct ompi_communicator_t **cached_low_comms; struct ompi_communicator_t **cached_up_comms; int *cached_vranks; @@ -250,7 +265,7 @@ typedef struct mca_coll_han_module_t { bool are_ppn_imbalanced; /* To be able to fallback when the cases are not supported */ - struct mca_coll_han_collective_fallback_s previous_routines[COLLCOUNT]; + struct mca_coll_han_collectives_fallback_s fallback; /* To be able to fallback on reproducible algorithm */ mca_coll_base_module_reduce_fn_t reproducible_reduce; @@ -281,21 +296,53 @@ OBJ_CLASS_DECLARATION(mca_coll_han_module_t); * Some defines to stick to the naming used in the other components in terms of * fallback routines */ -#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather -#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv -#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce -#define previous_bcast previous_routines[BCAST].previous_routine.bcast -#define previous_gather previous_routines[GATHER].previous_routine.gather -#define previous_reduce previous_routines[REDUCE].previous_routine.reduce -#define previous_scatter previous_routines[SCATTER].previous_routine.scatter - -#define previous_allgather_module previous_routines[ALLGATHER].previous_module -#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module -#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module -#define previous_bcast_module previous_routines[BCAST].previous_module -#define previous_gather_module previous_routines[GATHER].previous_module -#define previous_reduce_module previous_routines[REDUCE].previous_module -#define previous_scatter_module previous_routines[SCATTER].previous_module +#define previous_allgather fallback.allgather.allgather +#define previous_allgather_module fallback.allgather.module + +#define previous_allgatherv fallback.allgatherv.allgatherv +#define previous_allgatherv_module fallback.allgatherv.module + +#define previous_allreduce fallback.allreduce.allreduce +#define previous_allreduce_module fallback.allreduce.module + +#define previous_bcast fallback.bcast.bcast +#define previous_bcast_module fallback.bcast.module + +#define previous_reduce fallback.reduce.reduce +#define previous_reduce_module fallback.reduce.module + +#define previous_gather fallback.gather.gather +#define previous_gather_module fallback.gather.module + +#define previous_scatter fallback.scatter.scatter +#define previous_scatter_module fallback.scatter.module + + +/* macro to correctly load a fallback collective module */ +#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \ + do { \ + if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \ + OBJ_RELEASE(coll_module); \ + } \ + } while(0) + +/* macro to correctly load /all/ fallback collectives */ +#define HAN_LOAD_FALLBACK_COLLECTIVES(HANM, COMM) \ + do { \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, bcast); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, scatter); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, gather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, reduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allreduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgatherv); \ + han_module->enabled = false; /* entire module set to pass-through from now on */ \ + } while(0) + /** * Global component instance @@ -312,9 +359,18 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm int han_request_free(ompi_request_t ** request); /* Subcommunicator creation */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); -/* Gather topology information */ +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); + +/** + * Gather topology information + * + * Returns a pointer to the (potentially already cached) topology. + * NOTE: if the rank distribution is imbalanced, no effort will be made to gather + * the topology at all ranks and instead NULL is returned and han_module->is_mapbycore + * is set to false. + * If HAN ever learns to deal with imbalanced topologies, this needs fixing! + */ int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int num_topo_level); diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c index d8d8cd5b55f..cc7dfaff266 100644 --- a/ompi/mca/coll/han/coll_han_allgather.c +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -67,7 +67,14 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, { /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int low_rank = ompi_comm_rank(low_comm); @@ -75,14 +82,13 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, /* Init topo */ int *topo = mca_coll_han_topo_init(comm, han_module, 2); - /* unbalanced case needs algo adaptation */ - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allgather with this communicator. It need to fall back on another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, - comm, han_module->previous_allgather_module); + "han cannot handle allgather with this communicator (imbalance). Fall back on another component\n")); + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); } ompi_request_t *temp_request = NULL; @@ -118,7 +124,7 @@ int mca_coll_han_allgather_lg_task(void *task_args) mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; char *tmp_buf = NULL, *tmp_rbuf = NULL; char *tmp_send = NULL; - + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", t->w_rank)); @@ -159,7 +165,7 @@ int mca_coll_han_allgather_lg_task(void *task_args) t->rdtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_gather_module); } - + t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; @@ -280,22 +286,32 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - mca_coll_han_comm_create_new(comm, han_module); - ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; - ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } /* discovery topology */ int *topo = mca_coll_han_topo_init(comm, han_module, 2); /* unbalanced case needs algo adaptation */ - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allgather with this communicator. It need to fall back on another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, - comm, han_module->previous_allgather_module); + "han cannot handle allgather within this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); } + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int w_rank = ompi_comm_rank(comm); /* setup up/low coordinates */ int low_rank = ompi_comm_rank(low_comm); @@ -327,7 +343,6 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send); } } - /* 1. low gather on node leaders into tmp_buf */ if (MPI_IN_PLACE == sbuf) { diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c index 00b50d9e714..afa0e0a220e 100644 --- a/ompi/mca/coll/han/coll_han_allreduce.c +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -86,27 +86,32 @@ mca_coll_han_allreduce_intra(const void *sbuf, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - // Fallback to another component if the op cannot commute mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - if (! ompi_op_is_commute(op)) { + + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allreduce with this operation." - "Fall back on another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, - comm, han_module->previous_allreduce_module); + "han cannot handle allreduce with this operation. Fall back on another component\n")); + goto prev_allreduce_intra; } + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } ptrdiff_t extent, lb; + size_t dtype_size; ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; + int seg_count = count, w_rank; w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t dtype_size; ompi_datatype_type_size(dtype, &dtype_size); - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -182,6 +187,10 @@ mca_coll_han_allreduce_intra(const void *sbuf, free(t); return OMPI_SUCCESS; + + prev_allreduce_intra: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); } /* t0 task */ @@ -389,12 +398,12 @@ int mca_coll_han_allreduce_t3_task(void *task_args) int mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -409,13 +418,20 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, // Fallback to another component if the op cannot commute if (! ompi_op_is_commute(op)) { - OPAL_OUTPUT_VERBOSE((30, cs->han_output, - "han cannot handle allreduce with this operation." - "It need to fall back on another component\n")); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); goto prev_allreduce; } - mca_coll_han_comm_create_new(comm, han_module); + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } low_comm = han_module->sub_comm[INTRA_NODE]; up_comm = han_module->sub_comm[INTER_NODE]; @@ -475,9 +491,9 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, return OMPI_SUCCESS; -prev_allreduce: - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, - han_module->previous_allreduce_module); + prev_allreduce: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); } /* Find a fallback on reproducible algorithm @@ -499,9 +515,8 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, int i; for (i=0; imodules_storage - .modules[fallback] - .module_handler; + mca_coll_base_module_t *fallback_module + = han_module->modules_storage.modules[fallback].module_handler; if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { if (0 == w_rank) { opal_output_verbose(30, mca_coll_han_component.han_output, @@ -520,8 +535,7 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, "coll:han:allreduce_reproducible_decision: " "no reproducible fallback\n"); } - han_module->reproducible_allreduce_module = - han_module->previous_allreduce_module; + han_module->reproducible_allreduce_module = han_module->previous_allreduce_module; han_module->reproducible_allreduce = han_module->previous_allreduce; return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c index 0251ba16192..c32ea745b03 100644 --- a/ompi/mca/coll/han/coll_han_bcast.c +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -63,28 +63,40 @@ mca_coll_han_bcast_intra(void *buff, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - int seg_count = count, w_rank = ompi_comm_rank(comm); + int err, seg_count = count, w_rank = ompi_comm_rank(comm); + ompi_communicator_t *low_comm, *up_comm; ptrdiff_t extent, lb; size_t dtype_size; + /* Create the subcommunicators */ + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); } ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_type_size(dtype, &dtype_size); - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm, *up_comm; - /* use MCA parameters for now */ low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; @@ -166,6 +178,7 @@ int mca_coll_han_bcast_t1_task(void *task_args) { mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; ompi_request_t *ibcast_req = NULL; + int tmp_count = t->seg_count; ptrdiff_t extent, lb; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, @@ -174,7 +187,6 @@ int mca_coll_han_bcast_t1_task(void *task_args) ompi_datatype_get_extent(t->dtype, &lb, &extent); if (!t->noop) { if (t->cur_seg <= t->num_segments - 2 ) { - int tmp_count = t->seg_count; if (t->cur_seg == t->num_segments - 2) { tmp_count = t->last_seg_count; } @@ -185,8 +197,10 @@ int mca_coll_han_bcast_t1_task(void *task_args) } } + /* are we the last segment to be pushed downstream ? */ + tmp_count = (t->cur_seg == (t->num_segments - 1)) ? t->last_seg_count : t->seg_count; t->low_comm->c_coll->coll_bcast((char *) t->buff, - t->seg_count, t->dtype, t->root_low_rank, t->low_comm, + tmp_count, t->dtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_bcast_module); if (NULL != ibcast_req) { @@ -206,30 +220,43 @@ mca_coll_han_bcast_intra_simple(void *buff, { /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - mca_coll_han_comm_create_new(comm, han_module); - ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; - ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; - int w_rank = ompi_comm_rank(comm); - - int *vranks = han_module->cached_vranks; - int low_rank = ompi_comm_rank(low_comm); - int low_size = ompi_comm_size(low_comm); - int root_low_rank, root_up_rank; + ompi_communicator_t *low_comm, *up_comm; + int err, w_rank = ompi_comm_rank(comm); + /* Create the subcommunicators */ + err = mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); - } else { - OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, - "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); } + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int root_low_rank, root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: root_low_rank %d root_up_rank %d\n", diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 1a6912cc0ea..ef55a6ac99d 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -99,7 +99,6 @@ static int han_open(void) /* Get the global coll verbosity: it will be ours */ mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; - mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; return mca_coll_han_init_dynamic_rules(); } diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c index d93cf26ad76..d32b12fbcd7 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.c +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -757,8 +757,13 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ - ompi_datatype_type_size(sdtype, &dtype_size); - dtype_size = dtype_size * scount; + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } sub_module = get_module(GATHER, dtype_size, diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index ff12a7652d0..e6673cf9411 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -117,10 +117,6 @@ mca_coll_han_init_dynamic_rules(void) coll_rules[i].nb_topologic_levels = 0; mca_coll_han_component.dynamic_rules.nb_collectives = i+1; - if( NULL != coll_name ) { - free(coll_name); - coll_name = NULL; - } /* Get the collective identifier */ if( getnext_string(fptr, &coll_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, @@ -142,7 +138,8 @@ mca_coll_han_init_dynamic_rules(void) coll_name, fileline, ALLGATHER, COLLCOUNT); goto file_reading_error; } - coll_name = (char*)mca_coll_base_colltype_to_str(coll_id); + free(coll_name); + coll_name = mca_coll_base_colltype_to_str(coll_id); } if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { @@ -360,24 +357,21 @@ mca_coll_han_init_dynamic_rules(void) } } } - } - if( NULL != coll_name ) { - free(coll_name); - coll_name = NULL; + if( NULL != coll_name ) { + free(coll_name); + coll_name = NULL; + } } - if( getnext_long(fptr, &nb_coll) ) { + if( getnext_long(fptr, &nb_coll) > 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on file %s at line %d: " - "rule reading is over but reader does not seem " - "to have reached the end of the file\n", + "coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: " + "rule reading is over but reader does not seem to have reached the end of the file\n", fname, fileline); } opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "read %d rules from %s\n", + "coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n", nb_entries, fname); if(mca_coll_han_component.dump_dynamic_rules) { diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c index 946c2797050..14b87bde926 100644 --- a/ompi/mca/coll/han/coll_han_gather.c +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -36,7 +36,7 @@ mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args, int root_low_rank, struct ompi_communicator_t *up_comm, struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) + int w_rank, bool noop, bool is_mapbycore, ompi_request_t * req) { args->cur_task = cur_task; args->sbuf = sbuf; @@ -53,6 +53,7 @@ mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args, args->low_comm = low_comm; args->w_rank = w_rank; args->noop = noop; + args->is_mapbycore = is_mapbycore; args->req = req; } @@ -65,32 +66,43 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int i; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; int w_rank, w_size; /* information about the global communicator */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ char *reorder_buf = NULL, *reorder_rbuf = NULL; - ptrdiff_t rsize, rgap = 0, rextent; - int *vranks, low_rank, low_size; - int * topo; - + int i, err, *vranks, low_rank, low_size, *topo; ompi_request_t *temp_request = NULL; - w_rank = ompi_comm_rank(comm); - w_size = ompi_comm_size(comm); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ topo = mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); } + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); temp_request->req_state = OMPI_REQUEST_ACTIVE; @@ -100,7 +112,6 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, temp_request->req_complete = REQUEST_PENDING; /* create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; ompi_communicator_t *up_comm = @@ -118,7 +129,6 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", w_rank, root, root_low_rank, root_up_rank)); - ompi_datatype_type_extent(rdtype, &rextent); /* Allocate reorder buffers */ if (w_rank == root) { @@ -132,12 +142,25 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, } else { /* Need a buffer to store unordered final result */ + ptrdiff_t rsize, rgap; rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * w_size, &rgap); reorder_buf = (char *)malloc(rsize); //TODO:free /* rgap is the size of unused space at the start of the datatype */ reorder_rbuf = reorder_buf - rgap; + + if (MPI_IN_PLACE == sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; + ptrdiff_t src_shift = block_size * w_rank; + ptrdiff_t dest_shift = block_size * w_rank; + ompi_datatype_copy_content_same_ddt(rdtype, + (ptrdiff_t)rcount, + (char *)rbuf + dest_shift, + reorder_rbuf + src_shift); + } } } @@ -148,7 +171,7 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t)); mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, - low_comm, w_rank, low_rank != root_low_rank, temp_request); + low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, temp_request); /* Init lg task */ init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args)); /* Issure lg task */ @@ -166,6 +189,8 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, */ /* reorder rbuf based on rank */ if (w_rank == root && !han_module->is_mapbycore) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); for (i=0; iw_rank)); + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } /* If the process is one of the node leader */ char *tmp_buf = NULL; @@ -200,21 +234,35 @@ int mca_coll_han_gather_lg_task(void *task_args) /* if the process is one of the node leader, allocate the intermediary * buffer to gather on the low sub communicator */ int low_size = ompi_comm_size(t->low_comm); + int low_rank = ompi_comm_rank(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, - (int64_t)t->rcount * low_size, + rsize = opal_datatype_span(&dtype->super, + count * low_size, &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; + if (t->w_rank == t->root) { + if (MPI_IN_PLACE == t->sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(dtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)count; + ptrdiff_t src_shift = block_size * t->w_rank; + ptrdiff_t dest_shift = block_size * low_rank; + ompi_datatype_copy_content_same_ddt(dtype, + (ptrdiff_t)count, + tmp_rbuf + dest_shift, + (char *)t->rbuf + src_shift); + } + } } /* Low level (usually intra-node or shared memory) node gather */ t->low_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount, - t->sdtype, + count, + dtype, tmp_rbuf, - t->rcount, - t->rdtype, + count, + dtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_gather_module); @@ -243,14 +291,25 @@ int mca_coll_han_gather_ug_task(void *task_args) OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: ug noop\n", t->w_rank)); } else { + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } + + int low_size = ompi_comm_size(t->low_comm); /* inter node gather */ t->up_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount*low_size, - t->sdtype, + count*low_size, + dtype, (char *)t->rbuf, - t->rcount*low_size, - t->rdtype, + count*low_size, + dtype, t->root_up_rank, t->up_comm, t->up_comm->c_coll->coll_gather_module); @@ -278,29 +337,49 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int *topo, w_rank = ompi_comm_rank(comm); int w_size = ompi_comm_size(comm); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ - int *topo = mca_coll_han_topo_init(comm, han_module, 2); - - /* Here root needs to reach all nodes on up_comm. - * But in case of unbalance some up_comms are smaller, - * as the comm_split is made on the base of low_rank */ + topo = mca_coll_han_topo_init(comm, han_module, 2); if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); } - /* create the subcommunicators */ - mca_coll_han_comm_create_new(comm, han_module); ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + ompi_datatype_t *dtype; + size_t count; + + if (w_rank == root) { + dtype = rdtype; + count = rcount; + } else { + dtype = sdtype; + count = scount; + } + /* Get the 'virtual ranks' mapping corresponding to the communicators */ int *vranks = han_module->cached_vranks; @@ -340,8 +419,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, char *tmp_buf_start = NULL; // start of the data if (low_rank == root_low_rank) { ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * low_size, + rsize = opal_datatype_span(&dtype->super, + count * low_size, &rgap); tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; @@ -349,11 +428,11 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, /* 1. low gather on nodes leaders */ low_comm->c_coll->coll_gather((char *)sbuf, - scount, - sdtype, + count, + dtype, tmp_buf_start, - rcount, - rdtype, + count, + dtype, root_low_rank, low_comm, low_comm->c_coll->coll_gather_module); @@ -361,11 +440,11 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, /* 2. upper gather (inter-node) between node leaders */ if (low_rank == root_low_rank) { up_comm->c_coll->coll_gather((char *)tmp_buf_start, - scount*low_size, - sdtype, + count*low_size, + dtype, (char *)reorder_buf_start, - rcount*low_size, - rdtype, + count*low_size, + dtype, root_up_rank, up_comm, up_comm->c_coll->coll_gather_module); @@ -406,7 +485,7 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, void ompi_coll_han_reorder_gather(const void *sbuf, void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, + struct ompi_datatype_t *dtype, struct ompi_communicator_t *comm, int * topo) { @@ -414,7 +493,7 @@ ompi_coll_han_reorder_gather(const void *sbuf, int w_rank = ompi_comm_rank(comm); int w_size = ompi_comm_size(comm); ptrdiff_t rextent; - ompi_datatype_type_extent(rdtype, &rextent); + ompi_datatype_type_extent(dtype, &rextent); for ( i = 0; i < w_size; i++ ) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Future reorder from %d to %d\n", @@ -424,7 +503,7 @@ ompi_coll_han_reorder_gather(const void *sbuf, ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; ptrdiff_t src_shift = block_size * i; ptrdiff_t dest_shift = block_size * (ptrdiff_t)topo[i * topolevel + 1]; - ompi_datatype_copy_content_same_ddt(rdtype, + ompi_datatype_copy_content_same_ddt(dtype, (ptrdiff_t)rcount, (char *)rbuf + dest_shift, (char *)sbuf + src_shift); diff --git a/ompi/mca/coll/han/coll_han_module.c b/ompi/mca/coll/han/coll_han_module.c index 621de3f0242..1a3a7e5c667 100644 --- a/ompi/mca/coll/han/coll_han_module.c +++ b/ompi/mca/coll/han/coll_han_module.c @@ -25,19 +25,25 @@ static int han_module_enable(mca_coll_base_module_t * module, static int mca_coll_han_module_disable(mca_coll_base_module_t * module, struct ompi_communicator_t *comm); +#define CLEAN_PREV_COLL(HANDLE, NAME) \ + do { \ + (HANDLE)->fallback.NAME.NAME = NULL; \ + (HANDLE)->fallback.NAME.module = NULL; \ + } while (0) + /* * Module constructor */ static void han_module_clear(mca_coll_han_module_t *han_module) { - for (int i = 0; i < COLLCOUNT; i++) { - /* - * Since the previous routines function pointers are declared as - * a union, initializing the dummy routineis enough - */ - han_module->previous_routines[i].previous_routine.dummy = NULL; - han_module->previous_routines[i].previous_module = NULL; - } + CLEAN_PREV_COLL(han_module, allgather); + CLEAN_PREV_COLL(han_module, allgatherv); + CLEAN_PREV_COLL(han_module, allreduce); + CLEAN_PREV_COLL(han_module, bcast); + CLEAN_PREV_COLL(han_module, reduce); + CLEAN_PREV_COLL(han_module, gather); + CLEAN_PREV_COLL(han_module, scatter); + han_module->reproducible_reduce = NULL; han_module->reproducible_reduce_module = NULL; han_module->reproducible_allreduce = NULL; @@ -48,9 +54,8 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) { int i; - module->enabled = false; + module->enabled = true; module->super.coll_module_disable = mca_coll_han_module_disable; - module->cached_comm = NULL; module->cached_low_comms = NULL; module->cached_up_comms = NULL; module->cached_vranks = NULL; @@ -154,6 +159,8 @@ int mca_coll_han_init_query(bool enable_progress_threads, mca_coll_base_module_t * mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) { + int flag; + char info_val[OPAL_MAX_INFO_VAL+1]; mca_coll_han_module_t *han_module; /* @@ -171,7 +178,13 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) comm->c_contextid, comm->c_name); return NULL; } - + if( !ompi_group_have_remote_peers(comm->c_local_group) ) { + /* The group only contains local processes. Disable HAN for now */ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm has only local processes; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } /* Get the priority level attached to this module. If priority is less * than or equal to 0, then the module is unavailable. */ *priority = mca_coll_han_component.han_priority; @@ -188,7 +201,21 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) } /* All is good -- return a module */ - han_module->topologic_level = mca_coll_han_component.topo_level; + han_module->topologic_level = GLOBAL_COMMUNICATOR; + + if (NULL != comm->super.s_info) { + /* Get the info value disaqualifying coll components */ + opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level", + sizeof(info_val), info_val, &flag); + + if (flag) { + if (0 == strcmp(info_val, "INTER_NODE")) { + han_module->topologic_level = INTER_NODE; + } else { + han_module->topologic_level = INTRA_NODE; + } + } + } han_module->super.coll_module_enable = han_module_enable; han_module->super.ft_event = NULL; @@ -231,14 +258,14 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) */ #define HAN_SAVE_PREV_COLL_API(__api) \ do { \ - han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ - han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ "(%d/%s): no underlying " # __api"; disqualifying myself", \ comm->c_contextid, comm->c_name); \ - return OMPI_ERROR; \ + goto handle_error; \ } \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ } while(0) @@ -264,6 +291,17 @@ han_module_enable(mca_coll_base_module_t * module, mca_coll_han_allreduce_reproducible_decision(comm, module); return OMPI_SUCCESS; + +handle_error: + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + return OMPI_ERROR; } /* diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c index 26f7198a58f..03968b6f475 100644 --- a/ompi/mca/coll/han/coll_han_reduce.c +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -25,7 +25,7 @@ mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t struct ompi_communicator_t *up_comm, struct ompi_communicator_t *low_comm, int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) + bool noop, bool is_tmp_rbuf) { args->cur_task = cur_task; args->sbuf = sbuf; @@ -42,6 +42,7 @@ mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t args->w_rank = w_rank; args->last_seg_count = last_seg_count; args->noop = noop; + args->is_tmp_rbuf = is_tmp_rbuf; } /* @@ -66,33 +67,46 @@ mca_coll_han_reduce_intra(const void *sbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; + int seg_count = count, w_rank; size_t dtype_size; - ompi_datatype_type_size(dtype, &dtype_size); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - /* Do not initialize topology if the operation cannot commute */ - if(!ompi_op_is_commute(op)){ + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + "han cannot handle reduce with this operation. Fall back on another component\n")); goto prev_reduce_intra; } + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all modules */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); - goto prev_reduce_intra; + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); + ompi_datatype_get_extent(dtype, &lb, &extent); + w_rank = ompi_comm_rank(comm); + ompi_datatype_type_size(dtype, &dtype_size); + ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -110,6 +124,7 @@ mca_coll_han_reduce_intra(const void *sbuf, int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); int root_low_rank; int root_up_rank; @@ -118,14 +133,22 @@ mca_coll_han_reduce_intra(const void *sbuf, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, root_up_rank)); + void *tmp_rbuf = rbuf; + void *tmp_rbuf_to_free = NULL; + if (low_rank == root_low_rank && root_up_rank != up_rank) { + /* allocate 2 segments on node leaders that are not the global root */ + tmp_rbuf = malloc(2*extent*seg_count); + tmp_rbuf_to_free = tmp_rbuf; + } + /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t)); - mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, + mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) tmp_rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, - low_rank != root_low_rank); + low_rank != root_low_rank, (NULL != tmp_rbuf_to_free)); /* Init the first task */ init_task(t0, mca_coll_han_reduce_t0_task, (void *) t); issue_task(t0); @@ -144,7 +167,9 @@ mca_coll_han_reduce_intra(const void *sbuf, /* Setup up t1 task arguments */ t->cur_task = t1; t->sbuf = (char *) t->sbuf + extent * t->seg_count; - t->rbuf = (char *) t->rbuf + extent * t->seg_count; + if (up_rank == root_up_rank) { + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + } t->cur_seg = t->cur_seg + 1; /* Init the t1 task */ init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); @@ -152,10 +177,11 @@ mca_coll_han_reduce_intra(const void *sbuf, } free(t); + free(tmp_rbuf_to_free); return OMPI_SUCCESS; -prev_reduce_intra: + prev_reduce_intra: return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, comm, han_module->previous_reduce_module); @@ -183,29 +209,43 @@ int mca_coll_han_reduce_t1_task(void *task_args) { t->cur_seg)); OBJ_RELEASE(t->cur_task); ptrdiff_t extent, lb; + int cur_seg = t->cur_seg; ompi_datatype_get_extent(t->dtype, &lb, &extent); ompi_request_t *ireduce_req = NULL; - int tmp_count = t->seg_count; if (!t->noop) { + int tmp_count = t->seg_count; + if (cur_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } int up_rank = ompi_comm_rank(t->up_comm); /* ur of cur_seg */ if (up_rank == t->root_up_rank) { - t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, tmp_count, t->dtype, t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); } else { - t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, + /* this is a node leader that is not root so alternate between the two allocated segments */ + char *tmp_sbuf = (char*)t->rbuf + (cur_seg % 2)*(extent * t->seg_count); + t->up_comm->c_coll->coll_ireduce(tmp_sbuf, NULL, tmp_count, t->dtype, t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); } } /* lr of cur_seg+1 */ - if (t->cur_seg <= t->num_segments - 2) { - if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + int next_seg = cur_seg + 1; + if (next_seg <= t->num_segments - 1) { + int tmp_count = t->seg_count; + char *tmp_rbuf = NULL; + if (next_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { tmp_count = t->last_seg_count; } + if (t->is_tmp_rbuf) { + tmp_rbuf = (char*)t->rbuf + (next_seg % 2)*(extent * t->seg_count); + } else if (NULL != t->rbuf) { + tmp_rbuf = (char*)t->rbuf + extent * t->seg_count; + } t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, - (char *) t->rbuf + extent * t->seg_count, tmp_count, + (char *) tmp_rbuf, tmp_count, t->dtype, t->op, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_reduce_module); @@ -221,13 +261,13 @@ int mca_coll_han_reduce_t1_task(void *task_args) { * a fallback is made on the next component that provides a reduce in priority order */ int mca_coll_han_reduce_intra_simple(const void *sbuf, - void* rbuf, - int count, - struct ompi_datatype_t *dtype, - ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { int w_rank; /* information about the global communicator */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ @@ -238,23 +278,37 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - /* Do not initialize topology if the operation cannot commute */ + /* No support for non-commutative operations */ if(!ompi_op_is_commute(op)){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this operation. It needs to fall back on another component\n")); - goto prev_reduce_intra_simple; + "han cannot handle reduce with this operation. Fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); - goto prev_reduce_intra_simple; + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; ompi_communicator_t *up_comm = @@ -293,7 +347,7 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/REDUCE: low comm reduce failed. " "Falling back to another component\n")); - goto prev_reduce_intra_simple; + goto prev_reduce_intra; } /* Up_comm reduce */ @@ -319,10 +373,9 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, } return OMPI_SUCCESS; -prev_reduce_intra_simple: + prev_reduce_intra: return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, - comm, - han_module->previous_reduce_module); + comm, han_module->previous_reduce_module); } @@ -345,9 +398,8 @@ mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, int i; for (i=0; imodules_storage - .modules[fallback] - .module_handler; + mca_coll_base_module_t *fallback_module + = han_module->modules_storage.modules[fallback].module_handler; if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { if (0 == w_rank) { opal_output_verbose(30, mca_coll_han_component.han_output, diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c index bbd781f3517..c52cc1911ac 100644 --- a/ompi/mca/coll/han/coll_han_scatter.c +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -65,24 +65,35 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int i, j; - int w_rank, w_size; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int i, j, w_rank, w_size; w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ - mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + int* topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle scatter with this communicator. It needs to fall back on another component\n")); - goto prev_scatter_intra; + "han cannot handle scatter with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatter); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); } - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; ompi_communicator_t *up_comm = @@ -92,9 +103,8 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, int low_size = ompi_comm_size(low_comm); int up_size = ompi_comm_size(up_comm); - ompi_request_t *temp_request = NULL; /* Set up request */ - temp_request = OBJ_NEW(ompi_request_t); + ompi_request_t *temp_request = OBJ_NEW(ompi_request_t); temp_request->req_state = OMPI_REQUEST_ACTIVE; temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; @@ -148,12 +158,22 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, } } + + void *dest_buf = rbuf; + int dest_count = rcount; + ompi_datatype_t *dest_dtype = rdtype; + if (MPI_IN_PLACE == rbuf) { + dest_buf = (void*)sbuf; + dest_count = scount; + dest_dtype = sdtype; + } + /* Create us task */ mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); /* Setup us task arguments */ mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t)); mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, - (char *) rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, + (char *) dest_buf, dest_count, dest_dtype, root, root_up_rank, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, temp_request); /* Init us task */ @@ -164,11 +184,6 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); return OMPI_SUCCESS; - prev_scatter_intra: - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module->previous_scatter_module); } /* us: upper level (intra-node) scatter task */ @@ -180,9 +195,18 @@ int mca_coll_han_scatter_us_task(void *task_args) OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", t->w_rank)); } else { + size_t count; + ompi_datatype_t *dtype; + if (t->w_rank == t->root) { + dtype = t->sdtype; + count = t->scount; + } else { + dtype = t->rdtype; + count = t->rcount; + } int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + rsize = opal_datatype_span(&dtype->super, (int64_t) count * low_size, &rgap); char *tmp_buf = (char *) malloc(rsize); char *tmp_rbuf = tmp_buf - rgap; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index 28c1b47db91..bf5b4df523b 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -26,153 +26,100 @@ #include "coll_han.h" #include "coll_han_dynamic.h" - -/* - * Local functions - */ -static void create_intranode_comm_new(ompi_communicator_t *, - ompi_communicator_t **); -static void create_internode_comm_new(ompi_communicator_t *, - int, int, - ompi_communicator_t **); -static void create_intranode_comm(ompi_communicator_t *, - const char *, - int, - ompi_communicator_t **); -static void create_internode_comm(ompi_communicator_t *, - const char *, - int, int, - ompi_communicator_t **); - -/** - * Create a sub-communicator containing the ranks that share my node. - * - * @param comm (IN) original communicator for the collective - * target module priority - * @param sub_comm (OUT) created sub-communicator - */ -static void create_intranode_comm_new(ompi_communicator_t *comm, - ompi_communicator_t **sub_comm) -{ - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, - (opal_info_t *)(&ompi_mpi_info_null), sub_comm); - return; -} - -/** - * Create a sub-communicator containing one rank per node. - * - * @param comm (IN) original communicator for the collective - * @param my_rank (IN) my rank in comm - * @param intra_rank (IN) local rank in the intra-node sub-communicator - * @param sub_comm (OUT) created sub-communicator - */ -static void create_internode_comm_new(ompi_communicator_t *comm, - int my_rank, - int intra_rank, - ompi_communicator_t **sub_comm) -{ - ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); - return; -} +#define HAN_SUBCOM_SAVE_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (FALLBACKS).COLL.COLL = (COMM)->c_coll->coll_ ## COLL; \ + (FALLBACKS).COLL.module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + } while(0) + +#define HAN_SUBCOM_LOAD_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (COMM)->c_coll->coll_ ## COLL = (FALLBACKS).COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (FALLBACKS).COLL.module; \ + } while(0) /* * Routine that creates the local hierarchical sub-communicators * Called each time a collective is called. * comm: input communicator of the collective */ -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { int low_rank, low_size, up_rank, w_rank, w_size; ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); - const int *origin_priority; - int han_var_id; - int tmp_han_priority; + mca_coll_han_collectives_fallback_t fallbacks; int vrank, *vranks; - - mca_coll_base_module_allreduce_fn_t old_allreduce; - mca_coll_base_module_t *old_allreduce_module; - mca_coll_base_module_allgather_fn_t old_allgather; - mca_coll_base_module_t *old_allgather_module; - - mca_coll_base_module_bcast_fn_t old_bcast; - mca_coll_base_module_t *old_bcast_module; - - mca_coll_base_module_gather_fn_t old_gather; - mca_coll_base_module_t *old_gather_module; - - mca_coll_base_module_reduce_fn_t old_reduce; - mca_coll_base_module_t *old_reduce_module; + opal_info_t comm_info; /* The sub communicators have already been created */ - if (NULL != han_module->sub_comm[INTRA_NODE] + if (han_module->enabled && NULL != han_module->sub_comm[INTRA_NODE] && NULL != han_module->sub_comm[INTER_NODE] && NULL != han_module->cached_vranks) { - return; + return OMPI_SUCCESS; } /* - * We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. * * Allgather is used to compute vranks * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new * Reduce + Bcast may be called by the allreduce implementation * Gather + Bcast may be called by the allgather implementation */ - old_allreduce = comm->c_coll->coll_allreduce; - old_allreduce_module = comm->c_coll->coll_allreduce_module; - - old_allgather = comm->c_coll->coll_allgather; - old_allgather_module = comm->c_coll->coll_allgather_module; - - old_reduce = comm->c_coll->coll_reduce; - old_reduce_module = comm->c_coll->coll_reduce_module; - - old_bcast = comm->c_coll->coll_bcast; - old_bcast_module = comm->c_coll->coll_bcast_module; - - old_gather = comm->c_coll->coll_gather; - old_gather_module = comm->c_coll->coll_gather_module; - - comm->c_coll->coll_allreduce = han_module->previous_allreduce; - comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; - - comm->c_coll->coll_allgather = han_module->previous_allgather; - comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; - - comm->c_coll->coll_reduce = han_module->previous_reduce; - comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; - - comm->c_coll->coll_bcast = han_module->previous_bcast; - comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); + + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } - comm->c_coll->coll_gather = han_module->previous_gather; - comm->c_coll->coll_gather_module = han_module->previous_gather_module; + OBJ_CONSTRUCT(&comm_info, opal_info_t); /* Create topological sub-communicators */ w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - origin_priority = NULL; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - - /* - * Maximum priority for selector on sub-communicators - */ - tmp_han_priority = 100; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - /* * This sub-communicator contains the ranks that share my node. */ - mca_coll_han_component.topo_level = INTRA_NODE; - create_intranode_comm_new(comm, low_comm); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "han"); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, low_comm); /* * Get my local rank and the local size @@ -184,8 +131,8 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, * This sub-communicator contains one process per node: processes with the * same intra-node rank id share such a sub-communicator */ - mca_coll_han_component.topo_level = INTER_NODE; - create_internode_comm_new(comm, w_rank, low_rank, up_comm); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false); up_rank = ompi_comm_rank(*up_comm); @@ -217,236 +164,103 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, */ han_module->cached_vranks = vranks; - /* - * Come back to the original han module priority - */ - mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* Put allreduce, allgather, reduce, bcast and gather back */ - comm->c_coll->coll_allreduce = old_allreduce; - comm->c_coll->coll_allreduce_module = old_allreduce_module; - - comm->c_coll->coll_allgather = old_allgather; - comm->c_coll->coll_allgather_module = old_allgather_module; - - comm->c_coll->coll_reduce = old_reduce; - comm->c_coll->coll_reduce_module = old_reduce_module; - - comm->c_coll->coll_bcast = old_bcast; - comm->c_coll->coll_bcast_module = old_bcast_module; - - comm->c_coll->coll_gather = old_gather; - comm->c_coll->coll_gather_module = old_gather_module; - - mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; } -/** - * Create a sub-communicator containing the ranks that share my node. - * Associate this sub-communicator a given collective module. - * module can be one of: - * . sm - * . shared - * - * @param comm (IN) original communicator for the collective - * @param prio_string (IN) string containing the mca variable associated to - * target module priority - * @param my_rank (IN) my rank in comm - * @param sub_comm (OUT) created sub-communicator - */ -static void create_intranode_comm(ompi_communicator_t *comm, - const char *prio_string, - int my_rank, - ompi_communicator_t **sub_comm) -{ - int var_id; - const int *sav_priority; - int tmp_priority = 100; - - /* - * Upgrade the target module priority to make the resulting sub-communicator - * use that collective module - */ - mca_base_var_find_by_name(prio_string, &var_id); - mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] %s origin %d\n", - my_rank, prio_string, *sav_priority)); - - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - /* - * Create the sub-communicator - * Since the target priority has been set to the highest value, this - * sub-communicator will inherit it as a collective module. - */ - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, - (opal_info_t *)(&ompi_mpi_info_null), sub_comm); - /* - * Come back to the target module's original priority - */ - mca_base_var_set_value(var_id, sav_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - return; -} - -/** - * Create a sub-communicator containing one rank per node. - * Associate this sub-communicator a given collective module. - * module can be one of: - * . libnbc - * . adapt - * - * @param comm (IN) original communicator for the collective - * @param prio_string (IN) string containing the mca variable associated to - * target module priority - * @param my_rank (IN) my rank in comm - * @param intra_rank (IN) local rank in the intra-node sub-communicator - * @param sub_comm (OUT) created sub-communicator - */ -static void create_internode_comm(ompi_communicator_t *comm, - const char *prio_string, - int my_rank, - int intra_rank, - ompi_communicator_t **sub_comm) -{ - int var_id; - const int *sav_priority; - int tmp_priority = 100; - - /* - * Upgrade the target module priority to make the resulting sub-communicator - * use that collective module - */ - mca_base_var_find_by_name(prio_string, &var_id); - mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] %s origin %d\n", my_rank, prio_string, - *sav_priority)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* - * Create the sub-communicator - * Since the target priority has been set to the highest value, this - * sub-communicator will inherit it as a collective module. - */ - ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); - mca_base_var_set_value(var_id, sav_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - return; -} - - /* * Routine that creates the local hierarchical sub-communicators * Called each time a collective is called. * comm: input communicator of the collective */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { int low_rank, low_size, up_rank, w_rank, w_size; + mca_coll_han_collectives_fallback_t fallbacks; ompi_communicator_t **low_comms; ompi_communicator_t **up_comms; - const int *origin_priority; - int han_var_id; - int tmp_han_priority; int vrank, *vranks; - - mca_coll_base_module_allreduce_fn_t old_allreduce; - mca_coll_base_module_t *old_allreduce_module; - - mca_coll_base_module_allgather_fn_t old_allgather; - mca_coll_base_module_t *old_allgather_module; - - mca_coll_base_module_bcast_fn_t old_bcast; - mca_coll_base_module_t *old_bcast_module; - - mca_coll_base_module_gather_fn_t old_gather; - mca_coll_base_module_t *old_gather_module; - - mca_coll_base_module_reduce_fn_t old_reduce; - mca_coll_base_module_t *old_reduce_module; + opal_info_t comm_info; /* use cached communicators if possible */ - if (han_module->cached_comm == comm && - han_module->cached_low_comms != NULL && + if (han_module->enabled && han_module->cached_low_comms != NULL && han_module->cached_up_comms != NULL && han_module->cached_vranks != NULL) { - return; + return OMPI_SUCCESS; } /* - * We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. * * Allgather is used to compute vranks * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new * Reduce + Bcast may be called by the allreduce implementation * Gather + Bcast may be called by the allgather implementation */ - old_allreduce = comm->c_coll->coll_allreduce; - old_allreduce_module = comm->c_coll->coll_allreduce_module; - - old_allgather = comm->c_coll->coll_allgather; - old_allgather_module = comm->c_coll->coll_allgather_module; - - old_reduce = comm->c_coll->coll_reduce; - old_reduce_module = comm->c_coll->coll_reduce_module; - - old_bcast = comm->c_coll->coll_bcast; - old_bcast_module = comm->c_coll->coll_bcast_module; - - old_gather = comm->c_coll->coll_gather; - old_gather_module = comm->c_coll->coll_gather_module; - - comm->c_coll->coll_allreduce = han_module->previous_allreduce; - comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; - - comm->c_coll->coll_allgather = han_module->previous_allgather; - comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; - - comm->c_coll->coll_reduce = han_module->previous_reduce; - comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; - - comm->c_coll->coll_bcast = han_module->previous_bcast; - comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; - - comm->c_coll->coll_gather = han_module->previous_gather; - comm->c_coll->coll_gather_module = han_module->previous_gather_module; - + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); + + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } /* create communicators if there is no cached communicator */ - w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * sizeof(struct ompi_communicator_t *)); up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * sizeof(struct ompi_communicator_t *)); - origin_priority = NULL; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - /* - * Lower down our current priority - */ - tmp_han_priority = 0; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); + OBJ_CONSTRUCT(&comm_info, opal_info_t); /* * Upgrade sm module priority to set up low_comms[0] with sm module * This sub-communicator contains the ranks that share my node. */ - create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[0])); /* * Get my local rank and the local size @@ -458,15 +272,17 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade shared module priority to set up low_comms[1] with shared module * This sub-communicator contains the ranks that share my node. */ - create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[1])); /* * Upgrade libnbc module priority to set up up_comms[0] with libnbc module * This sub-communicator contains one process per node: processes with the * same intra-node rank id share such a sub-communicator */ - create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank, - &(up_comms[0])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false); up_rank = ompi_comm_rank(up_comms[0]); @@ -474,8 +290,8 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade adapt module priority to set up up_comms[0] with adapt module * This sub-communicator contains one process per node. */ - create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank, - &(up_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false); /* * Set my virtual rank number. @@ -497,32 +313,21 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, /* * Set the cached info */ - han_module->cached_comm = comm; han_module->cached_low_comms = low_comms; han_module->cached_up_comms = up_comms; han_module->cached_vranks = vranks; - /* - * Come back to the original han module priority - */ - mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* Put allreduce, allgather, reduce, bcast and gather back */ - comm->c_coll->coll_allreduce = old_allreduce; - comm->c_coll->coll_allreduce_module = old_allreduce_module; - - comm->c_coll->coll_allgather = old_allgather; - comm->c_coll->coll_allgather_module = old_allgather_module; - - comm->c_coll->coll_reduce = old_reduce; - comm->c_coll->coll_reduce_module = old_reduce_module; - - comm->c_coll->coll_bcast = old_bcast; - comm->c_coll->coll_bcast_module = old_bcast_module; - - comm->c_coll->coll_gather = old_gather; - comm->c_coll->coll_gather_module = old_gather_module; + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c index a013a8aa656..e25e37207e2 100644 --- a/ompi/mca/coll/han/coll_han_topo.c +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -35,316 +35,161 @@ #include "coll_han.h" -/* - * Local functions - */ - -static int mca_coll_han_hostname_to_number(char* hostname, int size); -static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level); -static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level); -static bool mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level); -static void mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level); - -/* - * takes the number part of a host: hhh2031 -->2031 - */ -static int mca_coll_han_hostname_to_number(char* hostname, int size) +#if OPAL_ENABLE_DEBUG +static void +mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) { - int i, j; - char *number_array = (char *)malloc(sizeof(char) * size); - int number = 0; + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); - for (i = 0, j = 0; hostname[i] != '\0'; i++) { - if ('0' <= hostname[i] && '9' >= hostname[i]) { - number_array[j++] = hostname[i]; + if (rank == 0) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank)); + for( int i = 0; i < size*num_topo_level; i++ ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); } - number_array[j] = '\0'; - number = atoi(number_array); - free(number_array); - return number; -} - -/* - * Set the virtual topo id. It is made of num_topo_level ints (2 today): - * . the integer part of the host id - * . the rank in the main communicator - * Gather the virtual topoid from each process so every process will know other - * processes virtual topids - */ -static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level) -{ - int *self_topo = (int *)malloc(sizeof(int) * num_topo_level); - char hostname[1024]; - - gethostname(hostname, 1024); - self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); - self_topo[1] = ompi_comm_rank(comm); - - ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, - topo, num_topo_level, MPI_INT, comm, - comm->c_coll->coll_allgather_module); - free(self_topo); - - return; } +#endif /* OPAL_ENABLE_DEBUG */ -/* - * Sort the topology array in order to have ranks sharing the same node - * contiguous in the topology array. - * Called from topo_init whenever the processes are not mapped by core. - * ex: 4 ranks executing on 2 nodes, mapped by node - * ranks 0 and 2 on hid0 - * ranks 1 and 3 on hid1 - * On entry the topo array looks like - * hid0 0 hid1 1 hid0 2 hid1 3 - * After the sort: - * hid0 0 hid0 2 hid1 1 hid1 3 - * This is to have the gather result in the right order - * - * @param topo (IN/OUT) topology description array (sorted in out) - * @param start (IN) where to begin the processing - * The index in topo will actually be: - * start * num_topo_level + level - * topo contains num_topo_level ids per rank. - * @param end (IN) where to stop the processing - * The index in topo will actually be: - * end * num_topo_level + level - * topo contains num_topo_level ids per rank. - * @param level (IN) level number we are currently processing - * @param num_topo_level (IN) number of topological levels +/** + * Topology initialization phase + * Called each time a collective that needs buffer reordering is called * + * @param num_topo_level (IN) Number of the topological levels */ -static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level) +int* +mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) { - int i, j; - int min, min_loc; - int last, new_start, new_end; - - if (level > num_topo_level-1 || start >= end) { - return; + if ( NULL != han_module->cached_topo ) { + return han_module->cached_topo; } - min = INT_MAX; - min_loc = -1; - for (i = start; i <= end; i++) { - int temp; - /* get the min value for current level and its location */ - for (j = i; j <= end; j++) { - /* topo contains num_topo_level ids per rank. */ - if (topo[j * num_topo_level + level] < min) { - min = topo[j*num_topo_level+level]; - min_loc = j; - - } - } - /* - * swap i and min_loc - * We have num_topo_level ids to swap - */ - for (j = 0; j < num_topo_level; j++) { - temp = topo[i * num_topo_level + j]; - topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; - topo[min_loc * num_topo_level + j] = temp; - } - min = INT_MAX; - min_loc = -1; - } - - /* Process next level */ - last = 0; - new_start = 0; - new_end = 0; - for (i = start; i <= end; i++) { - if (i == start) { - last = topo[i * num_topo_level + level]; - new_start = start; - } else if (i == end) { - new_end = end; - mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); - } else if (last != topo[i * num_topo_level + level]) { - new_end = i - 1; - mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); - new_start = i; - last = topo[i * num_topo_level + level]; - } - } - return; -} - -/* - * Check whether the ranks in the communicator given as input are mapped by core - * Mapped by core: each node is first filled with as many ranks as needed before - * moving to the next one - * This is checked as follows: - * . 2 contiguous ranks should be either on the same node or on node ids in - * ascending order - * The topology is actually an array of ints: - * +----------+-------+----------+-------+------+----------+-------+-----+ - * | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... | - * +----------+-------+----------+-------+------+----------+-------+-----+ - */ -static bool -mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ - int size = ompi_comm_size(comm); - int i; - - for (i = 1; i < size; i++) { - /* - * The host id for a given rank should be < host id for the next rank - */ - if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) { - return false; - } - /* - * For the same host id, consecutive ranks should be sorted in - * ascending order. - */ - if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { - return false; - } - } - return true; -} + ompi_communicator_t *up_comm, *low_comm; + ompi_request_t *request = MPI_REQUEST_NULL; + int *my_low_rank_map = NULL; + int *ranks_map = NULL; -/* The topo is supposed sorted by host */ -static bool -mca_coll_han_topo_are_ppn_imbalanced(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ int size = ompi_comm_size(comm); - int i; - if (size < 2) { - return false; + if (NULL != han_module->cached_up_comms) { + up_comm = han_module->cached_up_comms[0]; + low_comm = han_module->cached_low_comms[0]; + } else { + up_comm = han_module->sub_comm[INTER_NODE]; + low_comm = han_module->sub_comm[INTRA_NODE]; } - int ppn; - int last_host = topo[0]; + assert(up_comm != NULL && low_comm != NULL); - /* Find the ppn for the first node */ - for (i = 1; i < size; i++) { - if (topo[i * num_topo_level] != last_host) { - break; - } - } - ppn = i; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); - /* All on one node */ - if( size == ppn ) { - return false; - } - /* Trivial case */ - if( size % ppn != 0 ) { - return true; - } + int *topo = (int *)malloc(sizeof(int) * size * num_topo_level); + int is_imbalanced = 1; + int ranks_consecutive = 1; - last_host = topo[ppn * num_topo_level]; - /* Check that the 2nd and next hosts also this ppn. Since the topo is sorted - * one just need to jump ppn ranks to check the supposed switch of host */ - for (i = 2 * ppn; i < size; i += ppn ) { - /* the list of ranks for the last known host have ended before */ - if (topo[(i-1) * num_topo_level] != last_host) { - return true; + /* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */ + if (0 == low_rank) { + my_low_rank_map = malloc(sizeof(int)*low_size); + for (int i = 0; i < low_size; ++i) { + topo[i] = i; } - /* the list of ranks for the last known host are bigger than excpected */ - if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]) { - return true; + ompi_group_translate_ranks(low_comm->c_local_group, low_size, topo, + comm->c_local_group, my_low_rank_map); + /* check if ranks are consecutive */ + int rank = my_low_rank_map[0] + 1; + for (int i = 1; i < low_size; ++i, ++rank) { + if (my_low_rank_map[i] != rank) { + ranks_consecutive = 0; + break; + } } - last_host = topo[i * num_topo_level]; - } - /* Check the last host */ - if (topo[(size-1) * num_topo_level] != last_host) { - return true; - } - - return false; -} + int reduce_vals[] = {ranks_consecutive, -ranks_consecutive, low_size, -low_size}; -/** - * Topology initialization phase - * Called each time a collective that needs buffer reordering is called - * - * @param num_topo_level (IN) Number of the topological levels - */ -int* -mca_coll_han_topo_init(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module, - int num_topo_level) -{ - int size, *topo; + up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4, + MPI_INT, MPI_MAX, up_comm, + up_comm->c_coll->coll_allreduce_module); - size = ompi_comm_size(comm); + /* is the distribution of processes balanced per node? */ + is_imbalanced = (reduce_vals[2] == -reduce_vals[3]) ? 0 : 1; + ranks_consecutive = (reduce_vals[0] == -reduce_vals[1]) ? 1 : 0; - if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) { - if (han_module->cached_topo) { - free(han_module->cached_topo); - han_module->cached_topo = NULL; + if ( !ranks_consecutive && !is_imbalanced ) { + /* kick off up_comm allgather to collect non-consecutive rank information at node leaders */ + ranks_map = malloc(sizeof(int)*size); + up_comm->c_coll->coll_iallgather(my_low_rank_map, low_size, MPI_INT, + ranks_map, low_size, MPI_INT, up_comm, &request, + up_comm->c_coll->coll_iallgather_module); } + } - topo = (int *)malloc(sizeof(int) * size * num_topo_level); - - /* get topo infomation */ - mca_coll_han_topo_get(topo, comm, num_topo_level); - mca_coll_han_topo_print(topo, comm, num_topo_level); - /* - * All the ranks now have the topo information - */ + /* broadcast balanced and consecutive properties from node leaders to remaining ranks */ + int bcast_vals[] = {is_imbalanced, ranks_consecutive}; + low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + is_imbalanced = bcast_vals[0]; + ranks_consecutive = bcast_vals[1]; + + /* error out if the rank distribution is not balanced */ + if (is_imbalanced) { + assert(MPI_REQUEST_NULL == request); + han_module->are_ppn_imbalanced = true; + free(topo); + if( NULL != my_low_rank_map ) free(my_low_rank_map); + if( NULL != ranks_map ) free(ranks_map); + return NULL; + } - /* check if the processes are mapped by core */ - han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); + han_module->are_ppn_imbalanced = false; + if (ranks_consecutive) { + /* fast-path: all ranks are consecutive and balanced so fill topology locally */ + for (int i = 0; i < size; ++i) { + topo[2*i] = (i/low_size); // node leader is node ID + topo[2*i+1] = i; + } + han_module->is_mapbycore = true; + } else { /* - * If not, sort the topo such that each group of ids is sorted by rank - * i.e. ids for rank i are contiguous to ids for rank i+1. - * This will be needed for the operations that are order sensitive - * (like gather) + * Slow path: gather global-to-node-local rank mappings at node leaders + * + * The topology will contain a mapping from global consecutive positions + * to ranks in the communicator. + * + * ex: 4 ranks executing on 2 nodes, mapped by node + * ranks 0 and 2 on hid0 + * ranks 1 and 3 on hid1 + * On entry the topo array looks like + * hid0 0 hid1 1 hid0 2 hid1 3 + * After the sort: + * hid0 0 hid0 2 hid1 1 hid1 3 */ - if (!han_module->is_mapbycore) { - mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level); + if (0 == low_rank) { + ompi_request_wait(&request, MPI_STATUS_IGNORE); + /* fill topology */ + for (int i = 0; i < size; ++i) { + topo[2*i] = ranks_map[(i/low_size)*low_size]; // node leader is node ID + topo[2*i+1] = ranks_map[i]; + } + free(ranks_map); } - han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level); - han_module->cached_topo = topo; - han_module->cached_comm = comm; - } else { - topo = han_module->cached_topo; } + /* broadcast topology from node leaders to remaining ranks */ + low_comm->c_coll->coll_bcast(topo, num_topo_level*size, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + free(my_low_rank_map); + han_module->cached_topo = topo; +#if OPAL_ENABLE_DEBUG mca_coll_han_topo_print(topo, comm, num_topo_level); - return topo; -} +#endif /* OPAL_ENABLE_DEBUG */ -static void -mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ - int rank = ompi_comm_rank(comm); - int size = ompi_comm_size(comm); - - if (rank == 0) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank)); - for( int i = 0; i < size*num_topo_level; i++ ) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); - } - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); - } + return topo; } diff --git a/ompi/mca/coll/han/coll_han_trigger.c b/ompi/mca/coll/han/coll_han_trigger.c index 2eb3cd07061..87c8ed95979 100644 --- a/ompi/mca/coll/han/coll_han_trigger.c +++ b/ompi/mca/coll/han/coll_han_trigger.c @@ -25,18 +25,3 @@ static void mca_coll_task_destructor(mca_coll_task_t * t) OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, mca_coll_task_destructor); - -/* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args) -{ - OBJ_CONSTRUCT(t, mca_coll_task_t); - t->func_ptr = func_ptr; - t->func_args = func_args; - return OMPI_SUCCESS; -} - -/* Issue the task */ -int issue_task(mca_coll_task_t * t) -{ - return t->func_ptr(t->func_args); -} diff --git a/ompi/mca/coll/han/coll_han_trigger.h b/ompi/mca/coll/han/coll_han_trigger.h index 3a94661b355..413e393be61 100644 --- a/ompi/mca/coll/han/coll_han_trigger.h +++ b/ompi/mca/coll/han/coll_han_trigger.h @@ -30,9 +30,20 @@ typedef struct mca_coll_task_s mca_coll_task_t; OBJ_CLASS_DECLARATION(mca_coll_task_t); /* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args); +static inline int +init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args) +{ + OBJ_CONSTRUCT(t, mca_coll_task_t); + t->func_ptr = func_ptr; + t->func_args = func_args; + return OMPI_SUCCESS; +} /* Issue the task */ -int issue_task(mca_coll_task_t * t); +static inline int +issue_task(mca_coll_task_t * t) +{ + return t->func_ptr(t->func_args); +} #endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 781215251ea..25e9c779467 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -176,7 +176,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority) if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name); - return NULL; + return NULL; } /* Get the priority level attached to this module. If priority is less From 5b78a22313e046f5394f3807003e571eadcd3f17 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 24 Aug 2020 17:48:29 -0400 Subject: [PATCH 3/4] Fix partial packing of non data elements. There was a bug allowing for partial packing of non-data elements (such as loop and end_loop markers) during the exit condition of a pack/unpack call. This has basically no meaning. Prevent this bug from happening by making sure the element point to a data before trying to partially pack it. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_unpack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index efed62451ac..6f9fdce2774 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -380,7 +380,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, } complete_loop: assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { + if( (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) && (0 != iov_len_local) ) { unsigned char* temp = conv_ptr; /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. @@ -391,7 +391,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, 0, iov_len_local, &temp ); - + pConvertor->partial_length = iov_len_local; iov_len_local = 0; } From c230c157ef5c03b83ac5e62e34be6a085d498998 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 22 Oct 2020 15:30:06 +0200 Subject: [PATCH 4/4] COLL BASE: move strdup out of mca_coll_base_colltype_to_str to avoid memory leaks Signed-off-by: Joseph Schuchart --- ompi/mca/coll/base/coll_base_util.c | 4 ++-- ompi/mca/coll/base/coll_base_util.h | 2 +- ompi/mca/coll/han/coll_han_dynamic_file.c | 6 +++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 99c8b516a27..e6b1fde3d6e 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -551,10 +551,10 @@ static const char* colltype_translation_table[] = { [COLLCOUNT] = NULL }; -char* mca_coll_base_colltype_to_str(int collid) +const char* mca_coll_base_colltype_to_str(int collid) { if( (collid < 0) || (collid >= COLLCOUNT) ) { return NULL; } - return strdup(colltype_translation_table[collid]); + return colltype_translation_table[collid]; } diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index ee649fa63fb..e20ed6652cc 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -187,7 +187,7 @@ int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); /* Miscelaneous function */ -char* mca_coll_base_colltype_to_str(int collid); +const char* mca_coll_base_colltype_to_str(int collid); int mca_coll_base_name_to_colltype(const char* name); END_C_DECLS diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index e6673cf9411..fc1fbbaa767 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -139,7 +139,11 @@ mca_coll_han_init_dynamic_rules(void) goto file_reading_error; } free(coll_name); - coll_name = mca_coll_base_colltype_to_str(coll_id); + coll_name = NULL; + const char *tmp_name = mca_coll_base_colltype_to_str(coll_id); + if (NULL != tmp_name) { + coll_name = strdup(tmp_name); + } } if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) {