From 36d52d4e3268526547706ec713a5ba411dc99d35 Mon Sep 17 00:00:00 2001 From: dongzhong Date: Fri, 11 Oct 2019 11:12:07 -0400 Subject: [PATCH 01/13] rewrite arm sve ops Signed-off-by: dongzhong --- ompi/mca/op/arm_sve_op/Makefile.am | 73 +++ ompi/mca/op/arm_sve_op/op_sve.h | 61 +++ ompi/mca/op/arm_sve_op/op_sve_component.c | 246 ++++++++++ ompi/mca/op/arm_sve_op/op_sve_functions.c | 517 ++++++++++++++++++++++ ompi/mca/op/arm_sve_op/op_sve_functions.h | 29 ++ test/datatype/Reduce_local_float.c | 290 ++++++++++++ 6 files changed, 1216 insertions(+) create mode 100644 ompi/mca/op/arm_sve_op/Makefile.am create mode 100644 ompi/mca/op/arm_sve_op/op_sve.h create mode 100644 ompi/mca/op/arm_sve_op/op_sve_component.c create mode 100644 ompi/mca/op/arm_sve_op/op_sve_functions.c create mode 100644 ompi/mca/op/arm_sve_op/op_sve_functions.h create mode 100644 test/datatype/Reduce_local_float.c diff --git a/ompi/mca/op/arm_sve_op/Makefile.am b/ompi/mca/op/arm_sve_op/Makefile.am new file mode 100644 index 00000000000..9b2aca888a4 --- /dev/null +++ b/ompi/mca/op/arm_sve_op/Makefile.am @@ -0,0 +1,73 @@ +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is an sve op component. This Makefile.am is a typical +# sve of how to integrate into Open MPI's Automake-based build +# system. +# +# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent +# for more details on how to make Open MPI components. + +# First, list all .h and .c sources. It is necessary to list all .h +# files so that they will be picked up in the distribution tarball. + +sources = \ + op_sve.h \ + op_sve_component.c \ + op_sve_functions.c + +# Open MPI components can be compiled two ways: +# +# 1. As a standalone dynamic shared object (DSO), sometimes called a +# dynamically loadable library (DLL). +# +# 2. As a static library that is slurped up into the upper-level +# libmpi library (regardless of whether libmpi is a static or dynamic +# library). This is called a "Libtool convenience library". +# +# The component needs to create an output library in this top-level +# component directory, and named either mca__.la (for DSO +# builds) or libmca__.la (for static builds). The OMPI +# build system will have set the +# MCA_BUILD_ompi___DSO AM_CONDITIONAL to indicate +# which way this component should be built. + +if MCA_BUILD_ompi_op_arm_sve_op_DSO +lib = +lib_sources = +component = mca_op_sve.la +component_sources = $(sources) +else +lib = libmca_op_sve.la +lib_sources = $(sources) +component = +component_sources = +endif + +# Specific information for DSO builds. +# +# The DSO should install itself in $(ompilibdir) (by default, +# $prefix/lib/openmpi). + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_op_sve_la_SOURCES = $(component_sources) +mca_op_sve_la_LDFLAGS = -module -avoid-version +mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la + +# Specific information for static builds. +# +# Note that we *must* "noinst"; the upper-layer Makefile.am's will +# slurp in the resulting .la library into libmpi. + +noinst_LTLIBRARIES = $(lib) +libmca_op_sve_la_SOURCES = $(lib_sources) +libmca_op_sve_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/arm_sve_op/op_sve.h new file mode 100644 index 00000000000..cad9935ae30 --- /dev/null +++ b/ompi/mca/op/arm_sve_op/op_sve.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OP_SVE_EXPORT_H +#define MCA_OP_SVE_EXPORT_H + +#include "ompi_config.h" + +#include "ompi/mca/mca.h" +#include "opal/class/opal_object.h" + +#include "ompi/mca/op/op.h" + +BEGIN_C_DECLS + +/** + * Derive a struct from the base op component struct, allowing us to + * cache some component-specific information on our well-known + * component struct. + */ +typedef struct { + /** The base op component struct */ + ompi_op_base_component_1_0_0_t super; + + /* What follows is sve-component-specific cached information. We + tend to use this scheme (caching information on the sve + component itself) instead of lots of individual global + variables for the component. The following data fields are + sves; replace them with whatever is relevant for your + component. */ + + /** A simple boolean indicating that the hardware is available. */ + bool hardware_available; + + /** A simple boolean indicating whether double precision is + supported. */ + bool double_supported; +} ompi_op_sve_component_t; + +/** + * Globally exported variable. Note that it is a *sve* component + * (defined above), which has the ompi_op_base_component_t as its + * first member. Hence, the MCA/op framework will find the data that + * it expects in the first memory locations, but then the component + * itself can cache additional information after that that can be used + * by both the component and modules. + */ +OMPI_DECLSPEC extern ompi_op_sve_component_t + mca_op_sve_component; + +END_C_DECLS + +#endif /* MCA_OP_SVE_EXPORT_H */ diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/arm_sve_op/op_sve_component.c new file mode 100644 index 00000000000..c60df1a56d9 --- /dev/null +++ b/ompi/mca/op/arm_sve_op/op_sve_component.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * This is the "sve" component source code. + * + */ + +#include "ompi_config.h" + +#include "opal/util/printf.h" + +#include "ompi/constants.h" +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/arm_sve_op/op_sve.h" +#include "ompi/mca/op/arm_sve_op/op_sve_functions.h" + +static int sve_component_open(void); +static int sve_component_close(void); +static int sve_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + sve_component_op_query(struct ompi_op_t *op, int *priority); +static int sve_component_register(void); + +ompi_op_sve_component_t mca_op_sve_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "sve", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = sve_component_open, + .mca_close_component = sve_component_close, + .mca_register_component_params = sve_component_register, + }, + .opc_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = sve_component_init_query, + .opc_op_query = sve_component_op_query, + }, +}; + +/* + * Component open + */ +static int sve_component_open(void) +{ + + /* A first level check to see if sve is even available in this + process. E.g., you may want to do a first-order check to see + if hardware is available. If so, return OMPI_SUCCESS. If not, + return anything other than OMPI_SUCCESS and the component will + silently be ignored. + + Note that if this function returns non-OMPI_SUCCESS, then this + component won't even be shown in ompi_info output (which is + probably not what you want). + */ + + return OMPI_SUCCESS; +} + +/* + * Component close + */ +static int sve_component_close(void) +{ + + /* If sve was opened successfully, close it (i.e., release any + resources that may have been allocated on this component). + Note that _component_close() will always be called at the end + of the process, so it may have been after any/all of the other + component functions have been invoked (and possibly even after + modules have been created and/or destroyed). */ + + return OMPI_SUCCESS; +} + +/* + * Register MCA params. + */ +static int sve_component_register(void) +{ + + /* Additionally, since this component is simulating hardware, + let's make MCA params that determine whethere a) the hardware + is available, and b) whether double precision floating point + types are supported. This allows you to change the behavior of + this component at run-time (by setting these MCA params at + run-time), simulating different kinds of hardware. */ + mca_op_sve_component.hardware_available = false; + (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version, + "hardware_available", + "Whether the hardware is available or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_sve_component.hardware_available); + + mca_op_sve_component.double_supported = true; + (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version, + "double_supported", + "Whether the double precision data types are supported or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_sve_component.double_supported); + + return OMPI_SUCCESS; +} + +/* + * Query whether this component wants to be used in this process. + */ +static int sve_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + if (mca_op_sve_component.hardware_available && !enable_mpi_thread_multiple) { + return OMPI_SUCCESS; + } + return OMPI_ERR_NOT_SUPPORTED; +} + + +/* + * Query whether this component can be used for a specific op + */ +static struct ompi_op_base_module_1_0_0_t * + sve_component_op_query(struct ompi_op_t *op, int *priority) +{ + ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t); + /* Sanity check -- although the framework should never invoke the + _component_op_query() on non-intrinsic MPI_Op's, we'll put a + check here just to be sure. */ + if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) { + return NULL; + } + + int i=0; + switch (op->o_f_to_c_index) { + case OMPI_OP_BASE_FORTRAN_MAX: + /* Corresponds to MPI_MAX */ + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MAX][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_MIN: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MIN][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_SUM: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_SUM][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_PROD: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_PROD][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_BOR: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BOR][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_BAND: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BAND][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_BXOR: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BXOR][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_LAND: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_LOR: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_LXOR: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_MAXLOC: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_MINLOC: + module= NULL; + break; + default: + module= NULL; + } + /* If we got a module from above, we'll return it. Otherwise, + we'll return NULL, indicating that this component does not want + to be considered for selection for this MPI_Op. Note that the + functions each returned a *sve* component pointer + (vs. a *base* component pointer -- where an *sve* component + is a base component plus some other module-specific cached + information), so we have to cast it to the right pointer type + before returning. */ + if (NULL != module) { + *priority = 50; + } + return (ompi_op_base_module_1_0_0_t *) module; +} diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c new file mode 100644 index 00000000000..c01c663e0fb --- /dev/null +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#include "opal/util/output.h" + +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/arm_sve_op/op_sve.h" +#include "ompi/mca/op/arm_sve_op/op_sve_functions.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +/* + * Since all the functions in this file are essentially identical, we + * use a macro to substitute in names and types. The core operation + * in all functions that use this macro is the same. + * + * This macro is for (out op in). + * + * Support ops: max, min, for signed/unsigned 8,16,32,64 + * sum, for integer 8,16,32,64 + * + */ +#define OP_SVE_FUNC(name, type_sign, type_size, type, op) \ + static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 512 / type_size; \ + printf("op: %s %s \n ", #op, #type_size);\ + int size = *count/step; \ + int i; \ + int round = size*64; \ + svbool_t Pg = svptrue_b##type_size(); \ +} + +/* + * This macro is for bit-wise operations (out op in). + * + * Support ops: or, xor, and of 512 bits (representing integer data) + * + */ +#define OP_SVE_BIT_FUNC(name, type_size, type, op) \ + static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 512 / type_size; \ + int size = *count/step; \ + int i; \ + int round = size*64; \ +} + +#define OP_SVE_FLOAT_FUNC(op) \ + static void ompi_op_sve_2buff_##op##_float(void *in, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 16; \ + int size = *count/step; \ + int i; \ + int round = size*64; \ +} + +#define OP_SVE_DOUBLE_FUNC(op) \ + static void ompi_op_sve_2buff_##op##_double(void *in, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 8; \ + int size = *count/step; \ + int i; \ +} + + +/************************************************************************* + * Max + *************************************************************************/ + OP_SVE_FUNC(max, i, 8, int8_t, max) + OP_SVE_FUNC(max, u, 8, uint8_t, max) + OP_SVE_FUNC(max, i, 16, int16_t, max) + OP_SVE_FUNC(max, u, 16, uint16_t, max) + OP_SVE_FUNC(max, i, 32, int32_t, max) + OP_SVE_FUNC(max, u, 32, uint32_t, max) + OP_SVE_FUNC(max, i, 64, int64_t, max) + OP_SVE_FUNC(max, u, 64, uint64_t, max) + + /* Floating point */ + OP_SVE_FLOAT_FUNC(max) + OP_SVE_DOUBLE_FUNC(max) + +/************************************************************************* + * Min + *************************************************************************/ + OP_SVE_FUNC(min, i, 8, int8_t, min) + OP_SVE_FUNC(min, u, 8, uint8_t, min) + OP_SVE_FUNC(min, i, 16, int16_t, min) + OP_SVE_FUNC(min, u, 16, uint16_t, min) + OP_SVE_FUNC(min, i, 32, int32_t, min) + OP_SVE_FUNC(min, u, 32, uint32_t, min) + OP_SVE_FUNC(min, i, 64, int64_t, min) + OP_SVE_FUNC(min, u, 64, uint64_t, min) + + /* Floating point */ + OP_SVE_FLOAT_FUNC(min) + OP_SVE_DOUBLE_FUNC(min) + +/************************************************************************* + * Sum + ************************************************************************/ + OP_SVE_FUNC(sum, i, 8, int8_t, add) + OP_SVE_FUNC(sum, i, 8, uint8_t, add) + OP_SVE_FUNC(sum, i, 16, int16_t, add) + OP_SVE_FUNC(sum, i, 16, uint16_t, add) + OP_SVE_FUNC(sum, i, 32, int32_t, add) + OP_SVE_FUNC(sum, i, 32, uint32_t, add) + OP_SVE_FUNC(sum, i, 64, int64_t, add) + OP_SVE_FUNC(sum, i, 64, uint64_t, add) + + /* Floating point */ + OP_SVE_FLOAT_FUNC(add) + OP_SVE_DOUBLE_FUNC(add) + +/************************************************************************* + * Product + *************************************************************************/ + OP_SVE_FUNC(prod, i, 16, int16_t, mullo) + OP_SVE_FUNC(prod, i, 16, uint16_t, mullo) + OP_SVE_FUNC(prod, i, 32, int32_t, mullo) + OP_SVE_FUNC(prod, i ,32, uint32_t, mullo) + OP_SVE_FUNC(prod, i, 64, int64_t, mullo) + OP_SVE_FUNC(prod, i, 64, uint64_t, mullo) + + /* Floating point */ + OP_SVE_FLOAT_FUNC(mul) + OP_SVE_DOUBLE_FUNC(mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ + OP_SVE_BIT_FUNC(band, 8, int8_t, and) + OP_SVE_BIT_FUNC(band, 8, uint8_t, and) + OP_SVE_BIT_FUNC(band, 16, int16_t, and) + OP_SVE_BIT_FUNC(band, 16, uint16_t, and) + OP_SVE_BIT_FUNC(band, 32, int32_t, and) + OP_SVE_BIT_FUNC(band, 32, uint32_t, and) + OP_SVE_BIT_FUNC(band, 64, int64_t, and) + OP_SVE_BIT_FUNC(band, 64, uint64_t, and) + + OP_SVE_FLOAT_FUNC(and) + OP_SVE_DOUBLE_FUNC(and) + +/************************************************************************* + * Bitwise OR + *************************************************************************/ + OP_SVE_BIT_FUNC(bor, 8, int8_t, or) + OP_SVE_BIT_FUNC(bor, 8, uint8_t, or) + OP_SVE_BIT_FUNC(bor, 16, int16_t, or) + OP_SVE_BIT_FUNC(bor, 16, uint16_t, or) + OP_SVE_BIT_FUNC(bor, 32, int32_t, or) + OP_SVE_BIT_FUNC(bor, 32, uint32_t, or) + OP_SVE_BIT_FUNC(bor, 64, int64_t, or) + OP_SVE_BIT_FUNC(bor, 64, uint64_t, or) + + OP_SVE_FLOAT_FUNC(or) + OP_SVE_DOUBLE_FUNC(or) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ + OP_SVE_BIT_FUNC(bxor, 8, int8_t, xor) + OP_SVE_BIT_FUNC(bxor, 8, uint8_t, xor) + OP_SVE_BIT_FUNC(bxor, 16, int16_t, xor) + OP_SVE_BIT_FUNC(bxor, 16, uint16_t, xor) + OP_SVE_BIT_FUNC(bxor, 32, int32_t, xor) + OP_SVE_BIT_FUNC(bxor, 32, uint32_t, xor) + OP_SVE_BIT_FUNC(bxor, 64, int64_t, xor) + OP_SVE_BIT_FUNC(bxor, 64, uint64_t, xor) + + OP_SVE_FLOAT_FUNC(xor) + OP_SVE_DOUBLE_FUNC(xor) + +/* + * This is a three buffer (2 input and 1 output) version of the reduction + * routines, needed for some optimizations. + */ +#define OP_SVE_FUNC_3BUFF(name, type_sign, type_size, type, op)\ + static void ompi_op_sve_3buff_##name##_##type(void * restrict in1, \ + void * restrict in2, void * restrict out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 512 / type_size; \ + int size = *count/step; \ + int i; \ + int round = size*64; \ +} + +#define OP_SVE_BIT_FUNC_3BUFF(name, type_size, type, op) \ + static void ompi_op_sve_3buff_##op##_##type(void *in1, void *in2, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 512 / type_size; \ + int size = *count/step; \ + int i; \ +} + +#define OP_SVE_FLOAT_FUNC_3BUFF(op) \ + static void ompi_op_sve_3buff_##op##_float(void *in1, void *in2, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 16; \ + int size = *count/step; \ + int i; \ + int round = size*64; \ +} + +#define OP_SVE_DOUBLE_FUNC_3BUFF(op) \ + static void ompi_op_sve_3buff_##op##_double(void *in1, void *in2, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int step = 8; \ + int size = *count/step; \ + int i; \ +} + +/************************************************************************* + * Max + *************************************************************************/ + OP_SVE_FUNC_3BUFF(max, i, 8, int8_t, max) + OP_SVE_FUNC_3BUFF(max, u, 8, uint8_t, max) + OP_SVE_FUNC_3BUFF(max, i, 16, int16_t, max) + OP_SVE_FUNC_3BUFF(max, u, 16, uint16_t, max) + OP_SVE_FUNC_3BUFF(max, i, 32, int32_t, max) + OP_SVE_FUNC_3BUFF(max, u, 32, uint32_t, max) + OP_SVE_FUNC_3BUFF(max, i, 64, int64_t, max) + OP_SVE_FUNC_3BUFF(max, u, 64, uint64_t, max) + + /* Floating point */ + OP_SVE_FLOAT_FUNC_3BUFF(max) + OP_SVE_DOUBLE_FUNC_3BUFF(max) + +/************************************************************************* + * Min + *************************************************************************/ + OP_SVE_FUNC_3BUFF(min, i, 8, int8_t, min) + OP_SVE_FUNC_3BUFF(min, u, 8, uint8_t, min) + OP_SVE_FUNC_3BUFF(min, i, 16, int16_t, min) + OP_SVE_FUNC_3BUFF(min, u, 16, uint16_t, min) + OP_SVE_FUNC_3BUFF(min, i, 32, int32_t, min) + OP_SVE_FUNC_3BUFF(min, u, 32, uint32_t, min) + OP_SVE_FUNC_3BUFF(min, i, 64, int64_t, min) + OP_SVE_FUNC_3BUFF(min, u, 64, uint64_t, min) + + /* Floating point */ + OP_SVE_FLOAT_FUNC_3BUFF(min) + OP_SVE_DOUBLE_FUNC_3BUFF(min) + +/************************************************************************* + * Sum + *************************************************************************/ + OP_SVE_FUNC_3BUFF(sum, i, 8, int8_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 8, uint8_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 16, int16_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 16, uint16_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 32, int32_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 32, uint32_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 64, int64_t, add) + OP_SVE_FUNC_3BUFF(sum, i, 64, uint64_t, add) + + /* Floating point */ + OP_SVE_FLOAT_FUNC_3BUFF(add) + OP_SVE_DOUBLE_FUNC_3BUFF(add) + +/************************************************************************* + * Product + *************************************************************************/ + OP_SVE_FUNC_3BUFF(prod, i, 16, int16_t, mullo) + OP_SVE_FUNC_3BUFF(prod, i, 16, uint16_t, mullo) + OP_SVE_FUNC_3BUFF(prod, i, 32, int32_t, mullo) + OP_SVE_FUNC_3BUFF(prod, i ,32, uint32_t, mullo) + OP_SVE_FUNC_3BUFF(prod, i, 64, int64_t, mullo) + OP_SVE_FUNC_3BUFF(prod, i, 64, uint64_t, mullo) + + /* Floating point */ + OP_SVE_FLOAT_FUNC_3BUFF(mul) + OP_SVE_DOUBLE_FUNC_3BUFF(mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ + OP_SVE_BIT_FUNC_3BUFF(band, 8, int8_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 8, uint8_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 16, int16_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 16, uint16_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 32, int32_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 32, uint32_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 64, int64_t, and) + OP_SVE_BIT_FUNC_3BUFF(band, 64, uint64_t, and) + + OP_SVE_FLOAT_FUNC_3BUFF(and) + OP_SVE_DOUBLE_FUNC_3BUFF(and) + +/************************************************************************* + * Bitwise OR + *************************************************************************/ + OP_SVE_BIT_FUNC_3BUFF(bor, 8, int8_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 8, uint8_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 16, int16_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 16, uint16_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 32, int32_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 32, uint32_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 64, int64_t, or) + OP_SVE_BIT_FUNC_3BUFF(bor, 64, uint64_t, or) + + OP_SVE_FLOAT_FUNC_3BUFF(or) + OP_SVE_DOUBLE_FUNC_3BUFF(or) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ + OP_SVE_BIT_FUNC_3BUFF(bxor, 8, int8_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 8, uint8_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 16, int16_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 32, int32_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 64, int64_t, xor) + OP_SVE_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor) + + OP_SVE_FLOAT_FUNC_3BUFF(xor) + OP_SVE_DOUBLE_FUNC_3BUFF(xor) + + +/** C integer ***********************************************************/ +#define C_INTEGER(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_sve_##ftype##_##name##_int8_t, \ + [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_sve_##ftype##_##name##_uint8_t, \ + [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t, \ + [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \ + [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t, \ + [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \ + [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t, \ + [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t + + +/** Floating point, including all the Fortran reals *********************/ +#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float +#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_double + +#define FLOATING_POINT(name, ftype) \ + [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \ + [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ + [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) + +#define C_INTEGER_PROD(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t, \ + [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \ + [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t, \ + [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \ + [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t, \ + [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t + + +/* + * MPI_OP_NULL + * All types + */ +#define FLAGS_NO_FLOAT \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE) +#define FLAGS \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \ + OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE) + +ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 2buff), + FLOATING_POINT(max, 2buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 2buff), + FLOATING_POINT(min, 2buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 2buff), + FLOATING_POINT(add, 2buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER_PROD(prod, 2buff), + FLOATING_POINT(mul, 2buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] = { + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(band, 2buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(bor, 2buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(bxor, 2buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* (MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + impementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE) */ + NULL, + }, + +}; + +ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 3buff), + FLOATING_POINT(max, 3buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 3buff), + FLOATING_POINT(min, 3buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 3buff), + FLOATING_POINT(add, 3buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER_PROD(prod, 3buff), + FLOATING_POINT(mul, 3buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] ={ + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(and, 3buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(or, 3buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(xor, 3buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + impementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE */ + NULL, + }, +}; diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/arm_sve_op/op_sve_functions.h new file mode 100644 index 00000000000..0c621788842 --- /dev/null +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/arm_sve_op/op_sve.h" + +BEGIN_C_DECLS + +OMPI_DECLSPEC extern ompi_op_base_handler_fn_t +ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; +OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t +ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +END_C_DECLS + diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c new file mode 100644 index 00000000000..e33db456a03 --- /dev/null +++ b/test/datatype/Reduce_local_float.c @@ -0,0 +1,290 @@ +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" + +#define ARRAYSIZE 1024*1024 + +float in_float[ARRAYSIZE]; +float inout_float[ARRAYSIZE]; + +int8_t in_uint8[ARRAYSIZE]; +int8_t inout_uint8[ARRAYSIZE]; + +uint16_t in_uint16[ARRAYSIZE]; +uint16_t inout_uint16[ARRAYSIZE]; + +uint32_t in_uint32[ARRAYSIZE]; +uint32_t inout_uint32[ARRAYSIZE]; + +uint64_t in_uint64[ARRAYSIZE]; +uint64_t inout_uint64[ARRAYSIZE]; + +double in_double[ARRAYSIZE]; +double inout_double[ARRAYSIZE]; + + +int main(int argc, char **argv) { + + char *num_elem = argv[1]; + int count = atoi(num_elem); + char *type = argv[2]; + char *elem_size = argv[3]; + int elem_size1 = atoi(elem_size); + char *op = argv[4]; + + int i; + + for (i=0; i Date: Thu, 31 Oct 2019 18:41:02 -0400 Subject: [PATCH 02/13] sve ops working Signed-off-by: dong zhong --- ompi/mca/op/arm_sve_op/op_sve.h | 3 + ompi/mca/op/arm_sve_op/op_sve_component.c | 3 + ompi/mca/op/arm_sve_op/op_sve_functions.c | 513 +++++++++++----------- ompi/mca/op/arm_sve_op/op_sve_functions.h | 3 + test/datatype/Reduce_local_float.c | 2 +- 5 files changed, 270 insertions(+), 254 deletions(-) diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/arm_sve_op/op_sve.h index cad9935ae30..95b19b95e04 100644 --- a/ompi/mca/op/arm_sve_op/op_sve.h +++ b/ompi/mca/op/arm_sve_op/op_sve.h @@ -2,6 +2,9 @@ * Copyright (c) 2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * + * Copyright (c) 2019 ARM Ltd. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/arm_sve_op/op_sve_component.c index c60df1a56d9..e08bbae35d9 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_component.c +++ b/ompi/mca/op/arm_sve_op/op_sve_component.c @@ -2,6 +2,9 @@ * Copyright (c) 2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * + * Copyright (c) 2019 ARM Ltd. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c index c01c663e0fb..3233ac66e44 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.c +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c @@ -2,6 +2,9 @@ * Copyright (c) 2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * + * Copyright (c) 2019 ARM Ltd. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,323 +36,331 @@ * * This macro is for (out op in). * - * Support ops: max, min, for signed/unsigned 8,16,32,64 - * sum, for integer 8,16,32,64 - * */ -#define OP_SVE_FUNC(name, type_sign, type_size, type, op) \ - static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \ +#define OP_SVE_FUNC(name, type_name, type_size, type, op) \ + static void ompi_op_sve_2buff_##name##_##type_name(void *in, void *out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ - int step = 512 / type_size; \ - printf("op: %s %s \n ", #op, #type_size);\ - int size = *count/step; \ - int i; \ - int round = size*64; \ + uint64_t i; \ + uint64_t step = 0; \ svbool_t Pg = svptrue_b##type_size(); \ + switch(type_size) { \ + case 8: \ + step = svcntb(); \ + break; \ + case 16: \ + step = svcnth(); \ + break; \ + case 32: \ + step = svcntw(); \ + break; \ + case 64: \ + step = svcntd(); \ + } \ + uint64_t round = *count; \ + uint64_t remain = *count % step; \ + for(i=0; i< round; i=i+step) \ + { \ + sv##type vsrc = svld1(Pg, (type *)in+i); \ + sv##type vdst = svld1(Pg, (type *)out+i); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, (type *)out+i,vdst); \ + } \ + \ + if (remain !=0){ \ + Pg = svwhilelt_b##type_size##_u64(0, remain); \ + sv##type vsrc = svld1(Pg, (type *)in+i); \ + sv##type vdst = svld1(Pg, (type *)out+i); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, (type *)out+i,vdst); \ + } \ } -/* - * This macro is for bit-wise operations (out op in). - * - * Support ops: or, xor, and of 512 bits (representing integer data) - * - */ -#define OP_SVE_BIT_FUNC(name, type_size, type, op) \ - static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \ - struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ -{ \ - int step = 512 / type_size; \ - int size = *count/step; \ - int i; \ - int round = size*64; \ -} - -#define OP_SVE_FLOAT_FUNC(op) \ - static void ompi_op_sve_2buff_##op##_float(void *in, void *out, int *count, \ - struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ -{ \ - int step = 16; \ - int size = *count/step; \ - int i; \ - int round = size*64; \ -} - -#define OP_SVE_DOUBLE_FUNC(op) \ - static void ompi_op_sve_2buff_##op##_double(void *in, void *out, int *count, \ - struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ -{ \ - int step = 8; \ - int size = *count/step; \ - int i; \ -} - - /************************************************************************* * Max *************************************************************************/ - OP_SVE_FUNC(max, i, 8, int8_t, max) - OP_SVE_FUNC(max, u, 8, uint8_t, max) - OP_SVE_FUNC(max, i, 16, int16_t, max) - OP_SVE_FUNC(max, u, 16, uint16_t, max) - OP_SVE_FUNC(max, i, 32, int32_t, max) - OP_SVE_FUNC(max, u, 32, uint32_t, max) - OP_SVE_FUNC(max, i, 64, int64_t, max) - OP_SVE_FUNC(max, u, 64, uint64_t, max) + OP_SVE_FUNC(max, int8_t , 8, int8_t, max) + OP_SVE_FUNC(max, uint8_t, 8, uint8_t, max) + OP_SVE_FUNC(max, int16_t, 16, int16_t, max) + OP_SVE_FUNC(max, uint16_t, 16, uint16_t, max) + OP_SVE_FUNC(max, int32_t, 32, int32_t, max) + OP_SVE_FUNC(max, uint32_t, 32, uint32_t, max) + OP_SVE_FUNC(max, int64_t, 64, int64_t, max) + OP_SVE_FUNC(max, uint64_t, 64, uint64_t, max) /* Floating point */ - OP_SVE_FLOAT_FUNC(max) - OP_SVE_DOUBLE_FUNC(max) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC(max, short_float, 16, float16_t, max) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC(max, short_float, 16, float16_t, max) +#endif + OP_SVE_FUNC(max, float, 32, float32_t, max) + OP_SVE_FUNC(max, double, 64, float64_t, max) /************************************************************************* * Min *************************************************************************/ - OP_SVE_FUNC(min, i, 8, int8_t, min) - OP_SVE_FUNC(min, u, 8, uint8_t, min) - OP_SVE_FUNC(min, i, 16, int16_t, min) - OP_SVE_FUNC(min, u, 16, uint16_t, min) - OP_SVE_FUNC(min, i, 32, int32_t, min) - OP_SVE_FUNC(min, u, 32, uint32_t, min) - OP_SVE_FUNC(min, i, 64, int64_t, min) - OP_SVE_FUNC(min, u, 64, uint64_t, min) + OP_SVE_FUNC(min, int8_t , 8, int8_t, min) + OP_SVE_FUNC(min, uint8_t, 8, uint8_t, min) + OP_SVE_FUNC(min, int16_t, 16, int16_t, min) + OP_SVE_FUNC(min, uint16_t, 16, uint16_t, min) + OP_SVE_FUNC(min, int32_t, 32, int32_t, min) + OP_SVE_FUNC(min, uint32_t, 32, uint32_t, min) + OP_SVE_FUNC(min, int64_t, 64, int64_t, min) + OP_SVE_FUNC(min, uint64_t, 64, uint64_t, min) /* Floating point */ - OP_SVE_FLOAT_FUNC(min) - OP_SVE_DOUBLE_FUNC(min) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC(min, short_float, 16, float16_t, min) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC(min, short_float, 16, float16_t, min) +#endif + OP_SVE_FUNC(min, float, 32, float32_t, min) + OP_SVE_FUNC(min, double, 64, float64_t, min) /************************************************************************* * Sum ************************************************************************/ - OP_SVE_FUNC(sum, i, 8, int8_t, add) - OP_SVE_FUNC(sum, i, 8, uint8_t, add) - OP_SVE_FUNC(sum, i, 16, int16_t, add) - OP_SVE_FUNC(sum, i, 16, uint16_t, add) - OP_SVE_FUNC(sum, i, 32, int32_t, add) - OP_SVE_FUNC(sum, i, 32, uint32_t, add) - OP_SVE_FUNC(sum, i, 64, int64_t, add) - OP_SVE_FUNC(sum, i, 64, uint64_t, add) - + OP_SVE_FUNC(sum, int8_t , 8, int8_t, add) + OP_SVE_FUNC(sum, uint8_t, 8, uint8_t, add) + OP_SVE_FUNC(sum, int16_t, 16, int16_t, add) + OP_SVE_FUNC(sum, uint16_t, 16, uint16_t, add) + OP_SVE_FUNC(sum, int32_t, 32, int32_t, add) + OP_SVE_FUNC(sum, uint32_t, 32, uint32_t, add) + OP_SVE_FUNC(sum, int64_t, 64, int64_t, add) + OP_SVE_FUNC(sum, uint64_t, 64, uint64_t, add) /* Floating point */ - OP_SVE_FLOAT_FUNC(add) - OP_SVE_DOUBLE_FUNC(add) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC(sum, short_float, 16, float16_t, add) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC(sum, short_float, 16, float16_t, add) +#endif + OP_SVE_FUNC(sum, float, 32, float32_t, add) + OP_SVE_FUNC(sum, double, 64, float64_t, add) + /************************************************************************* * Product *************************************************************************/ - OP_SVE_FUNC(prod, i, 16, int16_t, mullo) - OP_SVE_FUNC(prod, i, 16, uint16_t, mullo) - OP_SVE_FUNC(prod, i, 32, int32_t, mullo) - OP_SVE_FUNC(prod, i ,32, uint32_t, mullo) - OP_SVE_FUNC(prod, i, 64, int64_t, mullo) - OP_SVE_FUNC(prod, i, 64, uint64_t, mullo) + OP_SVE_FUNC(prod, int8_t, 8, int8_t, mul) + OP_SVE_FUNC(prod, uint8_t, 8, uint8_t, mul) + OP_SVE_FUNC(prod, int16_t, 16, int16_t, mul) + OP_SVE_FUNC(prod, uint16_t, 16, uint16_t, mul) + OP_SVE_FUNC(prod, int32_t, 32, int32_t, mul) + OP_SVE_FUNC(prod, uint32_t, 32, uint32_t, mul) + OP_SVE_FUNC(prod, int64_t, 64, int64_t, mul) + OP_SVE_FUNC(prod, uint64_t, 64, uint64_t, mul) /* Floating point */ - OP_SVE_FLOAT_FUNC(mul) - OP_SVE_DOUBLE_FUNC(mul) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC(prod, short_float, 16, float16_t, mul) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC(prod, short_float, 16, float16_t, mul) +#endif + OP_SVE_FUNC(prod, float, 32, float32_t, mul) + OP_SVE_FUNC(prod, double, 64, float64_t, mul) /************************************************************************* * Bitwise AND *************************************************************************/ - OP_SVE_BIT_FUNC(band, 8, int8_t, and) - OP_SVE_BIT_FUNC(band, 8, uint8_t, and) - OP_SVE_BIT_FUNC(band, 16, int16_t, and) - OP_SVE_BIT_FUNC(band, 16, uint16_t, and) - OP_SVE_BIT_FUNC(band, 32, int32_t, and) - OP_SVE_BIT_FUNC(band, 32, uint32_t, and) - OP_SVE_BIT_FUNC(band, 64, int64_t, and) - OP_SVE_BIT_FUNC(band, 64, uint64_t, and) - - OP_SVE_FLOAT_FUNC(and) - OP_SVE_DOUBLE_FUNC(and) + OP_SVE_FUNC(band, int8_t, 8, int8_t, and) + OP_SVE_FUNC(band, uint8_t, 8, uint8_t, and) + OP_SVE_FUNC(band, int16_t, 16, int16_t, and) + OP_SVE_FUNC(band, uint16_t, 16, uint16_t, and) + OP_SVE_FUNC(band, int32_t, 32, int32_t, and) + OP_SVE_FUNC(band, uint32_t, 32, uint32_t, and) + OP_SVE_FUNC(band, int64_t, 64, int64_t, and) + OP_SVE_FUNC(band, uint64_t, 64, uint64_t, and) /************************************************************************* * Bitwise OR *************************************************************************/ - OP_SVE_BIT_FUNC(bor, 8, int8_t, or) - OP_SVE_BIT_FUNC(bor, 8, uint8_t, or) - OP_SVE_BIT_FUNC(bor, 16, int16_t, or) - OP_SVE_BIT_FUNC(bor, 16, uint16_t, or) - OP_SVE_BIT_FUNC(bor, 32, int32_t, or) - OP_SVE_BIT_FUNC(bor, 32, uint32_t, or) - OP_SVE_BIT_FUNC(bor, 64, int64_t, or) - OP_SVE_BIT_FUNC(bor, 64, uint64_t, or) - - OP_SVE_FLOAT_FUNC(or) - OP_SVE_DOUBLE_FUNC(or) + OP_SVE_FUNC(bor, int8_t, 8, int8_t, orr) + OP_SVE_FUNC(bor, uint8_t, 8, uint8_t, orr) + OP_SVE_FUNC(bor, int16_t, 16, int16_t, orr) + OP_SVE_FUNC(bor, uint16_t, 16, uint16_t, orr) + OP_SVE_FUNC(bor, int32_t, 32, int32_t, orr) + OP_SVE_FUNC(bor, uint32_t, 32, uint32_t, orr) + OP_SVE_FUNC(bor, int64_t, 64, int64_t, orr) + OP_SVE_FUNC(bor, uint64_t, 64, uint64_t, orr) /************************************************************************* * Bitwise XOR *************************************************************************/ - OP_SVE_BIT_FUNC(bxor, 8, int8_t, xor) - OP_SVE_BIT_FUNC(bxor, 8, uint8_t, xor) - OP_SVE_BIT_FUNC(bxor, 16, int16_t, xor) - OP_SVE_BIT_FUNC(bxor, 16, uint16_t, xor) - OP_SVE_BIT_FUNC(bxor, 32, int32_t, xor) - OP_SVE_BIT_FUNC(bxor, 32, uint32_t, xor) - OP_SVE_BIT_FUNC(bxor, 64, int64_t, xor) - OP_SVE_BIT_FUNC(bxor, 64, uint64_t, xor) - - OP_SVE_FLOAT_FUNC(xor) - OP_SVE_DOUBLE_FUNC(xor) + OP_SVE_FUNC(bxor, int8_t, 8, int8_t, eor) + OP_SVE_FUNC(bxor, uint8_t, 8, uint8_t, eor) + OP_SVE_FUNC(bxor, int16_t, 16, int16_t, eor) + OP_SVE_FUNC(bxor, uint16_t, 16, uint16_t, eor) + OP_SVE_FUNC(bxor, int32_t, 32, int32_t, eor) + OP_SVE_FUNC(bxor, uint32_t, 32, uint32_t, eor) + OP_SVE_FUNC(bxor, int64_t, 64, int64_t, eor) + OP_SVE_FUNC(bxor, uint64_t, 64, uint64_t, eor) + /* * This is a three buffer (2 input and 1 output) version of the reduction * routines, needed for some optimizations. */ -#define OP_SVE_FUNC_3BUFF(name, type_sign, type_size, type, op)\ - static void ompi_op_sve_3buff_##name##_##type(void * restrict in1, \ +#define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \ + static void ompi_op_sve_3buff_##name##_##type_name(void * restrict in1, \ void * restrict in2, void * restrict out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ - int step = 512 / type_size; \ - int size = *count/step; \ - int i; \ - int round = size*64; \ -} - -#define OP_SVE_BIT_FUNC_3BUFF(name, type_size, type, op) \ - static void ompi_op_sve_3buff_##op##_##type(void *in1, void *in2, void *out, int *count, \ - struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ -{ \ - int step = 512 / type_size; \ - int size = *count/step; \ - int i; \ -} - -#define OP_SVE_FLOAT_FUNC_3BUFF(op) \ - static void ompi_op_sve_3buff_##op##_float(void *in1, void *in2, void *out, int *count, \ - struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ -{ \ - int step = 16; \ - int size = *count/step; \ - int i; \ - int round = size*64; \ + uint64_t i; \ + uint64_t step = 0; \ + svbool_t Pg = svptrue_b##type_size(); \ + switch(type_size) { \ + case 8: \ + step = svcntb(); \ + break; \ + case 16: \ + step = svcnth(); \ + break; \ + case 32: \ + step = svcntw(); \ + break; \ + case 64: \ + step = svcntd(); \ + } \ + uint64_t round = *count; \ + uint64_t remain = *count % step; \ + for(i=0; i< round; i=i+step) \ + { \ + sv##type vsrc = svld1(Pg, (type *)in1+i); \ + sv##type vdst = svld1(Pg, (type *)in2+i); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, (type *)out+i,vdst); \ + } \ + if (remain !=0){ \ + Pg = svwhilelt_b##type_size##_u64(0, remain); \ + sv##type vsrc = svld1(Pg, (type *)in1+i); \ + sv##type vdst = svld1(Pg, (type *)in2+i); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, (type *)out+i,vdst); \ + } \ } -#define OP_SVE_DOUBLE_FUNC_3BUFF(op) \ - static void ompi_op_sve_3buff_##op##_double(void *in1, void *in2, void *out, int *count, \ - struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ -{ \ - int step = 8; \ - int size = *count/step; \ - int i; \ -} /************************************************************************* * Max *************************************************************************/ - OP_SVE_FUNC_3BUFF(max, i, 8, int8_t, max) - OP_SVE_FUNC_3BUFF(max, u, 8, uint8_t, max) - OP_SVE_FUNC_3BUFF(max, i, 16, int16_t, max) - OP_SVE_FUNC_3BUFF(max, u, 16, uint16_t, max) - OP_SVE_FUNC_3BUFF(max, i, 32, int32_t, max) - OP_SVE_FUNC_3BUFF(max, u, 32, uint32_t, max) - OP_SVE_FUNC_3BUFF(max, i, 64, int64_t, max) - OP_SVE_FUNC_3BUFF(max, u, 64, uint64_t, max) + OP_SVE_FUNC_3BUFF(max, int8_t , 8, int8_t, max) + OP_SVE_FUNC_3BUFF(max, uint8_t, 8, uint8_t, max) + OP_SVE_FUNC_3BUFF(max, int16_t, 16, int16_t, max) + OP_SVE_FUNC_3BUFF(max, uint16_t, 16, uint16_t, max) + OP_SVE_FUNC_3BUFF(max, int32_t, 32, int32_t, max) + OP_SVE_FUNC_3BUFF(max, uint32_t, 32, uint32_t, max) + OP_SVE_FUNC_3BUFF(max, int64_t, 64, int64_t, max) + OP_SVE_FUNC_3BUFF(max, uint64_t, 64, uint64_t, max) /* Floating point */ - OP_SVE_FLOAT_FUNC_3BUFF(max) - OP_SVE_DOUBLE_FUNC_3BUFF(max) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max) +#endif + OP_SVE_FUNC_3BUFF(max, float, 32, float32_t, max) + OP_SVE_FUNC_3BUFF(max, double, 64, float64_t, max) /************************************************************************* * Min *************************************************************************/ - OP_SVE_FUNC_3BUFF(min, i, 8, int8_t, min) - OP_SVE_FUNC_3BUFF(min, u, 8, uint8_t, min) - OP_SVE_FUNC_3BUFF(min, i, 16, int16_t, min) - OP_SVE_FUNC_3BUFF(min, u, 16, uint16_t, min) - OP_SVE_FUNC_3BUFF(min, i, 32, int32_t, min) - OP_SVE_FUNC_3BUFF(min, u, 32, uint32_t, min) - OP_SVE_FUNC_3BUFF(min, i, 64, int64_t, min) - OP_SVE_FUNC_3BUFF(min, u, 64, uint64_t, min) + OP_SVE_FUNC_3BUFF(min, int8_t , 8, int8_t, min) + OP_SVE_FUNC_3BUFF(min, uint8_t, 8, uint8_t, min) + OP_SVE_FUNC_3BUFF(min, int16_t, 16, int16_t, min) + OP_SVE_FUNC_3BUFF(min, uint16_t, 16, uint16_t, min) + OP_SVE_FUNC_3BUFF(min, int32_t, 32, int32_t, min) + OP_SVE_FUNC_3BUFF(min, uint32_t, 32, uint32_t, min) + OP_SVE_FUNC_3BUFF(min, int64_t, 64, int64_t, min) + OP_SVE_FUNC_3BUFF(min, uint64_t, 64, uint64_t, min) /* Floating point */ - OP_SVE_FLOAT_FUNC_3BUFF(min) - OP_SVE_DOUBLE_FUNC_3BUFF(min) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min) +#endif + OP_SVE_FUNC_3BUFF(min, float, 32, float32_t, min) + OP_SVE_FUNC_3BUFF(min, double, 64, float64_t, min) /************************************************************************* * Sum *************************************************************************/ - OP_SVE_FUNC_3BUFF(sum, i, 8, int8_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 8, uint8_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 16, int16_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 16, uint16_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 32, int32_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 32, uint32_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 64, int64_t, add) - OP_SVE_FUNC_3BUFF(sum, i, 64, uint64_t, add) - + OP_SVE_FUNC_3BUFF(sum, int8_t , 8, int8_t, add) + OP_SVE_FUNC_3BUFF(sum, uint8_t, 8, uint8_t, add) + OP_SVE_FUNC_3BUFF(sum, int16_t, 16, int16_t, add) + OP_SVE_FUNC_3BUFF(sum, uint16_t, 16, uint16_t, add) + OP_SVE_FUNC_3BUFF(sum, int32_t, 32, int32_t, add) + OP_SVE_FUNC_3BUFF(sum, uint32_t, 32, uint32_t, add) + OP_SVE_FUNC_3BUFF(sum, int64_t, 64, int64_t, add) + OP_SVE_FUNC_3BUFF(sum, uint64_t, 64, uint64_t, add) /* Floating point */ - OP_SVE_FLOAT_FUNC_3BUFF(add) - OP_SVE_DOUBLE_FUNC_3BUFF(add) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add) +#endif + OP_SVE_FUNC_3BUFF(sum, float, 32, float32_t, add) + OP_SVE_FUNC_3BUFF(sum, double, 64, float64_t, add) /************************************************************************* * Product *************************************************************************/ - OP_SVE_FUNC_3BUFF(prod, i, 16, int16_t, mullo) - OP_SVE_FUNC_3BUFF(prod, i, 16, uint16_t, mullo) - OP_SVE_FUNC_3BUFF(prod, i, 32, int32_t, mullo) - OP_SVE_FUNC_3BUFF(prod, i ,32, uint32_t, mullo) - OP_SVE_FUNC_3BUFF(prod, i, 64, int64_t, mullo) - OP_SVE_FUNC_3BUFF(prod, i, 64, uint64_t, mullo) + OP_SVE_FUNC_3BUFF(prod, int8_t, 8, int8_t, mul) + OP_SVE_FUNC_3BUFF(prod, uint8_t, 8, uint8_t, mul) + OP_SVE_FUNC_3BUFF(prod, int16_t, 16, int16_t, mul) + OP_SVE_FUNC_3BUFF(prod, uint16_t, 16, uint16_t, mul) + OP_SVE_FUNC_3BUFF(prod, int32_t, 32, int32_t, mul) + OP_SVE_FUNC_3BUFF(prod, uint32_t, 32, uint32_t, mul) + OP_SVE_FUNC_3BUFF(prod, int64_t, 64, int64_t, mul) + OP_SVE_FUNC_3BUFF(prod, uint64_t, 64, uint64_t, mul) /* Floating point */ - OP_SVE_FLOAT_FUNC_3BUFF(mul) - OP_SVE_DOUBLE_FUNC_3BUFF(mul) +#if defined(HAVE_SHORT_FLOAT) + OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul) +#elif defined(HAVE_OPAL_SHORT_FLOAT_T) + OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul) +#endif + OP_SVE_FUNC_3BUFF(prod, float, 32, float32_t, mul) + OP_SVE_FUNC_3BUFF(prod, double, 64, float64_t, mul) /************************************************************************* * Bitwise AND *************************************************************************/ - OP_SVE_BIT_FUNC_3BUFF(band, 8, int8_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 8, uint8_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 16, int16_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 16, uint16_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 32, int32_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 32, uint32_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 64, int64_t, and) - OP_SVE_BIT_FUNC_3BUFF(band, 64, uint64_t, and) - - OP_SVE_FLOAT_FUNC_3BUFF(and) - OP_SVE_DOUBLE_FUNC_3BUFF(and) + OP_SVE_FUNC_3BUFF(band, int8_t, 8, int8_t, and) + OP_SVE_FUNC_3BUFF(band, uint8_t, 8, uint8_t, and) + OP_SVE_FUNC_3BUFF(band, int16_t, 16, int16_t, and) + OP_SVE_FUNC_3BUFF(band, uint16_t, 16, uint16_t, and) + OP_SVE_FUNC_3BUFF(band, int32_t, 32, int32_t, and) + OP_SVE_FUNC_3BUFF(band, uint32_t, 32, uint32_t, and) + OP_SVE_FUNC_3BUFF(band, int64_t, 64, int64_t, and) + OP_SVE_FUNC_3BUFF(band, uint64_t, 64, uint64_t, and) /************************************************************************* * Bitwise OR *************************************************************************/ - OP_SVE_BIT_FUNC_3BUFF(bor, 8, int8_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 8, uint8_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 16, int16_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 16, uint16_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 32, int32_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 32, uint32_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 64, int64_t, or) - OP_SVE_BIT_FUNC_3BUFF(bor, 64, uint64_t, or) - - OP_SVE_FLOAT_FUNC_3BUFF(or) - OP_SVE_DOUBLE_FUNC_3BUFF(or) - -/************************************************************************* + OP_SVE_FUNC_3BUFF(bor, int8_t, 8, int8_t, orr) + OP_SVE_FUNC_3BUFF(bor, uint8_t, 8, uint8_t, orr) + OP_SVE_FUNC_3BUFF(bor, int16_t, 16, int16_t, orr) + OP_SVE_FUNC_3BUFF(bor, uint16_t, 16, uint16_t, orr) + OP_SVE_FUNC_3BUFF(bor, int32_t, 32, int32_t, orr) + OP_SVE_FUNC_3BUFF(bor, uint32_t, 32, uint32_t, orr) + OP_SVE_FUNC_3BUFF(bor, int64_t, 64, int64_t, orr) + OP_SVE_FUNC_3BUFF(bor, uint64_t, 64, uint64_t, orr) + + /************************************************************************* * Bitwise XOR *************************************************************************/ - OP_SVE_BIT_FUNC_3BUFF(bxor, 8, int8_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 8, uint8_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 16, int16_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 32, int32_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 64, int64_t, xor) - OP_SVE_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor) - - OP_SVE_FLOAT_FUNC_3BUFF(xor) - OP_SVE_DOUBLE_FUNC_3BUFF(xor) - + OP_SVE_FUNC_3BUFF(bxor, int8_t, 8, int8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, uint8_t, 8, uint8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, int16_t, 16, int16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, uint16_t, 16, uint16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, int32_t, 32, int32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, uint32_t, 32, uint32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, int64_t, 64, int64_t, eor) + OP_SVE_FUNC_3BUFF(bxor, uint64_t, 64, uint64_t, eor) /** C integer ***********************************************************/ #define C_INTEGER(name, ftype) \ @@ -364,23 +375,19 @@ /** Floating point, including all the Fortran reals *********************/ +#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T) +#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_short_float +#else +#define SHORT_FLOAT(name, ftype) NULL +#endif #define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float #define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_double -#define FLOATING_POINT(name, ftype) \ - [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \ - [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ +#define FLOATING_POINT(name, ftype) \ + [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype), \ + [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) -#define C_INTEGER_PROD(name, ftype) \ - [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t, \ - [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \ - [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t, \ - [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \ - [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t, \ - [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t - - /* * MPI_OP_NULL * All types @@ -411,12 +418,12 @@ ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMP /* Corresponds to MPI_SUM */ [OMPI_OP_BASE_FORTRAN_SUM] = { C_INTEGER(sum, 2buff), - FLOATING_POINT(add, 2buff), + FLOATING_POINT(sum, 2buff), }, /* Corresponds to MPI_PROD */ [OMPI_OP_BASE_FORTRAN_PROD] = { - C_INTEGER_PROD(prod, 2buff), - FLOATING_POINT(mul, 2buff), + C_INTEGER(prod, 2buff), + FLOATING_POINT(prod, 2buff), }, /* Corresponds to MPI_LAND */ [OMPI_OP_BASE_FORTRAN_LAND] = { @@ -474,12 +481,12 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN /* Corresponds to MPI_SUM */ [OMPI_OP_BASE_FORTRAN_SUM] = { C_INTEGER(sum, 3buff), - FLOATING_POINT(add, 3buff), + FLOATING_POINT(sum, 3buff), }, /* Corresponds to MPI_PROD */ [OMPI_OP_BASE_FORTRAN_PROD] = { - C_INTEGER_PROD(prod, 3buff), - FLOATING_POINT(mul, 3buff), + C_INTEGER(prod, 3buff), + FLOATING_POINT(prod, 3buff), }, /* Corresponds to MPI_LAND */ [OMPI_OP_BASE_FORTRAN_LAND] ={ @@ -487,7 +494,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN }, /* Corresponds to MPI_BAND */ [OMPI_OP_BASE_FORTRAN_BAND] = { - C_INTEGER(and, 3buff), + C_INTEGER(band, 3buff), }, /* Corresponds to MPI_LOR */ [OMPI_OP_BASE_FORTRAN_LOR] = { @@ -495,7 +502,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN }, /* Corresponds to MPI_BOR */ [OMPI_OP_BASE_FORTRAN_BOR] = { - C_INTEGER(or, 3buff), + C_INTEGER(bor, 3buff), }, /* Corresponds to MPI_LXOR */ [OMPI_OP_BASE_FORTRAN_LXOR] = { @@ -503,7 +510,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN }, /* Corresponds to MPI_BXOR */ [OMPI_OP_BASE_FORTRAN_BXOR] = { - C_INTEGER(xor, 3buff), + C_INTEGER(bxor, 3buff), }, /* Corresponds to MPI_REPLACE */ [OMPI_OP_BASE_FORTRAN_REPLACE] = { diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/arm_sve_op/op_sve_functions.h index 0c621788842..6c7b4c7df71 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.h +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.h @@ -2,6 +2,9 @@ * Copyright (c) 2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * + * Copyright (c) 2019 ARM Ltd. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c index e33db456a03..73ef7467b02 100644 --- a/test/datatype/Reduce_local_float.c +++ b/test/datatype/Reduce_local_float.c @@ -50,7 +50,7 @@ int main(int argc, char **argv) { in_double[i] = 10.0+1; inout_double[i] = 1.0+2; - in_uint8[i] = 4; + in_uint8[i] = 5; inout_uint8[i] = 3; in_uint16[i] = 2; From 075bb80b95a525faf48fcbcdb43a33a45d6fd142 Mon Sep 17 00:00:00 2001 From: dong zhong Date: Mon, 4 Nov 2019 11:17:00 -0500 Subject: [PATCH 03/13] Add readme and install scripts Signed-off-by: dong zhong --- ARM_SVE_README | 30 ++++++++++++++++++++++++++++++ arm_install.sh | 11 +++++++++++ 2 files changed, 41 insertions(+) create mode 100644 ARM_SVE_README create mode 100644 arm_install.sh diff --git a/ARM_SVE_README b/ARM_SVE_README new file mode 100644 index 00000000000..a4670d9a8df --- /dev/null +++ b/ARM_SVE_README @@ -0,0 +1,30 @@ +Configuration and installation details please check script arm_install.sh + +Run test: (Test example takes 4 args) +arg1 : elements count for operation +arg2 : elements type could be : i (integer), f (float), d (double) +arg3: type size in bits, only apply when you set arg2 to i. eg: i 8 will be converted to int8; i 16 to int16 +arg4: operation type. Could be : max, min, sum , mul, band , bor, bxor +If you want to use SVE module for MPI ops, you need to pass mca params as : -mca op sve -mca op_sve_hardware_available 1 +======= +Example for test +$PATH_To_BIN/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=256 --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 33 i 8 min + +If you don't need armie you can remove the ARMIE part in the command line as : +$PATH_To_BIN//mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 /ompi/test/datatype/Reduce_local_float 33 i 8 min + +How we evaluate the performance? +====== +Logical: + +Start_time; +MPI_reduce_local(...); +End_time; + +Reduce_time = Start_time - End_time; + +Possible issues (this happened on thunder2 machine): +====== +Reason for "-mca pml ob1" : on Arm machine the default pml module will cause a problem with armie (instruction not supported, I don't know why), but with ob1 it works. + + diff --git a/arm_install.sh b/arm_install.sh new file mode 100644 index 00000000000..0b8afef93b0 --- /dev/null +++ b/arm_install.sh @@ -0,0 +1,11 @@ +mkdir build + +./autogen.pl >/dev/null + +./configure --prefix=$PWD/build --enable-mpirun-prefix-by-default --enable-debug CC=armclang CFLAGS="-march=armv8-a+sve" CXX=armclang++ FC=armflang >/dev/null + +./config.status >/dev/null +make -j 128 install >/dev/null + +## compile the test code, test code under ompi/test/datapyte/Reduce_local_float.c +./build/bin/mpicc -g -O3 -march=armv8-a+sve -o ./test/datatype/Reduce_local_float ./test/datatype/Reduce_local_float.c From e7491d8e2cc97ad194ee05337d6cc0d404494096 Mon Sep 17 00:00:00 2001 From: dong zhong Date: Sun, 1 Dec 2019 17:30:35 -0500 Subject: [PATCH 04/13] improvement of code, add correctness_check scripts --- ompi/mca/op/arm_sve_op/op_sve.h | 2 +- ompi/mca/op/arm_sve_op/op_sve_component.c | 49 +- ompi/mca/op/arm_sve_op/op_sve_functions.c | 409 ++++++++--------- ompi/mca/op/arm_sve_op/op_sve_functions.h | 2 +- test/datatype/Reduce_local_float.c | 531 ++++++++++++++++++---- test/datatype/correctness_check.sh | 52 +++ 6 files changed, 701 insertions(+), 344 deletions(-) create mode 100644 test/datatype/correctness_check.sh diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/arm_sve_op/op_sve.h index 95b19b95e04..21483d1fa56 100644 --- a/ompi/mca/op/arm_sve_op/op_sve.h +++ b/ompi/mca/op/arm_sve_op/op_sve.h @@ -3,7 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * - * Copyright (c) 2019 ARM Ltd. All rights reserved. + * Copyright (c) 2019 Arm Ltd. All rights reserved. * * $COPYRIGHT$ * diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/arm_sve_op/op_sve_component.c index e08bbae35d9..df8281e5ba1 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_component.c +++ b/ompi/mca/op/arm_sve_op/op_sve_component.c @@ -136,7 +136,7 @@ static int sve_component_register(void) static int sve_component_init_query(bool enable_progress_threads, bool enable_mpi_thread_multiple) { - if (mca_op_sve_component.hardware_available && !enable_mpi_thread_multiple) { + if (mca_op_sve_component.hardware_available) { return OMPI_SUCCESS; } return OMPI_ERR_NOT_SUPPORTED; @@ -160,59 +160,16 @@ static struct ompi_op_base_module_1_0_0_t * int i=0; switch (op->o_f_to_c_index) { case OMPI_OP_BASE_FORTRAN_MAX: - /* Corresponds to MPI_MAX */ - for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MAX][i]; - OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i]; - OBJ_RETAIN(module); - } - break; case OMPI_OP_BASE_FORTRAN_MIN: - for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MIN][i]; - OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i]; - OBJ_RETAIN(module); - } - break; case OMPI_OP_BASE_FORTRAN_SUM: - for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_SUM][i]; - OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i]; - OBJ_RETAIN(module); - } - break; case OMPI_OP_BASE_FORTRAN_PROD: - for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_PROD][i]; - OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i]; - OBJ_RETAIN(module); - } - break; case OMPI_OP_BASE_FORTRAN_BOR: - for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BOR][i]; - OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i]; - OBJ_RETAIN(module); - } - break; case OMPI_OP_BASE_FORTRAN_BAND: - for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BAND][i]; - OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i]; - OBJ_RETAIN(module); - } - break; case OMPI_OP_BASE_FORTRAN_BXOR: for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { - module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BXOR][i]; + module->opm_fns[i] = ompi_op_sve_functions[op->o_f_to_c_index][i]; OBJ_RETAIN(module); - module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i]; + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[op->o_f_to_c_index][i]; OBJ_RETAIN(module); } break; diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c index 3233ac66e44..8b2281271ea 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.c +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c @@ -3,7 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * - * Copyright (c) 2019 ARM Ltd. All rights reserved. + * Copyright (c) 2019 Arm Ltd. All rights reserved. * * $COPYRIGHT$ * @@ -38,329 +38,306 @@ * */ #define OP_SVE_FUNC(name, type_name, type_size, type, op) \ - static void ompi_op_sve_2buff_##name##_##type_name(void *in, void *out, int *count, \ + static void ompi_op_sve_2buff_##name##_##type(void *_in, void *_out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ - uint64_t i; \ - uint64_t step = 0; \ + int types_per_step = svcnt##type_name(); \ + int left_over = *count; \ + type* in = (type*)_in; \ + type* out = (type*)_out; \ svbool_t Pg = svptrue_b##type_size(); \ - switch(type_size) { \ - case 8: \ - step = svcntb(); \ - break; \ - case 16: \ - step = svcnth(); \ - break; \ - case 32: \ - step = svcntw(); \ - break; \ - case 64: \ - step = svcntd(); \ - } \ - uint64_t round = *count; \ - uint64_t remain = *count % step; \ - for(i=0; i< round; i=i+step) \ - { \ - sv##type vsrc = svld1(Pg, (type *)in+i); \ - sv##type vdst = svld1(Pg, (type *)out+i); \ - vdst=sv##op##_z(Pg,vdst,vsrc); \ - svst1(Pg, (type *)out+i,vdst); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + sv##type vsrc = svld1(Pg, in); \ + sv##type vdst = svld1(Pg, out); \ + in += types_per_step; \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + out += types_per_step; \ } \ \ - if (remain !=0){ \ - Pg = svwhilelt_b##type_size##_u64(0, remain); \ - sv##type vsrc = svld1(Pg, (type *)in+i); \ - sv##type vdst = svld1(Pg, (type *)out+i); \ - vdst=sv##op##_z(Pg,vdst,vsrc); \ - svst1(Pg, (type *)out+i,vdst); \ + if (left_over !=0){ \ + Pg = svwhilelt_b##type_size##_u64(0, left_over); \ + sv##type vsrc = svld1(Pg, in); \ + sv##type vdst = svld1(Pg, out); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ } \ } /************************************************************************* * Max *************************************************************************/ - OP_SVE_FUNC(max, int8_t , 8, int8_t, max) - OP_SVE_FUNC(max, uint8_t, 8, uint8_t, max) - OP_SVE_FUNC(max, int16_t, 16, int16_t, max) - OP_SVE_FUNC(max, uint16_t, 16, uint16_t, max) - OP_SVE_FUNC(max, int32_t, 32, int32_t, max) - OP_SVE_FUNC(max, uint32_t, 32, uint32_t, max) - OP_SVE_FUNC(max, int64_t, 64, int64_t, max) - OP_SVE_FUNC(max, uint64_t, 64, uint64_t, max) + OP_SVE_FUNC(max, b, 8, int8_t, max) + OP_SVE_FUNC(max, b, 8, uint8_t, max) + OP_SVE_FUNC(max, h, 16, int16_t, max) + OP_SVE_FUNC(max, h, 16, uint16_t, max) + OP_SVE_FUNC(max, w, 32, int32_t, max) + OP_SVE_FUNC(max, w, 32, uint32_t, max) + OP_SVE_FUNC(max, d, 64, int64_t, max) + OP_SVE_FUNC(max, d, 64, uint64_t, max) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(max, short_float, 16, float16_t, max) + OP_SVE_FUNC(max, h, 16, float16_t, max) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(max, short_float, 16, float16_t, max) + OP_SVE_FUNC(max, h, 16, float16_t, max) #endif - OP_SVE_FUNC(max, float, 32, float32_t, max) - OP_SVE_FUNC(max, double, 64, float64_t, max) + OP_SVE_FUNC(max, w, 32, float32_t, max) + OP_SVE_FUNC(max, d, 64, float64_t, max) /************************************************************************* * Min *************************************************************************/ - OP_SVE_FUNC(min, int8_t , 8, int8_t, min) - OP_SVE_FUNC(min, uint8_t, 8, uint8_t, min) - OP_SVE_FUNC(min, int16_t, 16, int16_t, min) - OP_SVE_FUNC(min, uint16_t, 16, uint16_t, min) - OP_SVE_FUNC(min, int32_t, 32, int32_t, min) - OP_SVE_FUNC(min, uint32_t, 32, uint32_t, min) - OP_SVE_FUNC(min, int64_t, 64, int64_t, min) - OP_SVE_FUNC(min, uint64_t, 64, uint64_t, min) + OP_SVE_FUNC(min, b, 8, int8_t, min) + OP_SVE_FUNC(min, b, 8, uint8_t, min) + OP_SVE_FUNC(min, h, 16, int16_t, min) + OP_SVE_FUNC(min, h, 16, uint16_t, min) + OP_SVE_FUNC(min, w, 32, int32_t, min) + OP_SVE_FUNC(min, w, 32, uint32_t, min) + OP_SVE_FUNC(min, d, 64, int64_t, min) + OP_SVE_FUNC(min, d, 64, uint64_t, min) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(min, short_float, 16, float16_t, min) + OP_SVE_FUNC(min, h, 16, float16_t, min) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(min, short_float, 16, float16_t, min) + OP_SVE_FUNC(min, h, 16, float16_t, min) #endif - OP_SVE_FUNC(min, float, 32, float32_t, min) - OP_SVE_FUNC(min, double, 64, float64_t, min) + OP_SVE_FUNC(min, w, 32, float32_t, min) + OP_SVE_FUNC(min, d, 64, float64_t, min) -/************************************************************************* + /************************************************************************* * Sum ************************************************************************/ - OP_SVE_FUNC(sum, int8_t , 8, int8_t, add) - OP_SVE_FUNC(sum, uint8_t, 8, uint8_t, add) - OP_SVE_FUNC(sum, int16_t, 16, int16_t, add) - OP_SVE_FUNC(sum, uint16_t, 16, uint16_t, add) - OP_SVE_FUNC(sum, int32_t, 32, int32_t, add) - OP_SVE_FUNC(sum, uint32_t, 32, uint32_t, add) - OP_SVE_FUNC(sum, int64_t, 64, int64_t, add) - OP_SVE_FUNC(sum, uint64_t, 64, uint64_t, add) + OP_SVE_FUNC(sum, b, 8, int8_t, add) + OP_SVE_FUNC(sum, b, 8, uint8_t, add) + OP_SVE_FUNC(sum, h, 16, int16_t, add) + OP_SVE_FUNC(sum, h, 16, uint16_t, add) + OP_SVE_FUNC(sum, w, 32, int32_t, add) + OP_SVE_FUNC(sum, w, 32, uint32_t, add) + OP_SVE_FUNC(sum, d, 64, int64_t, add) + OP_SVE_FUNC(sum, d, 64, uint64_t, add) + /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(sum, short_float, 16, float16_t, add) + OP_SVE_FUNC(sum, h, 16, float16_t, add) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(sum, short_float, 16, float16_t, add) + OP_SVE_FUNC(sum, h, 16, float16_t, add) #endif - OP_SVE_FUNC(sum, float, 32, float32_t, add) - OP_SVE_FUNC(sum, double, 64, float64_t, add) - + OP_SVE_FUNC(sum, w, 32, float32_t, add) + OP_SVE_FUNC(sum, d, 64, float64_t, add) /************************************************************************* * Product *************************************************************************/ - OP_SVE_FUNC(prod, int8_t, 8, int8_t, mul) - OP_SVE_FUNC(prod, uint8_t, 8, uint8_t, mul) - OP_SVE_FUNC(prod, int16_t, 16, int16_t, mul) - OP_SVE_FUNC(prod, uint16_t, 16, uint16_t, mul) - OP_SVE_FUNC(prod, int32_t, 32, int32_t, mul) - OP_SVE_FUNC(prod, uint32_t, 32, uint32_t, mul) - OP_SVE_FUNC(prod, int64_t, 64, int64_t, mul) - OP_SVE_FUNC(prod, uint64_t, 64, uint64_t, mul) + OP_SVE_FUNC(prod, b, 8, int8_t, mul) + OP_SVE_FUNC(prod, b, 8, uint8_t, mul) + OP_SVE_FUNC(prod, h, 16, int16_t, mul) + OP_SVE_FUNC(prod, h, 16, uint16_t, mul) + OP_SVE_FUNC(prod, w, 32, int32_t, mul) + OP_SVE_FUNC(prod, w, 32, uint32_t, mul) + OP_SVE_FUNC(prod, d, 64, int64_t, mul) +OP_SVE_FUNC(prod, d, 64, uint64_t, mul) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(prod, short_float, 16, float16_t, mul) +OP_SVE_FUNC(prod, h, 16, float16_t, mul) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(prod, short_float, 16, float16_t, mul) +OP_SVE_FUNC(prod, h, 16, float16_t, mul) #endif - OP_SVE_FUNC(prod, float, 32, float32_t, mul) - OP_SVE_FUNC(prod, double, 64, float64_t, mul) + OP_SVE_FUNC(prod, w, 32, float32_t, mul) +OP_SVE_FUNC(prod, d, 64, float64_t, mul) /************************************************************************* * Bitwise AND *************************************************************************/ - OP_SVE_FUNC(band, int8_t, 8, int8_t, and) - OP_SVE_FUNC(band, uint8_t, 8, uint8_t, and) - OP_SVE_FUNC(band, int16_t, 16, int16_t, and) - OP_SVE_FUNC(band, uint16_t, 16, uint16_t, and) - OP_SVE_FUNC(band, int32_t, 32, int32_t, and) - OP_SVE_FUNC(band, uint32_t, 32, uint32_t, and) - OP_SVE_FUNC(band, int64_t, 64, int64_t, and) - OP_SVE_FUNC(band, uint64_t, 64, uint64_t, and) + OP_SVE_FUNC(band, b, 8, int8_t, and) + OP_SVE_FUNC(band, b, 8, uint8_t, and) + OP_SVE_FUNC(band, h, 16, int16_t, and) + OP_SVE_FUNC(band, h, 16, uint16_t, and) + OP_SVE_FUNC(band, w, 32, int32_t, and) + OP_SVE_FUNC(band, w, 32, uint32_t, and) + OP_SVE_FUNC(band, d, 64, int64_t, and) +OP_SVE_FUNC(band, d, 64, uint64_t, and) -/************************************************************************* + /************************************************************************* * Bitwise OR *************************************************************************/ - OP_SVE_FUNC(bor, int8_t, 8, int8_t, orr) - OP_SVE_FUNC(bor, uint8_t, 8, uint8_t, orr) - OP_SVE_FUNC(bor, int16_t, 16, int16_t, orr) - OP_SVE_FUNC(bor, uint16_t, 16, uint16_t, orr) - OP_SVE_FUNC(bor, int32_t, 32, int32_t, orr) - OP_SVE_FUNC(bor, uint32_t, 32, uint32_t, orr) - OP_SVE_FUNC(bor, int64_t, 64, int64_t, orr) - OP_SVE_FUNC(bor, uint64_t, 64, uint64_t, orr) + OP_SVE_FUNC(bor, b, 8, int8_t, orr) + OP_SVE_FUNC(bor, b, 8, uint8_t, orr) + OP_SVE_FUNC(bor, h, 16, int16_t, orr) + OP_SVE_FUNC(bor, h, 16, uint16_t, orr) + OP_SVE_FUNC(bor, w, 32, int32_t, orr) + OP_SVE_FUNC(bor, w, 32, uint32_t, orr) + OP_SVE_FUNC(bor, d, 64, int64_t, orr) +OP_SVE_FUNC(bor, d, 64, uint64_t, orr) /************************************************************************* * Bitwise XOR *************************************************************************/ - OP_SVE_FUNC(bxor, int8_t, 8, int8_t, eor) - OP_SVE_FUNC(bxor, uint8_t, 8, uint8_t, eor) - OP_SVE_FUNC(bxor, int16_t, 16, int16_t, eor) - OP_SVE_FUNC(bxor, uint16_t, 16, uint16_t, eor) - OP_SVE_FUNC(bxor, int32_t, 32, int32_t, eor) - OP_SVE_FUNC(bxor, uint32_t, 32, uint32_t, eor) - OP_SVE_FUNC(bxor, int64_t, 64, int64_t, eor) - OP_SVE_FUNC(bxor, uint64_t, 64, uint64_t, eor) - + OP_SVE_FUNC(bxor, b, 8, int8_t, eor) + OP_SVE_FUNC(bxor, b, 8, uint8_t, eor) + OP_SVE_FUNC(bxor, h, 16, int16_t, eor) + OP_SVE_FUNC(bxor, h, 16, uint16_t, eor) + OP_SVE_FUNC(bxor, w, 32, int32_t, eor) + OP_SVE_FUNC(bxor, w, 32, uint32_t, eor) + OP_SVE_FUNC(bxor, d, 64, int64_t, eor) +OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) /* * This is a three buffer (2 input and 1 output) version of the reduction * routines, needed for some optimizations. */ #define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \ - static void ompi_op_sve_3buff_##name##_##type_name(void * restrict in1, \ - void * restrict in2, void * restrict out, int *count, \ + static void ompi_op_sve_3buff_##name##_##type(void * restrict _in1, \ + void * restrict _in2, void * restrict _out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ - uint64_t i; \ - uint64_t step = 0; \ + int types_per_step = svcnt##type_name(); \ + int left_over = *count; \ + type* in1 = (type*)_in1; \ + type* in2 = (type*)_in2; \ + type* out = (type*)_out; \ svbool_t Pg = svptrue_b##type_size(); \ - switch(type_size) { \ - case 8: \ - step = svcntb(); \ - break; \ - case 16: \ - step = svcnth(); \ - break; \ - case 32: \ - step = svcntw(); \ - break; \ - case 64: \ - step = svcntd(); \ - } \ - uint64_t round = *count; \ - uint64_t remain = *count % step; \ - for(i=0; i< round; i=i+step) \ - { \ - sv##type vsrc = svld1(Pg, (type *)in1+i); \ - sv##type vdst = svld1(Pg, (type *)in2+i); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + sv##type vsrc = svld1(Pg, in1); \ + sv##type vdst = svld1(Pg, in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ vdst=sv##op##_z(Pg,vdst,vsrc); \ - svst1(Pg, (type *)out+i,vdst); \ + svst1(Pg, out,vdst); \ + out += types_per_step; \ } \ - if (remain !=0){ \ - Pg = svwhilelt_b##type_size##_u64(0, remain); \ - sv##type vsrc = svld1(Pg, (type *)in1+i); \ - sv##type vdst = svld1(Pg, (type *)in2+i); \ + if (left_over !=0){ \ + Pg = svwhilelt_b##type_size##_u64(0, left_over); \ + sv##type vsrc = svld1(Pg, in1); \ + sv##type vdst = svld1(Pg, in2); \ vdst=sv##op##_z(Pg,vdst,vsrc); \ - svst1(Pg, (type *)out+i,vdst); \ + svst1(Pg, out,vdst); \ } \ } - /************************************************************************* * Max *************************************************************************/ - OP_SVE_FUNC_3BUFF(max, int8_t , 8, int8_t, max) - OP_SVE_FUNC_3BUFF(max, uint8_t, 8, uint8_t, max) - OP_SVE_FUNC_3BUFF(max, int16_t, 16, int16_t, max) - OP_SVE_FUNC_3BUFF(max, uint16_t, 16, uint16_t, max) - OP_SVE_FUNC_3BUFF(max, int32_t, 32, int32_t, max) - OP_SVE_FUNC_3BUFF(max, uint32_t, 32, uint32_t, max) - OP_SVE_FUNC_3BUFF(max, int64_t, 64, int64_t, max) - OP_SVE_FUNC_3BUFF(max, uint64_t, 64, uint64_t, max) + OP_SVE_FUNC_3BUFF(max, b, 8, int8_t, max) + OP_SVE_FUNC_3BUFF(max, b, 8, uint8_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, int16_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, uint16_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, int32_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, uint32_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, int64_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max) #endif - OP_SVE_FUNC_3BUFF(max, float, 32, float32_t, max) - OP_SVE_FUNC_3BUFF(max, double, 64, float64_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max) /************************************************************************* * Min *************************************************************************/ - OP_SVE_FUNC_3BUFF(min, int8_t , 8, int8_t, min) - OP_SVE_FUNC_3BUFF(min, uint8_t, 8, uint8_t, min) - OP_SVE_FUNC_3BUFF(min, int16_t, 16, int16_t, min) - OP_SVE_FUNC_3BUFF(min, uint16_t, 16, uint16_t, min) - OP_SVE_FUNC_3BUFF(min, int32_t, 32, int32_t, min) - OP_SVE_FUNC_3BUFF(min, uint32_t, 32, uint32_t, min) - OP_SVE_FUNC_3BUFF(min, int64_t, 64, int64_t, min) - OP_SVE_FUNC_3BUFF(min, uint64_t, 64, uint64_t, min) + OP_SVE_FUNC_3BUFF(min, b, 8, int8_t, min) + OP_SVE_FUNC_3BUFF(min, b, 8, uint8_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, int16_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, uint16_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, int32_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, uint32_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, int64_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min) #endif - OP_SVE_FUNC_3BUFF(min, float, 32, float32_t, min) - OP_SVE_FUNC_3BUFF(min, double, 64, float64_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min) -/************************************************************************* + /************************************************************************* * Sum - *************************************************************************/ - OP_SVE_FUNC_3BUFF(sum, int8_t , 8, int8_t, add) - OP_SVE_FUNC_3BUFF(sum, uint8_t, 8, uint8_t, add) - OP_SVE_FUNC_3BUFF(sum, int16_t, 16, int16_t, add) - OP_SVE_FUNC_3BUFF(sum, uint16_t, 16, uint16_t, add) - OP_SVE_FUNC_3BUFF(sum, int32_t, 32, int32_t, add) - OP_SVE_FUNC_3BUFF(sum, uint32_t, 32, uint32_t, add) - OP_SVE_FUNC_3BUFF(sum, int64_t, 64, int64_t, add) - OP_SVE_FUNC_3BUFF(sum, uint64_t, 64, uint64_t, add) + ************************************************************************/ + OP_SVE_FUNC_3BUFF(sum, b, 8, int8_t, add) + OP_SVE_FUNC_3BUFF(sum, b, 8, uint8_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, int16_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, uint16_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, int32_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, uint32_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, int64_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add) + /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add) #endif - OP_SVE_FUNC_3BUFF(sum, float, 32, float32_t, add) - OP_SVE_FUNC_3BUFF(sum, double, 64, float64_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add) /************************************************************************* * Product *************************************************************************/ - OP_SVE_FUNC_3BUFF(prod, int8_t, 8, int8_t, mul) - OP_SVE_FUNC_3BUFF(prod, uint8_t, 8, uint8_t, mul) - OP_SVE_FUNC_3BUFF(prod, int16_t, 16, int16_t, mul) - OP_SVE_FUNC_3BUFF(prod, uint16_t, 16, uint16_t, mul) - OP_SVE_FUNC_3BUFF(prod, int32_t, 32, int32_t, mul) - OP_SVE_FUNC_3BUFF(prod, uint32_t, 32, uint32_t, mul) - OP_SVE_FUNC_3BUFF(prod, int64_t, 64, int64_t, mul) - OP_SVE_FUNC_3BUFF(prod, uint64_t, 64, uint64_t, mul) + OP_SVE_FUNC_3BUFF(prod, b, 8, int8_t, mul) + OP_SVE_FUNC_3BUFF(prod, b, 8, uint8_t, mul) + OP_SVE_FUNC_3BUFF(prod, h, 16, int16_t, mul) + OP_SVE_FUNC_3BUFF(prod, h, 16, uint16_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, int32_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, int64_t, mul) +OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul) +OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul) #elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul) +OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul) #endif - OP_SVE_FUNC_3BUFF(prod, float, 32, float32_t, mul) - OP_SVE_FUNC_3BUFF(prod, double, 64, float64_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul) +OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul) /************************************************************************* * Bitwise AND *************************************************************************/ - OP_SVE_FUNC_3BUFF(band, int8_t, 8, int8_t, and) - OP_SVE_FUNC_3BUFF(band, uint8_t, 8, uint8_t, and) - OP_SVE_FUNC_3BUFF(band, int16_t, 16, int16_t, and) - OP_SVE_FUNC_3BUFF(band, uint16_t, 16, uint16_t, and) - OP_SVE_FUNC_3BUFF(band, int32_t, 32, int32_t, and) - OP_SVE_FUNC_3BUFF(band, uint32_t, 32, uint32_t, and) - OP_SVE_FUNC_3BUFF(band, int64_t, 64, int64_t, and) - OP_SVE_FUNC_3BUFF(band, uint64_t, 64, uint64_t, and) + OP_SVE_FUNC_3BUFF(band, b, 8, int8_t, and) + OP_SVE_FUNC_3BUFF(band, b, 8, uint8_t, and) + OP_SVE_FUNC_3BUFF(band, h, 16, int16_t, and) + OP_SVE_FUNC_3BUFF(band, h, 16, uint16_t, and) + OP_SVE_FUNC_3BUFF(band, w, 32, int32_t, and) + OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and) + OP_SVE_FUNC_3BUFF(band, d, 64, int64_t, and) +OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and) -/************************************************************************* + /************************************************************************* * Bitwise OR *************************************************************************/ - OP_SVE_FUNC_3BUFF(bor, int8_t, 8, int8_t, orr) - OP_SVE_FUNC_3BUFF(bor, uint8_t, 8, uint8_t, orr) - OP_SVE_FUNC_3BUFF(bor, int16_t, 16, int16_t, orr) - OP_SVE_FUNC_3BUFF(bor, uint16_t, 16, uint16_t, orr) - OP_SVE_FUNC_3BUFF(bor, int32_t, 32, int32_t, orr) - OP_SVE_FUNC_3BUFF(bor, uint32_t, 32, uint32_t, orr) - OP_SVE_FUNC_3BUFF(bor, int64_t, 64, int64_t, orr) - OP_SVE_FUNC_3BUFF(bor, uint64_t, 64, uint64_t, orr) + OP_SVE_FUNC_3BUFF(bor, b, 8, int8_t, orr) + OP_SVE_FUNC_3BUFF(bor, b, 8, uint8_t, orr) + OP_SVE_FUNC_3BUFF(bor, h, 16, int16_t, orr) + OP_SVE_FUNC_3BUFF(bor, h, 16, uint16_t, orr) + OP_SVE_FUNC_3BUFF(bor, w, 32, int32_t, orr) + OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr) + OP_SVE_FUNC_3BUFF(bor, d, 64, int64_t, orr) +OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr) - /************************************************************************* +/************************************************************************* * Bitwise XOR *************************************************************************/ - OP_SVE_FUNC_3BUFF(bxor, int8_t, 8, int8_t, eor) - OP_SVE_FUNC_3BUFF(bxor, uint8_t, 8, uint8_t, eor) - OP_SVE_FUNC_3BUFF(bxor, int16_t, 16, int16_t, eor) - OP_SVE_FUNC_3BUFF(bxor, uint16_t, 16, uint16_t, eor) - OP_SVE_FUNC_3BUFF(bxor, int32_t, 32, int32_t, eor) - OP_SVE_FUNC_3BUFF(bxor, uint32_t, 32, uint32_t, eor) - OP_SVE_FUNC_3BUFF(bxor, int64_t, 64, int64_t, eor) - OP_SVE_FUNC_3BUFF(bxor, uint64_t, 64, uint64_t, eor) + OP_SVE_FUNC_3BUFF(bxor, b, 8, int8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, b, 8, uint8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, h, 16, int16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, h, 16, uint16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, w, 32, int32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, d, 64, int64_t, eor) +OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor) /** C integer ***********************************************************/ #define C_INTEGER(name, ftype) \ @@ -376,12 +353,12 @@ /** Floating point, including all the Fortran reals *********************/ #if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T) -#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_short_float +#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float16_t #else #define SHORT_FLOAT(name, ftype) NULL #endif -#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float -#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_double +#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t +#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t #define FLOATING_POINT(name, ftype) \ [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype), \ diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/arm_sve_op/op_sve_functions.h index 6c7b4c7df71..d5e050afb21 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.h +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.h @@ -3,7 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * - * Copyright (c) 2019 ARM Ltd. All rights reserved. + * Copyright (c) 2019 Arm Ltd. All rights reserved. * * $COPYRIGHT$ * diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c index 73ef7467b02..a9546ebcf1d 100644 --- a/test/datatype/Reduce_local_float.c +++ b/test/datatype/Reduce_local_float.c @@ -14,22 +14,27 @@ float in_float[ARRAYSIZE]; float inout_float[ARRAYSIZE]; +float inout_float_for_check[ARRAYSIZE]; int8_t in_uint8[ARRAYSIZE]; int8_t inout_uint8[ARRAYSIZE]; +int8_t inout_uint8_for_check[ARRAYSIZE]; uint16_t in_uint16[ARRAYSIZE]; uint16_t inout_uint16[ARRAYSIZE]; +uint16_t inout_uint16_for_check[ARRAYSIZE]; uint32_t in_uint32[ARRAYSIZE]; uint32_t inout_uint32[ARRAYSIZE]; +uint32_t inout_uint32_for_check[ARRAYSIZE]; uint64_t in_uint64[ARRAYSIZE]; uint64_t inout_uint64[ARRAYSIZE]; +uint64_t inout_uint64_for_check[ARRAYSIZE]; double in_double[ARRAYSIZE]; double inout_double[ARRAYSIZE]; - +double inout_double_for_check[ARRAYSIZE]; int main(int argc, char **argv) { @@ -45,22 +50,22 @@ int main(int argc, char **argv) { for (i=0; i Date: Sun, 1 Dec 2019 18:01:38 -0500 Subject: [PATCH 05/13] improvement of code, add duff device Signed-off-by: dong zhong --- ompi/mca/op/arm_sve_op/op_sve_functions.c | 283 +++++++++++++++++++++- test/datatype/correctness_check.sh | 77 +++--- 2 files changed, 317 insertions(+), 43 deletions(-) diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c index 8b2281271ea..c9e3bf7b137 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.c +++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c @@ -57,17 +57,272 @@ } \ \ if (left_over !=0){ \ - Pg = svwhilelt_b##type_size##_u64(0, left_over); \ - sv##type vsrc = svld1(Pg, in); \ - sv##type vdst = svld1(Pg, out); \ - vdst=sv##op##_z(Pg,vdst,vsrc); \ - svst1(Pg, out,vdst); \ + switch(left_over) { \ + case 256: out[255] = current_func(out[255],in[255]) ; \ + case 255: out[254] = current_func(out[254],in[254]) ; \ + case 254: out[253] = current_func(out[253],in[253]) ; \ + case 253: out[252] = current_func(out[252],in[252]) ; \ + case 252: out[251] = current_func(out[251],in[251]) ; \ + case 251: out[250] = current_func(out[250],in[250]) ; \ + case 250: out[249] = current_func(out[249],in[249]) ; \ + case 249: out[248] = current_func(out[248],in[248]) ; \ + case 248: out[247] = current_func(out[247],in[247]) ; \ + case 247: out[246] = current_func(out[246],in[246]) ; \ + case 246: out[245] = current_func(out[245],in[245]) ; \ + case 245: out[244] = current_func(out[244],in[244]) ; \ + case 244: out[243] = current_func(out[243],in[243]) ; \ + case 243: out[242] = current_func(out[242],in[242]) ; \ + case 242: out[241] = current_func(out[241],in[241]) ; \ + case 241: out[240] = current_func(out[240],in[240]) ; \ + case 240: out[239] = current_func(out[239],in[239]) ; \ + case 239: out[238] = current_func(out[238],in[238]) ; \ + case 238: out[237] = current_func(out[237],in[237]) ; \ + case 237: out[236] = current_func(out[236],in[236]) ; \ + case 236: out[235] = current_func(out[235],in[235]) ; \ + case 235: out[234] = current_func(out[234],in[234]) ; \ + case 234: out[233] = current_func(out[233],in[233]) ; \ + case 233: out[232] = current_func(out[232],in[232]) ; \ + case 232: out[231] = current_func(out[231],in[231]) ; \ + case 231: out[230] = current_func(out[230],in[230]) ; \ + case 230: out[229] = current_func(out[229],in[229]) ; \ + case 229: out[228] = current_func(out[228],in[228]) ; \ + case 228: out[227] = current_func(out[227],in[227]) ; \ + case 227: out[226] = current_func(out[226],in[226]) ; \ + case 226: out[225] = current_func(out[225],in[225]) ; \ + case 225: out[224] = current_func(out[224],in[224]) ; \ + case 224: out[223] = current_func(out[223],in[223]) ; \ + case 223: out[222] = current_func(out[222],in[222]) ; \ + case 222: out[221] = current_func(out[221],in[221]) ; \ + case 221: out[220] = current_func(out[220],in[220]) ; \ + case 220: out[219] = current_func(out[219],in[219]) ; \ + case 219: out[218] = current_func(out[218],in[218]) ; \ + case 218: out[217] = current_func(out[217],in[217]) ; \ + case 217: out[216] = current_func(out[216],in[216]) ; \ + case 216: out[215] = current_func(out[215],in[215]) ; \ + case 215: out[214] = current_func(out[214],in[214]) ; \ + case 214: out[213] = current_func(out[213],in[213]) ; \ + case 213: out[212] = current_func(out[212],in[212]) ; \ + case 212: out[211] = current_func(out[211],in[211]) ; \ + case 211: out[210] = current_func(out[210],in[210]) ; \ + case 210: out[209] = current_func(out[209],in[209]) ; \ + case 209: out[208] = current_func(out[208],in[208]) ; \ + case 208: out[207] = current_func(out[207],in[207]) ; \ + case 207: out[206] = current_func(out[206],in[206]) ; \ + case 206: out[205] = current_func(out[205],in[205]) ; \ + case 205: out[204] = current_func(out[204],in[204]) ; \ + case 204: out[203] = current_func(out[203],in[203]) ; \ + case 203: out[202] = current_func(out[202],in[202]) ; \ + case 202: out[201] = current_func(out[201],in[201]) ; \ + case 201: out[200] = current_func(out[200],in[200]) ; \ + case 200: out[199] = current_func(out[199],in[199]) ; \ + case 199: out[198] = current_func(out[198],in[198]) ; \ + case 198: out[197] = current_func(out[197],in[197]) ; \ + case 197: out[196] = current_func(out[196],in[196]) ; \ + case 196: out[195] = current_func(out[195],in[195]) ; \ + case 195: out[194] = current_func(out[194],in[194]) ; \ + case 194: out[193] = current_func(out[193],in[193]) ; \ + case 193: out[192] = current_func(out[192],in[192]) ; \ + case 192: out[191] = current_func(out[191],in[191]) ; \ + case 191: out[190] = current_func(out[190],in[190]) ; \ + case 190: out[189] = current_func(out[189],in[189]) ; \ + case 189: out[188] = current_func(out[188],in[188]) ; \ + case 188: out[187] = current_func(out[187],in[187]) ; \ + case 187: out[186] = current_func(out[186],in[186]) ; \ + case 186: out[185] = current_func(out[185],in[185]) ; \ + case 185: out[184] = current_func(out[184],in[184]) ; \ + case 184: out[183] = current_func(out[183],in[183]) ; \ + case 183: out[182] = current_func(out[182],in[182]) ; \ + case 182: out[181] = current_func(out[181],in[181]) ; \ + case 181: out[180] = current_func(out[180],in[180]) ; \ + case 180: out[179] = current_func(out[179],in[179]) ; \ + case 179: out[178] = current_func(out[178],in[178]) ; \ + case 178: out[177] = current_func(out[177],in[177]) ; \ + case 177: out[176] = current_func(out[176],in[176]) ; \ + case 176: out[175] = current_func(out[175],in[175]) ; \ + case 175: out[174] = current_func(out[174],in[174]) ; \ + case 174: out[173] = current_func(out[173],in[173]) ; \ + case 173: out[172] = current_func(out[172],in[172]) ; \ + case 172: out[171] = current_func(out[171],in[171]) ; \ + case 171: out[170] = current_func(out[170],in[170]) ; \ + case 170: out[169] = current_func(out[169],in[169]) ; \ + case 169: out[168] = current_func(out[168],in[168]) ; \ + case 168: out[167] = current_func(out[167],in[167]) ; \ + case 167: out[166] = current_func(out[166],in[166]) ; \ + case 166: out[165] = current_func(out[165],in[165]) ; \ + case 165: out[164] = current_func(out[164],in[164]) ; \ + case 164: out[163] = current_func(out[163],in[163]) ; \ + case 163: out[162] = current_func(out[162],in[162]) ; \ + case 162: out[161] = current_func(out[161],in[161]) ; \ + case 161: out[160] = current_func(out[160],in[160]) ; \ + case 160: out[159] = current_func(out[159],in[159]) ; \ + case 159: out[158] = current_func(out[158],in[158]) ; \ + case 158: out[157] = current_func(out[157],in[157]) ; \ + case 157: out[156] = current_func(out[156],in[156]) ; \ + case 156: out[155] = current_func(out[155],in[155]) ; \ + case 155: out[154] = current_func(out[154],in[154]) ; \ + case 154: out[153] = current_func(out[153],in[153]) ; \ + case 153: out[152] = current_func(out[152],in[152]) ; \ + case 152: out[151] = current_func(out[151],in[151]) ; \ + case 151: out[150] = current_func(out[150],in[150]) ; \ + case 150: out[149] = current_func(out[149],in[149]) ; \ + case 149: out[148] = current_func(out[148],in[148]) ; \ + case 148: out[147] = current_func(out[147],in[147]) ; \ + case 147: out[146] = current_func(out[146],in[146]) ; \ + case 146: out[145] = current_func(out[145],in[145]) ; \ + case 145: out[144] = current_func(out[144],in[144]) ; \ + case 144: out[143] = current_func(out[143],in[143]) ; \ + case 143: out[142] = current_func(out[142],in[142]) ; \ + case 142: out[141] = current_func(out[141],in[141]) ; \ + case 141: out[140] = current_func(out[140],in[140]) ; \ + case 140: out[139] = current_func(out[139],in[139]) ; \ + case 139: out[138] = current_func(out[138],in[138]) ; \ + case 138: out[137] = current_func(out[137],in[137]) ; \ + case 137: out[136] = current_func(out[136],in[136]) ; \ + case 136: out[135] = current_func(out[135],in[135]) ; \ + case 135: out[134] = current_func(out[134],in[134]) ; \ + case 134: out[133] = current_func(out[133],in[133]) ; \ + case 133: out[132] = current_func(out[132],in[132]) ; \ + case 132: out[131] = current_func(out[131],in[131]) ; \ + case 131: out[130] = current_func(out[130],in[130]) ; \ + case 130: out[129] = current_func(out[129],in[129]) ; \ + case 129: out[128] = current_func(out[128],in[128]) ; \ + case 128: out[127] = current_func(out[127],in[127]) ; \ + case 127: out[126] = current_func(out[126],in[126]) ; \ + case 126: out[125] = current_func(out[125],in[125]) ; \ + case 125: out[124] = current_func(out[124],in[124]) ; \ + case 124: out[123] = current_func(out[123],in[123]) ; \ + case 123: out[122] = current_func(out[122],in[122]) ; \ + case 122: out[121] = current_func(out[121],in[121]) ; \ + case 121: out[120] = current_func(out[120],in[120]) ; \ + case 120: out[119] = current_func(out[119],in[119]) ; \ + case 119: out[118] = current_func(out[118],in[118]) ; \ + case 118: out[117] = current_func(out[117],in[117]) ; \ + case 117: out[116] = current_func(out[116],in[116]) ; \ + case 116: out[115] = current_func(out[115],in[115]) ; \ + case 115: out[114] = current_func(out[114],in[114]) ; \ + case 114: out[113] = current_func(out[113],in[113]) ; \ + case 113: out[112] = current_func(out[112],in[112]) ; \ + case 112: out[111] = current_func(out[111],in[111]) ; \ + case 111: out[110] = current_func(out[110],in[110]) ; \ + case 110: out[109] = current_func(out[109],in[109]) ; \ + case 109: out[108] = current_func(out[108],in[108]) ; \ + case 108: out[107] = current_func(out[107],in[107]) ; \ + case 107: out[106] = current_func(out[106],in[106]) ; \ + case 106: out[105] = current_func(out[105],in[105]) ; \ + case 105: out[104] = current_func(out[104],in[104]) ; \ + case 104: out[103] = current_func(out[103],in[103]) ; \ + case 103: out[102] = current_func(out[102],in[102]) ; \ + case 102: out[101] = current_func(out[101],in[101]) ; \ + case 101: out[100] = current_func(out[100],in[100]) ; \ + case 100: out[99] = current_func(out[99],in[99]) ; \ + case 99: out[98] = current_func(out[98],in[98]) ; \ + case 98: out[97] = current_func(out[97],in[97]) ; \ + case 97: out[96] = current_func(out[96],in[96]) ; \ + case 96: out[95] = current_func(out[95],in[95]) ; \ + case 95: out[94] = current_func(out[94],in[94]) ; \ + case 94: out[93] = current_func(out[93],in[93]) ; \ + case 93: out[92] = current_func(out[92],in[92]) ; \ + case 92: out[91] = current_func(out[91],in[91]) ; \ + case 91: out[90] = current_func(out[90],in[90]) ; \ + case 90: out[89] = current_func(out[89],in[89]) ; \ + case 89: out[88] = current_func(out[88],in[88]) ; \ + case 88: out[87] = current_func(out[87],in[87]) ; \ + case 87: out[86] = current_func(out[86],in[86]) ; \ + case 86: out[85] = current_func(out[85],in[85]) ; \ + case 85: out[84] = current_func(out[84],in[84]) ; \ + case 84: out[83] = current_func(out[83],in[83]) ; \ + case 83: out[82] = current_func(out[82],in[82]) ; \ + case 82: out[81] = current_func(out[81],in[81]) ; \ + case 81: out[80] = current_func(out[80],in[80]) ; \ + case 80: out[79] = current_func(out[79],in[79]) ; \ + case 79: out[78] = current_func(out[78],in[78]) ; \ + case 78: out[77] = current_func(out[77],in[77]) ; \ + case 77: out[76] = current_func(out[76],in[76]) ; \ + case 76: out[75] = current_func(out[75],in[75]) ; \ + case 75: out[74] = current_func(out[74],in[74]) ; \ + case 74: out[73] = current_func(out[73],in[73]) ; \ + case 73: out[72] = current_func(out[72],in[72]) ; \ + case 72: out[71] = current_func(out[71],in[71]) ; \ + case 71: out[70] = current_func(out[70],in[70]) ; \ + case 70: out[69] = current_func(out[69],in[69]) ; \ + case 69: out[68] = current_func(out[68],in[68]) ; \ + case 68: out[67] = current_func(out[67],in[67]) ; \ + case 67: out[66] = current_func(out[66],in[66]) ; \ + case 66: out[65] = current_func(out[65],in[65]) ; \ + case 65: out[64] = current_func(out[64],in[64]) ; \ + case 64: out[63] = current_func(out[63],in[63]) ; \ + case 63: out[62] = current_func(out[62],in[62]) ; \ + case 62: out[61] = current_func(out[61],in[61]) ; \ + case 61: out[60] = current_func(out[60],in[60]) ; \ + case 60: out[59] = current_func(out[59],in[59]) ; \ + case 59: out[58] = current_func(out[58],in[58]) ; \ + case 58: out[57] = current_func(out[57],in[57]) ; \ + case 57: out[56] = current_func(out[56],in[56]) ; \ + case 56: out[55] = current_func(out[55],in[55]) ; \ + case 55: out[54] = current_func(out[54],in[54]) ; \ + case 54: out[53] = current_func(out[53],in[53]) ; \ + case 53: out[52] = current_func(out[52],in[52]) ; \ + case 52: out[51] = current_func(out[51],in[51]) ; \ + case 51: out[50] = current_func(out[50],in[50]) ; \ + case 50: out[49] = current_func(out[49],in[49]) ; \ + case 49: out[48] = current_func(out[48],in[48]) ; \ + case 48: out[47] = current_func(out[47],in[47]) ; \ + case 47: out[46] = current_func(out[46],in[46]) ; \ + case 46: out[45] = current_func(out[45],in[45]) ; \ + case 45: out[44] = current_func(out[44],in[44]) ; \ + case 44: out[43] = current_func(out[43],in[43]) ; \ + case 43: out[42] = current_func(out[42],in[42]) ; \ + case 42: out[41] = current_func(out[41],in[41]) ; \ + case 41: out[40] = current_func(out[40],in[40]) ; \ + case 40: out[39] = current_func(out[39],in[39]) ; \ + case 39: out[38] = current_func(out[38],in[38]) ; \ + case 38: out[37] = current_func(out[37],in[37]) ; \ + case 37: out[36] = current_func(out[36],in[36]) ; \ + case 36: out[35] = current_func(out[35],in[35]) ; \ + case 35: out[34] = current_func(out[34],in[34]) ; \ + case 34: out[33] = current_func(out[33],in[33]) ; \ + case 33: out[32] = current_func(out[32],in[32]) ; \ + case 32: out[31] = current_func(out[31],in[31]) ; \ + case 31: out[30] = current_func(out[30],in[30]) ; \ + case 30: out[29] = current_func(out[29],in[29]) ; \ + case 29: out[28] = current_func(out[28],in[28]) ; \ + case 28: out[27] = current_func(out[27],in[27]) ; \ + case 27: out[26] = current_func(out[26],in[26]) ; \ + case 26: out[25] = current_func(out[25],in[25]) ; \ + case 25: out[24] = current_func(out[24],in[24]) ; \ + case 24: out[23] = current_func(out[23],in[23]) ; \ + case 23: out[22] = current_func(out[22],in[22]) ; \ + case 22: out[21] = current_func(out[21],in[21]) ; \ + case 21: out[20] = current_func(out[20],in[20]) ; \ + case 20: out[19] = current_func(out[19],in[19]) ; \ + case 19: out[18] = current_func(out[18],in[18]) ; \ + case 18: out[17] = current_func(out[17],in[17]) ; \ + case 17: out[16] = current_func(out[16],in[16]) ; \ + case 16: out[15] = current_func(out[15],in[15]) ; \ + case 15: out[14] = current_func(out[14],in[14]) ; \ + case 14: out[13] = current_func(out[13],in[13]) ; \ + case 13: out[12] = current_func(out[12],in[12]) ; \ + case 12: out[11] = current_func(out[11],in[11]) ; \ + case 11: out[10] = current_func(out[10],in[10]) ; \ + case 10: out[9] = current_func(out[9],in[9]) ; \ + case 9: out[8] = current_func(out[8],in[8]) ; \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ } \ } /************************************************************************* * Max *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) OP_SVE_FUNC(max, b, 8, int8_t, max) OP_SVE_FUNC(max, b, 8, uint8_t, max) OP_SVE_FUNC(max, h, 16, int16_t, max) @@ -89,6 +344,8 @@ /************************************************************************* * Min *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) OP_SVE_FUNC(min, b, 8, int8_t, min) OP_SVE_FUNC(min, b, 8, uint8_t, min) OP_SVE_FUNC(min, h, 16, int16_t, min) @@ -110,6 +367,8 @@ /************************************************************************* * Sum ************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) OP_SVE_FUNC(sum, b, 8, int8_t, add) OP_SVE_FUNC(sum, b, 8, uint8_t, add) OP_SVE_FUNC(sum, h, 16, int16_t, add) @@ -131,6 +390,8 @@ /************************************************************************* * Product *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) OP_SVE_FUNC(prod, b, 8, int8_t, mul) OP_SVE_FUNC(prod, b, 8, uint8_t, mul) OP_SVE_FUNC(prod, h, 16, int16_t, mul) @@ -138,7 +399,7 @@ OP_SVE_FUNC(prod, w, 32, int32_t, mul) OP_SVE_FUNC(prod, w, 32, uint32_t, mul) OP_SVE_FUNC(prod, d, 64, int64_t, mul) -OP_SVE_FUNC(prod, d, 64, uint64_t, mul) + OP_SVE_FUNC(prod, d, 64, uint64_t, mul) /* Floating point */ #if defined(HAVE_SHORT_FLOAT) @@ -152,6 +413,8 @@ OP_SVE_FUNC(prod, d, 64, float64_t, mul) /************************************************************************* * Bitwise AND *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) OP_SVE_FUNC(band, b, 8, int8_t, and) OP_SVE_FUNC(band, b, 8, uint8_t, and) OP_SVE_FUNC(band, h, 16, int16_t, and) @@ -164,6 +427,8 @@ OP_SVE_FUNC(band, d, 64, uint64_t, and) /************************************************************************* * Bitwise OR *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) OP_SVE_FUNC(bor, b, 8, int8_t, orr) OP_SVE_FUNC(bor, b, 8, uint8_t, orr) OP_SVE_FUNC(bor, h, 16, int16_t, orr) @@ -176,6 +441,8 @@ OP_SVE_FUNC(bor, d, 64, uint64_t, orr) /************************************************************************* * Bitwise XOR *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) OP_SVE_FUNC(bxor, b, 8, int8_t, eor) OP_SVE_FUNC(bxor, b, 8, uint8_t, eor) OP_SVE_FUNC(bxor, h, 16, int16_t, eor) @@ -430,7 +697,7 @@ ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMP [OMPI_OP_BASE_FORTRAN_REPLACE] = { /* (MPI_ACCUMULATE is handled differently than the other reductions, so just zero out its function - impementations here to ensure that users don't invoke + implementations here to ensure that users don't invoke MPI_REPLACE with any reduction operations other than ACCUMULATE) */ NULL, @@ -493,7 +760,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN [OMPI_OP_BASE_FORTRAN_REPLACE] = { /* MPI_ACCUMULATE is handled differently than the other reductions, so just zero out its function - impementations here to ensure that users don't invoke + implementations here to ensure that users don't invoke MPI_REPLACE with any reduction operations other than ACCUMULATE */ NULL, diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh index 4ac4f08cba0..cf9f09a801e 100644 --- a/test/datatype/correctness_check.sh +++ b/test/datatype/correctness_check.sh @@ -1,5 +1,5 @@ -echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation" -echo "/home/zhongdong/opt/git/avx512_reduction/bin/mpirun -mca op_sve_hardware_available 0 -mca op_avx_hardware_available 0 -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1048576 i 8 max" +echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation" +echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1 -np 1 /your_test_path/Reduce_local_float 1048576 i 8 max" Orange= "\033[0;33m" Blue= '\033[0;34m' @@ -7,46 +7,53 @@ Purple= '\033[0;35m' NC='\033[0m' -echo "=========Integer type all operations & all sizes========" -echo "" -for op in max min sum mul band bor bxor +# test all vector size +for vector_len in 128 256 512 1024 2048 do + + echo "=========Integer type all operations & all sizes========" echo "" - echo "===Operation $op test===" - for size in 8 16 32 64 + for op in max min sum mul band bor bxor do - echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N " - /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=256 --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 i $size $op - echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) < Total_num_bits < 512*N " - echo -e "Test \e[1;35m duff device code \e[m 512*N < Total_num_bits < 512*N + 256 " + echo "" + echo "===Operation $op test===" + for size in 8 16 32 64 + do + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 i $size $op + echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) < Total_num_bits < 2048*N " + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 i $size $op + done done -done - -echo "=======Float type all operations========" -echo "" -echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N " -for op in max min sum mul -do - /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=256 --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op -done -echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) < Total_num_bits < 512*N " -## 28= 16+8 + 4 + echo "=======Float type all operations========" + echo "" + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op + done -echo -e "Test \e[1;35m duff device code \e[m 512*N < Total_num_bits < 512*N + 256 " -##40=16+4 + echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) < Total_num_bits < 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 f 32 $op + done -echo "========Double type all operations=========" -echo "" -echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N " -for op in max min sum mul -do - /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=256 --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op -done + echo "========Double type all operations=========" + echo "" -echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) < Total_num_bits < 512*N " -## 20= 8 +8 + 3 + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op + done -echo -e "Test \e[1;35m duff device code \e[m 512*N < Total_num_bits < 512*N + 256 " -##12=8+2 + echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) < Total_num_bits < 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 d 64 $op + done +done +##echo -e "Test \e[1;35m duff device code \e[m 2048*N < Total_num_bits < 2048*N + 256 " From 0914f2aa9ef9dff34897bf49eb24e4d710d12061 Mon Sep 17 00:00:00 2001 From: dong zhong Date: Wed, 11 Dec 2019 00:25:32 -0500 Subject: [PATCH 06/13] Add test script and plot script Signed-off-by: dong zhong --- test/datatype/Reduce_uint8.c | 62 ++++++++++++++++++++++ test/datatype/SVE_MPI_Op.py | 91 +++++++++++++++++++++++++++++++++ test/datatype/sve_uint8_test.sh | 21 ++++++++ 3 files changed, 174 insertions(+) create mode 100644 test/datatype/Reduce_uint8.c create mode 100644 test/datatype/SVE_MPI_Op.py create mode 100644 test/datatype/sve_uint8_test.sh diff --git a/test/datatype/Reduce_uint8.c b/test/datatype/Reduce_uint8.c new file mode 100644 index 00000000000..35570d8fe23 --- /dev/null +++ b/test/datatype/Reduce_uint8.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" + +#define ARRAYSIZE 32*1024*1024 + +int8_t in_uint8[ARRAYSIZE]; +int8_t inout_uint8[ARRAYSIZE]; +int8_t inout_uint8_for_check[ARRAYSIZE]; + +int main(int argc, char **argv) { + + char *num_elem = argv[1]; + int count = atoi(num_elem); + char *type = argv[2]; + char *elem_size = argv[3]; + int elem_size1 = atoi(elem_size); + char *op = argv[4]; + + int i; + + for (i=0; i Date: Wed, 11 Dec 2019 00:51:37 -0500 Subject: [PATCH 07/13] Add test README Signed-off-by: dong zhong --- test/datatype/HOW_TO_TEST_README.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 test/datatype/HOW_TO_TEST_README.txt diff --git a/test/datatype/HOW_TO_TEST_README.txt b/test/datatype/HOW_TO_TEST_README.txt new file mode 100644 index 00000000000..4d6025c95ee --- /dev/null +++ b/test/datatype/HOW_TO_TEST_README.txt @@ -0,0 +1,17 @@ +(1) Reduce_uint8.c : test code for MPI_SUM operation with type uint8. + compile as : $path/mpicc -march=armv8-a+sve -O3 -o Reduce_uint8 Reduce_uint8.c  +(2) sve_uint8_test.sh: A shell script that will generate results with time information for MPI_SUM operation with different message size.  +(3) SVE_MPI_Op.py: A python script which you can use to generate plot. + +ALL YOU NEED TO DO IS: +(1) Change the path of your mpirun and test binary in sve_uint8_test.sh. +(2) Run sve_uint8_test.sh + This will generate 3 types of output files with names + a. sve-sum-vectorlength.txt ##MPI_SUM with SVE-enabled operation + b. no-sve-sum-vectorlength.txt ##MPI_SUM without SVE-enabled operation + c. sve-cpy-vectorlength.txt ## memcpy +(2) run python scripts as + $python SVE_MPI_Op.py sve-sum-vectorlength.txt no-sve-sum-vectorlength.txt + make sure put sve-sum-vectorlength.txt before no-sve-sum-vectorlength.txt + example : SVE_MPI_Op.py sve-sum-128.txt no-sve-sum-128.txt + From 86f3965b57a040a7d0b7d6cf607eb11f74831525 Mon Sep 17 00:00:00 2001 From: Dong Zhong Date: Wed, 11 Dec 2019 01:28:52 -0500 Subject: [PATCH 08/13] Change to 30 runs --- test/datatype/sve_uint8_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/datatype/sve_uint8_test.sh b/test/datatype/sve_uint8_test.sh index b7f857db157..ed0cc151592 100644 --- a/test/datatype/sve_uint8_test.sh +++ b/test/datatype/sve_uint8_test.sh @@ -9,7 +9,7 @@ do echo "" echo "" echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " - for (( i=1; i<11; i++ )) + for (( i=1; i<31; i++ )) do for val in 1024 4096 16384 65536 262144 1048576 4194304 16777216 33554432 do From 72195f8cd1c236ece32f889f4e7f3646ddfbc43c Mon Sep 17 00:00:00 2001 From: Dong Zhong Date: Fri, 21 Feb 2020 14:27:42 -0500 Subject: [PATCH 09/13] update makefile --- ompi/mca/op/Makefile.am | 4 +++- ompi/mca/op/arm_sve_op/Makefile.am | 21 +++++++++------------ test/datatype/Makefile.am | 8 +++++++- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am index 8c392f1dbec..e942d1a8c21 100644 --- a/ompi/mca/op/Makefile.am +++ b/ompi/mca/op/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -17,6 +17,8 @@ # $HEADER$ # +AM_CPPFLAGS = $(LTDLINCL) + # main library setup noinst_LTLIBRARIES = libmca_op.la libmca_op_la_SOURCES = diff --git a/ompi/mca/op/arm_sve_op/Makefile.am b/ompi/mca/op/arm_sve_op/Makefile.am index 9b2aca888a4..7a0d2162581 100644 --- a/ompi/mca/op/arm_sve_op/Makefile.am +++ b/ompi/mca/op/arm_sve_op/Makefile.am @@ -22,6 +22,7 @@ sources = \ op_sve.h \ op_sve_component.c \ + op_sve_functions.h \ op_sve_functions.c # Open MPI components can be compiled two ways: @@ -41,15 +42,11 @@ sources = \ # which way this component should be built. if MCA_BUILD_ompi_op_arm_sve_op_DSO -lib = -lib_sources = -component = mca_op_sve.la -component_sources = $(sources) +component_noinst = +component_install = mca_op_sve.la else -lib = libmca_op_sve.la -lib_sources = $(sources) -component = -component_sources = +component_install = +component_noinst = component_noinst endif # Specific information for DSO builds. @@ -58,8 +55,8 @@ endif # $prefix/lib/openmpi). mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component) -mca_op_sve_la_SOURCES = $(component_sources) +mcacomponent_LTLIBRARIES = $(component_install) +mca_op_sve_la_SOURCES = $(sources) mca_op_sve_la_LDFLAGS = -module -avoid-version mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la @@ -68,6 +65,6 @@ mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la # Note that we *must* "noinst"; the upper-layer Makefile.am's will # slurp in the resulting .la library into libmpi. -noinst_LTLIBRARIES = $(lib) -libmca_op_sve_la_SOURCES = $(lib_sources) +noinst_LTLIBRARIES = $(component_noinst) +libmca_op_sve_la_SOURCES = $(sources) libmca_op_sve_la_LDFLAGS = -module -avoid-version diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am index 4366724a523..19c0d6489b3 100644 --- a/test/datatype/Makefile.am +++ b/test/datatype/Makefile.am @@ -16,7 +16,7 @@ if PROJECT_OMPI MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data - MPI_CHECKS = to_self + MPI_CHECKS = to_self reduce_local endif TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS) @@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) unpack_hetero_LDADD = \ $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la +reduce_local_SOURCES = reduce_local.c +reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) +reduce_local_LDADD = \ + $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + distclean: rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile From f632d687907c08584a9d2d5c29183df648698f6b Mon Sep 17 00:00:00 2001 From: Dong Zhong Date: Fri, 21 Feb 2020 16:57:17 -0500 Subject: [PATCH 10/13] rename sve folder --- ompi/mca/op/{arm_sve_op => sve}/Makefile.am | 2 +- ompi/mca/op/sve/configure.m4 | 21 +++++++++++++++++++ ompi/mca/op/{arm_sve_op => sve}/op_sve.h | 0 .../op/{arm_sve_op => sve}/op_sve_component.c | 4 ++-- .../op/{arm_sve_op => sve}/op_sve_functions.c | 4 ++-- .../op/{arm_sve_op => sve}/op_sve_functions.h | 2 +- 6 files changed, 27 insertions(+), 6 deletions(-) rename ompi/mca/op/{arm_sve_op => sve}/Makefile.am (98%) create mode 100644 ompi/mca/op/sve/configure.m4 rename ompi/mca/op/{arm_sve_op => sve}/op_sve.h (100%) rename ompi/mca/op/{arm_sve_op => sve}/op_sve_component.c (98%) rename ompi/mca/op/{arm_sve_op => sve}/op_sve_functions.c (99%) rename ompi/mca/op/{arm_sve_op => sve}/op_sve_functions.h (94%) diff --git a/ompi/mca/op/arm_sve_op/Makefile.am b/ompi/mca/op/sve/Makefile.am similarity index 98% rename from ompi/mca/op/arm_sve_op/Makefile.am rename to ompi/mca/op/sve/Makefile.am index 7a0d2162581..1e70e2fead8 100644 --- a/ompi/mca/op/arm_sve_op/Makefile.am +++ b/ompi/mca/op/sve/Makefile.am @@ -41,7 +41,7 @@ sources = \ # MCA_BUILD_ompi___DSO AM_CONDITIONAL to indicate # which way this component should be built. -if MCA_BUILD_ompi_op_arm_sve_op_DSO +if MCA_BUILD_ompi_op_sve_DSO component_noinst = component_install = mca_op_sve.la else diff --git a/ompi/mca/op/sve/configure.m4 b/ompi/mca/op/sve/configure.m4 new file mode 100644 index 00000000000..9f71d56c951 --- /dev/null +++ b/ompi/mca/op/sve/configure.m4 @@ -0,0 +1,21 @@ +# -*- shell-script -*- +# +# Copyright (c) 2019-2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_sve_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +# We can always build, unless we were explicitly disabled. +AC_DEFUN([MCA_ompi_op_sve_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/op/sve/Makefile]) + [$1], +])dnl diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/sve/op_sve.h similarity index 100% rename from ompi/mca/op/arm_sve_op/op_sve.h rename to ompi/mca/op/sve/op_sve.h diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/sve/op_sve_component.c similarity index 98% rename from ompi/mca/op/arm_sve_op/op_sve_component.c rename to ompi/mca/op/sve/op_sve_component.c index df8281e5ba1..16393aefe23 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_component.c +++ b/ompi/mca/op/sve/op_sve_component.c @@ -26,8 +26,8 @@ #include "ompi/op/op.h" #include "ompi/mca/op/op.h" #include "ompi/mca/op/base/base.h" -#include "ompi/mca/op/arm_sve_op/op_sve.h" -#include "ompi/mca/op/arm_sve_op/op_sve_functions.h" +#include "ompi/mca/op/sve/op_sve.h" +#include "ompi/mca/op/sve/op_sve_functions.h" static int sve_component_open(void); static int sve_component_close(void); diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c similarity index 99% rename from ompi/mca/op/arm_sve_op/op_sve_functions.c rename to ompi/mca/op/sve/op_sve_functions.c index c9e3bf7b137..265f41d9988 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.c +++ b/ompi/mca/op/sve/op_sve_functions.c @@ -22,8 +22,8 @@ #include "ompi/op/op.h" #include "ompi/mca/op/op.h" #include "ompi/mca/op/base/base.h" -#include "ompi/mca/op/arm_sve_op/op_sve.h" -#include "ompi/mca/op/arm_sve_op/op_sve_functions.h" +#include "ompi/mca/op/sve/op_sve.h" +#include "ompi/mca/op/sve/op_sve_functions.h" #ifdef __ARM_FEATURE_SVE #include diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/sve/op_sve_functions.h similarity index 94% rename from ompi/mca/op/arm_sve_op/op_sve_functions.h rename to ompi/mca/op/sve/op_sve_functions.h index d5e050afb21..00db651cfac 100644 --- a/ompi/mca/op/arm_sve_op/op_sve_functions.h +++ b/ompi/mca/op/sve/op_sve_functions.h @@ -19,7 +19,7 @@ #endif #include "ompi/mca/op/op.h" -#include "ompi/mca/op/arm_sve_op/op_sve.h" +#include "ompi/mca/op/sve/op_sve.h" BEGIN_C_DECLS From 3d9a27926f04f6fa627c7cdc5ed2ef6025dbdadd Mon Sep 17 00:00:00 2001 From: Dong Zhong Date: Fri, 21 Feb 2020 16:27:03 -0500 Subject: [PATCH 11/13] remove short_float, rewrite duff device Signed-off-by: Dong Zhong --- ompi/mca/op/sve/op_sve_functions.c | 364 ++++------------------------- 1 file changed, 45 insertions(+), 319 deletions(-) diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c index 265f41d9988..1b14affe336 100644 --- a/ompi/mca/op/sve/op_sve_functions.c +++ b/ompi/mca/op/sve/op_sve_functions.c @@ -56,265 +56,45 @@ out += types_per_step; \ } \ \ - if (left_over !=0){ \ - switch(left_over) { \ - case 256: out[255] = current_func(out[255],in[255]) ; \ - case 255: out[254] = current_func(out[254],in[254]) ; \ - case 254: out[253] = current_func(out[253],in[253]) ; \ - case 253: out[252] = current_func(out[252],in[252]) ; \ - case 252: out[251] = current_func(out[251],in[251]) ; \ - case 251: out[250] = current_func(out[250],in[250]) ; \ - case 250: out[249] = current_func(out[249],in[249]) ; \ - case 249: out[248] = current_func(out[248],in[248]) ; \ - case 248: out[247] = current_func(out[247],in[247]) ; \ - case 247: out[246] = current_func(out[246],in[246]) ; \ - case 246: out[245] = current_func(out[245],in[245]) ; \ - case 245: out[244] = current_func(out[244],in[244]) ; \ - case 244: out[243] = current_func(out[243],in[243]) ; \ - case 243: out[242] = current_func(out[242],in[242]) ; \ - case 242: out[241] = current_func(out[241],in[241]) ; \ - case 241: out[240] = current_func(out[240],in[240]) ; \ - case 240: out[239] = current_func(out[239],in[239]) ; \ - case 239: out[238] = current_func(out[238],in[238]) ; \ - case 238: out[237] = current_func(out[237],in[237]) ; \ - case 237: out[236] = current_func(out[236],in[236]) ; \ - case 236: out[235] = current_func(out[235],in[235]) ; \ - case 235: out[234] = current_func(out[234],in[234]) ; \ - case 234: out[233] = current_func(out[233],in[233]) ; \ - case 233: out[232] = current_func(out[232],in[232]) ; \ - case 232: out[231] = current_func(out[231],in[231]) ; \ - case 231: out[230] = current_func(out[230],in[230]) ; \ - case 230: out[229] = current_func(out[229],in[229]) ; \ - case 229: out[228] = current_func(out[228],in[228]) ; \ - case 228: out[227] = current_func(out[227],in[227]) ; \ - case 227: out[226] = current_func(out[226],in[226]) ; \ - case 226: out[225] = current_func(out[225],in[225]) ; \ - case 225: out[224] = current_func(out[224],in[224]) ; \ - case 224: out[223] = current_func(out[223],in[223]) ; \ - case 223: out[222] = current_func(out[222],in[222]) ; \ - case 222: out[221] = current_func(out[221],in[221]) ; \ - case 221: out[220] = current_func(out[220],in[220]) ; \ - case 220: out[219] = current_func(out[219],in[219]) ; \ - case 219: out[218] = current_func(out[218],in[218]) ; \ - case 218: out[217] = current_func(out[217],in[217]) ; \ - case 217: out[216] = current_func(out[216],in[216]) ; \ - case 216: out[215] = current_func(out[215],in[215]) ; \ - case 215: out[214] = current_func(out[214],in[214]) ; \ - case 214: out[213] = current_func(out[213],in[213]) ; \ - case 213: out[212] = current_func(out[212],in[212]) ; \ - case 212: out[211] = current_func(out[211],in[211]) ; \ - case 211: out[210] = current_func(out[210],in[210]) ; \ - case 210: out[209] = current_func(out[209],in[209]) ; \ - case 209: out[208] = current_func(out[208],in[208]) ; \ - case 208: out[207] = current_func(out[207],in[207]) ; \ - case 207: out[206] = current_func(out[206],in[206]) ; \ - case 206: out[205] = current_func(out[205],in[205]) ; \ - case 205: out[204] = current_func(out[204],in[204]) ; \ - case 204: out[203] = current_func(out[203],in[203]) ; \ - case 203: out[202] = current_func(out[202],in[202]) ; \ - case 202: out[201] = current_func(out[201],in[201]) ; \ - case 201: out[200] = current_func(out[200],in[200]) ; \ - case 200: out[199] = current_func(out[199],in[199]) ; \ - case 199: out[198] = current_func(out[198],in[198]) ; \ - case 198: out[197] = current_func(out[197],in[197]) ; \ - case 197: out[196] = current_func(out[196],in[196]) ; \ - case 196: out[195] = current_func(out[195],in[195]) ; \ - case 195: out[194] = current_func(out[194],in[194]) ; \ - case 194: out[193] = current_func(out[193],in[193]) ; \ - case 193: out[192] = current_func(out[192],in[192]) ; \ - case 192: out[191] = current_func(out[191],in[191]) ; \ - case 191: out[190] = current_func(out[190],in[190]) ; \ - case 190: out[189] = current_func(out[189],in[189]) ; \ - case 189: out[188] = current_func(out[188],in[188]) ; \ - case 188: out[187] = current_func(out[187],in[187]) ; \ - case 187: out[186] = current_func(out[186],in[186]) ; \ - case 186: out[185] = current_func(out[185],in[185]) ; \ - case 185: out[184] = current_func(out[184],in[184]) ; \ - case 184: out[183] = current_func(out[183],in[183]) ; \ - case 183: out[182] = current_func(out[182],in[182]) ; \ - case 182: out[181] = current_func(out[181],in[181]) ; \ - case 181: out[180] = current_func(out[180],in[180]) ; \ - case 180: out[179] = current_func(out[179],in[179]) ; \ - case 179: out[178] = current_func(out[178],in[178]) ; \ - case 178: out[177] = current_func(out[177],in[177]) ; \ - case 177: out[176] = current_func(out[176],in[176]) ; \ - case 176: out[175] = current_func(out[175],in[175]) ; \ - case 175: out[174] = current_func(out[174],in[174]) ; \ - case 174: out[173] = current_func(out[173],in[173]) ; \ - case 173: out[172] = current_func(out[172],in[172]) ; \ - case 172: out[171] = current_func(out[171],in[171]) ; \ - case 171: out[170] = current_func(out[170],in[170]) ; \ - case 170: out[169] = current_func(out[169],in[169]) ; \ - case 169: out[168] = current_func(out[168],in[168]) ; \ - case 168: out[167] = current_func(out[167],in[167]) ; \ - case 167: out[166] = current_func(out[166],in[166]) ; \ - case 166: out[165] = current_func(out[165],in[165]) ; \ - case 165: out[164] = current_func(out[164],in[164]) ; \ - case 164: out[163] = current_func(out[163],in[163]) ; \ - case 163: out[162] = current_func(out[162],in[162]) ; \ - case 162: out[161] = current_func(out[161],in[161]) ; \ - case 161: out[160] = current_func(out[160],in[160]) ; \ - case 160: out[159] = current_func(out[159],in[159]) ; \ - case 159: out[158] = current_func(out[158],in[158]) ; \ - case 158: out[157] = current_func(out[157],in[157]) ; \ - case 157: out[156] = current_func(out[156],in[156]) ; \ - case 156: out[155] = current_func(out[155],in[155]) ; \ - case 155: out[154] = current_func(out[154],in[154]) ; \ - case 154: out[153] = current_func(out[153],in[153]) ; \ - case 153: out[152] = current_func(out[152],in[152]) ; \ - case 152: out[151] = current_func(out[151],in[151]) ; \ - case 151: out[150] = current_func(out[150],in[150]) ; \ - case 150: out[149] = current_func(out[149],in[149]) ; \ - case 149: out[148] = current_func(out[148],in[148]) ; \ - case 148: out[147] = current_func(out[147],in[147]) ; \ - case 147: out[146] = current_func(out[146],in[146]) ; \ - case 146: out[145] = current_func(out[145],in[145]) ; \ - case 145: out[144] = current_func(out[144],in[144]) ; \ - case 144: out[143] = current_func(out[143],in[143]) ; \ - case 143: out[142] = current_func(out[142],in[142]) ; \ - case 142: out[141] = current_func(out[141],in[141]) ; \ - case 141: out[140] = current_func(out[140],in[140]) ; \ - case 140: out[139] = current_func(out[139],in[139]) ; \ - case 139: out[138] = current_func(out[138],in[138]) ; \ - case 138: out[137] = current_func(out[137],in[137]) ; \ - case 137: out[136] = current_func(out[136],in[136]) ; \ - case 136: out[135] = current_func(out[135],in[135]) ; \ - case 135: out[134] = current_func(out[134],in[134]) ; \ - case 134: out[133] = current_func(out[133],in[133]) ; \ - case 133: out[132] = current_func(out[132],in[132]) ; \ - case 132: out[131] = current_func(out[131],in[131]) ; \ - case 131: out[130] = current_func(out[130],in[130]) ; \ - case 130: out[129] = current_func(out[129],in[129]) ; \ - case 129: out[128] = current_func(out[128],in[128]) ; \ - case 128: out[127] = current_func(out[127],in[127]) ; \ - case 127: out[126] = current_func(out[126],in[126]) ; \ - case 126: out[125] = current_func(out[125],in[125]) ; \ - case 125: out[124] = current_func(out[124],in[124]) ; \ - case 124: out[123] = current_func(out[123],in[123]) ; \ - case 123: out[122] = current_func(out[122],in[122]) ; \ - case 122: out[121] = current_func(out[121],in[121]) ; \ - case 121: out[120] = current_func(out[120],in[120]) ; \ - case 120: out[119] = current_func(out[119],in[119]) ; \ - case 119: out[118] = current_func(out[118],in[118]) ; \ - case 118: out[117] = current_func(out[117],in[117]) ; \ - case 117: out[116] = current_func(out[116],in[116]) ; \ - case 116: out[115] = current_func(out[115],in[115]) ; \ - case 115: out[114] = current_func(out[114],in[114]) ; \ - case 114: out[113] = current_func(out[113],in[113]) ; \ - case 113: out[112] = current_func(out[112],in[112]) ; \ - case 112: out[111] = current_func(out[111],in[111]) ; \ - case 111: out[110] = current_func(out[110],in[110]) ; \ - case 110: out[109] = current_func(out[109],in[109]) ; \ - case 109: out[108] = current_func(out[108],in[108]) ; \ - case 108: out[107] = current_func(out[107],in[107]) ; \ - case 107: out[106] = current_func(out[106],in[106]) ; \ - case 106: out[105] = current_func(out[105],in[105]) ; \ - case 105: out[104] = current_func(out[104],in[104]) ; \ - case 104: out[103] = current_func(out[103],in[103]) ; \ - case 103: out[102] = current_func(out[102],in[102]) ; \ - case 102: out[101] = current_func(out[101],in[101]) ; \ - case 101: out[100] = current_func(out[100],in[100]) ; \ - case 100: out[99] = current_func(out[99],in[99]) ; \ - case 99: out[98] = current_func(out[98],in[98]) ; \ - case 98: out[97] = current_func(out[97],in[97]) ; \ - case 97: out[96] = current_func(out[96],in[96]) ; \ - case 96: out[95] = current_func(out[95],in[95]) ; \ - case 95: out[94] = current_func(out[94],in[94]) ; \ - case 94: out[93] = current_func(out[93],in[93]) ; \ - case 93: out[92] = current_func(out[92],in[92]) ; \ - case 92: out[91] = current_func(out[91],in[91]) ; \ - case 91: out[90] = current_func(out[90],in[90]) ; \ - case 90: out[89] = current_func(out[89],in[89]) ; \ - case 89: out[88] = current_func(out[88],in[88]) ; \ - case 88: out[87] = current_func(out[87],in[87]) ; \ - case 87: out[86] = current_func(out[86],in[86]) ; \ - case 86: out[85] = current_func(out[85],in[85]) ; \ - case 85: out[84] = current_func(out[84],in[84]) ; \ - case 84: out[83] = current_func(out[83],in[83]) ; \ - case 83: out[82] = current_func(out[82],in[82]) ; \ - case 82: out[81] = current_func(out[81],in[81]) ; \ - case 81: out[80] = current_func(out[80],in[80]) ; \ - case 80: out[79] = current_func(out[79],in[79]) ; \ - case 79: out[78] = current_func(out[78],in[78]) ; \ - case 78: out[77] = current_func(out[77],in[77]) ; \ - case 77: out[76] = current_func(out[76],in[76]) ; \ - case 76: out[75] = current_func(out[75],in[75]) ; \ - case 75: out[74] = current_func(out[74],in[74]) ; \ - case 74: out[73] = current_func(out[73],in[73]) ; \ - case 73: out[72] = current_func(out[72],in[72]) ; \ - case 72: out[71] = current_func(out[71],in[71]) ; \ - case 71: out[70] = current_func(out[70],in[70]) ; \ - case 70: out[69] = current_func(out[69],in[69]) ; \ - case 69: out[68] = current_func(out[68],in[68]) ; \ - case 68: out[67] = current_func(out[67],in[67]) ; \ - case 67: out[66] = current_func(out[66],in[66]) ; \ - case 66: out[65] = current_func(out[65],in[65]) ; \ - case 65: out[64] = current_func(out[64],in[64]) ; \ - case 64: out[63] = current_func(out[63],in[63]) ; \ - case 63: out[62] = current_func(out[62],in[62]) ; \ - case 62: out[61] = current_func(out[61],in[61]) ; \ - case 61: out[60] = current_func(out[60],in[60]) ; \ - case 60: out[59] = current_func(out[59],in[59]) ; \ - case 59: out[58] = current_func(out[58],in[58]) ; \ - case 58: out[57] = current_func(out[57],in[57]) ; \ - case 57: out[56] = current_func(out[56],in[56]) ; \ - case 56: out[55] = current_func(out[55],in[55]) ; \ - case 55: out[54] = current_func(out[54],in[54]) ; \ - case 54: out[53] = current_func(out[53],in[53]) ; \ - case 53: out[52] = current_func(out[52],in[52]) ; \ - case 52: out[51] = current_func(out[51],in[51]) ; \ - case 51: out[50] = current_func(out[50],in[50]) ; \ - case 50: out[49] = current_func(out[49],in[49]) ; \ - case 49: out[48] = current_func(out[48],in[48]) ; \ - case 48: out[47] = current_func(out[47],in[47]) ; \ - case 47: out[46] = current_func(out[46],in[46]) ; \ - case 46: out[45] = current_func(out[45],in[45]) ; \ - case 45: out[44] = current_func(out[44],in[44]) ; \ - case 44: out[43] = current_func(out[43],in[43]) ; \ - case 43: out[42] = current_func(out[42],in[42]) ; \ - case 42: out[41] = current_func(out[41],in[41]) ; \ - case 41: out[40] = current_func(out[40],in[40]) ; \ - case 40: out[39] = current_func(out[39],in[39]) ; \ - case 39: out[38] = current_func(out[38],in[38]) ; \ - case 38: out[37] = current_func(out[37],in[37]) ; \ - case 37: out[36] = current_func(out[36],in[36]) ; \ - case 36: out[35] = current_func(out[35],in[35]) ; \ - case 35: out[34] = current_func(out[34],in[34]) ; \ - case 34: out[33] = current_func(out[33],in[33]) ; \ - case 33: out[32] = current_func(out[32],in[32]) ; \ - case 32: out[31] = current_func(out[31],in[31]) ; \ - case 31: out[30] = current_func(out[30],in[30]) ; \ - case 30: out[29] = current_func(out[29],in[29]) ; \ - case 29: out[28] = current_func(out[28],in[28]) ; \ - case 28: out[27] = current_func(out[27],in[27]) ; \ - case 27: out[26] = current_func(out[26],in[26]) ; \ - case 26: out[25] = current_func(out[25],in[25]) ; \ - case 25: out[24] = current_func(out[24],in[24]) ; \ - case 24: out[23] = current_func(out[23],in[23]) ; \ - case 23: out[22] = current_func(out[22],in[22]) ; \ - case 22: out[21] = current_func(out[21],in[21]) ; \ - case 21: out[20] = current_func(out[20],in[20]) ; \ - case 20: out[19] = current_func(out[19],in[19]) ; \ - case 19: out[18] = current_func(out[18],in[18]) ; \ - case 18: out[17] = current_func(out[17],in[17]) ; \ - case 17: out[16] = current_func(out[16],in[16]) ; \ - case 16: out[15] = current_func(out[15],in[15]) ; \ - case 15: out[14] = current_func(out[14],in[14]) ; \ - case 14: out[13] = current_func(out[13],in[13]) ; \ - case 13: out[12] = current_func(out[12],in[12]) ; \ - case 12: out[11] = current_func(out[11],in[11]) ; \ - case 11: out[10] = current_func(out[10],in[10]) ; \ - case 10: out[9] = current_func(out[9],in[9]) ; \ - case 9: out[8] = current_func(out[8],in[8]) ; \ - case 8: out[7] = current_func(out[7],in[7]) ; \ - case 7: out[6] = current_func(out[6],in[6]) ; \ - case 6: out[5] = current_func(out[5],in[5]) ; \ - case 5: out[4] = current_func(out[4],in[4]) ; \ - case 4: out[3] = current_func(out[3],in[3]) ; \ - case 3: out[2] = current_func(out[2],in[2]) ; \ - case 2: out[1] = current_func(out[1],in[1]) ; \ - case 1: out[0] = current_func(out[0],in[0]) ; \ - }\ + while( left_over > 0 ) { \ + int how_much = (left_over > 32) ? 32 : left_over; \ + switch(left_over) { \ + case 32: out[31] = current_func(out[31],in[31]) ; \ + case 31: out[30] = current_func(out[30],in[30]) ; \ + case 30: out[29] = current_func(out[29],in[29]) ; \ + case 29: out[28] = current_func(out[28],in[28]) ; \ + case 28: out[27] = current_func(out[27],in[27]) ; \ + case 27: out[26] = current_func(out[26],in[26]) ; \ + case 26: out[25] = current_func(out[25],in[25]) ; \ + case 25: out[24] = current_func(out[24],in[24]) ; \ + case 24: out[23] = current_func(out[23],in[23]) ; \ + case 23: out[22] = current_func(out[22],in[22]) ; \ + case 22: out[21] = current_func(out[21],in[21]) ; \ + case 21: out[20] = current_func(out[20],in[20]) ; \ + case 20: out[19] = current_func(out[19],in[19]) ; \ + case 19: out[18] = current_func(out[18],in[18]) ; \ + case 18: out[17] = current_func(out[17],in[17]) ; \ + case 17: out[16] = current_func(out[16],in[16]) ; \ + case 16: out[15] = current_func(out[15],in[15]) ; \ + case 15: out[14] = current_func(out[14],in[14]) ; \ + case 14: out[13] = current_func(out[13],in[13]) ; \ + case 13: out[12] = current_func(out[12],in[12]) ; \ + case 12: out[11] = current_func(out[11],in[11]) ; \ + case 11: out[10] = current_func(out[10],in[10]) ; \ + case 10: out[9] = current_func(out[9],in[9]) ; \ + case 9: out[8] = current_func(out[8],in[8]) ; \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + left_over -= how_much; \ + out += how_much; \ + in += how_much; \ } \ } @@ -332,12 +112,6 @@ OP_SVE_FUNC(max, d, 64, int64_t, max) OP_SVE_FUNC(max, d, 64, uint64_t, max) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(max, h, 16, float16_t, max) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(max, h, 16, float16_t, max) -#endif OP_SVE_FUNC(max, w, 32, float32_t, max) OP_SVE_FUNC(max, d, 64, float64_t, max) @@ -355,12 +129,6 @@ OP_SVE_FUNC(min, d, 64, int64_t, min) OP_SVE_FUNC(min, d, 64, uint64_t, min) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(min, h, 16, float16_t, min) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(min, h, 16, float16_t, min) -#endif OP_SVE_FUNC(min, w, 32, float32_t, min) OP_SVE_FUNC(min, d, 64, float64_t, min) @@ -378,12 +146,6 @@ OP_SVE_FUNC(sum, d, 64, int64_t, add) OP_SVE_FUNC(sum, d, 64, uint64_t, add) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC(sum, h, 16, float16_t, add) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC(sum, h, 16, float16_t, add) -#endif OP_SVE_FUNC(sum, w, 32, float32_t, add) OP_SVE_FUNC(sum, d, 64, float64_t, add) @@ -401,14 +163,8 @@ OP_SVE_FUNC(prod, d, 64, int64_t, mul) OP_SVE_FUNC(prod, d, 64, uint64_t, mul) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) -OP_SVE_FUNC(prod, h, 16, float16_t, mul) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) -OP_SVE_FUNC(prod, h, 16, float16_t, mul) -#endif OP_SVE_FUNC(prod, w, 32, float32_t, mul) -OP_SVE_FUNC(prod, d, 64, float64_t, mul) + OP_SVE_FUNC(prod, d, 64, float64_t, mul) /************************************************************************* * Bitwise AND @@ -498,12 +254,6 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) OP_SVE_FUNC_3BUFF(max, d, 64, int64_t, max) OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max) -#endif OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max) OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max) @@ -519,12 +269,6 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) OP_SVE_FUNC_3BUFF(min, d, 64, int64_t, min) OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min) -#endif OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min) OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min) @@ -540,12 +284,6 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) OP_SVE_FUNC_3BUFF(sum, d, 64, int64_t, add) OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) - OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) - OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add) -#endif OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add) OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add) @@ -559,16 +297,10 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) OP_SVE_FUNC_3BUFF(prod, w, 32, int32_t, mul) OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul) OP_SVE_FUNC_3BUFF(prod, d, 64, int64_t, mul) -OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul) - /* Floating point */ -#if defined(HAVE_SHORT_FLOAT) -OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul) -#elif defined(HAVE_OPAL_SHORT_FLOAT_T) -OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul) -#endif OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul) -OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul) /************************************************************************* * Bitwise AND @@ -580,7 +312,7 @@ OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul) OP_SVE_FUNC_3BUFF(band, w, 32, int32_t, and) OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and) OP_SVE_FUNC_3BUFF(band, d, 64, int64_t, and) -OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and) + OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and) /************************************************************************* * Bitwise OR @@ -592,7 +324,7 @@ OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and) OP_SVE_FUNC_3BUFF(bor, w, 32, int32_t, orr) OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr) OP_SVE_FUNC_3BUFF(bor, d, 64, int64_t, orr) -OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr) + OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr) /************************************************************************* * Bitwise XOR @@ -604,7 +336,7 @@ OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr) OP_SVE_FUNC_3BUFF(bxor, w, 32, int32_t, eor) OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor) OP_SVE_FUNC_3BUFF(bxor, d, 64, int64_t, eor) -OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor) + OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor) /** C integer ***********************************************************/ #define C_INTEGER(name, ftype) \ @@ -619,16 +351,10 @@ OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor) /** Floating point, including all the Fortran reals *********************/ -#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T) -#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float16_t -#else -#define SHORT_FLOAT(name, ftype) NULL -#endif #define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t #define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t #define FLOATING_POINT(name, ftype) \ - [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype), \ [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) From 6c3bde95b7cad83b28cc866c9dc8f0f56358c57d Mon Sep 17 00:00:00 2001 From: Dong Zhong Date: Fri, 21 Feb 2020 17:08:19 -0500 Subject: [PATCH 12/13] add reduce_local test code Signed-off-by: Dong Zhong --- test/datatype/reduce_local.c | 1149 ++++++++++++++++++++++++++++++++++ 1 file changed, 1149 insertions(+) create mode 100644 test/datatype/reduce_local.c diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c new file mode 100644 index 00000000000..b2fe9b352c3 --- /dev/null +++ b/test/datatype/reduce_local.c @@ -0,0 +1,1149 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" +#include "ompi/communicator/communicator.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/datatype/ompi_datatype.h" + +static void print_status(char* op, char* type, int correctness) +{ + if(correctness) + printf("%s %s [\033[1;32msuccess\033[0m]", op, type); + else + printf("%s %s [\033[1;31mfail\033[0m]", op, type); +} + +int main(int argc, char **argv) +{ + static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL; + int count, elem_size, rank, size, len, provided, correctness = 1, i; + double tstart, tend; + char *type, *op; + + if(argc < 4 ) { + fprintf(stderr, + "Less arguments than expected (we need at least 3): \n" + " : [i, u, f, d]\n" + " ; [sum, max, min, bor, bxor, mul, band]\n"); + exit(-1); + } + count = atoi(argv[1]); + type = argv[2]; + elem_size = atoi(argv[3]); + op = argv[4]; + + if( count <= 0 ) { + printf("The number of elements should be positive\n"); + exit(-1); + } + if( (0 != (elem_size%8)) || (elem_size <= 0) || (elem_size > 64) ) { + printf("The element type should be 8, 16, 32 or 64\n"); + exit(-2); + } + + in_buf = malloc(count * sizeof(double)); + inout_buf = malloc(count * sizeof(double)); + inout_check_buf = malloc(count * sizeof(double)); + + ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false); + + rank = ompi_comm_rank(MPI_COMM_WORLD); + size = ompi_comm_size(MPI_COMM_WORLD); + + if(*type=='i') { + if( 8 == elem_size ) { + int8_t *in_int8 = (int8_t*)in_buf, + *inout_int8 = (int8_t*)inout_buf, + *inout_int8_for_check = (int8_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int8[i] = 5; + inout_int8[i] = inout_int8_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (int8_t)(in_int8[i] + inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != in_int8[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "min") ) { //intentionly reversed in and out + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int8, in_int8, count, MPI_INT8_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != in_int8[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (in_int8[i] | inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (in_int8[i] ^ inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8,inout_int8,count, MPI_INT8_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (int8_t)(in_int8[i] * inout_int8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int8[i] != (in_int8[i] & inout_int8_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT8_T", correctness); + } + } + if( 16 == elem_size ) { + int16_t *in_int16 = (int16_t*)in_buf, + *inout_int16 = (int16_t*)inout_buf, + *inout_int16_for_check = (int16_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int16[i] = 5; + inout_int16[i] = inout_int16_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_SUM); + tend = MPI_Wtime(); + for( i = 0; i < count; i++ ) { + if(inout_int16[i] != (int16_t)(in_int16[i] + inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != in_int16[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int16, in_int16, count, MPI_INT16_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != in_int16[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (in_int16[i] | inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (in_int16[i] ^ inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (int16_t)(in_int16[i] * inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int16[i] != (in_int16[i] & inout_int16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT16_T", correctness); + } + } + if( 32 == elem_size ) { + int32_t *in_int32 = (int32_t*)in_buf, + *inout_int32 = (int32_t*)inout_buf, + *inout_int32_for_check = (int32_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int32[i] = 5; + inout_int32[i] = inout_int32_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (int32_t)(in_int32[i] + inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != in_int32[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int32, in_int32, count, MPI_INT32_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != in_int32[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (in_int32[i] | inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (int32_t)(in_int32[i] * inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (in_int32[i] & inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int32[i] != (in_int32[i] ^ inout_int32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT32_T", correctness); + } + } + if( 64 == elem_size ) { + int64_t *in_int64 = (int64_t*)in_buf, + *inout_int64 = (int64_t*)inout_buf, + *inout_int64_for_check = (int64_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_int64[i] = 5; + inout_int64[i] = inout_int64_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (int64_t)(in_int64[i] + inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != in_int64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_int64, in_int64, count, MPI_INT64_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != in_int64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (in_int64[i] | inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (in_int64[i] ^ inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64,inout_int64,count, MPI_INT64_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (int64_t)(in_int64[i] * inout_int64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_int64[i] != (in_int64[i] & inout_int64_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT64_T", correctness); + } + } + } + if(*type=='u') { + if( 8 == elem_size ) { + uint8_t *in_uint8 = (uint8_t*)in_buf, + *inout_uint8 = (uint8_t*)inout_buf, + *inout_uint8_for_check = (uint8_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint8[i] = 5; + inout_uint8[i] = inout_uint8_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (uint8_t)(in_uint8[i] + inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != inout_uint8_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "min") ) { //intentionly reversed in and out + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != in_uint8[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (in_uint8[i] | inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (in_uint8[i] ^ inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_UINT8_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (int8_t)(in_uint8[i] * inout_uint8_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT8_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT8_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint8[i] != (in_uint8[i] & inout_uint8_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT8_T", correctness); + } + } + if( 16 == elem_size ) { + uint16_t *in_uint16 = (uint16_t*)in_buf, + *inout_uint16 = (uint16_t*)inout_buf, + *inout_uint16_for_check = (uint16_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint16[i] = 5; + inout_uint16[i] = inout_uint16_for_check[i] = -3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_SUM); + tend = MPI_Wtime(); + for( i = 0; i < count; i++ ) { + if(inout_uint16[i] != (uint16_t)(in_uint16[i] + inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != inout_uint16_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != in_uint16[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (in_uint16[i] ^ inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (uint16_t)(in_uint16[i] * inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT16_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT16_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint16[i] != (in_uint16[i] & inout_uint16_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT16_T", correctness); + } + } + if( 32 == elem_size ) { + uint32_t *in_uint32 = (uint32_t*)in_buf, + *inout_uint32 = (uint32_t*)inout_buf, + *inout_uint32_for_check = (uint32_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint32[i] = 5; + inout_uint32[i] = inout_uint32_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (uint32_t)(in_uint32[i] + inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != inout_uint32_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != in_uint32[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (uint32_t)(in_uint32[i] * inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (in_uint32[i] & inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT32_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT32_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint32[i] != (in_uint32[i] ^ inout_uint32_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT32_T", correctness); + } + } + if( 64 == elem_size ) { + int64_t *in_uint64 = (int64_t*)in_buf, + *inout_uint64 = (int64_t*)inout_buf, + *inout_uint64_for_check = (int64_t*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_uint64[i] = 5; + inout_uint64[i] = inout_uint64_for_check[i] = 3; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (int64_t)(in_uint64[i] + inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != in_uint64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_uint64, in_uint64, count, MPI_UINT64_T, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != in_uint64[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "bxor") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BXOR); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (in_uint64[i] ^ inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BXOR", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (int64_t)(in_uint64[i] * inout_uint64_for_check[i])) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_INT64_T", correctness); + } + if( 0 == strcmp(op, "band") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT64_T", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BAND); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_uint64[i] != (in_uint64[i] & inout_uint64_for_check[i]) ) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_BAND", "MPI_INT64_T", correctness); + } + } + } + + if(*type=='f') { + float *in_float = (float*)in_buf, + *inout_float = (float*)inout_buf, + *inout_float_for_check = (float*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_float[i] = 1000.0+1; + inout_float[i] = inout_float_for_check[i] = 100.0+2; + } + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != inout_float_for_check[i]+in_float[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_FLOAT", correctness); + } + + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != in_float[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_FLOAT", correctness); + } + + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != in_float[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_FLOAT", correctness); + } + + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_FLOAT", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_float[i] != in_float[i] * inout_float_for_check[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_FLOAT", correctness); + } + } + + if(*type=='d') { + double *in_double = (double*)in_buf, + *inout_double = (double*)inout_buf, + *inout_double_for_check = (double*)inout_check_buf; + for( i = 0; i < count; i++ ) { + in_double[i] = 10.0+1; + inout_double[i] = inout_double_for_check[i] = 1.0+2; + } + + if( 0 == strcmp(op, "sum") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_SUM); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != inout_double_for_check[i]+in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_SUM", "MPI_DOUBLE", correctness); + } + + if( 0 == strcmp(op, "max") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_MAX); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MAX", "MPI_DOUBLE", correctness); + } + + if( 0 == strcmp(op, "min") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(inout_double, in_double, count, MPI_DOUBLE, MPI_MIN); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_MIN", "MPI_DOUBLE", correctness); + } + if( 0 == strcmp(op, "mul") ) { + printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_DOUBLE", count); + tstart = MPI_Wtime(); + MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_PROD); + tend = MPI_Wtime(); + for( correctness = 1, i = 0; i < count; i++ ) { + if(inout_double[i] != inout_double_for_check[i]*in_double[i]) { + if( correctness ) + printf("First error at position %d\n", i); + correctness = 0; + break; + } + } + print_status("MPI_PROD", "MPI_DOUBLE", correctness); + } + } + //tstart = MPI_Wtime(); + //memcpy(in_uint8,inout_uint8, count); + //memcpy(in_float, inout_float, count); + //memcpy(in_double, inout_double, count); + printf(" count %d time %.6f seconds\n",count, tend-tstart); + ompi_mpi_finalize(); + + free(in_buf); + free(inout_buf); + free(inout_check_buf); + + return correctness ? 0 : -1; +} + From b418d37886a16f7af0f70df610dde40ad30ac974 Mon Sep 17 00:00:00 2001 From: Dong Zhong Date: Mon, 24 Feb 2020 15:09:02 -0500 Subject: [PATCH 13/13] shrink duff device code Signed-off-by: Dong Zhong --- ompi/mca/op/sve/op_sve_functions.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c index 1b14affe336..4e86f5b2471 100644 --- a/ompi/mca/op/sve/op_sve_functions.c +++ b/ompi/mca/op/sve/op_sve_functions.c @@ -57,32 +57,8 @@ } \ \ while( left_over > 0 ) { \ - int how_much = (left_over > 32) ? 32 : left_over; \ + int how_much = (left_over > 8) ? 8 : left_over; \ switch(left_over) { \ - case 32: out[31] = current_func(out[31],in[31]) ; \ - case 31: out[30] = current_func(out[30],in[30]) ; \ - case 30: out[29] = current_func(out[29],in[29]) ; \ - case 29: out[28] = current_func(out[28],in[28]) ; \ - case 28: out[27] = current_func(out[27],in[27]) ; \ - case 27: out[26] = current_func(out[26],in[26]) ; \ - case 26: out[25] = current_func(out[25],in[25]) ; \ - case 25: out[24] = current_func(out[24],in[24]) ; \ - case 24: out[23] = current_func(out[23],in[23]) ; \ - case 23: out[22] = current_func(out[22],in[22]) ; \ - case 22: out[21] = current_func(out[21],in[21]) ; \ - case 21: out[20] = current_func(out[20],in[20]) ; \ - case 20: out[19] = current_func(out[19],in[19]) ; \ - case 19: out[18] = current_func(out[18],in[18]) ; \ - case 18: out[17] = current_func(out[17],in[17]) ; \ - case 17: out[16] = current_func(out[16],in[16]) ; \ - case 16: out[15] = current_func(out[15],in[15]) ; \ - case 15: out[14] = current_func(out[14],in[14]) ; \ - case 14: out[13] = current_func(out[13],in[13]) ; \ - case 13: out[12] = current_func(out[12],in[12]) ; \ - case 12: out[11] = current_func(out[11],in[11]) ; \ - case 11: out[10] = current_func(out[10],in[10]) ; \ - case 10: out[9] = current_func(out[9],in[9]) ; \ - case 9: out[8] = current_func(out[8],in[8]) ; \ case 8: out[7] = current_func(out[7],in[7]) ; \ case 7: out[6] = current_func(out[6],in[6]) ; \ case 6: out[5] = current_func(out[5],in[5]) ; \