diff --git a/ARM_SVE_README b/ARM_SVE_README new file mode 100644 index 00000000000..a4670d9a8df --- /dev/null +++ b/ARM_SVE_README @@ -0,0 +1,30 @@ +Configuration and installation details please check script arm_install.sh + +Run test: (Test example takes 4 args) +arg1 : elements count for operation +arg2 : elements type could be : i (integer), f (float), d (double) +arg3: type size in bits, only apply when you set arg2 to i. eg: i 8 will be converted to int8; i 16 to int16 +arg4: operation type. Could be : max, min, sum , mul, band , bor, bxor +If you want to use SVE module for MPI ops, you need to pass mca params as : -mca op sve -mca op_sve_hardware_available 1 +======= +Example for test +$PATH_To_BIN/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=256 --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 33 i 8 min + +If you don't need armie you can remove the ARMIE part in the command line as : +$PATH_To_BIN//mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 /ompi/test/datatype/Reduce_local_float 33 i 8 min + +How we evaluate the performance? +====== +Logical: + +Start_time; +MPI_reduce_local(...); +End_time; + +Reduce_time = Start_time - End_time; + +Possible issues (this happened on thunder2 machine): +====== +Reason for "-mca pml ob1" : on Arm machine the default pml module will cause a problem with armie (instruction not supported, I don't know why), but with ob1 it works. + + diff --git a/arm_install.sh b/arm_install.sh new file mode 100644 index 00000000000..0b8afef93b0 --- /dev/null +++ b/arm_install.sh @@ -0,0 +1,11 @@ +mkdir build + +./autogen.pl >/dev/null + +./configure --prefix=$PWD/build --enable-mpirun-prefix-by-default --enable-debug CC=armclang CFLAGS="-march=armv8-a+sve" CXX=armclang++ FC=armflang >/dev/null + +./config.status >/dev/null +make -j 128 install >/dev/null + +## compile the test code, test code under ompi/test/datapyte/Reduce_local_float.c +./build/bin/mpicc -g -O3 -march=armv8-a+sve -o ./test/datatype/Reduce_local_float ./test/datatype/Reduce_local_float.c diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am index 8c392f1dbec..e942d1a8c21 100644 --- a/ompi/mca/op/Makefile.am +++ b/ompi/mca/op/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -17,6 +17,8 @@ # $HEADER$ # +AM_CPPFLAGS = $(LTDLINCL) + # main library setup noinst_LTLIBRARIES = libmca_op.la libmca_op_la_SOURCES = diff --git a/ompi/mca/op/sve/Makefile.am b/ompi/mca/op/sve/Makefile.am new file mode 100644 index 00000000000..9f6e747b384 --- /dev/null +++ b/ompi/mca/op/sve/Makefile.am @@ -0,0 +1,70 @@ +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is an sve op component. This Makefile.am is a typical +# sve of how to integrate into Open MPI's Automake-based build +# system. +# +# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent +# for more details on how to make Open MPI components. + +# First, list all .h and .c sources. It is necessary to list all .h +# files so that they will be picked up in the distribution tarball. + +sources = \ + op_sve.h \ + op_sve_component.c \ + op_sve_functions.h \ + op_sve_functions.c + +# Open MPI components can be compiled two ways: +# +# 1. As a standalone dynamic shared object (DSO), sometimes called a +# dynamically loadable library (DLL). +# +# 2. As a static library that is slurped up into the upper-level +# libmpi library (regardless of whether libmpi is a static or dynamic +# library). This is called a "Libtool convenience library". +# +# The component needs to create an output library in this top-level +# component directory, and named either mca__.la (for DSO +# builds) or libmca__.la (for static builds). The OMPI +# build system will have set the +# MCA_BUILD_ompi___DSO AM_CONDITIONAL to indicate +# which way this component should be built. + +if MCA_BUILD_ompi_op_sve_DSO +component_noinst = +component_install = mca_op_sve.la +else +component_install = +component_noinst = libmca_op_sve.la +endif + +# Specific information for DSO builds. +# +# The DSO should install itself in $(ompilibdir) (by default, +# $prefix/lib/openmpi). + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_op_sve_la_SOURCES = $(sources) +mca_op_sve_la_LDFLAGS = -module -avoid-version +mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la + +# Specific information for static builds. +# +# Note that we *must* "noinst"; the upper-layer Makefile.am's will +# slurp in the resulting .la library into libmpi. + +noinst_LTLIBRARIES = $(component_noinst) +libmca_op_sve_la_SOURCES = $(sources) +libmca_op_sve_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/op/sve/configure.m4 b/ompi/mca/op/sve/configure.m4 new file mode 100644 index 00000000000..47138dd44a5 --- /dev/null +++ b/ompi/mca/op/sve/configure.m4 @@ -0,0 +1,34 @@ +# -*- shell-script -*- +# +# Copyright (c) 2019-2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_sve_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +# We can always build, unless we were explicitly disabled. +AC_DEFUN([MCA_ompi_op_sve_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/op/sve/Makefile]) + case "${host}" in + aarch64) + op_sve_support="yes";; + *) + op_sve_support="no";; + esac + [$1] + AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sve_support], + [test "$op_sve_support" = "yes"]) + AC_SUBST(MCA_BUILD_ompi_op_has_sve_support) + + AS_IF([test $op_sve_support == "yes"], + [$1], + [$2]) +])dnl diff --git a/ompi/mca/op/sve/op_sve.h b/ompi/mca/op/sve/op_sve.h new file mode 100644 index 00000000000..21483d1fa56 --- /dev/null +++ b/ompi/mca/op/sve/op_sve.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 Arm Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OP_SVE_EXPORT_H +#define MCA_OP_SVE_EXPORT_H + +#include "ompi_config.h" + +#include "ompi/mca/mca.h" +#include "opal/class/opal_object.h" + +#include "ompi/mca/op/op.h" + +BEGIN_C_DECLS + +/** + * Derive a struct from the base op component struct, allowing us to + * cache some component-specific information on our well-known + * component struct. + */ +typedef struct { + /** The base op component struct */ + ompi_op_base_component_1_0_0_t super; + + /* What follows is sve-component-specific cached information. We + tend to use this scheme (caching information on the sve + component itself) instead of lots of individual global + variables for the component. The following data fields are + sves; replace them with whatever is relevant for your + component. */ + + /** A simple boolean indicating that the hardware is available. */ + bool hardware_available; + + /** A simple boolean indicating whether double precision is + supported. */ + bool double_supported; +} ompi_op_sve_component_t; + +/** + * Globally exported variable. Note that it is a *sve* component + * (defined above), which has the ompi_op_base_component_t as its + * first member. Hence, the MCA/op framework will find the data that + * it expects in the first memory locations, but then the component + * itself can cache additional information after that that can be used + * by both the component and modules. + */ +OMPI_DECLSPEC extern ompi_op_sve_component_t + mca_op_sve_component; + +END_C_DECLS + +#endif /* MCA_OP_SVE_EXPORT_H */ diff --git a/ompi/mca/op/sve/op_sve_component.c b/ompi/mca/op/sve/op_sve_component.c new file mode 100644 index 00000000000..16393aefe23 --- /dev/null +++ b/ompi/mca/op/sve/op_sve_component.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 ARM Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * This is the "sve" component source code. + * + */ + +#include "ompi_config.h" + +#include "opal/util/printf.h" + +#include "ompi/constants.h" +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/sve/op_sve.h" +#include "ompi/mca/op/sve/op_sve_functions.h" + +static int sve_component_open(void); +static int sve_component_close(void); +static int sve_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + sve_component_op_query(struct ompi_op_t *op, int *priority); +static int sve_component_register(void); + +ompi_op_sve_component_t mca_op_sve_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "sve", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = sve_component_open, + .mca_close_component = sve_component_close, + .mca_register_component_params = sve_component_register, + }, + .opc_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = sve_component_init_query, + .opc_op_query = sve_component_op_query, + }, +}; + +/* + * Component open + */ +static int sve_component_open(void) +{ + + /* A first level check to see if sve is even available in this + process. E.g., you may want to do a first-order check to see + if hardware is available. If so, return OMPI_SUCCESS. If not, + return anything other than OMPI_SUCCESS and the component will + silently be ignored. + + Note that if this function returns non-OMPI_SUCCESS, then this + component won't even be shown in ompi_info output (which is + probably not what you want). + */ + + return OMPI_SUCCESS; +} + +/* + * Component close + */ +static int sve_component_close(void) +{ + + /* If sve was opened successfully, close it (i.e., release any + resources that may have been allocated on this component). + Note that _component_close() will always be called at the end + of the process, so it may have been after any/all of the other + component functions have been invoked (and possibly even after + modules have been created and/or destroyed). */ + + return OMPI_SUCCESS; +} + +/* + * Register MCA params. + */ +static int sve_component_register(void) +{ + + /* Additionally, since this component is simulating hardware, + let's make MCA params that determine whethere a) the hardware + is available, and b) whether double precision floating point + types are supported. This allows you to change the behavior of + this component at run-time (by setting these MCA params at + run-time), simulating different kinds of hardware. */ + mca_op_sve_component.hardware_available = false; + (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version, + "hardware_available", + "Whether the hardware is available or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_sve_component.hardware_available); + + mca_op_sve_component.double_supported = true; + (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version, + "double_supported", + "Whether the double precision data types are supported or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_sve_component.double_supported); + + return OMPI_SUCCESS; +} + +/* + * Query whether this component wants to be used in this process. + */ +static int sve_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + if (mca_op_sve_component.hardware_available) { + return OMPI_SUCCESS; + } + return OMPI_ERR_NOT_SUPPORTED; +} + + +/* + * Query whether this component can be used for a specific op + */ +static struct ompi_op_base_module_1_0_0_t * + sve_component_op_query(struct ompi_op_t *op, int *priority) +{ + ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t); + /* Sanity check -- although the framework should never invoke the + _component_op_query() on non-intrinsic MPI_Op's, we'll put a + check here just to be sure. */ + if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) { + return NULL; + } + + int i=0; + switch (op->o_f_to_c_index) { + case OMPI_OP_BASE_FORTRAN_MAX: + case OMPI_OP_BASE_FORTRAN_MIN: + case OMPI_OP_BASE_FORTRAN_SUM: + case OMPI_OP_BASE_FORTRAN_PROD: + case OMPI_OP_BASE_FORTRAN_BOR: + case OMPI_OP_BASE_FORTRAN_BAND: + case OMPI_OP_BASE_FORTRAN_BXOR: + for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_sve_functions[op->o_f_to_c_index][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[op->o_f_to_c_index][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_LAND: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_LOR: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_LXOR: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_MAXLOC: + module = NULL; + break; + case OMPI_OP_BASE_FORTRAN_MINLOC: + module= NULL; + break; + default: + module= NULL; + } + /* If we got a module from above, we'll return it. Otherwise, + we'll return NULL, indicating that this component does not want + to be considered for selection for this MPI_Op. Note that the + functions each returned a *sve* component pointer + (vs. a *base* component pointer -- where an *sve* component + is a base component plus some other module-specific cached + information), so we have to cast it to the right pointer type + before returning. */ + if (NULL != module) { + *priority = 50; + } + return (ompi_op_base_module_1_0_0_t *) module; +} diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c new file mode 100644 index 00000000000..4e86f5b2471 --- /dev/null +++ b/ompi/mca/op/sve/op_sve_functions.c @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 Arm Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#include "opal/util/output.h" + +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/sve/op_sve.h" +#include "ompi/mca/op/sve/op_sve_functions.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +/* + * Since all the functions in this file are essentially identical, we + * use a macro to substitute in names and types. The core operation + * in all functions that use this macro is the same. + * + * This macro is for (out op in). + * + */ +#define OP_SVE_FUNC(name, type_name, type_size, type, op) \ + static void ompi_op_sve_2buff_##name##_##type(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = svcnt##type_name(); \ + int left_over = *count; \ + type* in = (type*)_in; \ + type* out = (type*)_out; \ + svbool_t Pg = svptrue_b##type_size(); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + sv##type vsrc = svld1(Pg, in); \ + sv##type vdst = svld1(Pg, out); \ + in += types_per_step; \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + out += types_per_step; \ + } \ + \ + while( left_over > 0 ) { \ + int how_much = (left_over > 8) ? 8 : left_over; \ + switch(left_over) { \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + left_over -= how_much; \ + out += how_much; \ + in += how_much; \ + } \ +} + +/************************************************************************* + * Max + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) + OP_SVE_FUNC(max, b, 8, int8_t, max) + OP_SVE_FUNC(max, b, 8, uint8_t, max) + OP_SVE_FUNC(max, h, 16, int16_t, max) + OP_SVE_FUNC(max, h, 16, uint16_t, max) + OP_SVE_FUNC(max, w, 32, int32_t, max) + OP_SVE_FUNC(max, w, 32, uint32_t, max) + OP_SVE_FUNC(max, d, 64, int64_t, max) + OP_SVE_FUNC(max, d, 64, uint64_t, max) + + OP_SVE_FUNC(max, w, 32, float32_t, max) + OP_SVE_FUNC(max, d, 64, float64_t, max) + +/************************************************************************* + * Min + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) + OP_SVE_FUNC(min, b, 8, int8_t, min) + OP_SVE_FUNC(min, b, 8, uint8_t, min) + OP_SVE_FUNC(min, h, 16, int16_t, min) + OP_SVE_FUNC(min, h, 16, uint16_t, min) + OP_SVE_FUNC(min, w, 32, int32_t, min) + OP_SVE_FUNC(min, w, 32, uint32_t, min) + OP_SVE_FUNC(min, d, 64, int64_t, min) + OP_SVE_FUNC(min, d, 64, uint64_t, min) + + OP_SVE_FUNC(min, w, 32, float32_t, min) + OP_SVE_FUNC(min, d, 64, float64_t, min) + + /************************************************************************* + * Sum + ************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) + OP_SVE_FUNC(sum, b, 8, int8_t, add) + OP_SVE_FUNC(sum, b, 8, uint8_t, add) + OP_SVE_FUNC(sum, h, 16, int16_t, add) + OP_SVE_FUNC(sum, h, 16, uint16_t, add) + OP_SVE_FUNC(sum, w, 32, int32_t, add) + OP_SVE_FUNC(sum, w, 32, uint32_t, add) + OP_SVE_FUNC(sum, d, 64, int64_t, add) + OP_SVE_FUNC(sum, d, 64, uint64_t, add) + + OP_SVE_FUNC(sum, w, 32, float32_t, add) + OP_SVE_FUNC(sum, d, 64, float64_t, add) + +/************************************************************************* + * Product + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) + OP_SVE_FUNC(prod, b, 8, int8_t, mul) + OP_SVE_FUNC(prod, b, 8, uint8_t, mul) + OP_SVE_FUNC(prod, h, 16, int16_t, mul) + OP_SVE_FUNC(prod, h, 16, uint16_t, mul) + OP_SVE_FUNC(prod, w, 32, int32_t, mul) + OP_SVE_FUNC(prod, w, 32, uint32_t, mul) + OP_SVE_FUNC(prod, d, 64, int64_t, mul) + OP_SVE_FUNC(prod, d, 64, uint64_t, mul) + + OP_SVE_FUNC(prod, w, 32, float32_t, mul) + OP_SVE_FUNC(prod, d, 64, float64_t, mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) + OP_SVE_FUNC(band, b, 8, int8_t, and) + OP_SVE_FUNC(band, b, 8, uint8_t, and) + OP_SVE_FUNC(band, h, 16, int16_t, and) + OP_SVE_FUNC(band, h, 16, uint16_t, and) + OP_SVE_FUNC(band, w, 32, int32_t, and) + OP_SVE_FUNC(band, w, 32, uint32_t, and) + OP_SVE_FUNC(band, d, 64, int64_t, and) +OP_SVE_FUNC(band, d, 64, uint64_t, and) + + /************************************************************************* + * Bitwise OR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) + OP_SVE_FUNC(bor, b, 8, int8_t, orr) + OP_SVE_FUNC(bor, b, 8, uint8_t, orr) + OP_SVE_FUNC(bor, h, 16, int16_t, orr) + OP_SVE_FUNC(bor, h, 16, uint16_t, orr) + OP_SVE_FUNC(bor, w, 32, int32_t, orr) + OP_SVE_FUNC(bor, w, 32, uint32_t, orr) + OP_SVE_FUNC(bor, d, 64, int64_t, orr) +OP_SVE_FUNC(bor, d, 64, uint64_t, orr) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) + OP_SVE_FUNC(bxor, b, 8, int8_t, eor) + OP_SVE_FUNC(bxor, b, 8, uint8_t, eor) + OP_SVE_FUNC(bxor, h, 16, int16_t, eor) + OP_SVE_FUNC(bxor, h, 16, uint16_t, eor) + OP_SVE_FUNC(bxor, w, 32, int32_t, eor) + OP_SVE_FUNC(bxor, w, 32, uint32_t, eor) + OP_SVE_FUNC(bxor, d, 64, int64_t, eor) +OP_SVE_FUNC(bxor, d, 64, uint64_t, eor) + +/* + * This is a three buffer (2 input and 1 output) version of the reduction + * routines, needed for some optimizations. + */ +#define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \ + static void ompi_op_sve_3buff_##name##_##type(void * restrict _in1, \ + void * restrict _in2, void * restrict _out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = svcnt##type_name(); \ + int left_over = *count; \ + type* in1 = (type*)_in1; \ + type* in2 = (type*)_in2; \ + type* out = (type*)_out; \ + svbool_t Pg = svptrue_b##type_size(); \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + sv##type vsrc = svld1(Pg, in1); \ + sv##type vdst = svld1(Pg, in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + out += types_per_step; \ + } \ + if (left_over !=0){ \ + Pg = svwhilelt_b##type_size##_u64(0, left_over); \ + sv##type vsrc = svld1(Pg, in1); \ + sv##type vdst = svld1(Pg, in2); \ + vdst=sv##op##_z(Pg,vdst,vsrc); \ + svst1(Pg, out,vdst); \ + } \ +} + +/************************************************************************* + * Max + *************************************************************************/ + OP_SVE_FUNC_3BUFF(max, b, 8, int8_t, max) + OP_SVE_FUNC_3BUFF(max, b, 8, uint8_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, int16_t, max) + OP_SVE_FUNC_3BUFF(max, h, 16, uint16_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, int32_t, max) + OP_SVE_FUNC_3BUFF(max, w, 32, uint32_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, int64_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max) + + OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max) + OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max) + +/************************************************************************* + * Min + *************************************************************************/ + OP_SVE_FUNC_3BUFF(min, b, 8, int8_t, min) + OP_SVE_FUNC_3BUFF(min, b, 8, uint8_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, int16_t, min) + OP_SVE_FUNC_3BUFF(min, h, 16, uint16_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, int32_t, min) + OP_SVE_FUNC_3BUFF(min, w, 32, uint32_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, int64_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min) + + OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min) + OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min) + + /************************************************************************* + * Sum + ************************************************************************/ + OP_SVE_FUNC_3BUFF(sum, b, 8, int8_t, add) + OP_SVE_FUNC_3BUFF(sum, b, 8, uint8_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, int16_t, add) + OP_SVE_FUNC_3BUFF(sum, h, 16, uint16_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, int32_t, add) + OP_SVE_FUNC_3BUFF(sum, w, 32, uint32_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, int64_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add) + + OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add) + OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add) + +/************************************************************************* + * Product + *************************************************************************/ + OP_SVE_FUNC_3BUFF(prod, b, 8, int8_t, mul) + OP_SVE_FUNC_3BUFF(prod, b, 8, uint8_t, mul) + OP_SVE_FUNC_3BUFF(prod, h, 16, int16_t, mul) + OP_SVE_FUNC_3BUFF(prod, h, 16, uint16_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, int32_t, mul) + OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, int64_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul) + + OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul) + OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ + OP_SVE_FUNC_3BUFF(band, b, 8, int8_t, and) + OP_SVE_FUNC_3BUFF(band, b, 8, uint8_t, and) + OP_SVE_FUNC_3BUFF(band, h, 16, int16_t, and) + OP_SVE_FUNC_3BUFF(band, h, 16, uint16_t, and) + OP_SVE_FUNC_3BUFF(band, w, 32, int32_t, and) + OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and) + OP_SVE_FUNC_3BUFF(band, d, 64, int64_t, and) + OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and) + + /************************************************************************* + * Bitwise OR + *************************************************************************/ + OP_SVE_FUNC_3BUFF(bor, b, 8, int8_t, orr) + OP_SVE_FUNC_3BUFF(bor, b, 8, uint8_t, orr) + OP_SVE_FUNC_3BUFF(bor, h, 16, int16_t, orr) + OP_SVE_FUNC_3BUFF(bor, h, 16, uint16_t, orr) + OP_SVE_FUNC_3BUFF(bor, w, 32, int32_t, orr) + OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr) + OP_SVE_FUNC_3BUFF(bor, d, 64, int64_t, orr) + OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ + OP_SVE_FUNC_3BUFF(bxor, b, 8, int8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, b, 8, uint8_t, eor) + OP_SVE_FUNC_3BUFF(bxor, h, 16, int16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, h, 16, uint16_t, eor) + OP_SVE_FUNC_3BUFF(bxor, w, 32, int32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor) + OP_SVE_FUNC_3BUFF(bxor, d, 64, int64_t, eor) + OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor) + +/** C integer ***********************************************************/ +#define C_INTEGER(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_sve_##ftype##_##name##_int8_t, \ + [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_sve_##ftype##_##name##_uint8_t, \ + [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t, \ + [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \ + [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t, \ + [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \ + [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t, \ + [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t + + +/** Floating point, including all the Fortran reals *********************/ +#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t +#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t + +#define FLOATING_POINT(name, ftype) \ + [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ + [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) + +/* + * MPI_OP_NULL + * All types + */ +#define FLAGS_NO_FLOAT \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE) +#define FLAGS \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \ + OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE) + +ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 2buff), + FLOATING_POINT(max, 2buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 2buff), + FLOATING_POINT(min, 2buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 2buff), + FLOATING_POINT(sum, 2buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER(prod, 2buff), + FLOATING_POINT(prod, 2buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] = { + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(band, 2buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(bor, 2buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(bxor, 2buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* (MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE) */ + NULL, + }, + +}; + +ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 3buff), + FLOATING_POINT(max, 3buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 3buff), + FLOATING_POINT(min, 3buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 3buff), + FLOATING_POINT(sum, 3buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER(prod, 3buff), + FLOATING_POINT(prod, 3buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] ={ + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(band, 3buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(bor, 3buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(bxor, 3buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE */ + NULL, + }, +}; diff --git a/ompi/mca/op/sve/op_sve_functions.h b/ompi/mca/op/sve/op_sve_functions.h new file mode 100644 index 00000000000..00db651cfac --- /dev/null +++ b/ompi/mca/op/sve/op_sve_functions.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * Copyright (c) 2019 Arm Ltd. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/sve/op_sve.h" + +BEGIN_C_DECLS + +OMPI_DECLSPEC extern ompi_op_base_handler_fn_t +ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; +OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t +ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +END_C_DECLS + diff --git a/test/datatype/HOW_TO_TEST_README.txt b/test/datatype/HOW_TO_TEST_README.txt new file mode 100644 index 00000000000..4d6025c95ee --- /dev/null +++ b/test/datatype/HOW_TO_TEST_README.txt @@ -0,0 +1,17 @@ +(1) Reduce_uint8.c : test code for MPI_SUM operation with type uint8. + compile as : $path/mpicc -march=armv8-a+sve -O3 -o Reduce_uint8 Reduce_uint8.c  +(2) sve_uint8_test.sh: A shell script that will generate results with time information for MPI_SUM operation with different message size.  +(3) SVE_MPI_Op.py: A python script which you can use to generate plot. + +ALL YOU NEED TO DO IS: +(1) Change the path of your mpirun and test binary in sve_uint8_test.sh. +(2) Run sve_uint8_test.sh + This will generate 3 types of output files with names + a. sve-sum-vectorlength.txt ##MPI_SUM with SVE-enabled operation + b. no-sve-sum-vectorlength.txt ##MPI_SUM without SVE-enabled operation + c. sve-cpy-vectorlength.txt ## memcpy +(2) run python scripts as + $python SVE_MPI_Op.py sve-sum-vectorlength.txt no-sve-sum-vectorlength.txt + make sure put sve-sum-vectorlength.txt before no-sve-sum-vectorlength.txt + example : SVE_MPI_Op.py sve-sum-128.txt no-sve-sum-128.txt + diff --git a/test/datatype/SVE_MPI_Op.py b/test/datatype/SVE_MPI_Op.py new file mode 100644 index 00000000000..5051936e29d --- /dev/null +++ b/test/datatype/SVE_MPI_Op.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +import numpy as np +import matplotlib.pyplot as plt +import sys + +print("\n \n Processing first file \n") + +filename1 = sys.argv[1] +f1 = open(filename1,"r") +lines1 = f1.readlines() +f1.close() + +noMin1 = {} +sortednomin1 = {} + +#Select lines with key +for line in lines1: + if "time" in line: + vals = line.split( ) + noMin1.setdefault((int)(vals[2]), []).append((float)(vals[4])) + +sortednomin1=sorted(noMin1.items()) +num_of_elements1 = [x[0] for x in sortednomin1] +Reduce_latency1 = [x[1] for x in sortednomin1] +print(Reduce_latency1) + +print("\n \n Processing second file \n") + + +filename2 = sys.argv[2] +f2 = open(filename2,"r") +lines2 = f2.readlines() +f2.close() + +noMin2 = {} +sortednomin2 = {} + + +#Select lines with key +for line in lines2: + if "time" in line: + vals = line.split( ) + noMin2.setdefault((int)(vals[2]), []).append((float)(vals[4])) + +sortednomin2=sorted(noMin2.items()) +num_of_elements2 = [x[0] for x in sortednomin2] +Reduce_latency2 = [x[1] for x in sortednomin2] +print(Reduce_latency2) + +Msg=[1,2,3,4,5,6,7,8,9] + +Msg_size=['1k', '4k', '16k', '64k', '256k', '1M', '4M', '16M', '32M'] + +meanlineprops = dict(linewidth=2) +fig, (ax1) = plt.subplots(1, figsize=(6, 4)) + +ax1.grid(True,linestyle='--') + + +bp1 =ax1.boxplot(Reduce_latency1, notch=1, sym='+', vert=1, whis=1.5,patch_artist=True) +bp2 =ax1.boxplot(Reduce_latency2, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True) +#bp3 =ax1.boxplot(cpy, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True) +#ax1.plot(Msg, cpy, linestyle='-.', lw=2.5, marker='o',markersize='8', color='b', label="0.1s") + +colors = ['red','red','red','red','red','red','red','red','red'] +for patch, color in zip(bp2['boxes'], colors): + patch.set_facecolor(color) + +colors = ['purple','purple','purple','purple','purple','purple','purple','purple','purple'] +for patch, color in zip(bp1['boxes'], colors): + patch.set_facecolor(color) + +""" +colors = ['green','green','green','green','green','green'] +for patch, color in zip(bp3['boxes'], colors): + patch.set_facecolor(color) +""" +fig.suptitle('MPI_Op: mul; MPI_type: uint8 \n(Configured with optimized, Flushed cache)', x=0.45, y=1.01, va='center',fontsize=15) +fig.text(-0.03, 0.5, 'Reduce time (s)', va='center', rotation='vertical',fontsize=12) +plt.xlabel('Buffer size (Bytes)',fontsize=12) + +ax1.set_xticks(Msg) +ax1.set_xticklabels(Msg_size, minor=False) + + +ax1.legend([bp2["boxes"][0], bp1["boxes"][0]], ['NO_SVE','SVE'], loc='upper left',fontsize=12)#, bbox_to_anchor=(1, 0.5),fontsize=15) + +plt.yscale('log') + +plt.savefig(filename1+"_sum.pdf",dpi=300,bbox_inches='tight') diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh new file mode 100644 index 00000000000..cf9f09a801e --- /dev/null +++ b/test/datatype/correctness_check.sh @@ -0,0 +1,59 @@ +echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation" +echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1 -np 1 /your_test_path/Reduce_local_float 1048576 i 8 max" + +Orange= "\033[0;33m" +Blue= '\033[0;34m' +Purple= '\033[0;35m' + +NC='\033[0m' + +# test all vector size +for vector_len in 128 256 512 1024 2048 +do + + echo "=========Integer type all operations & all sizes========" + echo "" + for op in max min sum mul band bor bxor + do + echo "" + echo "===Operation $op test===" + for size in 8 16 32 64 + do + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 i $size $op + echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) < Total_num_bits < 2048*N " + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 i $size $op + done + done + + echo "=======Float type all operations========" + echo "" + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op + done + + echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) < Total_num_bits < 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 f 32 $op + done + + + echo "========Double type all operations=========" + echo "" + + echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op + done + + echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) < Total_num_bits < 2048*N " + for op in max min sum mul + do + /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1 -np 1 armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so --unsafe-ldstex -- /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 d 64 $op + done +done +##echo -e "Test \e[1;35m duff device code \e[m 2048*N < Total_num_bits < 2048*N + 256 "