diff --git a/config/opal_config_asm.m4 b/config/opal_config_asm.m4 index b90c4aa61bd..a11682ce24e 100644 --- a/config/opal_config_asm.m4 +++ b/config/opal_config_asm.m4 @@ -1374,7 +1374,7 @@ AC_MSG_ERROR([Can not continue.]) # Check for RDTSCP support result=0 - AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"], + AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"], [AC_MSG_CHECKING([for RDTSCP assembly support]) AC_LANG_PUSH([C]) AC_TRY_RUN([[ diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am index 8c392f1dbec..c4d47f9e64a 100644 --- a/ompi/mca/op/Makefile.am +++ b/ompi/mca/op/Makefile.am @@ -17,6 +17,8 @@ # $HEADER$ # +AM_CPPFLAGS = $(LTDLINCL) + # main library setup noinst_LTLIBRARIES = libmca_op.la libmca_op_la_SOURCES = diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am new file mode 100644 index 00000000000..3a02d7510fb --- /dev/null +++ b/ompi/mca/op/avx/Makefile.am @@ -0,0 +1,69 @@ +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This component provide support for the Advanced Vector Extensions (AVX) +# available in recent versions of x86 processors. +# +# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent +# for more details on how to make Open MPI components. + +# First, list all .h and .c sources. It is necessary to list all .h +# files so that they will be picked up in the distribution tarball. + +AM_CPPFLAGS=$(op_avx_CPPFLAGS) + +sources = op_avx_component.c op_avx_functions.c \ + op_avx_functions.h op_avx.h + +# Open MPI components can be compiled two ways: +# +# 1. As a standalone dynamic shared object (DSO), sometimes called a +# dynamically loadable library (DLL). +# +# 2. As a static library that is slurped up into the upper-level +# libmpi library (regardless of whether libmpi is a static or dynamic +# library). This is called a "Libtool convenience library". +# +# The component needs to create an output library in this top-level +# component directory, and named either mca__.la (for DSO +# builds) or libmca__.la (for static builds). The OMPI +# build system will have set the +# MCA_BUILD_ompi___DSO AM_CONDITIONAL to indicate +# which way this component should be built. + +if MCA_BUILD_ompi_op_avx_DSO +component_noinst = +component_install = mca_op_avx.la +else +component_install = +component_noinst = libmca_op_avx.la +endif + +# Specific information for DSO builds. +# +# The DSO should install itself in $(ompilibdir) (by default, +# $prefix/lib/openmpi). + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_op_avx_la_SOURCES = $(sources) +mca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS) +mca_op_avx_la_LDFLAGS = -module -avoid-version + +# Specific information for static builds. +# +# Note that we *must* "noinst"; the upper-layer Makefile.am's will +# slurp in the resulting .la library into libmpi. + +noinst_LTLIBRARIES = $(component_noinst) +libmca_op_avx_la_SOURCES = $(sources) +libmca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS) +libmca_op_avx_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4 new file mode 100644 index 00000000000..ab21f057933 --- /dev/null +++ b/ompi/mca/op/avx/configure.m4 @@ -0,0 +1,67 @@ +# -*- shell-script -*- +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_avx_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +# We can always build, unless we were explicitly disabled. +AC_DEFUN([MCA_ompi_op_avx_CONFIG],[ + op_avx_support=0 + OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save op_avx_CPPFLAGS]) + + AS_IF([test "$opal_cv_asm_arch" = "X86_64"], + [AC_LANG_PUSH([C]) + + # Check for AVX512 support with default flags + op_avx_CPPFLAGS="" + AC_MSG_CHECKING([for AVX512 support (no additional flags)]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ + __m512 vA, vB; + _mm512_add_ps(vA, vB) + ]])], + [op_avx_support=1 + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])]) + + AS_IF([test $op_avx_support -eq 0], + [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)]) + op_avx_cflags_save="$CFLAGS" + CFLAGS="$CFLAGS -march=skylake-avx512" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[ + __m512 vA, vB; + _mm512_add_ps(vA, vB) + ]])], + [op_avx_support=1 + op_avx_CPPFLAGS="-march=skylake-avx512" + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])]) + CFLAGS="$op_avx_cflags_save" + ]) + AC_LANG_POP([C]) + ]) + + AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512], + [$op_avx_support], + [Whetever AVX512 is supported in the current compilation context]) + AC_SUBST([op_avx_CPPFLAGS]) + OPAL_VAR_SCOPE_POP + AS_IF([test $op_avx_support -eq 1], + [AC_CONFIG_FILES([ompi/mca/op/avx/Makefile]) + [$1]], + [$2]) + +])dnl diff --git a/ompi/mca/op/avx/op_avx.h b/ompi/mca/op/avx/op_avx.h new file mode 100644 index 00000000000..927a48db947 --- /dev/null +++ b/ompi/mca/op/avx/op_avx.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OP_AVX_EXPORT_H +#define MCA_OP_AVX_EXPORT_H + +#include "ompi_config.h" + +#include "ompi/mca/mca.h" +#include "opal/class/opal_object.h" + +#include "ompi/mca/op/op.h" + +BEGIN_C_DECLS + +/** + * Derive a struct from the base op component struct, allowing us to + * cache some component-specific information on our well-known + * component struct. + */ +typedef struct { + /** The base op component struct */ + ompi_op_base_component_1_0_0_t super; + + /* What follows is avx-component-specific cached information. We + tend to use this scheme (caching information on the avx + component itself) instead of lots of individual global + variables for the component. The following data fields are + avxs; replace them with whatever is relevant for your + component. */ + + /** A simple boolean indicating whether double precision is + supported. */ + bool double_supported; +} ompi_op_avx_component_t; + +/** + * Globally exported variable. Note that it is a *avx* component + * (defined above), which has the ompi_op_base_component_t as its + * first member. Hence, the MCA/op framework will find the data that + * it expects in the first memory locations, but then the component + * itself can cache additional information after that that can be used + * by both the component and modules. + */ +OMPI_DECLSPEC extern ompi_op_avx_component_t + mca_op_avx_component; + +END_C_DECLS + +#endif /* MCA_OP_AVX_EXPORT_H */ diff --git a/ompi/mca/op/avx/op_avx_component.c b/ompi/mca/op/avx/op_avx_component.c new file mode 100644 index 00000000000..5113a6cf437 --- /dev/null +++ b/ompi/mca/op/avx/op_avx_component.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * This is the "avx" component source code. + * + */ + +#include "ompi_config.h" + +#include "opal/util/printf.h" + +#include "ompi/constants.h" +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/avx/op_avx.h" +#include "ompi/mca/op/avx/op_avx_functions.h" + +static int avx_component_open(void); +static int avx_component_close(void); +static int avx_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + avx_component_op_query(struct ompi_op_t *op, int *priority); +static int avx_component_register(void); + +/** + * A slightly modified code from + * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family + */ +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300) + +#include + +int has_intel_AVX512_features(void) +{ + const unsigned long avx512_features = _FEATURE_AVX2; + + return _may_i_use_cpu_feature( avx512_features ); +} +#else /* non-Intel compiler */ +#include + +#if defined(_MSC_VER) +#include +#endif + +void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + uint32_t ebx, edx; +#if defined( __i386__ ) && defined ( __PIC__ ) + /* in case of PIC under 32-bit EBX cannot be clobbered */ + __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx), + "+a" (eax), "=c" (ecx), "=d" (edx) ); +#else + __asm__ ( "cpuid" : "=b" (ebx), + "+a" (eax), "=c" (ecx), "=d" (edx) ); +#endif /* defined( __i386__ ) && defined ( __PIC__ ) */ + abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx; +#endif +} + +int has_intel_AVX512_features(void) +{ + uint32_t abcd[4]; + uint32_t osxsave_mask = (1 << 27); // OSX. + uint32_t avx2_mask = (1 << 5); // AVX2 + + run_cpuid( 1, 0, abcd ); + // OS supports extended processor state management ? + if ( (abcd[2] & osxsave_mask) != osxsave_mask ) + return 0; + + run_cpuid( 7, 0, abcd ); + return !!((abcd[1] & avx2_mask) != avx2_mask); +} +#endif /* non-Intel compiler */ + +ompi_op_avx_component_t mca_op_avx_component = { + { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "avx", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = avx_component_open, + .mca_close_component = avx_component_close, + .mca_register_component_params = avx_component_register, + }, + .opc_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = avx_component_init_query, + .opc_op_query = avx_component_op_query, + }, +}; + +/* + * Component open + */ +static int avx_component_open(void) +{ + + /* A first level check to see if avx is even available in this + process. E.g., you may want to do a first-order check to see + if hardware is available. If so, return OMPI_SUCCESS. If not, + return anything other than OMPI_SUCCESS and the component will + silently be ignored. + + Note that if this function returns non-OMPI_SUCCESS, then this + component won't even be shown in ompi_info output (which is + probably not what you want). + */ + return OMPI_SUCCESS; +} + +/* + * Component close + */ +static int avx_component_close(void) +{ + + /* If avx was opened successfully, close it (i.e., release any + resources that may have been allocated on this component). + Note that _component_close() will always be called at the end + of the process, so it may have been after any/all of the other + component functions have been invoked (and possibly even after + modules have been created and/or destroyed). */ + + return OMPI_SUCCESS; +} + +/* + * Register MCA params. + */ +static int +avx_component_register(void) +{ + mca_op_avx_component.double_supported = true; + (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version, + "double_supported", + "Whether the double precision data types are supported or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_avx_component.double_supported); + + return OMPI_SUCCESS; +} + +/* + * Query whether this component wants to be used in this process. + */ +static int +avx_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + if( !has_intel_AVX512_features() ) + return OMPI_ERR_NOT_SUPPORTED; + return OMPI_SUCCESS; +} + + +/* + * Query whether this component can be used for a specific op + */ +static struct ompi_op_base_module_1_0_0_t* +avx_component_op_query(struct ompi_op_t *op, int *priority) +{ + ompi_op_base_module_t *module = NULL; + /* Sanity check -- although the framework should never invoke the + _component_op_query() on non-intrinsic MPI_Op's, we'll put a + check here just to be sure. */ + if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) { + return NULL; + } + + switch (op->o_f_to_c_index) { + case OMPI_OP_BASE_FORTRAN_MAX: + case OMPI_OP_BASE_FORTRAN_MIN: + case OMPI_OP_BASE_FORTRAN_SUM: + case OMPI_OP_BASE_FORTRAN_PROD: + case OMPI_OP_BASE_FORTRAN_BOR: + case OMPI_OP_BASE_FORTRAN_BAND: + case OMPI_OP_BASE_FORTRAN_BXOR: + module = OBJ_NEW(ompi_op_base_module_t); + for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = ompi_op_avx_functions[op->o_f_to_c_index][i]; + OBJ_RETAIN(module); + module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[op->o_f_to_c_index][i]; + OBJ_RETAIN(module); + } + break; + case OMPI_OP_BASE_FORTRAN_LAND: + case OMPI_OP_BASE_FORTRAN_LOR: + case OMPI_OP_BASE_FORTRAN_LXOR: + case OMPI_OP_BASE_FORTRAN_MAXLOC: + case OMPI_OP_BASE_FORTRAN_MINLOC: + case OMPI_OP_BASE_FORTRAN_REPLACE: + default: + break; + } + /* If we got a module from above, we'll return it. Otherwise, + we'll return NULL, indicating that this component does not want + to be considered for selection for this MPI_Op. Note that the + functions each returned a *avx* component pointer + (vs. a *base* component pointer -- where an *avx* component + is a base component plus some other module-specific cached + information), so we have to cast it to the right pointer type + before returning. */ + if (NULL != module) { + *priority = 50; + } + return (ompi_op_base_module_1_0_0_t *) module; +} diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c new file mode 100644 index 00000000000..231696f12ca --- /dev/null +++ b/ompi/mca/op/avx/op_avx_functions.c @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#include "opal/util/output.h" + +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/avx/op_avx.h" +#include "ompi/mca/op/avx/op_avx_functions.h" + +#include +/* + * Since all the functions in this file are essentially identical, we + * use a macro to substitute in names and types. The core operation + * in all functions that use this macro is the same. + * + * This macro is for (out op in). + * + * Support ops: max, min, for signed/unsigned 8,16,32,64 + * sum, for integer 8,16,32,64 + * + */ +#define OP_AVX_FUNC(name, type_sign, type_size, type, op) \ + static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(type)); \ + int left_over = *count; \ + type* in = (type*)_in; \ + type* out = (type*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512i vecA = _mm512_loadu_si512(in);\ + __m512i vecB = _mm512_loadu_si512(out);\ + in += types_per_step; \ + __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm512_storeu_si512((out), res); \ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + types_per_step >>= 1; \ + if( left_over >= types_per_step ) { \ + __m256i vecA = _mm256_loadu_si256(in); \ + __m256i vecB = _mm256_loadu_si256(out); \ + in += types_per_step; \ + __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm256_storeu_si256(out, res); \ + out += types_per_step; \ + left_over -= types_per_step; \ + }\ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 31: out[30] = current_func(out[30],in[30]) ; \ + case 30: out[29] = current_func(out[29],in[29]) ; \ + case 29: out[28] = current_func(out[28],in[28]) ; \ + case 28: out[27] = current_func(out[27],in[27]) ; \ + case 27: out[26] = current_func(out[26],in[26]) ; \ + case 26: out[25] = current_func(out[25],in[25]) ; \ + case 25: out[24] = current_func(out[24],in[24]) ; \ + case 24: out[23] = current_func(out[23],in[23]) ; \ + case 23: out[22] = current_func(out[22],in[22]) ; \ + case 22: out[21] = current_func(out[21],in[21]) ; \ + case 21: out[20] = current_func(out[20],in[20]) ; \ + case 20: out[19] = current_func(out[19],in[19]) ; \ + case 19: out[18] = current_func(out[18],in[18]) ; \ + case 18: out[17] = current_func(out[17],in[17]) ; \ + case 17: out[16] = current_func(out[16],in[16]) ; \ + case 16: out[15] = current_func(out[15],in[15]) ; \ + case 15: out[14] = current_func(out[14],in[14]) ; \ + case 14: out[13] = current_func(out[13],in[13]) ; \ + case 13: out[12] = current_func(out[12],in[12]) ; \ + case 12: out[11] = current_func(out[11],in[11]) ; \ + case 11: out[10] = current_func(out[10],in[10]) ; \ + case 10: out[9] = current_func(out[9],in[9]) ; \ + case 9: out[8] = current_func(out[8],in[8]) ; \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + }\ + }\ +} + +/* special case for int8 mul */ +#define OP_AVX_MUL(name, type_sign, type_size, type, op) \ + static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 256 / (8 * sizeof(type)); \ + int left_over = *count; \ + type* in = (type*)_in; \ + type* out = (type*)_out; \ + int store_mask = 0xFFFFFFFF;\ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m256i vecA_tmp = _mm256_loadu_si256(in);\ + __m256i vecB_tmp = _mm256_loadu_si256(out);\ + in += types_per_step; \ + __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \ + __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \ + __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB); \ + _mm512_mask_cvtepi16_storeu_epi8((out), store_mask, res);\ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + switch(left_over) { \ + case 31: out[30] = current_func(out[30],in[30]) ; \ + case 30: out[29] = current_func(out[29],in[29]) ; \ + case 29: out[28] = current_func(out[28],in[28]) ; \ + case 28: out[27] = current_func(out[27],in[27]) ; \ + case 27: out[26] = current_func(out[26],in[26]) ; \ + case 26: out[25] = current_func(out[25],in[25]) ; \ + case 25: out[24] = current_func(out[24],in[24]) ; \ + case 24: out[23] = current_func(out[23],in[23]) ; \ + case 23: out[22] = current_func(out[22],in[22]) ; \ + case 22: out[21] = current_func(out[21],in[21]) ; \ + case 21: out[20] = current_func(out[20],in[20]) ; \ + case 20: out[19] = current_func(out[19],in[19]) ; \ + case 19: out[18] = current_func(out[18],in[18]) ; \ + case 18: out[17] = current_func(out[17],in[17]) ; \ + case 17: out[16] = current_func(out[16],in[16]) ; \ + case 16: out[15] = current_func(out[15],in[15]) ; \ + case 15: out[14] = current_func(out[14],in[14]) ; \ + case 14: out[13] = current_func(out[13],in[13]) ; \ + case 13: out[12] = current_func(out[12],in[12]) ; \ + case 12: out[11] = current_func(out[11],in[11]) ; \ + case 11: out[10] = current_func(out[10],in[10]) ; \ + case 10: out[9] = current_func(out[9],in[9]) ; \ + case 9: out[8] = current_func(out[8],in[8]) ; \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + }\ +} + +/* + * This macro is for bit-wise operations (out op in). + * + * Support ops: or, xor, and of 512 bits (representing integer data) + * + */ +#define OP_AVX_BIT_FUNC(name, type_size, type, op) \ + static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(type)); \ + int left_over = *count; \ + type* in = (type*)_in; \ + type* out = (type*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512i vecA = _mm512_loadu_si512(in);\ + __m512i vecB = _mm512_loadu_si512(out);\ + in += types_per_step; \ + __m512i res = _mm512_##op##_si512(vecA, vecB); \ + _mm512_storeu_si512(out, res); \ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + types_per_step >>= 1; \ + if( left_over >= types_per_step ) { \ + __m256i vecA = _mm256_loadu_si256(in); \ + __m256i vecB = _mm256_loadu_si256(out); \ + in += types_per_step; \ + __m256i res = _mm256_##op##_si256(vecA, vecB); \ + _mm256_storeu_si256(out, res); \ + out += types_per_step; \ + left_over -= types_per_step; \ + }\ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 31: out[30] = current_func(out[30],in[30]) ; \ + case 30: out[29] = current_func(out[29],in[29]) ; \ + case 29: out[28] = current_func(out[28],in[28]) ; \ + case 28: out[27] = current_func(out[27],in[27]) ; \ + case 27: out[26] = current_func(out[26],in[26]) ; \ + case 26: out[25] = current_func(out[25],in[25]) ; \ + case 25: out[24] = current_func(out[24],in[24]) ; \ + case 24: out[23] = current_func(out[23],in[23]) ; \ + case 23: out[22] = current_func(out[22],in[22]) ; \ + case 22: out[21] = current_func(out[21],in[21]) ; \ + case 21: out[20] = current_func(out[20],in[20]) ; \ + case 20: out[19] = current_func(out[19],in[19]) ; \ + case 19: out[18] = current_func(out[18],in[18]) ; \ + case 18: out[17] = current_func(out[17],in[17]) ; \ + case 17: out[16] = current_func(out[16],in[16]) ; \ + case 16: out[15] = current_func(out[15],in[15]) ; \ + case 15: out[14] = current_func(out[14],in[14]) ; \ + case 14: out[13] = current_func(out[13],in[13]) ; \ + case 13: out[12] = current_func(out[12],in[12]) ; \ + case 12: out[11] = current_func(out[11],in[11]) ; \ + case 11: out[10] = current_func(out[10],in[10]) ; \ + case 10: out[9] = current_func(out[9],in[9]) ; \ + case 9: out[8] = current_func(out[8],in[8]) ; \ + case 8: out[7] = current_func(out[7],in[7]) ; \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + }\ + }\ +} + +#if 0 +#define OP_AVX_FLOAT_FUNC(op) \ + static void ompi_op_avx_2buff_##op##_float(void *in, void *out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ + { \ + int step = 16; \ + int size = *count/step; \ + int i; \ + int round = size*64; \ + for (i = 0; i < round; i+=64) { \ + __m512 vecA = _mm512_load_ps((in+i)); \ + __m512 vecB = _mm512_load_ps((out+i)); \ + __m512 res = _mm512_##op##_ps(vecA, vecB); \ + _mm512_store_ps((out+i), res); \ + } \ + uint64_t left_over = *count - (size*step); \ + if(left_over!=0) { \ + uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \ + left_over = left_over64 >>(64-left_over); \ + __m512 vecA = _mm512_maskz_load_ps(left_over, (in+round)); \ + __m512 vecB = _mm512_maskz_load_ps(left_over, (out+round)); \ + __m512 res = _mm512_##op##_ps(vecA, vecB); \ + _mm512_mask_store_ps((out+round), left_over, res); \ + } \ + } + +#else +#define OP_AVX_FLOAT_FUNC(op) \ +static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(float)); \ + int left_over = *count; \ + float* in = (float*)_in; \ + float* out = (float*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512 vecA = _mm512_load_ps(in); \ + __m512 vecB = _mm512_load_ps(out); \ + in += types_per_step; \ + __m512 res = _mm512_##op##_ps(vecA, vecB); \ + _mm512_store_ps(out, res); \ + out += types_per_step; \ + } \ + if( 0 != left_over ) { \ + types_per_step >>= 1; /* 256 / (8 * sizeof(float)); */ \ + if( left_over >= types_per_step ) { \ + __m256 vecA = _mm256_load_ps(in); \ + __m256 vecB = _mm256_load_ps(out); \ + __m256 res = _mm256_##op##_ps(vecA, vecB); \ + _mm256_store_ps(out, res); \ + in += types_per_step; \ + out += types_per_step; \ + left_over -= types_per_step; \ + } \ + } \ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 7: out[6] = current_func(out[6],in[6]) ; \ + case 6: out[5] = current_func(out[5],in[5]) ; \ + case 5: out[4] = current_func(out[4],in[4]) ; \ + case 4: out[3] = current_func(out[3],in[3]) ; \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + }\ +} +#endif + +#define OP_AVX_DOUBLE_FUNC(op) \ + static void ompi_op_avx_2buff_##op##_double(void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(double)); \ + int left_over = *count; \ + double* in = (double*)_in; \ + double* out = (double*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512d vecA = _mm512_load_pd(in);\ + __m512d vecB = _mm512_load_pd(out);\ + in += types_per_step; \ + __m512d res = _mm512_##op##_pd(vecA, vecB); \ + _mm512_store_pd((out), res); \ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + types_per_step >>= 1; /* 256 / (8 * sizeof(double)); */ \ + if( left_over >= types_per_step ) { \ + __m256 vecA = _mm256_load_pd(in); \ + __m256 vecB = _mm256_load_pd(out); \ + __m256 res = _mm256_##op##_pd(vecA, vecB); \ + _mm256_store_pd(out, res); \ + in += types_per_step; \ + out += types_per_step; \ + left_over -= types_per_step; \ + }\ + }\ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 3: out[2] = current_func(out[2],in[2]) ; \ + case 2: out[1] = current_func(out[1],in[1]) ; \ + case 1: out[0] = current_func(out[0],in[0]) ; \ + }\ + }\ +} + + +/************************************************************************* + * Max + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) + OP_AVX_FUNC(max, i, 8, int8_t, max) + OP_AVX_FUNC(max, u, 8, uint8_t, max) + OP_AVX_FUNC(max, i, 16, int16_t, max) + OP_AVX_FUNC(max, u, 16, uint16_t, max) + OP_AVX_FUNC(max, i, 32, int32_t, max) + OP_AVX_FUNC(max, u, 32, uint32_t, max) + OP_AVX_FUNC(max, i, 64, int64_t, max) + OP_AVX_FUNC(max, u, 64, uint64_t, max) + + /* Floating point */ + OP_AVX_FLOAT_FUNC(max) + OP_AVX_DOUBLE_FUNC(max) + +/************************************************************************* + * Min + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) + OP_AVX_FUNC(min, i, 8, int8_t, min) + OP_AVX_FUNC(min, u, 8, uint8_t, min) + OP_AVX_FUNC(min, i, 16, int16_t, min) + OP_AVX_FUNC(min, u, 16, uint16_t, min) + OP_AVX_FUNC(min, i, 32, int32_t, min) + OP_AVX_FUNC(min, u, 32, uint32_t, min) + OP_AVX_FUNC(min, i, 64, int64_t, min) + OP_AVX_FUNC(min, u, 64, uint64_t, min) + + /* Floating point */ + OP_AVX_FLOAT_FUNC(min) + OP_AVX_DOUBLE_FUNC(min) + +/************************************************************************* + * Sum + ************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) + OP_AVX_FUNC(sum, i, 8, int8_t, add) + OP_AVX_FUNC(sum, i, 8, uint8_t, add) + OP_AVX_FUNC(sum, i, 16, int16_t, add) + OP_AVX_FUNC(sum, i, 16, uint16_t, add) + OP_AVX_FUNC(sum, i, 32, int32_t, add) + OP_AVX_FUNC(sum, i, 32, uint32_t, add) + OP_AVX_FUNC(sum, i, 64, int64_t, add) + OP_AVX_FUNC(sum, i, 64, uint64_t, add) + + /* Floating point */ + OP_AVX_FLOAT_FUNC(add) + OP_AVX_DOUBLE_FUNC(add) + +/************************************************************************* + * Product + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) + OP_AVX_MUL(prod, i, 8, int8_t, mullo) + OP_AVX_MUL(prod, i, 8, uint8_t, mullo) + OP_AVX_FUNC(prod, i, 16, int16_t, mullo) + OP_AVX_FUNC(prod, i, 16, uint16_t, mullo) + OP_AVX_FUNC(prod, i, 32, int32_t, mullo) + OP_AVX_FUNC(prod, i ,32, uint32_t, mullo) + OP_AVX_FUNC(prod, i, 64, int64_t, mullo) + OP_AVX_FUNC(prod, i, 64, uint64_t, mullo) + + /* Floating point */ + OP_AVX_FLOAT_FUNC(mul) + OP_AVX_DOUBLE_FUNC(mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) + OP_AVX_BIT_FUNC(band, 8, int8_t, and) + OP_AVX_BIT_FUNC(band, 8, uint8_t, and) + OP_AVX_BIT_FUNC(band, 16, int16_t, and) + OP_AVX_BIT_FUNC(band, 16, uint16_t, and) + OP_AVX_BIT_FUNC(band, 32, int32_t, and) + OP_AVX_BIT_FUNC(band, 32, uint32_t, and) + OP_AVX_BIT_FUNC(band, 64, int64_t, and) + OP_AVX_BIT_FUNC(band, 64, uint64_t, and) + + // not defined - OP_AVX_FLOAT_FUNC(and) + // not defined - OP_AVX_DOUBLE_FUNC(and) + +/************************************************************************* + * Bitwise OR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) + OP_AVX_BIT_FUNC(bor, 8, int8_t, or) + OP_AVX_BIT_FUNC(bor, 8, uint8_t, or) + OP_AVX_BIT_FUNC(bor, 16, int16_t, or) + OP_AVX_BIT_FUNC(bor, 16, uint16_t, or) + OP_AVX_BIT_FUNC(bor, 32, int32_t, or) + OP_AVX_BIT_FUNC(bor, 32, uint32_t, or) + OP_AVX_BIT_FUNC(bor, 64, int64_t, or) + OP_AVX_BIT_FUNC(bor, 64, uint64_t, or) + + // not defined - OP_AVX_FLOAT_FUNC(or) + // not defined - OP_AVX_DOUBLE_FUNC(or) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) + OP_AVX_BIT_FUNC(bxor, 8, int8_t, xor) + OP_AVX_BIT_FUNC(bxor, 8, uint8_t, xor) + OP_AVX_BIT_FUNC(bxor, 16, int16_t, xor) + OP_AVX_BIT_FUNC(bxor, 16, uint16_t, xor) + OP_AVX_BIT_FUNC(bxor, 32, int32_t, xor) + OP_AVX_BIT_FUNC(bxor, 32, uint32_t, xor) + OP_AVX_BIT_FUNC(bxor, 64, int64_t, xor) + OP_AVX_BIT_FUNC(bxor, 64, uint64_t, xor) + + // not defined - OP_AVX_FLOAT_FUNC(xor) + // not defined - OP_AVX_DOUBLE_FUNC(xor) + +/* + * This is a three buffer (2 input and 1 output) version of the reduction + * routines, needed for some optimizations. + */ +#define OP_AVX_FUNC_3BUFF(name, type_sign, type_size, type, op)\ + static void ompi_op_avx_3buff_##name##_##type(void * restrict _in1, \ + void * restrict _in2, void * restrict _out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(type)); \ + int left_over = *count; \ + type* in1 = (type*)_in1; \ + type* in2 = (type*)_in2; \ + type* out = (type*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512i vecA = _mm512_loadu_si512(in1);\ + __m512i vecB = _mm512_loadu_si512(in2);\ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm512_storeu_si512((out), res); \ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + types_per_step >>= 1; \ + if( left_over >= types_per_step ) { \ + __m256i vecA = _mm256_loadu_si256(in1); \ + __m256i vecB = _mm256_loadu_si256(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m256i res = _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \ + _mm256_storeu_si256(out, res); \ + out += types_per_step; \ + left_over -= types_per_step; \ + }\ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 31: out[30] = current_func(in1[30],in2[30]) ; \ + case 30: out[29] = current_func(in1[29],in2[29]) ; \ + case 29: out[28] = current_func(in1[28],in2[28]) ; \ + case 28: out[27] = current_func(in1[27],in2[27]) ; \ + case 27: out[26] = current_func(in1[26],in2[26]) ; \ + case 26: out[25] = current_func(in1[25],in2[25]) ; \ + case 25: out[24] = current_func(in1[24],in2[24]) ; \ + case 24: out[23] = current_func(in1[23],in2[23]) ; \ + case 23: out[22] = current_func(in1[22],in2[22]) ; \ + case 22: out[21] = current_func(in1[21],in2[21]) ; \ + case 21: out[20] = current_func(in1[20],in2[20]) ; \ + case 20: out[19] = current_func(in1[19],in2[19]) ; \ + case 19: out[18] = current_func(in1[18],in2[18]) ; \ + case 18: out[17] = current_func(in1[17],in2[17]) ; \ + case 17: out[16] = current_func(in1[16],in2[16]) ; \ + case 16: out[15] = current_func(in1[15],in2[15]) ; \ + case 15: out[14] = current_func(in1[14],in2[14]) ; \ + case 14: out[13] = current_func(in1[13],in2[13]) ; \ + case 13: out[12] = current_func(in1[12],in2[12]) ; \ + case 12: out[11] = current_func(in1[11],in2[11]) ; \ + case 11: out[10] = current_func(in1[10],in2[10]) ; \ + case 10: out[9] = current_func(in1[9],in2[9]) ; \ + case 9: out[8] = current_func(in1[8],in2[8]) ; \ + case 8: out[7] = current_func(in1[7],in2[7]) ; \ + case 7: out[6] = current_func(in1[6],in2[6]) ; \ + case 6: out[5] = current_func(in1[5],in2[5]) ; \ + case 5: out[4] = current_func(in1[4],in2[4]) ; \ + case 4: out[3] = current_func(in1[3],in2[3]) ; \ + case 3: out[2] = current_func(in1[2],in2[2]) ; \ + case 2: out[1] = current_func(in1[1],in2[1]) ; \ + case 1: out[0] = current_func(in1[0],in2[0]) ; \ + } \ + }\ + }\ +} + +/* special case for int8 mul */ +#define OP_AVX_MUL_3BUFF(name, type_sign, type_size, type, op) \ + static void ompi_op_avx_3buff_##name##_##type(void * restrict _in1, \ + void * restrict _in2, void * restrict _out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 256 / (8 * sizeof(type)); \ + int left_over = *count; \ + type* in1 = (type*)_in1; \ + type* in2 = (type*)_in2; \ + type* out = (type*)_out; \ + int store_mask = 0xFFFFFFFF;\ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m256i vecA_tmp = _mm256_loadu_si256(in1);\ + __m256i vecB_tmp = _mm256_loadu_si256(in2);\ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \ + __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \ + __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB); \ + _mm512_mask_cvtepi16_storeu_epi8((out), store_mask, res);\ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + switch(left_over) { \ + case 31: out[30] = current_func(in1[30],in2[30]) ; \ + case 30: out[29] = current_func(in1[29],in2[29]) ; \ + case 29: out[28] = current_func(in1[28],in2[28]) ; \ + case 28: out[27] = current_func(in1[27],in2[27]) ; \ + case 27: out[26] = current_func(in1[26],in2[26]) ; \ + case 26: out[25] = current_func(in1[25],in2[25]) ; \ + case 25: out[24] = current_func(in1[24],in2[24]) ; \ + case 24: out[23] = current_func(in1[23],in2[23]) ; \ + case 23: out[22] = current_func(in1[22],in2[22]) ; \ + case 22: out[21] = current_func(in1[21],in2[21]) ; \ + case 21: out[20] = current_func(in1[20],in2[20]) ; \ + case 20: out[19] = current_func(in1[19],in2[19]) ; \ + case 19: out[18] = current_func(in1[18],in2[18]) ; \ + case 18: out[17] = current_func(in1[17],in2[17]) ; \ + case 17: out[16] = current_func(in1[16],in2[16]) ; \ + case 16: out[15] = current_func(in1[15],in2[15]) ; \ + case 15: out[14] = current_func(in1[14],in2[14]) ; \ + case 14: out[13] = current_func(in1[13],in2[13]) ; \ + case 13: out[12] = current_func(in1[12],in2[12]) ; \ + case 12: out[11] = current_func(in1[11],in2[11]) ; \ + case 11: out[10] = current_func(in1[10],in2[10]) ; \ + case 10: out[9] = current_func(in1[9],in2[9]) ; \ + case 9: out[8] = current_func(in1[8],in2[8]) ; \ + case 8: out[7] = current_func(in1[7],in2[7]) ; \ + case 7: out[6] = current_func(in1[6],in2[6]) ; \ + case 6: out[5] = current_func(in1[5],in2[5]) ; \ + case 5: out[4] = current_func(in1[4],in2[4]) ; \ + case 4: out[3] = current_func(in1[3],in2[3]) ; \ + case 3: out[2] = current_func(in1[2],in2[2]) ; \ + case 2: out[1] = current_func(in1[1],in2[1]) ; \ + case 1: out[0] = current_func(in1[0],in2[0]) ; \ + } \ + }\ +} + +#define OP_AVX_BIT_FUNC_3BUFF(name, type_size, type, op) \ + static void ompi_op_avx_3buff_##op##_##type(void *_in1, void *_in2, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(type)); \ + int left_over = *count; \ + type* in1 = (type*)_in1; \ + type* in2 = (type*)_in2; \ + type* out = (type*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512i vecA = _mm512_loadu_si512(in1);\ + __m512i vecB = _mm512_loadu_si512(in2);\ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512i res = _mm512_##op##_si512(vecA, vecB); \ + _mm512_storeu_si512(out, res); \ + out += types_per_step; \ + }\ + if(left_over!=0){ \ + types_per_step >>= 1; \ + if( left_over >= types_per_step ) { \ + __m256i vecA = _mm256_loadu_si256(in1); \ + __m256i vecB = _mm256_loadu_si256(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m256i res = _mm256_##op##_si256(vecA, vecB); \ + _mm256_storeu_si256(out, res); \ + out += types_per_step; \ + left_over -= types_per_step; \ + }\ + }\ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 31: out[30] = current_func(in1[30],in2[30]) ; \ + case 30: out[29] = current_func(in1[29],in2[29]) ; \ + case 29: out[28] = current_func(in1[28],in2[28]) ; \ + case 28: out[27] = current_func(in1[27],in2[27]) ; \ + case 27: out[26] = current_func(in1[26],in2[26]) ; \ + case 26: out[25] = current_func(in1[25],in2[25]) ; \ + case 25: out[24] = current_func(in1[24],in2[24]) ; \ + case 24: out[23] = current_func(in1[23],in2[23]) ; \ + case 23: out[22] = current_func(in1[22],in2[22]) ; \ + case 22: out[21] = current_func(in1[21],in2[21]) ; \ + case 21: out[20] = current_func(in1[20],in2[20]) ; \ + case 20: out[19] = current_func(in1[19],in2[19]) ; \ + case 19: out[18] = current_func(in1[18],in2[18]) ; \ + case 18: out[17] = current_func(in1[17],in2[17]) ; \ + case 17: out[16] = current_func(in1[16],in2[16]) ; \ + case 16: out[15] = current_func(in1[15],in2[15]) ; \ + case 15: out[14] = current_func(in1[14],in2[14]) ; \ + case 14: out[13] = current_func(in1[13],in2[13]) ; \ + case 13: out[12] = current_func(in1[12],in2[12]) ; \ + case 12: out[11] = current_func(in1[11],in2[11]) ; \ + case 11: out[10] = current_func(in1[10],in2[10]) ; \ + case 10: out[9] = current_func(in1[9],in2[9]) ; \ + case 9: out[8] = current_func(in1[8],in2[8]) ; \ + case 8: out[7] = current_func(in1[7],in2[7]) ; \ + case 7: out[6] = current_func(in1[6],in2[6]) ; \ + case 6: out[5] = current_func(in1[5],in2[5]) ; \ + case 5: out[4] = current_func(in1[4],in2[4]) ; \ + case 4: out[3] = current_func(in1[3],in2[3]) ; \ + case 3: out[2] = current_func(in1[2],in2[2]) ; \ + case 2: out[1] = current_func(in1[1],in2[1]) ; \ + case 1: out[0] = current_func(in1[0],in2[0]) ; \ + } \ + }\ +} + +#define OP_AVX_FLOAT_FUNC_3BUFF(op) \ + static void ompi_op_avx_3buff_##op##_float(void *_in1, void *_in2, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(float)); \ + int left_over = *count; \ + float* in1 = (float*)_in1; \ + float* in2 = (float*)_in2; \ + float* out = (float*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512 vecA = _mm512_load_ps(in1); \ + __m512 vecB = _mm512_load_ps(in2); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512 res = _mm512_##op##_ps(vecA, vecB); \ + _mm512_store_ps(out, res); \ + out += types_per_step; \ + } \ + if( 0 != left_over ) { \ + types_per_step >>= 1; /* 256 / (8 * sizeof(float)); */ \ + if( left_over >= types_per_step ) { \ + __m256 vecA = _mm256_load_ps(in1); \ + __m256 vecB = _mm256_load_ps(in2); \ + __m256 res = _mm256_##op##_ps(vecA, vecB); \ + _mm256_store_ps(out, res); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + out += types_per_step; \ + left_over -= types_per_step; \ + } \ + } \ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 7: out[6] = current_func(in1[6],in2[6]) ; \ + case 6: out[5] = current_func(in1[5],in2[5]) ; \ + case 5: out[4] = current_func(in1[4],in2[4]) ; \ + case 4: out[3] = current_func(in1[3],in2[3]) ; \ + case 3: out[2] = current_func(in1[2],in2[2]) ; \ + case 2: out[1] = current_func(in1[1],in2[1]) ; \ + case 1: out[0] = current_func(in1[0],in2[0]) ; \ + }\ + }\ +} + +#define OP_AVX_DOUBLE_FUNC_3BUFF(op) \ + static void ompi_op_avx_3buff_##op##_double(void *_in1, void *_in2, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ +{ \ + int types_per_step = 512 / (8 * sizeof(double)); \ + int left_over = *count; \ + double* in1 = (double*)_in1; \ + double* in2 = (double*)_in2; \ + double* out = (double*)_out; \ + for (; left_over >= types_per_step; left_over -= types_per_step) { \ + __m512d vecA = _mm512_load_pd((in1));\ + __m512d vecB = _mm512_load_pd((in2));\ + in1 += types_per_step; \ + in2 += types_per_step; \ + __m512d res = _mm512_##op##_pd(vecA, vecB); \ + _mm512_store_pd((out), res); \ + out += types_per_step; \ + }\ + if( 0 != left_over ) { \ + types_per_step >>= 1; /* 256 / (8 * sizeof(double)); */ \ + if( left_over >= types_per_step ) { \ + __m256 vecA = _mm256_load_ps(in1); \ + __m256 vecB = _mm256_load_ps(in2); \ + __m256 res = _mm256_##op##_ps(vecA, vecB); \ + _mm256_store_ps(out, res); \ + in1 += types_per_step; \ + in2 += types_per_step; \ + out += types_per_step; \ + left_over -= types_per_step; \ + } \ + } \ + if( 0 != left_over ) { \ + switch(left_over) { \ + case 3: out[2] = current_func(in1[2],in2[2]) ; \ + case 2: out[1] = current_func(in1[1],in2[1]) ; \ + case 1: out[0] = current_func(in1[0],in2[0]) ; \ + }\ + }\ +} + +/************************************************************************* + * Max + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) + + OP_AVX_FUNC_3BUFF(max, i, 8, int8_t, max) + OP_AVX_FUNC_3BUFF(max, u, 8, uint8_t, max) + OP_AVX_FUNC_3BUFF(max, i, 16, int16_t, max) + OP_AVX_FUNC_3BUFF(max, u, 16, uint16_t, max) + OP_AVX_FUNC_3BUFF(max, i, 32, int32_t, max) + OP_AVX_FUNC_3BUFF(max, u, 32, uint32_t, max) + OP_AVX_FUNC_3BUFF(max, i, 64, int64_t, max) + OP_AVX_FUNC_3BUFF(max, u, 64, uint64_t, max) + + /* Floating point */ + OP_AVX_FLOAT_FUNC_3BUFF(max) + OP_AVX_DOUBLE_FUNC_3BUFF(max) + +/************************************************************************* + * Min + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) + OP_AVX_FUNC_3BUFF(min, i, 8, int8_t, min) + OP_AVX_FUNC_3BUFF(min, u, 8, uint8_t, min) + OP_AVX_FUNC_3BUFF(min, i, 16, int16_t, min) + OP_AVX_FUNC_3BUFF(min, u, 16, uint16_t, min) + OP_AVX_FUNC_3BUFF(min, i, 32, int32_t, min) + OP_AVX_FUNC_3BUFF(min, u, 32, uint32_t, min) + OP_AVX_FUNC_3BUFF(min, i, 64, int64_t, min) + OP_AVX_FUNC_3BUFF(min, u, 64, uint64_t, min) + + /* Floating point */ + OP_AVX_FLOAT_FUNC_3BUFF(min) + OP_AVX_DOUBLE_FUNC_3BUFF(min) + +/************************************************************************* + * Sum + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) + + OP_AVX_FUNC_3BUFF(sum, i, 8, int8_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 8, uint8_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 16, int16_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 16, uint16_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 32, int32_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 32, uint32_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 64, int64_t, add) + OP_AVX_FUNC_3BUFF(sum, i, 64, uint64_t, add) + + /* Floating point */ + OP_AVX_FLOAT_FUNC_3BUFF(add) + OP_AVX_DOUBLE_FUNC_3BUFF(add) + +/************************************************************************* + * Product + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) + OP_AVX_MUL_3BUFF(prod, i, 8, int8_t, mullo) + OP_AVX_MUL_3BUFF(prod, i, 8, uint8_t, mullo) + OP_AVX_FUNC_3BUFF(prod, i, 16, int16_t, mullo) + OP_AVX_FUNC_3BUFF(prod, i, 16, uint16_t, mullo) + OP_AVX_FUNC_3BUFF(prod, i, 32, int32_t, mullo) + OP_AVX_FUNC_3BUFF(prod, i ,32, uint32_t, mullo) + OP_AVX_FUNC_3BUFF(prod, i, 64, int64_t, mullo) + OP_AVX_FUNC_3BUFF(prod, i, 64, uint64_t, mullo) + + /* Floating point */ + OP_AVX_FLOAT_FUNC_3BUFF(mul) + OP_AVX_DOUBLE_FUNC_3BUFF(mul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) + OP_AVX_BIT_FUNC_3BUFF(band, 8, int8_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 8, uint8_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 16, int16_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 16, uint16_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 32, int32_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 32, uint32_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 64, int64_t, and) + OP_AVX_BIT_FUNC_3BUFF(band, 64, uint64_t, and) + + // not defined - OP_AVX_FLOAT_FUNC_3BUFF(and) + // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(and) + +/************************************************************************* + * Bitwise OR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) + OP_AVX_BIT_FUNC_3BUFF(bor, 8, int8_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 8, uint8_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 16, int16_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 16, uint16_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 32, int32_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 32, uint32_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 64, int64_t, or) + OP_AVX_BIT_FUNC_3BUFF(bor, 64, uint64_t, or) + + // not defined - OP_AVX_FLOAT_FUNC_3BUFF(or) + // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(or) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) + OP_AVX_BIT_FUNC_3BUFF(bxor, 8, int8_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 8, uint8_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 16, int16_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 32, int32_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 64, int64_t, xor) + OP_AVX_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor) + + // not defined - OP_AVX_FLOAT_FUNC_3BUFF(xor) + // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(xor) + + +/** C integer ***********************************************************/ +#define C_INTEGER(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_avx_##ftype##_##name##_int8_t, \ + [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_avx_##ftype##_##name##_uint8_t, \ + [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_avx_##ftype##_##name##_int16_t, \ + [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_avx_##ftype##_##name##_uint16_t, \ + [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_avx_##ftype##_##name##_int32_t, \ + [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_avx_##ftype##_##name##_uint32_t, \ + [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_avx_##ftype##_##name##_int64_t, \ + [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_avx_##ftype##_##name##_uint64_t + + +/** Floating point, including all the Fortran reals *********************/ +#define FLOAT(name, ftype) ompi_op_avx_##ftype##_##name##_float +#define DOUBLE(name, ftype) ompi_op_avx_##ftype##_##name##_double + +#define FLOATING_POINT(name, ftype) \ + [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \ + [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ + [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) + +/* + * MPI_OP_NULL + * All types + */ +#define FLAGS_NO_FLOAT \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE) +#define FLAGS \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \ + OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE) + +ompi_op_base_handler_fn_t ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 2buff), + FLOATING_POINT(max, 2buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 2buff), + FLOATING_POINT(min, 2buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 2buff), + FLOATING_POINT(add, 2buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER(prod, 2buff), + FLOATING_POINT(mul, 2buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] = { + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(band, 2buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(bor, 2buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(bxor, 2buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* (MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE) */ + NULL, + }, + +}; + +ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER(max, 3buff), + FLOATING_POINT(max, 3buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER(min, 3buff), + FLOATING_POINT(min, 3buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER(sum, 3buff), + FLOATING_POINT(add, 3buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER(prod, 3buff), + FLOATING_POINT(mul, 3buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] ={ + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER(and, 3buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER(or, 3buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER(xor, 3buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE */ + NULL, + }, +}; diff --git a/ompi/mca/op/avx/op_avx_functions.h b/ompi/mca/op/avx/op_avx_functions.h new file mode 100644 index 00000000000..621f8538d92 --- /dev/null +++ b/ompi/mca/op/avx/op_avx_functions.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif + +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/avx/op_avx.h" + +BEGIN_C_DECLS + +OMPI_DECLSPEC extern ompi_op_base_handler_fn_t +ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; +OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t +ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; + +END_C_DECLS + diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c new file mode 100644 index 00000000000..c31020e04e6 --- /dev/null +++ b/test/datatype/Reduce_local_float.c @@ -0,0 +1,716 @@ +#include +#include +#include +#include +#include +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif /* __ARM_FEATURE_SVE */ + +#include "mpi.h" + +#define ARRAYSIZE 1024*1024 + +float in_float[ARRAYSIZE]; +float inout_float[ARRAYSIZE]; +float inout_float_for_check[ARRAYSIZE]; + +int8_t in_uint8[ARRAYSIZE]; +int8_t inout_uint8[ARRAYSIZE]; +int8_t inout_uint8_for_check[ARRAYSIZE]; + +uint16_t in_uint16[ARRAYSIZE]; +uint16_t inout_uint16[ARRAYSIZE]; +uint16_t inout_uint16_for_check[ARRAYSIZE]; + +uint32_t in_uint32[ARRAYSIZE]; +uint32_t inout_uint32[ARRAYSIZE]; +uint32_t inout_uint32_for_check[ARRAYSIZE]; + +uint64_t in_uint64[ARRAYSIZE]; +uint64_t inout_uint64[ARRAYSIZE]; +uint64_t inout_uint64_for_check[ARRAYSIZE]; + +double in_double[ARRAYSIZE]; +double inout_double[ARRAYSIZE]; +double inout_double_for_check[ARRAYSIZE]; + +int main(int argc, char **argv) { + + char *num_elem = argv[1]; + int count = atoi(num_elem); + char *type = argv[2]; + char *elem_size = argv[3]; + int elem_size1 = atoi(elem_size); + char *op = argv[4]; + + int i; + + for (i=0; i