From f7a900b5b4594c0bbeb7df0b50d6cda03a54ad65 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 11 Oct 2019 11:12:07 -0400
Subject: [PATCH 01/13] add AVX512 op

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/intel_avx_op/Makefile.am        |  73 +++
 ompi/mca/op/intel_avx_op/op_avx.h           |  61 ++
 ompi/mca/op/intel_avx_op/op_avx_component.c | 250 +++++++
 ompi/mca/op/intel_avx_op/op_avx_functions.c | 688 ++++++++++++++++++++
 ompi/mca/op/intel_avx_op/op_avx_functions.h |  29 +
 test/datatype/Reduce_local_float.c          | 277 ++++++++
 6 files changed, 1378 insertions(+)
 create mode 100644 ompi/mca/op/intel_avx_op/Makefile.am
 create mode 100644 ompi/mca/op/intel_avx_op/op_avx.h
 create mode 100644 ompi/mca/op/intel_avx_op/op_avx_component.c
 create mode 100644 ompi/mca/op/intel_avx_op/op_avx_functions.c
 create mode 100644 ompi/mca/op/intel_avx_op/op_avx_functions.h
 create mode 100644 test/datatype/Reduce_local_float.c
diff --git a/ompi/mca/op/intel_avx_op/Makefile.am b/ompi/mca/op/intel_avx_op/Makefile.am
new file mode 100644
index 00000000000..8bf31fc58c2
--- /dev/null
+++ b/ompi/mca/op/intel_avx_op/Makefile.am
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This is an avx op component.  This Makefile.am is a typical
+# avx of how to integrate into Open MPI's Automake-based build
+# system.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+sources = \
+    op_avx.h \
+    op_avx_component.c \
+	op_avx_functions.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_intel_avx_op_DSO
+lib =
+lib_sources =
+component = mca_op_avx.la
+component_sources = $(sources)
+else
+lib = libmca_op_avx.la
+lib_sources = $(sources)
+component =
+component_sources =
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component)
+mca_op_avx_la_SOURCES = $(component_sources)
+mca_op_avx_la_LDFLAGS = -module -avoid-version
+mca_op_avx_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(lib)
+libmca_op_avx_la_SOURCES = $(lib_sources)
+libmca_op_avx_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/intel_avx_op/op_avx.h b/ompi/mca/op/intel_avx_op/op_avx.h
new file mode 100644
index 00000000000..5ec4b3f9e00
--- /dev/null
+++ b/ompi/mca/op/intel_avx_op/op_avx.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_AVX_EXPORT_H
+#define MCA_OP_AVX_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is avx-component-specific cached information.  We
+       tend to use this scheme (caching information on the avx
+       component itself) instead of lots of individual global
+       variables for the component.  The following data fields are
+       avxs; replace them with whatever is relevant for your
+       component. */
+
+    /** A simple boolean indicating that the hardware is available. */
+    bool hardware_available;
+
+    /** A simple boolean indicating whether double precision is
+        supported. */
+    bool double_supported;
+} ompi_op_avx_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *avx* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_avx_component_t
+    mca_op_avx_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_AVX_EXPORT_H */
diff --git a/ompi/mca/op/intel_avx_op/op_avx_component.c b/ompi/mca/op/intel_avx_op/op_avx_component.c
new file mode 100644
index 00000000000..ddb9d6c759c
--- /dev/null
+++ b/ompi/mca/op/intel_avx_op/op_avx_component.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "avx" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/intel_avx_op/op_avx.h"
+#include "ompi/mca/op/intel_avx_op/op_avx_functions.h"
+
+static int avx_component_open(void);
+static int avx_component_close(void);
+static int avx_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    avx_component_op_query(struct ompi_op_t *op, int *priority);
+static int avx_component_register(void);
+
+ompi_op_avx_component_t mca_op_avx_component = {
+    /* First, the mca_base_component_t struct containing meta
+       information about the component itself */
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "avx",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = avx_component_open,
+            .mca_close_component = avx_component_close,
+            .mca_register_component_params = avx_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = avx_component_init_query,
+        .opc_op_query = avx_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int avx_component_open(void)
+{
+    opal_output(ompi_op_base_framework.framework_output, "avx component open");
+
+    /* A first level check to see if avx is even available in this
+       process.  E.g., you may want to do a first-order check to see
+       if hardware is available.  If so, return OMPI_SUCCESS.  If not,
+       return anything other than OMPI_SUCCESS and the component will
+       silently be ignored.
+
+       Note that if this function returns non-OMPI_SUCCESS, then this
+       component won't even be shown in ompi_info output (which is
+       probably not what you want).
+    */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int avx_component_close(void)
+{
+    opal_output(ompi_op_base_framework.framework_output, "avx component close");
+
+    /* If avx was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+static char *avx_component_version;
+
+/*
+ * Register MCA params.
+ */
+static int avx_component_register(void)
+{
+
+    opal_output(ompi_op_base_framework.framework_output, "avx component register");
+
+    /* Additionally, since this component is simulating hardware,
+       let's make MCA params that determine whethere a) the hardware
+       is available, and b) whether double precision floating point
+       types are supported.  This allows you to change the behavior of
+       this component at run-time (by setting these MCA params at
+       run-time), simulating different kinds of hardware. */
+    mca_op_avx_component.hardware_available = true;
+    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
+                                           "hardware_available",
+                                           "Whether the hardware is available or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_avx_component.hardware_available);
+
+    mca_op_avx_component.double_supported = true;
+    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
+                                           "double_supported",
+                                           "Whether the double precision data types are supported or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_avx_component.double_supported);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int avx_component_init_query(bool enable_progress_threads,
+                                        bool enable_mpi_thread_multiple)
+{
+    opal_output(ompi_op_base_framework.framework_output, "avx component init query");
+
+    if (mca_op_avx_component.hardware_available && !enable_mpi_thread_multiple) {
+        return OMPI_SUCCESS;
+    }
+    return OMPI_ERR_NOT_SUPPORTED;
+}
+
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t *
+    avx_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
+
+    opal_output(ompi_op_base_framework.framework_output, "avx component op query");
+
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        opal_output(0, "avx component op query: not an intrinsic MPI_Op -- skipping");
+        return NULL;
+    }
+
+    int i=0;
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+        /* Corresponds to MPI_MAX */
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick MAX");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_MIN:
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick MIN");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_SUM:
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick SUM");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
+             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_PROD:
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick PRO2BUF");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_BOR:
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick BOR");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
+             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_BAND:
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick BAND");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
+             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        opal_output(ompi_op_base_framework.framework_output, "avx component op pick BXOR");
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+        module= NULL;
+        break;
+    }
+
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *avx* component pointer
+       (vs. a *base* component pointer -- where an *avx* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/intel_avx_op/op_avx_functions.c b/ompi/mca/op/intel_avx_op/op_avx_functions.c
new file mode 100644
index 00000000000..f5b168a59d6
--- /dev/null
+++ b/ompi/mca/op/intel_avx_op/op_avx_functions.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/intel_avx_op/op_avx.h"
+#include "ompi/mca/op/intel_avx_op/op_avx_functions.h"
+
+#include <immintrin.h>
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ * Support ops: max, min, for signed/unsigned 8,16,32,64
+ *              sum, for integer 8,16,32,64
+ *
+ */
+#define OP_AVX_FUNC(name, type_sign, type_size, type, op) \
+    static void ompi_op_avx_2buff_##name##_##type(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step;                                                          \
+    switch(type_size) {                                                \
+        case 8:  \
+                 step = 64; \
+        break; \
+        case 16: \
+                 step = 32; \
+        break; \
+        case 32: \
+                 step = 16; \
+        break; \
+        case 64: \
+                 step = 8;  \
+    }\
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512i vecA =  _mm512_loadu_si512((in+i));\
+        __m512i vecB =  _mm512_loadu_si512((out+i));\
+        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+        _mm512_storeu_si512((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in+round)); \
+        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (out+round)); \
+        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+    }\
+}
+
+/*
+ *  This macro is for bit-wise operations (out op in).
+ *
+ *  Support ops: or, xor, and of 512 bits (representing integer data)
+ *
+ */
+#define OP_AVX_BIT_FUNC(name, type_size, type, op) \
+    static void ompi_op_avx_2buff_##name##_##type(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step;                                                          \
+    switch(type_size) {                                                \
+        case 8:  \
+                 step = 64; \
+        break; \
+        case 16: \
+                 step = 32; \
+        break; \
+        case 32: \
+                 step = 16; \
+        break; \
+        case 64: \
+                 step = 8;  \
+    }\
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512i vecA =  _mm512_loadu_si512((in+i));\
+        __m512i vecB =  _mm512_loadu_si512((out+i));\
+        __m512i res = _mm512_##op##_si512(vecA, vecB); \
+        _mm512_storeu_si512((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in+round)); \
+        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (out+round)); \
+        __m512i res =  _mm512_##op##_si512(vecA, vecB); \
+        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+    }\
+}
+
+#define OP_AVX_FLOAT_FUNC(op) \
+    static void ompi_op_avx_2buff_##op##_float(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 16;                                                          \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512 vecA =  _mm512_load_ps((in+i));\
+        __m512 vecB =  _mm512_load_ps((out+i));\
+        __m512 res = _mm512_##op##_ps(vecA, vecB); \
+        _mm512_store_ps((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512 vecA = _mm512_maskz_load_ps(left_over, (in+round)); \
+        __m512 vecB = _mm512_maskz_load_ps(left_over, (out+round)); \
+        __m512 res = _mm512_##op##_ps(vecA, vecB); \
+        _mm512_mask_store_ps((out+round), left_over, res); \
+    }\
+}
+
+#define OP_AVX_DOUBLE_FUNC(op) \
+    static void ompi_op_avx_2buff_##op##_double(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 8;                                                          \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512d vecA =  _mm512_load_pd((in+i));\
+        __m512d vecB =  _mm512_load_pd((out+i));\
+        __m512d res = _mm512_##op##_pd(vecA, vecB); \
+        _mm512_store_pd((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512d vecA = _mm512_maskz_load_pd(left_over, (in+round)); \
+        __m512d vecB = _mm512_maskz_load_pd(left_over, (out+round)); \
+        __m512d res = _mm512_##op##_pd(vecA, vecB); \
+        _mm512_mask_store_pd((out+round), left_over, res); \
+    }\
+}
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+    OP_AVX_FUNC(max, i, 8,    int8_t, max)
+    OP_AVX_FUNC(max, u, 8,   uint8_t, max)
+    OP_AVX_FUNC(max, i, 16,  int16_t, max)
+    OP_AVX_FUNC(max, u, 16, uint16_t, max)
+    OP_AVX_FUNC(max, i, 32,  int32_t, max)
+    OP_AVX_FUNC(max, u, 32, uint32_t, max)
+    OP_AVX_FUNC(max, i, 64,  int64_t, max)
+    OP_AVX_FUNC(max, u, 64, uint64_t, max)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(max)
+    OP_AVX_DOUBLE_FUNC(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+    OP_AVX_FUNC(min, i, 8,    int8_t, min)
+    OP_AVX_FUNC(min, u, 8,   uint8_t, min)
+    OP_AVX_FUNC(min, i, 16,  int16_t, min)
+    OP_AVX_FUNC(min, u, 16, uint16_t, min)
+    OP_AVX_FUNC(min, i, 32,  int32_t, min)
+    OP_AVX_FUNC(min, u, 32, uint32_t, min)
+    OP_AVX_FUNC(min, i, 64,  int64_t, min)
+    OP_AVX_FUNC(min, u, 64, uint64_t, min)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(min)
+    OP_AVX_DOUBLE_FUNC(min)
+
+/*************************************************************************
+ * Sum
+ ************************************************************************/
+    OP_AVX_FUNC(sum, i, 8,    int8_t, add)
+    OP_AVX_FUNC(sum, i, 8,   uint8_t, add)
+    OP_AVX_FUNC(sum, i, 16,  int16_t, add)
+    OP_AVX_FUNC(sum, i, 16, uint16_t, add)
+    OP_AVX_FUNC(sum, i, 32,  int32_t, add)
+    OP_AVX_FUNC(sum, i, 32, uint32_t, add)
+    OP_AVX_FUNC(sum, i, 64,  int64_t, add)
+    OP_AVX_FUNC(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(add)
+    OP_AVX_DOUBLE_FUNC(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+    OP_AVX_FUNC(prod, i, 16,  int16_t, mullo)
+    OP_AVX_FUNC(prod, i, 16, uint16_t, mullo)
+    OP_AVX_FUNC(prod, i, 32,  int32_t, mullo)
+    OP_AVX_FUNC(prod, i ,32, uint32_t, mullo)
+    OP_AVX_FUNC(prod, i, 64,  int64_t, mullo)
+    OP_AVX_FUNC(prod, i, 64, uint64_t, mullo)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(mul)
+    OP_AVX_DOUBLE_FUNC(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+    OP_AVX_BIT_FUNC(band, 8,    int8_t, and)
+    OP_AVX_BIT_FUNC(band, 8,   uint8_t, and)
+    OP_AVX_BIT_FUNC(band, 16,  int16_t, and)
+    OP_AVX_BIT_FUNC(band, 16, uint16_t, and)
+    OP_AVX_BIT_FUNC(band, 32,  int32_t, and)
+    OP_AVX_BIT_FUNC(band, 32, uint32_t, and)
+    OP_AVX_BIT_FUNC(band, 64,  int64_t, and)
+    OP_AVX_BIT_FUNC(band, 64, uint64_t, and)
+
+    OP_AVX_FLOAT_FUNC(and)
+    OP_AVX_DOUBLE_FUNC(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+    OP_AVX_BIT_FUNC(bor, 8,    int8_t, or)
+    OP_AVX_BIT_FUNC(bor, 8,   uint8_t, or)
+    OP_AVX_BIT_FUNC(bor, 16,  int16_t, or)
+    OP_AVX_BIT_FUNC(bor, 16, uint16_t, or)
+    OP_AVX_BIT_FUNC(bor, 32,  int32_t, or)
+    OP_AVX_BIT_FUNC(bor, 32, uint32_t, or)
+    OP_AVX_BIT_FUNC(bor, 64,  int64_t, or)
+    OP_AVX_BIT_FUNC(bor, 64, uint64_t, or)
+
+    OP_AVX_FLOAT_FUNC(or)
+    OP_AVX_DOUBLE_FUNC(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+    OP_AVX_BIT_FUNC(bxor, 8,    int8_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 8,   uint8_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 16,  int16_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 16, uint16_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 32,  int32_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 32, uint32_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 64,  int64_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 64, uint64_t, xor)
+
+    OP_AVX_FLOAT_FUNC(xor)
+    OP_AVX_DOUBLE_FUNC(xor)
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *  routines, needed for some optimizations.
+ */
+#define OP_AVX_FUNC_3BUFF(name, type_sign, type_size, type, op)\
+        static void ompi_op_avx_3buff_##name##_##type(void * restrict in1,   \
+                void * restrict in2, void * restrict out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step;                                                          \
+    switch(type_size) {                                                \
+        case 8:  \
+                 step = 64; \
+        break; \
+        case 16: \
+                 step = 32; \
+        break; \
+        case 32: \
+                 step = 16; \
+        break; \
+        case 64: \
+                 step = 8;  \
+    }\
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512i vecA =  _mm512_loadu_si512((in1+i));\
+        __m512i vecB =  _mm512_loadu_si512((in2+i));\
+        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+        _mm512_storeu_si512((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in1+round)); \
+        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (in2+round)); \
+        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+    }\
+}
+
+#define OP_AVX_BIT_FUNC_3BUFF(name, type_size, type, op) \
+        static void ompi_op_avx_3buff_##op##_##type(void *in1, void *in2, void *out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step;                                                          \
+    switch(type_size) {                                                \
+        case 8:  \
+                 step = 64; \
+        break; \
+        case 16: \
+                 step = 32; \
+        break; \
+        case 32: \
+                 step = 16; \
+        break; \
+        case 64: \
+                 step = 8;  \
+    }\
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512i vecA =  _mm512_loadu_si512((in1+i));\
+        __m512i vecB =  _mm512_loadu_si512((in2+i));\
+        __m512i res = _mm512_##op##_si512(vecA, vecB); \
+        _mm512_storeu_si512((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in1+round)); \
+        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (in2+round)); \
+        __m512i res =  _mm512_##op##_si512(vecA, vecB); \
+        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+    }\
+}
+
+#define OP_AVX_FLOAT_FUNC_3BUFF(op) \
+        static void ompi_op_avx_3buff_##op##_float(void *in1, void *in2, void *out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 16;                                                          \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512 vecA =  _mm512_load_ps((in1+i));\
+        __m512 vecB =  _mm512_load_ps((in2+i));\
+        __m512 res = _mm512_##op##_ps(vecA, vecB); \
+        _mm512_store_ps((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512 vecA = _mm512_maskz_load_ps(left_over, (in1+round)); \
+        __m512 vecB = _mm512_maskz_load_ps(left_over, (in2+round)); \
+        __m512 res = _mm512_##op##_ps(vecA, vecB); \
+        _mm512_mask_store_ps((out+round), left_over, res); \
+    }\
+}
+
+#define OP_AVX_DOUBLE_FUNC_3BUFF(op) \
+        static void ompi_op_avx_3buff_##op##_double(void *in1, void *in2, void *out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 8;                                                          \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    for (i = 0; i < round; i+=64) { \
+        __m512d vecA =  _mm512_load_pd((in1+i));\
+        __m512d vecB =  _mm512_load_pd((in2+i));\
+        __m512d res = _mm512_##op##_pd(vecA, vecB); \
+        _mm512_store_pd((out+i), res); \
+    }\
+    uint64_t left_over = *count - (size*step); \
+    if(left_over!=0){ \
+        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
+        left_over = left_over64 >>(64-left_over); \
+        __m512d vecA = _mm512_maskz_load_pd(left_over, (in1+round)); \
+        __m512d vecB = _mm512_maskz_load_pd(left_over, (in2+round)); \
+        __m512d res = _mm512_##op##_pd(vecA, vecB); \
+        _mm512_mask_store_pd((out+round), left_over, res); \
+    }\
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+    OP_AVX_FUNC_3BUFF(max, i, 8,    int8_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 8,   uint8_t, max)
+    OP_AVX_FUNC_3BUFF(max, i, 16,  int16_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 16, uint16_t, max)
+    OP_AVX_FUNC_3BUFF(max, i, 32,  int32_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 32, uint32_t, max)
+    OP_AVX_FUNC_3BUFF(max, i, 64,  int64_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 64, uint64_t, max)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(max)
+    OP_AVX_DOUBLE_FUNC_3BUFF(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+    OP_AVX_FUNC_3BUFF(min, i, 8,    int8_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 8,   uint8_t, min)
+    OP_AVX_FUNC_3BUFF(min, i, 16,  int16_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 16, uint16_t, min)
+    OP_AVX_FUNC_3BUFF(min, i, 32,  int32_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 32, uint32_t, min)
+    OP_AVX_FUNC_3BUFF(min, i, 64,  int64_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 64, uint64_t, min)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(min)
+    OP_AVX_DOUBLE_FUNC_3BUFF(min)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+    OP_AVX_FUNC_3BUFF(sum, i, 8,    int8_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 8,   uint8_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 16,  int16_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 16, uint16_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 32,  int32_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 32, uint32_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 64,  int64_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(add)
+    OP_AVX_DOUBLE_FUNC_3BUFF(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+    OP_AVX_FUNC_3BUFF(prod, i, 16,  int16_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 16, uint16_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 32,  int32_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i ,32, uint32_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 64,  int64_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 64, uint64_t, mullo)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(mul)
+    OP_AVX_DOUBLE_FUNC_3BUFF(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+    OP_AVX_BIT_FUNC_3BUFF(band, 8,    int8_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 8,   uint8_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 16,  int16_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 16, uint16_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 32,  int32_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 32, uint32_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 64,  int64_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 64, uint64_t, and)
+
+    OP_AVX_FLOAT_FUNC_3BUFF(and)
+    OP_AVX_DOUBLE_FUNC_3BUFF(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+    OP_AVX_BIT_FUNC_3BUFF(bor, 8,    int8_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 8,   uint8_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 16,  int16_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 16, uint16_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 32,  int32_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 32, uint32_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 64,  int64_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 64, uint64_t, or)
+
+    OP_AVX_FLOAT_FUNC_3BUFF(or)
+    OP_AVX_DOUBLE_FUNC_3BUFF(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 8,    int8_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 8,   uint8_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 16,  int16_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 32,  int32_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 64,  int64_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor)
+
+    OP_AVX_FLOAT_FUNC_3BUFF(xor)
+    OP_AVX_DOUBLE_FUNC_3BUFF(xor)
+
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                              \
+    [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_avx_##ftype##_##name##_int8_t,     \
+    [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_avx_##ftype##_##name##_uint8_t,   \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_avx_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_avx_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_avx_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_avx_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_avx_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_avx_##ftype##_##name##_uint64_t
+
+
+/** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype) ompi_op_avx_##ftype##_##name##_float
+#define DOUBLE(name, ftype) ompi_op_avx_##ftype##_##name##_double
+
+#define FLOATING_POINT(name, ftype)                                                            \
+    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+#define C_INTEGER_PROD(name, ftype)                                           \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_avx_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_avx_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_avx_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_avx_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_avx_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_avx_##ftype##_##name##_uint64_t
+
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_handler_fn_t ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 2buff),
+        FLOATING_POINT(add, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_PROD(prod, 2buff),
+        FLOATING_POINT(mul, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           impementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 3buff),
+        FLOATING_POINT(add, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_PROD(prod, 3buff),
+        FLOATING_POINT(mul, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(and, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(or, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(xor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           impementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};
diff --git a/ompi/mca/op/intel_avx_op/op_avx_functions.h b/ompi/mca/op/intel_avx_op/op_avx_functions.h
new file mode 100644
index 00000000000..b76b45b0419
--- /dev/null
+++ b/ompi/mca/op/intel_avx_op/op_avx_functions.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/intel_avx_op/op_avx.h"
+
+BEGIN_C_DECLS
+
+OMPI_DECLSPEC extern ompi_op_base_handler_fn_t
+ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t
+ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
new file mode 100644
index 00000000000..b9c3050e293
--- /dev/null
+++ b/test/datatype/Reduce_local_float.c
@@ -0,0 +1,277 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+
+#define ARRAYSIZE  1024*1024
+
+float in_float[ARRAYSIZE];
+float inout_float[ARRAYSIZE];
+
+int8_t in_uint8[ARRAYSIZE];
+int8_t inout_uint8[ARRAYSIZE];
+
+uint16_t in_uint16[ARRAYSIZE];
+uint16_t inout_uint16[ARRAYSIZE];
+
+uint32_t in_uint32[ARRAYSIZE];
+uint32_t inout_uint32[ARRAYSIZE];
+
+uint64_t in_uint64[ARRAYSIZE];
+uint64_t inout_uint64[ARRAYSIZE];
+
+double in_double[ARRAYSIZE];
+double inout_double[ARRAYSIZE];
+
+
+int main(int argc, char **argv) {
+
+    char *num_elem = argv[1];
+    int count = atoi(num_elem);
+    char *type = argv[2];
+    char *elem_size = argv[3];
+    int elem_size1  = atoi(elem_size);
+    char *op = argv[4];
+
+    printf("Sum %d elems, option %c \n",count, *type);
+    int i;
+
+    for (i=0; i<count; i++)
+    {
+        in_float[i] = 1000.0+1;
+        inout_float[i] = 100.0+2;
+
+        in_double[i] = 10.0+1;
+        inout_double[i] = 1.0+2;
+
+        in_uint8[i] = 2;
+        inout_uint8[i] = 3;
+
+        in_uint16[i] = 2;
+        inout_uint16[i] = 3;
+
+        in_uint32[i] = 2;
+        inout_uint32[i] = 3;
+
+        in_uint64[i] = 2;
+        inout_uint64[i] = 3;
+    }
+    int rank, size, len;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    double tstart, tend;
+
+    if(*type=='i') {
+        if(strcmp(op, "sum") == 0){
+            printf("#Local Reduce SUM: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
+             tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            printf("#Local Reduce MAX: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            printf("#Local Reduce MIN: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MIN);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MIN);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MIN);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MIN);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "band") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "bor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "bxor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+            tend = MPI_Wtime();
+        }
+
+
+        if(strcmp(op, "mul") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+            tend = MPI_Wtime();
+        }
+    }
+
+
+    if(*type=='f') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MIN);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            tend = MPI_Wtime();
+        }
+    }
+
+    if(*type=='d') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MIN);
+            tend = MPI_Wtime();
+        }
+
+         if(strcmp(op, "mul") == 0) {
+             tstart = MPI_Wtime();
+             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+             tend = MPI_Wtime();
+         }
+    }
+/*
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" float: %f", inout_float[i]);
+        if((i+1)%16==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out8: %d", inout_uint8[i]);
+        if((i+1)%64==0)
+            printf("\n");
+    }
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out16: %d",inout_uint16[i]);
+        if((i+1)%32==0)
+            printf("\n");
+    }
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out32: %d",inout_uint32[i]);
+        if((i+1)%16==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out64: %d",inout_uint64[i]);
+        if((i+1)%8==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" float: %f", inout_double[i]);
+        if((i+1)%8==0)
+            printf("\n");
+    }
+*/
+    printf("#Local Reduce time %.6f seconds\n", tend-tstart);
+    MPI_Finalize();
+    return 0;
+}
+

From db633a2e2cfcbb35aacbd244ca7e01aa4d2a3b52 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 25 Oct 2019 16:57:31 -0400
Subject: [PATCH 02/13] clean the code and fixed module OBJ_RETAIN bug

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/intel_avx_op/op_avx_component.c | 45 ++++++++--------
 ompi/mca/op/intel_avx_op/op_avx_functions.c | 60 ++-------------------
 test/datatype/Reduce_local_float.c          | 26 ++++++---
 3 files changed, 45 insertions(+), 86 deletions(-)

diff --git a/ompi/mca/op/intel_avx_op/op_avx_component.c b/ompi/mca/op/intel_avx_op/op_avx_component.c
index ddb9d6c759c..d7be36076df 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_component.c
+++ b/ompi/mca/op/intel_avx_op/op_avx_component.c
@@ -63,7 +63,6 @@ ompi_op_avx_component_t mca_op_avx_component = {
  */
 static int avx_component_open(void)
 {
-    opal_output(ompi_op_base_framework.framework_output, "avx component open");
 
     /* A first level check to see if avx is even available in this
        process.  E.g., you may want to do a first-order check to see
@@ -84,7 +83,6 @@ static int avx_component_open(void)
  */
 static int avx_component_close(void)
 {
-    opal_output(ompi_op_base_framework.framework_output, "avx component close");
 
     /* If avx was opened successfully, close it (i.e., release any
        resources that may have been allocated on this component).
@@ -96,23 +94,19 @@ static int avx_component_close(void)
     return OMPI_SUCCESS;
 }
 
-static char *avx_component_version;
-
 /*
  * Register MCA params.
  */
 static int avx_component_register(void)
 {
 
-    opal_output(ompi_op_base_framework.framework_output, "avx component register");
-
     /* Additionally, since this component is simulating hardware,
        let's make MCA params that determine whethere a) the hardware
        is available, and b) whether double precision floating point
        types are supported.  This allows you to change the behavior of
        this component at run-time (by setting these MCA params at
        run-time), simulating different kinds of hardware. */
-    mca_op_avx_component.hardware_available = true;
+    mca_op_avx_component.hardware_available = false;
     (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
                                            "hardware_available",
                                            "Whether the hardware is available or not",
@@ -139,8 +133,6 @@ static int avx_component_register(void)
 static int avx_component_init_query(bool enable_progress_threads,
                                         bool enable_mpi_thread_multiple)
 {
-    opal_output(ompi_op_base_framework.framework_output, "avx component init query");
-
     if (mca_op_avx_component.hardware_available && !enable_mpi_thread_multiple) {
         return OMPI_SUCCESS;
     }
@@ -155,14 +147,11 @@ static struct ompi_op_base_module_1_0_0_t *
     avx_component_op_query(struct ompi_op_t *op, int *priority)
 {
     ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
-
-    opal_output(ompi_op_base_framework.framework_output, "avx component op query");
-
+    printf("\n %p", module);
     /* Sanity check -- although the framework should never invoke the
        _component_op_query() on non-intrinsic MPI_Op's, we'll put a
        check here just to be sure. */
     if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
-        opal_output(0, "avx component op query: not an intrinsic MPI_Op -- skipping");
         return NULL;
     }
 
@@ -170,52 +159,59 @@ static struct ompi_op_base_module_1_0_0_t *
     switch (op->o_f_to_c_index) {
     case OMPI_OP_BASE_FORTRAN_MAX:
         /* Corresponds to MPI_MAX */
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick MAX");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
+            OBJ_RETAIN(module);
             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_MIN:
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick MIN");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
+            OBJ_RETAIN(module);
             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_SUM:
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick SUM");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
-             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_PROD:
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick PRO2BUF");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
+            OBJ_RETAIN(module);
             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_BOR:
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick BOR");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
-             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_BAND:
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick BAND");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
-             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_BXOR:
-        opal_output(ompi_op_base_framework.framework_output, "avx component op pick BXOR");
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
             module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            OBJ_RETAIN(module);
             module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_LAND:
@@ -233,8 +229,9 @@ static struct ompi_op_base_module_1_0_0_t *
     case OMPI_OP_BASE_FORTRAN_MINLOC:
         module= NULL;
         break;
+    default:
+        module= NULL;
     }
-
     /* If we got a module from above, we'll return it.  Otherwise,
        we'll return NULL, indicating that this component does not want
        to be considered for selection for this MPI_Op.  Note that the
diff --git a/ompi/mca/op/intel_avx_op/op_avx_functions.c b/ompi/mca/op/intel_avx_op/op_avx_functions.c
index f5b168a59d6..464304f15b4 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_functions.c
+++ b/ompi/mca/op/intel_avx_op/op_avx_functions.c
@@ -40,20 +40,7 @@
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step;                                                          \
-    switch(type_size) {                                                \
-        case 8:  \
-                 step = 64; \
-        break; \
-        case 16: \
-                 step = 32; \
-        break; \
-        case 32: \
-                 step = 16; \
-        break; \
-        case 64: \
-                 step = 8;  \
-    }\
+    int step = 512 / type_size;                                        \
     int size = *count/step; \
     int i; \
     int round = size*64; \
@@ -85,20 +72,7 @@
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step;                                                          \
-    switch(type_size) {                                                \
-        case 8:  \
-                 step = 64; \
-        break; \
-        case 16: \
-                 step = 32; \
-        break; \
-        case 32: \
-                 step = 16; \
-        break; \
-        case 64: \
-                 step = 8;  \
-    }\
+    int step = 512 / type_size; \
     int size = *count/step; \
     int i; \
     int round = size*64; \
@@ -289,20 +263,7 @@
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step;                                                          \
-    switch(type_size) {                                                \
-        case 8:  \
-                 step = 64; \
-        break; \
-        case 16: \
-                 step = 32; \
-        break; \
-        case 32: \
-                 step = 16; \
-        break; \
-        case 64: \
-                 step = 8;  \
-    }\
+    int step = 512 / type_size; \
     int size = *count/step; \
     int i; \
     int round = size*64; \
@@ -328,20 +289,7 @@
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step;                                                          \
-    switch(type_size) {                                                \
-        case 8:  \
-                 step = 64; \
-        break; \
-        case 16: \
-                 step = 32; \
-        break; \
-        case 32: \
-                 step = 16; \
-        break; \
-        case 64: \
-                 step = 8;  \
-    }\
+    int step = 512 / type_size; \
     int size = *count/step; \
     int i; \
     int round = size*64; \
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index b9c3050e293..9077405ed17 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
     int elem_size1  = atoi(elem_size);
     char *op = argv[4];
 
-    printf("Sum %d elems, option %c \n",count, *type);
+    printf("%s  %d elems, option %c \n",*op, count, *type);
     int i;
 
     for (i=0; i<count; i++)
@@ -51,7 +51,7 @@ int main(int argc, char **argv) {
         in_double[i] = 10.0+1;
         inout_double[i] = 1.0+2;
 
-        in_uint8[i] = 2;
+        in_uint8[i] = 4;
         inout_uint8[i] = 3;
 
         in_uint16[i] = 2;
@@ -113,11 +113,11 @@ int main(int argc, char **argv) {
             tend = MPI_Wtime();
         }
 
-        if(strcmp(op, "band") == 0) {
+        if(strcmp(op, "land") == 0) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
-                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_LAND);
             if (elem_size1 == 16)
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
             if (elem_size1 == 32)
@@ -169,8 +169,21 @@ int main(int argc, char **argv) {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
             tend = MPI_Wtime();
         }
-    }
 
+        if(strcmp(op, "band") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+            tend = MPI_Wtime();
+        }
+    }
 
     if(*type=='f') {
         if(strcmp(op, "sum") == 0){
@@ -231,7 +244,7 @@ int main(int argc, char **argv) {
         if((i+1)%16==0)
             printf("\n");
     }
-
+*/
     printf("\n ================\n");
     for (i=0; i<count; i++)
     {
@@ -239,6 +252,7 @@ int main(int argc, char **argv) {
         if((i+1)%64==0)
             printf("\n");
     }
+/*
     printf("\n ================\n");
     for (i=0; i<count; i++)
     {

From 721e352169232603d0f10b3592661928d8469024 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 25 Oct 2019 17:22:16 -0400
Subject: [PATCH 03/13] minor clean

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/intel_avx_op/op_avx_component.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ompi/mca/op/intel_avx_op/op_avx_component.c b/ompi/mca/op/intel_avx_op/op_avx_component.c
index d7be36076df..9e6a571929d 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_component.c
+++ b/ompi/mca/op/intel_avx_op/op_avx_component.c
@@ -147,7 +147,6 @@ static struct ompi_op_base_module_1_0_0_t *
     avx_component_op_query(struct ompi_op_t *op, int *priority)
 {
     ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
-    printf("\n %p", module);
     /* Sanity check -- although the framework should never invoke the
        _component_op_query() on non-intrinsic MPI_Op's, we'll put a
        check here just to be sure. */

From fbc8305a12ae031a2e576268b380bb09b40f64e0 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 25 Oct 2019 17:51:36 -0400
Subject: [PATCH 04/13] update test example

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 test/datatype/Reduce_local_float.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index 9077405ed17..e33db456a03 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -40,7 +40,6 @@ int main(int argc, char **argv) {
     int elem_size1  = atoi(elem_size);
     char *op = argv[4];
 
-    printf("%s  %d elems, option %c \n",*op, count, *type);
     int i;
 
     for (i=0; i<count; i++)
@@ -244,7 +243,7 @@ int main(int argc, char **argv) {
         if((i+1)%16==0)
             printf("\n");
     }
-*/
+
     printf("\n ================\n");
     for (i=0; i<count; i++)
     {
@@ -252,7 +251,7 @@ int main(int argc, char **argv) {
         if((i+1)%64==0)
             printf("\n");
     }
-/*
+
     printf("\n ================\n");
     for (i=0; i<count; i++)
     {

From 65644d7133213b6fab2a8807fbf64b4acdbebe1c Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Tue, 5 Nov 2019 16:41:23 -0500
Subject: [PATCH 05/13] Small improvement to the AVX code, for small cases. The
 float add operator is now validated. Trying to figure out the fastest
 implementation.

Add default avx512 flags to compile seamlesly

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/intel_avx_op/Makefile.am        |  13 +-
 ompi/mca/op/intel_avx_op/op_avx_component.c |   4 +-
 ompi/mca/op/intel_avx_op/op_avx_functions.c | 196 +++++++++++++++-----
 3 files changed, 158 insertions(+), 55 deletions(-)

diff --git a/ompi/mca/op/intel_avx_op/Makefile.am b/ompi/mca/op/intel_avx_op/Makefile.am
index 8bf31fc58c2..d4497a7e717 100644
--- a/ompi/mca/op/intel_avx_op/Makefile.am
+++ b/ompi/mca/op/intel_avx_op/Makefile.am
@@ -9,9 +9,8 @@
 # $HEADER$
 #
 
-# This is an avx op component.  This Makefile.am is a typical
-# avx of how to integrate into Open MPI's Automake-based build
-# system.
+# This component provide support for the Advanced Vector Extensions (AVX)
+# available in recent versions of x86 processors.
 #
 # See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
 # for more details on how to make Open MPI components.
@@ -19,10 +18,8 @@
 # First, list all .h and .c sources.  It is necessary to list all .h
 # files so that they will be picked up in the distribution tarball.
 
-sources = \
-    op_avx.h \
-    op_avx_component.c \
-	op_avx_functions.c
+sources = op_avx_component.c op_avx_functions.c \
+	op_avx_functions.h op_avx.h
 
 # Open MPI components can be compiled two ways:
 #
@@ -60,6 +57,7 @@ endif
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component)
 mca_op_avx_la_SOURCES = $(component_sources)
+mca_op_avx_la_CFLAGS = -mavx -march=skylake-avx512
 mca_op_avx_la_LDFLAGS = -module -avoid-version
 mca_op_avx_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 
@@ -70,4 +68,5 @@ mca_op_avx_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 
 noinst_LTLIBRARIES = $(lib)
 libmca_op_avx_la_SOURCES = $(lib_sources)
+libmca_op_avx_la_CFLAGS = -mavx -march=skylake-avx512
 libmca_op_avx_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/intel_avx_op/op_avx_component.c b/ompi/mca/op/intel_avx_op/op_avx_component.c
index 9e6a571929d..3c673c0adbb 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_component.c
+++ b/ompi/mca/op/intel_avx_op/op_avx_component.c
@@ -35,8 +35,6 @@ static struct ompi_op_base_module_1_0_0_t *
 static int avx_component_register(void);
 
 ompi_op_avx_component_t mca_op_avx_component = {
-    /* First, the mca_base_component_t struct containing meta
-       information about the component itself */
     {
         .opc_version = {
             OMPI_OP_BASE_VERSION_1_0_0,
@@ -133,7 +131,7 @@ static int avx_component_register(void)
 static int avx_component_init_query(bool enable_progress_threads,
                                         bool enable_mpi_thread_multiple)
 {
-    if (mca_op_avx_component.hardware_available && !enable_mpi_thread_multiple) {
+    if (mca_op_avx_component.hardware_available) {
         return OMPI_SUCCESS;
     }
     return OMPI_ERR_NOT_SUPPORTED;
diff --git a/ompi/mca/op/intel_avx_op/op_avx_functions.c b/ompi/mca/op/intel_avx_op/op_avx_functions.c
index 464304f15b4..6f7a9f74bf7 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_functions.c
+++ b/ompi/mca/op/intel_avx_op/op_avx_functions.c
@@ -22,8 +22,9 @@
 #include "ompi/mca/op/intel_avx_op/op_avx.h"
 #include "ompi/mca/op/intel_avx_op/op_avx_functions.h"
 
+#include <xmmintrin.h>  /* SIMD v2 */
+#include <pmmintrin.h>  /* SIMD v3 */
 #include <immintrin.h>
-
 /*
  * Since all the functions in this file are essentially identical, we
  * use a macro to substitute in names and types.  The core operation
@@ -36,7 +37,7 @@
  *
  */
 #define OP_AVX_FUNC(name, type_sign, type_size, type, op) \
-    static void ompi_op_avx_2buff_##name##_##type(void *in, void *out, int *count, \
+    static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -44,6 +45,8 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512i vecA =  _mm512_loadu_si512((in+i));\
         __m512i vecB =  _mm512_loadu_si512((out+i));\
@@ -68,7 +71,7 @@
  *
  */
 #define OP_AVX_BIT_FUNC(name, type_size, type, op) \
-    static void ompi_op_avx_2buff_##name##_##type(void *in, void *out, int *count, \
+    static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -76,6 +79,8 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512i vecA =  _mm512_loadu_si512((in+i));\
         __m512i vecB =  _mm512_loadu_si512((out+i));\
@@ -93,34 +98,79 @@
     }\
 }
 
-#define OP_AVX_FLOAT_FUNC(op) \
-    static void ompi_op_avx_2buff_##op##_float(void *in, void *out, int *count, \
-            struct ompi_datatype_t **dtype, \
-            struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 16;                                                          \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-    for (i = 0; i < round; i+=64) { \
-        __m512 vecA =  _mm512_load_ps((in+i));\
-        __m512 vecB =  _mm512_load_ps((out+i));\
-        __m512 res = _mm512_##op##_ps(vecA, vecB); \
-        _mm512_store_ps((out+i), res); \
-    }\
-    uint64_t left_over = *count - (size*step); \
-    if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512 vecA = _mm512_maskz_load_ps(left_over, (in+round)); \
-        __m512 vecB = _mm512_maskz_load_ps(left_over, (out+round)); \
-        __m512 res = _mm512_##op##_ps(vecA, vecB); \
-        _mm512_mask_store_ps((out+round), left_over, res); \
-    }\
+#if 0
+#define OP_AVX_FLOAT_FUNC(op)						\
+  static void ompi_op_avx_2buff_##op##_float(void *in, void *out, int *count, \
+					     struct ompi_datatype_t **dtype, \
+					     struct ompi_op_base_module_1_0_0_t *module) \
+  {									\
+    int step = 16;							\
+    int size = *count/step;						\
+    int i;								\
+    int round = size*64;						\
+    for (i = 0; i < round; i+=64) {					\
+      __m512 vecA =  _mm512_load_ps((in+i));				\
+      __m512 vecB =  _mm512_load_ps((out+i));				\
+      __m512 res = _mm512_##op##_ps(vecA, vecB);			\
+	_mm512_store_ps((out+i), res);					\
+    }									\
+    uint64_t left_over = *count - (size*step);				\
+    if(left_over!=0) {							\
+      uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF;			\
+      left_over = left_over64 >>(64-left_over);				\
+      __m512 vecA = _mm512_maskz_load_ps(left_over, (in+round));	\
+      __m512 vecB = _mm512_maskz_load_ps(left_over, (out+round));	\
+      __m512 res = _mm512_##op##_ps(vecA, vecB);			\
+	_mm512_mask_store_ps((out+round), left_over, res);		\
+    }									\
+  }
+
+#else
+#define OP_AVX_FLOAT_FUNC(op)				\
+static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
+					   struct ompi_datatype_t **dtype, \
+					   struct ompi_op_base_module_1_0_0_t *module) \
+{									\
+  int types_per_step = 512 / (8 * sizeof(float));			\
+  int left_over = *count;						\
+  float* in = (float*)_in;						\
+  float* out = (float*)_out;						\
+  for (; left_over >= types_per_step; left_over -= types_per_step) {	\
+    __m512 vecA =  _mm512_load_ps(in);					\
+    __m512 vecB =  _mm512_load_ps(out);					\
+    in += types_per_step;						\
+    __m512 res = _mm512_##op##_ps(vecA, vecB);				\
+      _mm512_store_ps(out, res);					\
+      out += types_per_step;						\
+  }									\
+  if( 0 != left_over ) {						\
+    types_per_step >>= 1;  /* 256 / (8 * sizeof(float));	*/	\
+    if( left_over >= types_per_step ) {					\
+      __m256 vecA =  _mm256_load_ps(in);				\
+      __m256 vecB =  _mm256_load_ps(out);				\
+      __m256 res = _mm256_##op##_ps(vecA, vecB);			\
+	_mm256_store_ps(out, res);					\
+	in += types_per_step;						\
+	out += types_per_step;						\
+	left_over -= types_per_step;					\
+    }									\
+  }									\
+  if( 0 != left_over ) {						\
+    switch(left_over) {							\
+    case 7: out[6] += in[6];						\
+    case 6: out[5] += in[5];						\
+    case 5: out[4] += in[4];						\
+    case 4: out[3] += in[3];						\
+    case 3: out[2] += in[2];						\
+    case 2: out[1] += in[1];						\
+    case 1: out[0] += in[0];						\
+    }									\
+  }									\
 }
+#endif
 
 #define OP_AVX_DOUBLE_FUNC(op) \
-    static void ompi_op_avx_2buff_##op##_double(void *in, void *out, int *count, \
+    static void ompi_op_avx_2buff_##op##_double(void *_in, void *_out, int *count, \
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -128,6 +178,8 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    double* in = (double*)_in; \
+    double* out = (double*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512d vecA =  _mm512_load_pd((in+i));\
         __m512d vecB =  _mm512_load_pd((out+i));\
@@ -192,6 +244,48 @@
 
     /* Floating point */
     OP_AVX_FLOAT_FUNC(add)
+#if 0
+static void ompi_op_avx_2buff_add_float(void *_in, void *_out, int *count,
+					struct ompi_datatype_t **dtype,
+					struct ompi_op_base_module_1_0_0_t *module) 
+{
+  int types_per_step = 512 / (8 * sizeof(float));
+  int left_over = *count;
+  float* in = (float*)_in;
+  float* out = (float*)_out;
+  for (; left_over >= types_per_step; left_over -= types_per_step) {
+    __m512 vecA =  _mm512_load_ps(in);
+    __m512 vecB =  _mm512_load_ps(out);
+    in += types_per_step;
+    __m512 res = _mm512_add_ps(vecA, vecB);
+      _mm512_store_ps(out, res);
+      out += types_per_step;
+  }
+  if( 0 != left_over ) {
+    types_per_step >>= 1;  /* 256 / (8 * sizeof(float)); */
+    if( left_over >= types_per_step ) {
+      __m256 vecA =  _mm256_load_ps(in);
+      __m256 vecB =  _mm256_load_ps(out);
+      __m256 res = _mm256_add_ps(vecA, vecB);
+	_mm256_store_ps(out, res);
+	in += types_per_step;
+	out += types_per_step;
+	left_over -= types_per_step;
+    }
+    if( 0 != left_over ) {
+      switch(left_over) {
+      case 7: out[6] += in[6];
+      case 6: out[5] += in[5];
+      case 5: out[4] += in[4];
+      case 4: out[3] += in[3];
+      case 3: out[2] += in[2];
+      case 2: out[1] += in[1];
+      case 1: out[0] += in[0];
+      }
+    }
+  }
+}
+#endif
     OP_AVX_DOUBLE_FUNC(add)
 
 /*************************************************************************
@@ -220,8 +314,8 @@
     OP_AVX_BIT_FUNC(band, 64,  int64_t, and)
     OP_AVX_BIT_FUNC(band, 64, uint64_t, and)
 
-    OP_AVX_FLOAT_FUNC(and)
-    OP_AVX_DOUBLE_FUNC(and)
+    // not defined - OP_AVX_FLOAT_FUNC(and)
+    // not defined - OP_AVX_DOUBLE_FUNC(and)
 
 /*************************************************************************
  * Bitwise OR
@@ -235,8 +329,8 @@
     OP_AVX_BIT_FUNC(bor, 64,  int64_t, or)
     OP_AVX_BIT_FUNC(bor, 64, uint64_t, or)
 
-    OP_AVX_FLOAT_FUNC(or)
-    OP_AVX_DOUBLE_FUNC(or)
+    // not defined - OP_AVX_FLOAT_FUNC(or)
+    // not defined - OP_AVX_DOUBLE_FUNC(or)
 
 /*************************************************************************
  * Bitwise XOR
@@ -250,16 +344,16 @@
     OP_AVX_BIT_FUNC(bxor, 64,  int64_t, xor)
     OP_AVX_BIT_FUNC(bxor, 64, uint64_t, xor)
 
-    OP_AVX_FLOAT_FUNC(xor)
-    OP_AVX_DOUBLE_FUNC(xor)
+    // not defined - OP_AVX_FLOAT_FUNC(xor)
+    // not defined - OP_AVX_DOUBLE_FUNC(xor)
 
 /*
  *  This is a three buffer (2 input and 1 output) version of the reduction
  *  routines, needed for some optimizations.
  */
 #define OP_AVX_FUNC_3BUFF(name, type_sign, type_size, type, op)\
-        static void ompi_op_avx_3buff_##name##_##type(void * restrict in1,   \
-                void * restrict in2, void * restrict out, int *count, \
+        static void ompi_op_avx_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -267,6 +361,9 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512i vecA =  _mm512_loadu_si512((in1+i));\
         __m512i vecB =  _mm512_loadu_si512((in2+i));\
@@ -285,7 +382,7 @@
 }
 
 #define OP_AVX_BIT_FUNC_3BUFF(name, type_size, type, op) \
-        static void ompi_op_avx_3buff_##op##_##type(void *in1, void *in2, void *out, int *count, \
+        static void ompi_op_avx_3buff_##op##_##type(void *_in1, void *_in2, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -293,6 +390,9 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512i vecA =  _mm512_loadu_si512((in1+i));\
         __m512i vecB =  _mm512_loadu_si512((in2+i));\
@@ -311,7 +411,7 @@
 }
 
 #define OP_AVX_FLOAT_FUNC_3BUFF(op) \
-        static void ompi_op_avx_3buff_##op##_float(void *in1, void *in2, void *out, int *count, \
+        static void ompi_op_avx_3buff_##op##_float(void *_in1, void *_in2, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -319,6 +419,9 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    float* in1 = (float*)_in1; \
+    float* in2 = (float*)_in2; \
+    float* out = (float*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512 vecA =  _mm512_load_ps((in1+i));\
         __m512 vecB =  _mm512_load_ps((in2+i));\
@@ -337,7 +440,7 @@
 }
 
 #define OP_AVX_DOUBLE_FUNC_3BUFF(op) \
-        static void ompi_op_avx_3buff_##op##_double(void *in1, void *in2, void *out, int *count, \
+        static void ompi_op_avx_3buff_##op##_double(void *_in1, void *_in2, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
@@ -345,6 +448,9 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
+    double* in1 = (double*)_in1; \
+    double* in2 = (double*)_in2; \
+    double* out = (double*)_out; \
     for (i = 0; i < round; i+=64) { \
         __m512d vecA =  _mm512_load_pd((in1+i));\
         __m512d vecB =  _mm512_load_pd((in2+i));\
@@ -436,8 +542,8 @@
     OP_AVX_BIT_FUNC_3BUFF(band, 64,  int64_t, and)
     OP_AVX_BIT_FUNC_3BUFF(band, 64, uint64_t, and)
 
-    OP_AVX_FLOAT_FUNC_3BUFF(and)
-    OP_AVX_DOUBLE_FUNC_3BUFF(and)
+    // not defined - OP_AVX_FLOAT_FUNC_3BUFF(and)
+    // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(and)
 
 /*************************************************************************
  * Bitwise OR
@@ -451,8 +557,8 @@
     OP_AVX_BIT_FUNC_3BUFF(bor, 64,  int64_t, or)
     OP_AVX_BIT_FUNC_3BUFF(bor, 64, uint64_t, or)
 
-    OP_AVX_FLOAT_FUNC_3BUFF(or)
-    OP_AVX_DOUBLE_FUNC_3BUFF(or)
+    // not defined - OP_AVX_FLOAT_FUNC_3BUFF(or)
+    // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(or)
 
 /*************************************************************************
  * Bitwise XOR
@@ -466,8 +572,8 @@
     OP_AVX_BIT_FUNC_3BUFF(bxor, 64,  int64_t, xor)
     OP_AVX_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor)
 
-    OP_AVX_FLOAT_FUNC_3BUFF(xor)
-    OP_AVX_DOUBLE_FUNC_3BUFF(xor)
+    // not defined - OP_AVX_FLOAT_FUNC_3BUFF(xor)
+    // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(xor)
 
 
 /** C integer ***********************************************************/

From 6e1dbcac7a58c4b838c299c2e4681f9c504bd188 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 7 Nov 2019 18:35:45 -0500
Subject: [PATCH 06/13] Reshape the AVX component to automatically activate.
 During configure it detects if the target architecture is x86_64 to enable
 itself. Then during query it detects processor capabilities using cpuid and
 disable itself if AVX2 support is not found.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/Makefile.am                       |  2 +
 ompi/mca/op/{intel_avx_op => avx}/Makefile.am |  8 +-
 ompi/mca/op/avx/configure.m4                  | 64 ++++++++++++
 ompi/mca/op/{intel_avx_op => avx}/op_avx.h    |  0
 .../{intel_avx_op => avx}/op_avx_component.c  | 99 +++++++++++++------
 .../{intel_avx_op => avx}/op_avx_functions.c  |  6 +-
 .../{intel_avx_op => avx}/op_avx_functions.h  |  2 +-
 7 files changed, 143 insertions(+), 38 deletions(-)
 rename ompi/mca/op/{intel_avx_op => avx}/Makefile.am (93%)
 create mode 100644 ompi/mca/op/avx/configure.m4
 rename ompi/mca/op/{intel_avx_op => avx}/op_avx.h (100%)
 rename ompi/mca/op/{intel_avx_op => avx}/op_avx_component.c (79%)
 rename ompi/mca/op/{intel_avx_op => avx}/op_avx_functions.c (99%)
 rename ompi/mca/op/{intel_avx_op => avx}/op_avx_functions.h (93%)

diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am
index 8c392f1dbec..c4d47f9e64a 100644
--- a/ompi/mca/op/Makefile.am
+++ b/ompi/mca/op/Makefile.am
@@ -17,6 +17,8 @@
 # $HEADER$
 #
 
+AM_CPPFLAGS = $(LTDLINCL)
+
 # main library setup
 noinst_LTLIBRARIES = libmca_op.la
 libmca_op_la_SOURCES =
diff --git a/ompi/mca/op/intel_avx_op/Makefile.am b/ompi/mca/op/avx/Makefile.am
similarity index 93%
rename from ompi/mca/op/intel_avx_op/Makefile.am
rename to ompi/mca/op/avx/Makefile.am
index d4497a7e717..3be0f3e4028 100644
--- a/ompi/mca/op/intel_avx_op/Makefile.am
+++ b/ompi/mca/op/avx/Makefile.am
@@ -18,6 +18,8 @@
 # First, list all .h and .c sources.  It is necessary to list all .h
 # files so that they will be picked up in the distribution tarball.
 
+AM_CPPFLAGS=$(op_avx_CPPFLAGS)
+
 sources = op_avx_component.c op_avx_functions.c \
 	op_avx_functions.h op_avx.h
 
@@ -37,7 +39,7 @@ sources = op_avx_component.c op_avx_functions.c \
 # MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
 # which way this component should be built.
 
-if MCA_BUILD_ompi_op_intel_avx_op_DSO
+if MCA_BUILD_ompi_op_avx_DSO
 lib =
 lib_sources =
 component = mca_op_avx.la
@@ -57,7 +59,7 @@ endif
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component)
 mca_op_avx_la_SOURCES = $(component_sources)
-mca_op_avx_la_CFLAGS = -mavx -march=skylake-avx512
+mca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS)
 mca_op_avx_la_LDFLAGS = -module -avoid-version
 mca_op_avx_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 
@@ -68,5 +70,5 @@ mca_op_avx_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 
 noinst_LTLIBRARIES = $(lib)
 libmca_op_avx_la_SOURCES = $(lib_sources)
-libmca_op_avx_la_CFLAGS = -mavx -march=skylake-avx512
+libmca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS)
 libmca_op_avx_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
new file mode 100644
index 00000000000..18c3cad3734
--- /dev/null
+++ b/ompi/mca/op/avx/configure.m4
@@ -0,0 +1,64 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
+#		          [action-if-cant-compile])
+# ------------------------------------------------
+# We can always build, unless we were explicitly disabled.
+AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+    OPAL_VAR_SCOPE_PUSH([op_avx_support op_avx_cflags_save op_avx_CPPFLAGS])
+    op_avx_support=0
+
+    AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
+          [AC_LANG_PUSH([C])
+
+           # Check for AVX512 support with default flags
+           op_avx_CPPFLAGS=""
+           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
+           AC_LINK_IFELSE(
+               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB);
+                                ]])],
+               [op_avx_support=1
+                AC_MSG_RESULT([yes])],
+               [AC_MSG_RESULT([no])])
+
+           AS_IF([test $op_avx_support -eq 0],
+                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
+                  op_avx_cflags_save="$CFLAGS"
+                  CFLAGS="$CFLAGS -march=skylake-avx512"
+                  AC_LINK_IFELSE(
+	              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                       [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB);
+                                       ]])],
+                      [op_avx_support=1
+	               op_avx_CPPFLAGS="-march=skylake-avx512"
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])])
+           CFLAGS="$op_avx_cflags_save"
+           AC_LANG_POP([C])])
+
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
+		       [$op_avx_support],
+		       [Whetever AVX512 is supported in the current compilation context])
+    AS_IF([test $op_avx_support -eq 1],
+    	  [$1],
+	  [$2])
+    AC_SUBST([op_avx_CPPFLAGS])
+
+    OPAL_VAR_SCOPE_POP
+])dnl
diff --git a/ompi/mca/op/intel_avx_op/op_avx.h b/ompi/mca/op/avx/op_avx.h
similarity index 100%
rename from ompi/mca/op/intel_avx_op/op_avx.h
rename to ompi/mca/op/avx/op_avx.h
diff --git a/ompi/mca/op/intel_avx_op/op_avx_component.c b/ompi/mca/op/avx/op_avx_component.c
similarity index 79%
rename from ompi/mca/op/intel_avx_op/op_avx_component.c
rename to ompi/mca/op/avx/op_avx_component.c
index 3c673c0adbb..de8545f9579 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_component.c
+++ b/ompi/mca/op/avx/op_avx_component.c
@@ -23,8 +23,8 @@
 #include "ompi/op/op.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
-#include "ompi/mca/op/intel_avx_op/op_avx.h"
-#include "ompi/mca/op/intel_avx_op/op_avx_functions.h"
+#include "ompi/mca/op/avx/op_avx.h"
+#include "ompi/mca/op/avx/op_avx_functions.h"
 
 static int avx_component_open(void);
 static int avx_component_close(void);
@@ -34,6 +34,61 @@ static struct ompi_op_base_module_1_0_0_t *
     avx_component_op_query(struct ompi_op_t *op, int *priority);
 static int avx_component_register(void);
 
+/**
+ * A slightly modified code from
+ * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
+
+#include <immintrin.h>
+
+int has_intel_AVX512_features(void)
+{
+    const unsigned long avx512_features = _FEATURE_AVX2;
+
+    return _may_i_use_cpu_feature( avx512_features );
+}
+#else /* non-Intel compiler */
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
+{
+#if defined(_MSC_VER)
+    __cpuidex(abcd, eax, ecx);
+#else
+    uint32_t ebx, edx;
+#if defined( __i386__ ) && defined ( __PIC__ )
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
+	      "+a" (eax), "=c" (ecx), "=d" (edx) );
+#else
+    __asm__ ( "cpuid" : "=b" (ebx),
+	      "+a" (eax), "=c" (ecx), "=d" (edx) );
+#endif  /* defined( __i386__ ) && defined ( __PIC__ ) */
+    abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
+#endif
+}
+
+int has_intel_AVX512_features(void)
+{
+  uint32_t abcd[4];
+  uint32_t osxsave_mask = (1 << 27);  // OSX.
+  uint32_t avx2_mask = (1 << 5);  // AVX2
+
+  run_cpuid( 1, 0, abcd );
+  // OS supports extended processor state management ?
+  if ( (abcd[2] & osxsave_mask) != osxsave_mask )
+    return 0;
+
+  run_cpuid( 7, 0, abcd );
+  return !!((abcd[1] & avx2_mask) != avx2_mask);
+}
+#endif /* non-Intel compiler */
+
 ompi_op_avx_component_t mca_op_avx_component = {
     {
         .opc_version = {
@@ -72,8 +127,7 @@ static int avx_component_open(void)
        component won't even be shown in ompi_info output (which is
        probably not what you want).
     */
-
-    return OMPI_SUCCESS;
+  return OMPI_SUCCESS;
 }
 
 /*
@@ -95,24 +149,9 @@ static int avx_component_close(void)
 /*
  * Register MCA params.
  */
-static int avx_component_register(void)
+static int
+avx_component_register(void)
 {
-
-    /* Additionally, since this component is simulating hardware,
-       let's make MCA params that determine whethere a) the hardware
-       is available, and b) whether double precision floating point
-       types are supported.  This allows you to change the behavior of
-       this component at run-time (by setting these MCA params at
-       run-time), simulating different kinds of hardware. */
-    mca_op_avx_component.hardware_available = false;
-    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
-                                           "hardware_available",
-                                           "Whether the hardware is available or not",
-                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
-                                           OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
-                                           &mca_op_avx_component.hardware_available);
-
     mca_op_avx_component.double_supported = true;
     (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
                                            "double_supported",
@@ -128,21 +167,21 @@ static int avx_component_register(void)
 /*
  * Query whether this component wants to be used in this process.
  */
-static int avx_component_init_query(bool enable_progress_threads,
-                                        bool enable_mpi_thread_multiple)
+static int
+avx_component_init_query(bool enable_progress_threads,
+			 bool enable_mpi_thread_multiple)
 {
-    if (mca_op_avx_component.hardware_available) {
-        return OMPI_SUCCESS;
-    }
-    return OMPI_ERR_NOT_SUPPORTED;
+    if( !has_intel_AVX512_features() )
+        return OMPI_ERR_NOT_SUPPORTED;
+    return OMPI_SUCCESS;
 }
 
 
 /*
  * Query whether this component can be used for a specific op
  */
-static struct ompi_op_base_module_1_0_0_t *
-    avx_component_op_query(struct ompi_op_t *op, int *priority)
+static struct ompi_op_base_module_1_0_0_t*
+avx_component_op_query(struct ompi_op_t *op, int *priority)
 {
     ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
     /* Sanity check -- although the framework should never invoke the
@@ -152,7 +191,7 @@ static struct ompi_op_base_module_1_0_0_t *
         return NULL;
     }
 
-    int i=0;
+    int i = 0;
     switch (op->o_f_to_c_index) {
     case OMPI_OP_BASE_FORTRAN_MAX:
         /* Corresponds to MPI_MAX */
diff --git a/ompi/mca/op/intel_avx_op/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
similarity index 99%
rename from ompi/mca/op/intel_avx_op/op_avx_functions.c
rename to ompi/mca/op/avx/op_avx_functions.c
index 6f7a9f74bf7..abac5cac534 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -19,11 +19,9 @@
 #include "ompi/op/op.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
-#include "ompi/mca/op/intel_avx_op/op_avx.h"
-#include "ompi/mca/op/intel_avx_op/op_avx_functions.h"
+#include "ompi/mca/op/avx/op_avx.h"
+#include "ompi/mca/op/avx/op_avx_functions.h"
 
-#include <xmmintrin.h>  /* SIMD v2 */
-#include <pmmintrin.h>  /* SIMD v3 */
 #include <immintrin.h>
 /*
  * Since all the functions in this file are essentially identical, we
diff --git a/ompi/mca/op/intel_avx_op/op_avx_functions.h b/ompi/mca/op/avx/op_avx_functions.h
similarity index 93%
rename from ompi/mca/op/intel_avx_op/op_avx_functions.h
rename to ompi/mca/op/avx/op_avx_functions.h
index b76b45b0419..621f8538d92 100644
--- a/ompi/mca/op/intel_avx_op/op_avx_functions.h
+++ b/ompi/mca/op/avx/op_avx_functions.h
@@ -16,7 +16,7 @@
 #endif
 
 #include "ompi/mca/op/op.h"
-#include "ompi/mca/op/intel_avx_op/op_avx.h"
+#include "ompi/mca/op/avx/op_avx.h"
 
 BEGIN_C_DECLS
 

From d279b49d851be067f75dd9387cf50e0b90212300 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 7 Nov 2019 18:37:47 -0500
Subject: [PATCH 07/13] Fix a typos in the detection of rdtsc.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 config/opal_config_asm.m4 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/opal_config_asm.m4 b/config/opal_config_asm.m4
index b90c4aa61bd..a11682ce24e 100644
--- a/config/opal_config_asm.m4
+++ b/config/opal_config_asm.m4
@@ -1374,7 +1374,7 @@ AC_MSG_ERROR([Can not continue.])
 
     # Check for RDTSCP support
     result=0
-    AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
+    AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
           [AC_MSG_CHECKING([for RDTSCP assembly support])
            AC_LANG_PUSH([C])
            AC_TRY_RUN([[

From 8a28dee7802e6969765fb765a02796990f103172 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 8 Nov 2019 11:15:01 -0500
Subject: [PATCH 08/13] Integrate the AVX component into the OMPI build system.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/avx/Makefile.am  | 21 ++++++++-------------
 ompi/mca/op/avx/configure.m4 | 23 +++++++++++++----------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
index 3be0f3e4028..3a02d7510fb 100644
--- a/ompi/mca/op/avx/Makefile.am
+++ b/ompi/mca/op/avx/Makefile.am
@@ -40,15 +40,11 @@ sources = op_avx_component.c op_avx_functions.c \
 # which way this component should be built.
 
 if MCA_BUILD_ompi_op_avx_DSO
-lib =
-lib_sources =
-component = mca_op_avx.la
-component_sources = $(sources)
+component_noinst =
+component_install = mca_op_avx.la
 else
-lib = libmca_op_avx.la
-lib_sources = $(sources)
-component =
-component_sources =
+component_install =
+component_noinst = libmca_op_avx.la
 endif
 
 # Specific information for DSO builds.
@@ -57,18 +53,17 @@ endif
 # $prefix/lib/openmpi).
 
 mcacomponentdir = $(ompilibdir)
-mcacomponent_LTLIBRARIES = $(component)
-mca_op_avx_la_SOURCES = $(component_sources)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_avx_la_SOURCES = $(sources)
 mca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS)
 mca_op_avx_la_LDFLAGS = -module -avoid-version
-mca_op_avx_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 
 # Specific information for static builds.
 #
 # Note that we *must* "noinst"; the upper-layer Makefile.am's will
 # slurp in the resulting .la library into libmpi.
 
-noinst_LTLIBRARIES = $(lib)
-libmca_op_avx_la_SOURCES = $(lib_sources)
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_avx_la_SOURCES = $(sources)
 libmca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS)
 libmca_op_avx_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
index 18c3cad3734..ab21f057933 100644
--- a/ompi/mca/op/avx/configure.m4
+++ b/ompi/mca/op/avx/configure.m4
@@ -12,12 +12,12 @@
 #
 
 # MCA_ompi_op_avx_CONFIG([action-if-can-compile],
-#		          [action-if-cant-compile])
+#		         [action-if-cant-compile])
 # ------------------------------------------------
 # We can always build, unless we were explicitly disabled.
 AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
-    OPAL_VAR_SCOPE_PUSH([op_avx_support op_avx_cflags_save op_avx_CPPFLAGS])
     op_avx_support=0
+    OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save op_avx_CPPFLAGS])
 
     AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
           [AC_LANG_PUSH([C])
@@ -29,7 +29,7 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
                [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                                 [[
     __m512 vA, vB;
-    _mm512_add_ps(vA, vB);
+    _mm512_add_ps(vA, vB)
                                 ]])],
                [op_avx_support=1
                 AC_MSG_RESULT([yes])],
@@ -43,22 +43,25 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
 	              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
                                        [[
     __m512 vA, vB;
-    _mm512_add_ps(vA, vB);
+    _mm512_add_ps(vA, vB)
                                        ]])],
                       [op_avx_support=1
 	               op_avx_CPPFLAGS="-march=skylake-avx512"
                        AC_MSG_RESULT([yes])],
-                      [AC_MSG_RESULT([no])])])
-           CFLAGS="$op_avx_cflags_save"
-           AC_LANG_POP([C])])
+                      [AC_MSG_RESULT([no])])
+                  CFLAGS="$op_avx_cflags_save"
+                 ])
+           AC_LANG_POP([C])
+          ])
 
     AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
 		       [$op_avx_support],
 		       [Whetever AVX512 is supported in the current compilation context])
+    AC_SUBST([op_avx_CPPFLAGS])
+    OPAL_VAR_SCOPE_POP
     AS_IF([test $op_avx_support -eq 1],
-    	  [$1],
+    	  [AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
+           [$1]],
 	  [$2])
-    AC_SUBST([op_avx_CPPFLAGS])
 
-    OPAL_VAR_SCOPE_POP
 ])dnl

From 615692fc00851ccb134bb4c8c0e6b4f6cbb3cdd9 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 8 Nov 2019 18:44:02 -0500
Subject: [PATCH 09/13] Cleanups on the initialization path.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/avx/op_avx.h           |  3 --
 ompi/mca/op/avx/op_avx_component.c | 70 ++++--------------------------
 ompi/mca/op/avx/op_avx_functions.c | 46 +-------------------
 3 files changed, 11 insertions(+), 108 deletions(-)

diff --git a/ompi/mca/op/avx/op_avx.h b/ompi/mca/op/avx/op_avx.h
index 5ec4b3f9e00..927a48db947 100644
--- a/ompi/mca/op/avx/op_avx.h
+++ b/ompi/mca/op/avx/op_avx.h
@@ -37,9 +37,6 @@ typedef struct {
        avxs; replace them with whatever is relevant for your
        component. */
 
-    /** A simple boolean indicating that the hardware is available. */
-    bool hardware_available;
-
     /** A simple boolean indicating whether double precision is
         supported. */
     bool double_supported;
diff --git a/ompi/mca/op/avx/op_avx_component.c b/ompi/mca/op/avx/op_avx_component.c
index de8545f9579..5113a6cf437 100644
--- a/ompi/mca/op/avx/op_avx_component.c
+++ b/ompi/mca/op/avx/op_avx_component.c
@@ -183,7 +183,7 @@ avx_component_init_query(bool enable_progress_threads,
 static struct ompi_op_base_module_1_0_0_t*
 avx_component_op_query(struct ompi_op_t *op, int *priority)
 {
-    ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
+    ompi_op_base_module_t *module = NULL;
     /* Sanity check -- although the framework should never invoke the
        _component_op_query() on non-intrinsic MPI_Op's, we'll put a
        check here just to be sure. */
@@ -191,82 +191,30 @@ avx_component_op_query(struct ompi_op_t *op, int *priority)
         return NULL;
     }
 
-    int i = 0;
     switch (op->o_f_to_c_index) {
     case OMPI_OP_BASE_FORTRAN_MAX:
-        /* Corresponds to MPI_MAX */
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_MIN:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_SUM:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_PROD:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_BOR:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_BAND:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_BXOR:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
-            OBJ_RETAIN(module);
+        module = OBJ_NEW(ompi_op_base_module_t);
+        for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+	    module->opm_fns[i] = ompi_op_avx_functions[op->o_f_to_c_index][i];
+	    OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[op->o_f_to_c_index][i];
+	    OBJ_RETAIN(module);
         }
         break;
     case OMPI_OP_BASE_FORTRAN_LAND:
-        module = NULL;
-        break;
     case OMPI_OP_BASE_FORTRAN_LOR:
-        module = NULL;
-        break;
     case OMPI_OP_BASE_FORTRAN_LXOR:
-        module = NULL;
-        break;
     case OMPI_OP_BASE_FORTRAN_MAXLOC:
-        module = NULL;
-        break;
     case OMPI_OP_BASE_FORTRAN_MINLOC:
-        module= NULL;
-        break;
+    case OMPI_OP_BASE_FORTRAN_REPLACE:
     default:
-        module= NULL;
+        break;
     }
     /* If we got a module from above, we'll return it.  Otherwise,
        we'll return NULL, indicating that this component does not want
diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
index abac5cac534..98bff8601c4 100644
--- a/ompi/mca/op/avx/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -242,48 +242,6 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 
     /* Floating point */
     OP_AVX_FLOAT_FUNC(add)
-#if 0
-static void ompi_op_avx_2buff_add_float(void *_in, void *_out, int *count,
-					struct ompi_datatype_t **dtype,
-					struct ompi_op_base_module_1_0_0_t *module) 
-{
-  int types_per_step = 512 / (8 * sizeof(float));
-  int left_over = *count;
-  float* in = (float*)_in;
-  float* out = (float*)_out;
-  for (; left_over >= types_per_step; left_over -= types_per_step) {
-    __m512 vecA =  _mm512_load_ps(in);
-    __m512 vecB =  _mm512_load_ps(out);
-    in += types_per_step;
-    __m512 res = _mm512_add_ps(vecA, vecB);
-      _mm512_store_ps(out, res);
-      out += types_per_step;
-  }
-  if( 0 != left_over ) {
-    types_per_step >>= 1;  /* 256 / (8 * sizeof(float)); */
-    if( left_over >= types_per_step ) {
-      __m256 vecA =  _mm256_load_ps(in);
-      __m256 vecB =  _mm256_load_ps(out);
-      __m256 res = _mm256_add_ps(vecA, vecB);
-	_mm256_store_ps(out, res);
-	in += types_per_step;
-	out += types_per_step;
-	left_over -= types_per_step;
-    }
-    if( 0 != left_over ) {
-      switch(left_over) {
-      case 7: out[6] += in[6];
-      case 6: out[5] += in[5];
-      case 5: out[4] += in[4];
-      case 4: out[3] += in[3];
-      case 3: out[2] += in[2];
-      case 2: out[1] += in[1];
-      case 1: out[0] += in[0];
-      }
-    }
-  }
-}
-#endif
     OP_AVX_DOUBLE_FUNC(add)
 
 /*************************************************************************
@@ -669,7 +627,7 @@ ompi_op_base_handler_fn_t ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMP
     [OMPI_OP_BASE_FORTRAN_REPLACE] = {
         /* (MPI_ACCUMULATE is handled differently than the other
            reductions, so just zero out its function
-           impementations here to ensure that users don't invoke
+           implementations here to ensure that users don't invoke
            MPI_REPLACE with any reduction operations other than
            ACCUMULATE) */
         NULL,
@@ -732,7 +690,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN
     [OMPI_OP_BASE_FORTRAN_REPLACE] = {
         /* MPI_ACCUMULATE is handled differently than the other
            reductions, so just zero out its function
-           impementations here to ensure that users don't invoke
+           implementations here to ensure that users don't invoke
            MPI_REPLACE with any reduction operations other than
            ACCUMULATE */
         NULL,

From 85ee66e3ff515762f26c2ac94810bbf409cd2b8c Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Sun, 10 Nov 2019 15:31:15 -0500
Subject: [PATCH 10/13] Correctness check for all types and all operations

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/avx/op_avx_functions.c |  32 +-
 test/datatype/Reduce_local_float.c | 533 ++++++++++++++++++++++++-----
 test/datatype/correctness_check.sh |  28 ++
 3 files changed, 494 insertions(+), 99 deletions(-)
 create mode 100644 test/datatype/correctness_check.sh

diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
index 98bff8601c4..9c945574814 100644
--- a/ompi/mca/op/avx/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -43,22 +43,20 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
-    type* in = (type*)_in; \
-    type* out = (type*)_out; \
-    for (i = 0; i < round; i+=64) { \
-        __m512i vecA =  _mm512_loadu_si512((in+i));\
-        __m512i vecB =  _mm512_loadu_si512((out+i));\
+    for (i = 0; i < round; i+= 64) { \
+        __m512i vecA =  _mm512_loadu_si512((_in+i));\
+        __m512i vecB =  _mm512_loadu_si512((_out+i));\
         __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-        _mm512_storeu_si512((out+i), res); \
+        _mm512_storeu_si512((_out+i), res); \
     }\
     uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
         uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
         left_over = left_over64 >>(64-left_over); \
-        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in+round)); \
-        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (out+round)); \
+        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (_in+round)); \
+        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (_out+round)); \
         __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+        _mm512_mask_storeu_epi##type_size((_out+round), left_over, res); \
     }\
 }
 
@@ -77,22 +75,20 @@
     int size = *count/step; \
     int i; \
     int round = size*64; \
-    type* in = (type*)_in; \
-    type* out = (type*)_out; \
     for (i = 0; i < round; i+=64) { \
-        __m512i vecA =  _mm512_loadu_si512((in+i));\
-        __m512i vecB =  _mm512_loadu_si512((out+i));\
+        __m512i vecA =  _mm512_loadu_si512((_in+i));\
+        __m512i vecB =  _mm512_loadu_si512((_out+i));\
         __m512i res = _mm512_##op##_si512(vecA, vecB); \
-        _mm512_storeu_si512((out+i), res); \
+        _mm512_storeu_si512((_out+i), res); \
     }\
     uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
         uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
         left_over = left_over64 >>(64-left_over); \
-        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in+round)); \
-        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (out+round)); \
+        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (_in+round)); \
+        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (_out+round)); \
         __m512i res =  _mm512_##op##_si512(vecA, vecB); \
-        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+        _mm512_mask_storeu_epi##type_size((_out+round), left_over, res); \
     }\
 }
 
@@ -178,7 +174,7 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
     int round = size*64; \
     double* in = (double*)_in; \
     double* out = (double*)_out; \
-    for (i = 0; i < round; i+=64) { \
+    for (i = 0; i < round; i+=8) { \
         __m512d vecA =  _mm512_load_pd((in+i));\
         __m512d vecB =  _mm512_load_pd((out+i));\
         __m512d res = _mm512_##op##_pd(vecA, vecB); \
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index e33db456a03..277846c55ea 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -14,22 +14,27 @@
 
 float in_float[ARRAYSIZE];
 float inout_float[ARRAYSIZE];
+float inout_float_for_check[ARRAYSIZE];
 
 int8_t in_uint8[ARRAYSIZE];
 int8_t inout_uint8[ARRAYSIZE];
+int8_t inout_uint8_for_check[ARRAYSIZE];
 
 uint16_t in_uint16[ARRAYSIZE];
 uint16_t inout_uint16[ARRAYSIZE];
+uint16_t inout_uint16_for_check[ARRAYSIZE];
 
 uint32_t in_uint32[ARRAYSIZE];
 uint32_t inout_uint32[ARRAYSIZE];
+uint32_t inout_uint32_for_check[ARRAYSIZE];
 
 uint64_t in_uint64[ARRAYSIZE];
 uint64_t inout_uint64[ARRAYSIZE];
+uint64_t inout_uint64_for_check[ARRAYSIZE];
 
 double in_double[ARRAYSIZE];
 double inout_double[ARRAYSIZE];
-
+double inout_double_for_check[ARRAYSIZE];
 
 int main(int argc, char **argv) {
 
@@ -45,22 +50,22 @@ int main(int argc, char **argv) {
     for (i=0; i<count; i++)
     {
         in_float[i] = 1000.0+1;
-        inout_float[i] = 100.0+2;
+        inout_float[i] = inout_float_for_check[i] = 100.0+2;
 
         in_double[i] = 10.0+1;
-        inout_double[i] = 1.0+2;
+        inout_double[i] = inout_double_for_check[i] = 1.0+2;
 
-        in_uint8[i] = 4;
-        inout_uint8[i] = 3;
+        in_uint8[i] = 5;
+        inout_uint8[i] = inout_uint8_for_check[i] = 3;
 
-        in_uint16[i] = 2;
-        inout_uint16[i] = 3;
+        in_uint16[i] = 5;
+        inout_uint16[i] = inout_uint16_for_check[i] = 3;
 
-        in_uint32[i] = 2;
-        inout_uint32[i] = 3;
+        in_uint32[i] = 5;
+        inout_uint32[i] = inout_uint32_for_check[i] = 3;
 
-        in_uint64[i] = 2;
-        inout_uint64[i] = 3;
+        in_uint64[i] = 5;
+        inout_uint64[i] = inout_uint64_for_check[i] = 3;
     }
     int rank, size, len;
 
@@ -69,60 +74,180 @@ int main(int argc, char **argv) {
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     double tstart, tend;
 
+    int correctness=1;
+
     if(*type=='i') {
         if(strcmp(op, "sum") == 0){
             printf("#Local Reduce SUM: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] + inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 SUM check success!");
+                else
+                    printf("Integer Size 8 SUM check fail!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] + inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 SUM check success!");
+                else
+                    printf("Integer Size 16 SUM check fail!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] + inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 SUM check success!");
+                else
+                    printf("Integer Size 32 SUM check fail!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
-             tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] + inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 SUM check success!");
+                else
+                    printf("Integer Size 64 SUM check fail!");
+            }
+            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             printf("#Local Reduce MAX: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MAX check success!");
+                else
+                    printf("Integer Size 8 MAX check fail!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MAX check success!");
+                else
+                    printf("Integer Size 16 MAX check fail!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MAX check success!");
+                else
+                    printf("Integer Size 32 MAX check fail!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MAX check success!");
+                else
+                    printf("Integer Size 64 MAX check fail!");
+            }
             tend = MPI_Wtime();
         }
-
+        //intentionly reversed in and out
         if(strcmp(op, "min") == 0) {
             printf("#Local Reduce MIN: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
-                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MIN);
-            if (elem_size1 == 16)
-                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MIN);
-            if (elem_size1 == 32)
-                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MIN);
-            if (elem_size1 == 64)
-                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MIN);
-            tend = MPI_Wtime();
-        }
-
-        if(strcmp(op, "land") == 0) {
-            printf("#Local Reduce Logical: %d \n", count);
-            tstart = MPI_Wtime();
-            if (elem_size1 == 8)
-                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_LAND);
+            {
+                MPI_Reduce_local(inout_uint8,in_uint8,count, MPI_INT8_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MIN check success!");
+                else
+                    printf("Integer Size 8 MIN check fail!");
+            }
             if (elem_size1 == 16)
-                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+            {
+                MPI_Reduce_local(inout_uint16,in_uint16,count, MPI_INT16_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MIN check success!");
+                else
+                    printf("Integer Size 16 MIN check fail!");
+            }
             if (elem_size1 == 32)
-                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+            {
+                MPI_Reduce_local(inout_uint32,in_uint32,count, MPI_UINT32_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MIN check success!");
+                else
+                    printf("Integer Size 32 MIN check fail!");
+            }
             if (elem_size1 == 64)
-                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+            {
+                MPI_Reduce_local(inout_uint64,in_uint64,count, MPI_UINT64_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MIN check success!");
+                else
+                    printf("Integer Size 64 MIN check fail!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -130,13 +255,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!= (in_uint8[i] | inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BOR check success!");
+                else
+                    printf("Integer Size 8 BOR check fail!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BOR check success!");
+                else
+                    printf("Integer Size 16 BOR check fail!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BOR check success!");
+                else
+                    printf("Integer Size 32 BOR check fail!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BOR check success!");
+                else
+                    printf("Integer Size 64 BOR check fail!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -144,13 +313,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] ^ inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BXOR check success!");
+                else
+                    printf("Integer Size 8 BXOR check fail!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] ^ inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BXOR check success!");
+                else
+                    printf("Integer Size 16 BXOR check fail!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] ^ inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BXOR check success!");
+                else
+                    printf("Integer Size 32 BXOR check fail!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=(in_uint64[i] ^ inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BXOR check success!");
+                else
+                    printf("Integer Size 64 BXOR check fail!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -159,13 +372,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] * inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 PROD check success!");
+                else
+                    printf("Integer Size 8 PROD check fail!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] * inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 PROD check success!");
+                else
+                    printf("Integer Size 16 PROD check fail!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] * inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 PROD check success!");
+                else
+                    printf("Integer Size 32 PROD check fail!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] * inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 PROD check success!");
+                else
+                    printf("Integer Size 64 PROD check fail!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -173,13 +430,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] & inout_uint8_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BAND check success!");
+                else
+                    printf("Integer Size 8 BAND check fail!");
+            }
             if (elem_size1 == 16)
-                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] & inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BAND check success!");
+                else
+                    printf("Integer Size 16 BAND check fail!");
+            }
             if (elem_size1 == 32)
-                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] & inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BAND check success!");
+                else
+                    printf("Integer Size 32 BAND check fail!");
+            }
             if (elem_size1 == 64)
-                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!= (in_uint64[i] & inout_uint64_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BAND check success!");
+                else
+                    printf("Integer Size 64 BAND check fail!");
+            }
             tend = MPI_Wtime();
         }
     }
@@ -188,24 +489,60 @@ int main(int argc, char **argv) {
         if(strcmp(op, "sum") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=inout_float_for_check[i]+in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Sum check success!");
+            else
+                printf("Float Sum check fail!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Max check success!");
+            else
+                printf("Float Max check fail!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "min") == 0) {
             tstart = MPI_Wtime();
-            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MIN);
+            MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Min check success!");
+            else
+                printf("Float Min check fail!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "mul") == 0) {
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i] * inout_float_for_check[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Prod check success!");
+            else
+                printf("Float Prod check fail!");
             tend = MPI_Wtime();
         }
     }
@@ -214,67 +551,101 @@ int main(int argc, char **argv) {
         if(strcmp(op, "sum") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]+in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Sum check success!");
+            else
+                printf("Double Sum check fail!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Max check success!");
+            else
+                printf("Double Max check fail!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "min") == 0) {
             tstart = MPI_Wtime();
-            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MIN);
+            MPI_Reduce_local(inout_double,in_double,count, MPI_DOUBLE, MPI_MIN);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Min check success!");
+            else
+                printf("Double Min check fail!");
             tend = MPI_Wtime();
         }
-
-         if(strcmp(op, "mul") == 0) {
-             tstart = MPI_Wtime();
-             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]*in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Prod check success!");
+            else
+                 printf("Double Prod check fail!");
              tend = MPI_Wtime();
          }
     }
-/*
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" float: %f", inout_float[i]);
-        if((i+1)%16==0)
-            printf("\n");
-    }
-
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out8: %d", inout_uint8[i]);
-        if((i+1)%64==0)
-            printf("\n");
-    }
-
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out16: %d",inout_uint16[i]);
-        if((i+1)%32==0)
-            printf("\n");
-    }
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out32: %d",inout_uint32[i]);
-        if((i+1)%16==0)
-            printf("\n");
-    }
-
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out64: %d",inout_uint64[i]);
-        if((i+1)%8==0)
-            printf("\n");
-    }
-
+    /*
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" float: %f", inout_float[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out8: %d", inout_uint8[i]);
+       if((i+1)%64==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out16: %d",inout_uint16[i]);
+       if((i+1)%32==0)
+       printf("\n");
+       }
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out32: %d",inout_uint32[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out64: %d",inout_uint64[i]);
+       if((i+1)%8==0)
+       printf("\n");
+       }
     printf("\n ================\n");
     for (i=0; i<count; i++)
     {
@@ -283,7 +654,7 @@ int main(int argc, char **argv) {
             printf("\n");
     }
 */
-    printf("#Local Reduce time %.6f seconds\n", tend-tstart);
+    printf("\n#Local Reduce time %.6f seconds\n", tend-tstart);
     MPI_Finalize();
     return 0;
 }
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
new file mode 100644
index 00000000000..551800d016a
--- /dev/null
+++ b/test/datatype/correctness_check.sh
@@ -0,0 +1,28 @@
+echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "/home/zhongdong/opt/git/avx512_reduction/bin/mpirun  -mca   op_sve_hardware_available 0  -mca   op_avx_hardware_available 0   -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1048576  i 8 max"
+
+echo "=========Integer type all operations & all sizes========"
+echo ""
+for op in max min sum mul band bor bxor
+do
+    echo ""
+    echo "===Operation  $op test==="
+    for size in 8 16 32 64
+    do
+        /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 i $size $op 
+    done
+done
+
+echo "=======Float type all operations========"
+echo ""
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 f 32 $op 
+done
+
+echo "========Double type all operations========="
+echo ""
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 d 32 $op 
+done

From 7c1c0634382c5e05a298d5875e5b406105264b62 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 15 Nov 2019 18:56:41 -0500
Subject: [PATCH 11/13] Correctness check for all types and all operations with
 mm256 & duff device code

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/avx/op_avx_functions.c | 212 +++++++++++++++++++++--------
 test/datatype/Reduce_local_float.c | 144 ++++++++++----------
 test/datatype/correctness_check.sh |  46 ++++++-
 3 files changed, 275 insertions(+), 127 deletions(-)

diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
index 9c945574814..3fbce355aa3 100644
--- a/ompi/mca/op/avx/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -39,24 +39,64 @@
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 512 / type_size;                                        \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-    for (i = 0; i < round; i+= 64) { \
-        __m512i vecA =  _mm512_loadu_si512((_in+i));\
-        __m512i vecB =  _mm512_loadu_si512((_out+i));\
+    int types_per_step = 512 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in);\
+        __m512i vecB =  _mm512_loadu_si512(out);\
+        in += types_per_step; \
         __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-        _mm512_storeu_si512((_out+i), res); \
+        _mm512_storeu_si512((out), res); \
+        out += types_per_step; \
     }\
-    uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (_in+round)); \
-        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (_out+round)); \
-        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-        _mm512_mask_storeu_epi##type_size((_out+round), left_over, res); \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in); \
+            __m256i vecB = _mm256_loadu_si256(out); \
+            in += types_per_step;                       \
+            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+        if( 0 != left_over ) {                        \
+            switch(left_over) {                         \
+                case 31: out[30] = current_func(out[30],in[30]) ;                        \
+                case 30: out[29] = current_func(out[29],in[29]) ;                        \
+                case 29: out[28] = current_func(out[28],in[28]) ;                        \
+                case 28: out[27] = current_func(out[27],in[27]) ;                        \
+                case 27: out[26] = current_func(out[26],in[26]) ;                        \
+                case 26: out[25] = current_func(out[25],in[25]) ;                        \
+                case 25: out[24] = current_func(out[24],in[24]) ;                        \
+                case 24: out[23] = current_func(out[23],in[23]) ;                        \
+                case 23: out[22] = current_func(out[22],in[22]) ;                        \
+                case 22: out[21] = current_func(out[21],in[21]) ;                        \
+                case 21: out[20] = current_func(out[20],in[20]) ;                        \
+                case 20: out[19] = current_func(out[19],in[19]) ;                        \
+                case 19: out[18] = current_func(out[18],in[18]) ;                        \
+                case 18: out[17] = current_func(out[17],in[17]) ;                        \
+                case 17: out[16] = current_func(out[16],in[16]) ;                        \
+                case 16: out[15] = current_func(out[15],in[15]) ;                        \
+                case 15: out[14] = current_func(out[14],in[14]) ;                        \
+                case 14: out[13] = current_func(out[13],in[13]) ;                        \
+                case 13: out[12] = current_func(out[12],in[12]) ;                        \
+                case 12: out[11] = current_func(out[11],in[11]) ;                        \
+                case 11: out[10] = current_func(out[10],in[10]) ;                        \
+                case 10: out[9] = current_func(out[9],in[9]) ;                        \
+                case 9: out[8] = current_func(out[8],in[8]) ;                        \
+                case 8: out[7] = current_func(out[7],in[7]) ;                        \
+                case 7: out[6] = current_func(out[6],in[6]) ;                        \
+                case 6: out[5] = current_func(out[5],in[5]) ;                        \
+                case 5: out[4] = current_func(out[4],in[4]) ;                        \
+                case 4: out[3] = current_func(out[3],in[3]) ;                        \
+                case 3: out[2] = current_func(out[2],in[2]) ;                        \
+                case 2: out[1] = current_func(out[1],in[1]) ;                        \
+                case 1: out[0] = current_func(out[0],in[0]) ;                        \
+            }\
+        }\
     }\
 }
 
@@ -71,24 +111,64 @@
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 512 / type_size; \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-    for (i = 0; i < round; i+=64) { \
-        __m512i vecA =  _mm512_loadu_si512((_in+i));\
-        __m512i vecB =  _mm512_loadu_si512((_out+i));\
+    int types_per_step = 512 / (8 * sizeof(type)); \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in);\
+        __m512i vecB =  _mm512_loadu_si512(out);\
+        in += types_per_step;                       \
         __m512i res = _mm512_##op##_si512(vecA, vecB); \
-        _mm512_storeu_si512((_out+i), res); \
+        _mm512_storeu_si512(out, res); \
+        out += types_per_step;                        \
     }\
-    uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (_in+round)); \
-        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (_out+round)); \
-        __m512i res =  _mm512_##op##_si512(vecA, vecB); \
-        _mm512_mask_storeu_epi##type_size((_out+round), left_over, res); \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in); \
+            __m256i vecB = _mm256_loadu_si256(out); \
+            in += types_per_step;                       \
+            __m256i res =  _mm256_##op##_si256(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+        if( 0 != left_over ) {                        \
+            switch(left_over) {                         \
+                case 31: out[30] = current_func(out[30],in[30]) ;                        \
+                case 30: out[29] = current_func(out[29],in[29]) ;                        \
+                case 29: out[28] = current_func(out[28],in[28]) ;                        \
+                case 28: out[27] = current_func(out[27],in[27]) ;                        \
+                case 27: out[26] = current_func(out[26],in[26]) ;                        \
+                case 26: out[25] = current_func(out[25],in[25]) ;                        \
+                case 25: out[24] = current_func(out[24],in[24]) ;                        \
+                case 24: out[23] = current_func(out[23],in[23]) ;                        \
+                case 23: out[22] = current_func(out[22],in[22]) ;                        \
+                case 22: out[21] = current_func(out[21],in[21]) ;                        \
+                case 21: out[20] = current_func(out[20],in[20]) ;                        \
+                case 20: out[19] = current_func(out[19],in[19]) ;                        \
+                case 19: out[18] = current_func(out[18],in[18]) ;                        \
+                case 18: out[17] = current_func(out[17],in[17]) ;                        \
+                case 17: out[16] = current_func(out[16],in[16]) ;                        \
+                case 16: out[15] = current_func(out[15],in[15]) ;                        \
+                case 15: out[14] = current_func(out[14],in[14]) ;                        \
+                case 14: out[13] = current_func(out[13],in[13]) ;                        \
+                case 13: out[12] = current_func(out[12],in[12]) ;                        \
+                case 12: out[11] = current_func(out[11],in[11]) ;                        \
+                case 11: out[10] = current_func(out[10],in[10]) ;                        \
+                case 10: out[9] = current_func(out[9],in[9]) ;                        \
+                case 9: out[8] = current_func(out[8],in[8]) ;                        \
+                case 8: out[7] = current_func(out[7],in[7]) ;                        \
+                case 7: out[6] = current_func(out[6],in[6]) ;                        \
+                case 6: out[5] = current_func(out[5],in[5]) ;                        \
+                case 5: out[4] = current_func(out[4],in[4]) ;                        \
+                case 4: out[3] = current_func(out[3],in[3]) ;                        \
+                case 3: out[2] = current_func(out[2],in[2]) ;                        \
+                case 2: out[1] = current_func(out[1],in[1]) ;                        \
+                case 1: out[0] = current_func(out[0],in[0]) ;                        \
+            }\
+        }\
     }\
 }
 
@@ -151,15 +231,15 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
   }									\
   if( 0 != left_over ) {						\
     switch(left_over) {							\
-    case 7: out[6] += in[6];						\
-    case 6: out[5] += in[5];						\
-    case 5: out[4] += in[4];						\
-    case 4: out[3] += in[3];						\
-    case 3: out[2] += in[2];						\
-    case 2: out[1] += in[1];						\
-    case 1: out[0] += in[0];						\
-    }									\
-  }									\
+        case 7: out[6] = current_func(out[6],in[6]) ;                        \
+        case 6: out[5] = current_func(out[5],in[5]) ;                        \
+        case 5: out[4] = current_func(out[4],in[4]) ;                        \
+        case 4: out[3] = current_func(out[3],in[3]) ;                        \
+        case 3: out[2] = current_func(out[2],in[2]) ;                        \
+        case 2: out[1] = current_func(out[1],in[1]) ;                        \
+        case 1: out[0] = current_func(out[0],in[0]) ;                        \
+    }\
+  }\
 }
 #endif
 
@@ -168,26 +248,36 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 8;                                                          \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
+    int types_per_step = 512 / (8 * sizeof(double));           \
+    int left_over = *count; \
     double* in = (double*)_in; \
     double* out = (double*)_out; \
-    for (i = 0; i < round; i+=8) { \
-        __m512d vecA =  _mm512_load_pd((in+i));\
-        __m512d vecB =  _mm512_load_pd((out+i));\
+    for (; left_over >= types_per_step; left_over -= types_per_step) {    \
+        __m512d vecA =  _mm512_load_pd(in);\
+        __m512d vecB =  _mm512_load_pd(out);\
+        in += types_per_step;                       \
         __m512d res = _mm512_##op##_pd(vecA, vecB); \
-        _mm512_store_pd((out+i), res); \
+        _mm512_store_pd((out), res); \
+        out += types_per_step;                        \
     }\
-    uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512d vecA = _mm512_maskz_load_pd(left_over, (in+round)); \
-        __m512d vecB = _mm512_maskz_load_pd(left_over, (out+round)); \
-        __m512d res = _mm512_##op##_pd(vecA, vecB); \
-        _mm512_mask_store_pd((out+round), left_over, res); \
+        types_per_step >>= 1;  /* 256 / (8 * sizeof(double));    */  \
+            if( left_over >= types_per_step ) {                 \
+                __m256 vecA =  _mm256_load_pd(in);                \
+                __m256 vecB =  _mm256_load_pd(out);               \
+                __m256 res = _mm256_##op##_pd(vecA, vecB);            \
+                _mm256_store_pd(out, res);                  \
+                in += types_per_step;                       \
+                out += types_per_step;                      \
+                left_over -= types_per_step;                    \
+            }\
+    }\
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                         \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
     }\
 }
 
@@ -195,6 +285,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Max
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
     OP_AVX_FUNC(max, i, 8,    int8_t, max)
     OP_AVX_FUNC(max, u, 8,   uint8_t, max)
     OP_AVX_FUNC(max, i, 16,  int16_t, max)
@@ -211,6 +303,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Min
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
     OP_AVX_FUNC(min, i, 8,    int8_t, min)
     OP_AVX_FUNC(min, u, 8,   uint8_t, min)
     OP_AVX_FUNC(min, i, 16,  int16_t, min)
@@ -227,6 +321,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Sum
  ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
     OP_AVX_FUNC(sum, i, 8,    int8_t, add)
     OP_AVX_FUNC(sum, i, 8,   uint8_t, add)
     OP_AVX_FUNC(sum, i, 16,  int16_t, add)
@@ -243,6 +339,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Product
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
     OP_AVX_FUNC(prod, i, 16,  int16_t, mullo)
     OP_AVX_FUNC(prod, i, 16, uint16_t, mullo)
     OP_AVX_FUNC(prod, i, 32,  int32_t, mullo)
@@ -257,6 +355,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
     OP_AVX_BIT_FUNC(band, 8,    int8_t, and)
     OP_AVX_BIT_FUNC(band, 8,   uint8_t, and)
     OP_AVX_BIT_FUNC(band, 16,  int16_t, and)
@@ -272,6 +372,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Bitwise OR
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
     OP_AVX_BIT_FUNC(bor, 8,    int8_t, or)
     OP_AVX_BIT_FUNC(bor, 8,   uint8_t, or)
     OP_AVX_BIT_FUNC(bor, 16,  int16_t, or)
@@ -287,6 +389,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Bitwise XOR
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
     OP_AVX_BIT_FUNC(bxor, 8,    int8_t, xor)
     OP_AVX_BIT_FUNC(bxor, 8,   uint8_t, xor)
     OP_AVX_BIT_FUNC(bxor, 16,  int16_t, xor)
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index 277846c55ea..a9546ebcf1d 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -89,9 +89,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 SUM check success!");
+                    printf("Integer Size 8 SUM check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 SUM check fail!");
+                    printf("Integer Size 8 SUM check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -102,9 +102,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 SUM check success!");
+                    printf("Integer Size 16 SUM check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 SUM check fail!");
+                    printf("Integer Size 16 SUM check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -115,9 +115,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 SUM check success!");
+                    printf("Integer Size 32 SUM check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 SUM check fail!");
+                    printf("Integer Size 32 SUM check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -128,9 +128,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 SUM check success!");
+                    printf("Integer Size 64 SUM check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 SUM check fail!");
+                    printf("Integer Size 64 SUM check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -147,9 +147,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 MAX check success!");
+                    printf("Integer Size 8 MAX check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 MAX check fail!");
+                    printf("Integer Size 8 MAX check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -160,9 +160,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 MAX check success!");
+                    printf("Integer Size 16 MAX check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 MAX check fail!");
+                    printf("Integer Size 16 MAX check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -173,9 +173,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 MAX check success!");
+                    printf("Integer Size 32 MAX check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 MAX check fail!");
+                    printf("Integer Size 32 MAX check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -186,9 +186,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 MAX check success!");
+                    printf("Integer Size 64 MAX check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 MAX check fail!");
+                    printf("Integer Size 64 MAX check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -205,9 +205,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 MIN check success!");
+                    printf("Integer Size 8 MIN check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 MIN check fail!");
+                    printf("Integer Size 8 MIN check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -218,9 +218,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 MIN check success!");
+                    printf("Integer Size 16 MIN check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 MIN check fail!");
+                    printf("Integer Size 16 MIN check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -231,9 +231,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 MIN check success!");
+                    printf("Integer Size 32 MIN check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 MIN check fail!");
+                    printf("Integer Size 32 MIN check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -244,9 +244,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 MIN check success!");
+                    printf("Integer Size 64 MIN check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 MIN check fail!");
+                    printf("Integer Size 64 MIN check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -263,9 +263,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 BOR check success!");
+                    printf("Integer Size 8 BOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 BOR check fail!");
+                    printf("Integer Size 8 BOR check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -276,9 +276,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 BOR check success!");
+                    printf("Integer Size 16 BOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 BOR check fail!");
+                    printf("Integer Size 16 BOR check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -289,9 +289,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 BOR check success!");
+                    printf("Integer Size 32 BOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 BOR check fail!");
+                    printf("Integer Size 32 BOR check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -302,9 +302,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 BOR check success!");
+                    printf("Integer Size 64 BOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 BOR check fail!");
+                    printf("Integer Size 64 BOR check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -321,9 +321,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 BXOR check success!");
+                    printf("Integer Size 8 BXOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 BXOR check fail!");
+                    printf("Integer Size 8 BXOR check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -334,9 +334,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 BXOR check success!");
+                    printf("Integer Size 16 BXOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 BXOR check fail!");
+                    printf("Integer Size 16 BXOR check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -347,9 +347,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 BXOR check success!");
+                    printf("Integer Size 32 BXOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 BXOR check fail!");
+                    printf("Integer Size 32 BXOR check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -360,9 +360,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 BXOR check success!");
+                    printf("Integer Size 64 BXOR check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 BXOR check fail!");
+                    printf("Integer Size 64 BXOR check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -380,9 +380,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 PROD check success!");
+                    printf("Integer Size 8 PROD check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 PROD check fail!");
+                    printf("Integer Size 8 PROD check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -393,9 +393,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 PROD check success!");
+                    printf("Integer Size 16 PROD check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 PROD check fail!");
+                    printf("Integer Size 16 PROD check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -406,9 +406,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 PROD check success!");
+                    printf("Integer Size 32 PROD check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 PROD check fail!");
+                    printf("Integer Size 32 PROD check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -419,9 +419,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 PROD check success!");
+                    printf("Integer Size 64 PROD check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 PROD check fail!");
+                    printf("Integer Size 64 PROD check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -438,9 +438,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 8 BAND check success!");
+                    printf("Integer Size 8 BAND check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 8 BAND check fail!");
+                    printf("Integer Size 8 BAND check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 16)
             {
@@ -451,9 +451,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 16 BAND check success!");
+                    printf("Integer Size 16 BAND check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 16 BAND check fail!");
+                    printf("Integer Size 16 BAND check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 32)
             {
@@ -464,9 +464,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 32 BAND check success!");
+                    printf("Integer Size 32 BAND check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 32 BAND check fail!");
+                    printf("Integer Size 32 BAND check \033[1;31m fail\033[0m!");
             }
             if (elem_size1 == 64)
             {
@@ -477,9 +477,9 @@ int main(int argc, char **argv) {
                         correctness = 0;
                 }
                 if(correctness)
-                    printf("Integer Size 64 BAND check success!");
+                    printf("Integer Size 64 BAND check \033[1;32m success!\033[0m");
                 else
-                    printf("Integer Size 64 BAND check fail!");
+                    printf("Integer Size 64 BAND check \033[1;31m fail\033[0m!");
             }
             tend = MPI_Wtime();
         }
@@ -495,9 +495,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Float Sum check success!");
+                printf("Float Sum check \033[1;32m success!\033[0m");
             else
-                printf("Float Sum check fail!");
+                printf("Float Sum check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
@@ -510,9 +510,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Float Max check success!");
+                printf("Float Max check \033[1;32m success!\033[0m");
             else
-                printf("Float Max check fail!");
+                printf("Float Max check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
@@ -525,9 +525,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Float Min check success!");
+                printf("Float Min check \033[1;32m success!\033[0m");
             else
-                printf("Float Min check fail!");
+                printf("Float Min check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
@@ -540,9 +540,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Float Prod check success!");
+                printf("Float Prod check \033[1;32m success!\033[0m");
             else
-                printf("Float Prod check fail!");
+                printf("Float Prod check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
     }
@@ -557,9 +557,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Double Sum check success!");
+                printf("Double Sum check \033[1;32m success!\033[0m");
             else
-                printf("Double Sum check fail!");
+                printf("Double Sum check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
@@ -572,9 +572,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Double Max check success!");
+                printf("Double Max check \033[1;32m success!\033[0m");
             else
-                printf("Double Max check fail!");
+                printf("Double Max check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
@@ -587,9 +587,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Double Min check success!");
+                printf("Double Min check \033[1;32m success!\033[0m");
             else
-                printf("Double Min check fail!");
+                printf("Double Min check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
         if(strcmp(op, "mul") == 0) {
@@ -601,9 +601,9 @@ int main(int argc, char **argv) {
                     correctness = 0;
             }
             if(correctness)
-                printf("Double Prod check success!");
+                printf("Double Prod check \033[1;32m success!\033[0m");
             else
-                 printf("Double Prod check fail!");
+                 printf("Double Prod check \033[1;31m fail\033[0m!");
              tend = MPI_Wtime();
          }
     }
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
index 551800d016a..f7b89da106c 100644
--- a/test/datatype/correctness_check.sh
+++ b/test/datatype/correctness_check.sh
@@ -1,6 +1,14 @@
 echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
 echo "/home/zhongdong/opt/git/avx512_reduction/bin/mpirun  -mca   op_sve_hardware_available 0  -mca   op_avx_hardware_available 0   -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1048576  i 8 max"
 
+Orange= "\033[0;33m"
+Blue= '\033[0;34m'
+Purple= '\033[0;35m'
+
+NC='\033[0m'
+
+
+
 echo "=========Integer type all operations & all sizes========"
 echo ""
 for op in max min sum mul band bor bxor
@@ -9,20 +17,56 @@ do
     echo "===Operation  $op test==="
     for size in 8 16 32 64
     do
+        echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
         /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 i $size $op 
+         echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+         /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 127 i $size $op
+         echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+         /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 130 i $size $op
     done
 done
 
 echo "=======Float type all operations========"
 echo ""
+echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
 for op in max min sum mul
 do
     /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 f 32 $op 
 done
 
+echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+## 28= 16+8 + 4
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 28 f 32 $op
+done
+
+echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+##40=16+4
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 20 f 32 $op
+done
+
 echo "========Double type all operations========="
 echo ""
+
+echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 d 64 $op
+done
+
+echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+## 20= 8 +8 + 3
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 19 d 64 $op
+done
+
+echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+##12=8+2
 for op in max min sum mul
 do
-    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 d 32 $op 
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 10 d 64 $op
 done

From e60af1eff437b63db48663ecb5cb7f6073b6b8f5 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Tue, 3 Dec 2019 13:49:40 -0500
Subject: [PATCH 12/13] add special case for mul int8

---
 ompi/mca/op/avx/op_avx_functions.c | 136 ++++++++++++++++++++++++++---
 test/datatype/Reduce_local_float.c | 121 ++++++++++++++++++-------
 2 files changed, 213 insertions(+), 44 deletions(-)

diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
index 3fbce355aa3..b2ed2e8bbfc 100644
--- a/ompi/mca/op/avx/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -100,6 +100,64 @@
     }\
 }
 
+/* special case for int8 mul */
+#define OP_AVX_MUL(name, type_sign, type_size, type, op) \
+        static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \
+                            struct ompi_datatype_t **dtype, \
+                            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 256 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    int store_mask = 0xFFFFFFFF;\
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m256i vecA_tmp =  _mm256_loadu_si256(in);\
+        __m256i vecB_tmp =  _mm256_loadu_si256(out);\
+        in += types_per_step; \
+        __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \
+        __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \
+        __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB); \
+        _mm512_mask_cvtepi16_storeu_epi8((out), store_mask, res);\
+        out += types_per_step; \
+        }\
+    if(left_over!=0){ \
+        switch(left_over) {                         \
+            case 31: out[30] = current_func(out[30],in[30]) ;                        \
+            case 30: out[29] = current_func(out[29],in[29]) ;                        \
+            case 29: out[28] = current_func(out[28],in[28]) ;                        \
+            case 28: out[27] = current_func(out[27],in[27]) ;                        \
+            case 27: out[26] = current_func(out[26],in[26]) ;                        \
+            case 26: out[25] = current_func(out[25],in[25]) ;                        \
+            case 25: out[24] = current_func(out[24],in[24]) ;                        \
+            case 24: out[23] = current_func(out[23],in[23]) ;                        \
+            case 23: out[22] = current_func(out[22],in[22]) ;                        \
+            case 22: out[21] = current_func(out[21],in[21]) ;                        \
+            case 21: out[20] = current_func(out[20],in[20]) ;                        \
+            case 20: out[19] = current_func(out[19],in[19]) ;                        \
+            case 19: out[18] = current_func(out[18],in[18]) ;                        \
+            case 18: out[17] = current_func(out[17],in[17]) ;                        \
+            case 17: out[16] = current_func(out[16],in[16]) ;                        \
+            case 16: out[15] = current_func(out[15],in[15]) ;                        \
+            case 15: out[14] = current_func(out[14],in[14]) ;                        \
+            case 14: out[13] = current_func(out[13],in[13]) ;                        \
+            case 13: out[12] = current_func(out[12],in[12]) ;                        \
+            case 12: out[11] = current_func(out[11],in[11]) ;                        \
+            case 11: out[10] = current_func(out[10],in[10]) ;                        \
+            case 10: out[9] = current_func(out[9],in[9]) ;                        \
+            case 9: out[8] = current_func(out[8],in[8]) ;                        \
+            case 8: out[7] = current_func(out[7],in[7]) ;                        \
+            case 7: out[6] = current_func(out[6],in[6]) ;                        \
+            case 6: out[5] = current_func(out[5],in[5]) ;                        \
+            case 5: out[4] = current_func(out[4],in[4]) ;                        \
+            case 4: out[3] = current_func(out[3],in[3]) ;                        \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
+    }\
+}
+
 /*
  *  This macro is for bit-wise operations (out op in).
  *
@@ -341,6 +399,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
  *************************************************************************/
 #undef current_func
 #define current_func(a, b) ((a) * (b))
+    OP_AVX_MUL(prod, i, 8, int8_t, mullo)
+    OP_AVX_MUL(prod, i, 8, uint8_t, mullo)
     OP_AVX_FUNC(prod, i, 16,  int16_t, mullo)
     OP_AVX_FUNC(prod, i, 16, uint16_t, mullo)
     OP_AVX_FUNC(prod, i, 32,  int32_t, mullo)
@@ -437,6 +497,67 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
     }\
 }
 
+/* special case for int8 mul */
+#define OP_AVX_MUL_3BUFF(name, type_sign, type_size, type, op) \
+        static void ompi_op_avx_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 256 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
+    int store_mask = 0xFFFFFFFF;\
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m256i vecA_tmp =  _mm256_loadu_si256(in1);\
+        __m256i vecB_tmp =  _mm256_loadu_si256(in2);\
+        in1 += types_per_step; \
+        in2 += types_per_step; \
+        __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \
+        __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \
+        __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB); \
+        _mm512_mask_cvtepi16_storeu_epi8((out), store_mask, res);\
+        out += types_per_step; \
+    }\
+    if(left_over!=0){ \
+        switch(left_over) {                         \
+            case 31: out[30] = current_func(in1[30],in2[30]) ;                        \
+            case 30: out[29] = current_func(in1[29],in2[29]) ;                        \
+            case 29: out[28] = current_func(in1[28],in2[28]) ;                        \
+            case 28: out[27] = current_func(in1[27],in2[27]) ;                        \
+            case 27: out[26] = current_func(in1[26],in2[26]) ;                        \
+            case 26: out[25] = current_func(in1[25],in2[25]) ;                        \
+            case 25: out[24] = current_func(in1[24],in2[24]) ;                        \
+            case 24: out[23] = current_func(in1[23],in2[23]) ;                        \
+            case 23: out[22] = current_func(in1[22],in2[22]) ;                        \
+            case 22: out[21] = current_func(in1[21],in2[21]) ;                        \
+            case 21: out[20] = current_func(in1[20],in2[20]) ;                        \
+            case 20: out[19] = current_func(in1[19],in2[19]) ;                        \
+            case 19: out[18] = current_func(in1[18],in2[18]) ;                        \
+            case 18: out[17] = current_func(in1[17],in2[17]) ;                        \
+            case 17: out[16] = current_func(in1[16],in2[16]) ;                        \
+            case 16: out[15] = current_func(in1[15],in2[15]) ;                        \
+            case 15: out[14] = current_func(in1[14],in2[14]) ;                        \
+            case 14: out[13] = current_func(in1[13],in2[13]) ;                        \
+            case 13: out[12] = current_func(in1[12],in2[12]) ;                        \
+            case 12: out[11] = current_func(in1[11],in2[11]) ;                        \
+            case 11: out[10] = current_func(in1[10],in2[10]) ;                        \
+            case 10: out[9] = current_func(in1[9],in2[9]) ;                        \
+            case 9: out[8] = current_func(in1[8],in2[8]) ;                        \
+            case 8: out[7] = current_func(in1[7],in2[7]) ;                        \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        } \
+    }\
+}
+
 #define OP_AVX_BIT_FUNC_3BUFF(name, type_size, type, op) \
         static void ompi_op_avx_3buff_##op##_##type(void *_in1, void *_in2, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
@@ -575,6 +696,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Product
  *************************************************************************/
+    OP_AVX_MUL_3BUFF(prod, i, 8, int8_t, mullo)
+    OP_AVX_MUL_3BUFF(prod, i, 8, uint8_t, mullo)
     OP_AVX_FUNC_3BUFF(prod, i, 16,  int16_t, mullo)
     OP_AVX_FUNC_3BUFF(prod, i, 16, uint16_t, mullo)
     OP_AVX_FUNC_3BUFF(prod, i, 32,  int32_t, mullo)
@@ -653,15 +776,6 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
     [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
     [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
 
-#define C_INTEGER_PROD(name, ftype)                                           \
-    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_avx_##ftype##_##name##_int16_t,   \
-    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_avx_##ftype##_##name##_uint16_t, \
-    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_avx_##ftype##_##name##_int32_t,   \
-    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_avx_##ftype##_##name##_uint32_t, \
-    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_avx_##ftype##_##name##_int64_t,   \
-    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_avx_##ftype##_##name##_uint64_t
-
-
 /*
  * MPI_OP_NULL
  * All types
@@ -696,7 +810,7 @@ ompi_op_base_handler_fn_t ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMP
     },
     /* Corresponds to MPI_PROD */
     [OMPI_OP_BASE_FORTRAN_PROD] = {
-        C_INTEGER_PROD(prod, 2buff),
+        C_INTEGER(prod, 2buff),
         FLOATING_POINT(mul, 2buff),
     },
     /* Corresponds to MPI_LAND */
@@ -759,7 +873,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN
     },
     /* Corresponds to MPI_PROD */
     [OMPI_OP_BASE_FORTRAN_PROD] = {
-        C_INTEGER_PROD(prod, 3buff),
+        C_INTEGER(prod, 3buff),
         FLOATING_POINT(mul, 3buff),
     },
     /* Corresponds to MPI_LAND */
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index a9546ebcf1d..fabcd0bc0a9 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -4,6 +4,7 @@
 #include <sys/time.h>
 #include <stdbool.h>
 #include <stdint.h>
+#include <unistd.h>
 #ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
 #endif /* __ARM_FEATURE_SVE */
@@ -79,10 +80,11 @@ int main(int argc, char **argv) {
     if(*type=='i') {
         if(strcmp(op, "sum") == 0){
             printf("#Local Reduce SUM: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!=in_uint8[i] + inout_uint8_for_check[i])
@@ -95,7 +97,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i]!=in_uint16[i] + inout_uint16_for_check[i])
@@ -108,7 +112,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i]!=in_uint32[i] + inout_uint32_for_check[i])
@@ -121,7 +127,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i]!=in_uint64[i] + inout_uint64_for_check[i])
@@ -132,15 +140,15 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 SUM check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             printf("#Local Reduce MAX: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!=in_uint8[i])
@@ -153,7 +161,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i]!=in_uint16[i])
@@ -166,7 +176,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i]!=in_uint32[i])
@@ -179,7 +191,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i]!=in_uint64[i])
@@ -190,15 +204,15 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 MAX check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
         //intentionly reversed in and out
         if(strcmp(op, "min") == 0) {
             printf("#Local Reduce MIN: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(inout_uint8,in_uint8,count, MPI_INT8_T, MPI_MIN);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!=in_uint8[i])
@@ -211,7 +225,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(inout_uint16,in_uint16,count, MPI_INT16_T, MPI_MIN);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i]!=in_uint16[i])
@@ -224,7 +240,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(inout_uint32,in_uint32,count, MPI_UINT32_T, MPI_MIN);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i]!=in_uint32[i])
@@ -237,7 +255,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(inout_uint64,in_uint64,count, MPI_UINT64_T, MPI_MIN);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i]!=in_uint64[i])
@@ -248,15 +268,15 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 MIN check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "bor") == 0) {
             printf("#Local Reduce Logical: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!= (in_uint8[i] | inout_uint8_for_check[i]))
@@ -269,7 +289,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i]))
@@ -282,7 +304,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i]))
@@ -295,7 +319,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i]))
@@ -306,15 +332,15 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 BOR check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "bxor") == 0) {
             printf("#Local Reduce Logical: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!=(in_uint8[i] ^ inout_uint8_for_check[i]))
@@ -327,7 +353,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i]!=(in_uint16[i] ^ inout_uint16_for_check[i]))
@@ -340,7 +368,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i]!=(in_uint32[i] ^ inout_uint32_for_check[i]))
@@ -353,7 +383,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i]!=(in_uint64[i] ^ inout_uint64_for_check[i]))
@@ -364,16 +396,16 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 BXOR check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
 
 
         if(strcmp(op, "mul") == 0) {
             printf("#Local Reduce Logical: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!=in_uint8[i] * inout_uint8_for_check[i])
@@ -386,7 +418,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i]!=in_uint16[i] * inout_uint16_for_check[i])
@@ -399,7 +433,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i]!=in_uint32[i] * inout_uint32_for_check[i])
@@ -412,7 +448,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i]!=in_uint64[i] * inout_uint64_for_check[i])
@@ -423,15 +461,15 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 PROD check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "band") == 0) {
             printf("#Local Reduce Logical: %d \n", count);
-            tstart = MPI_Wtime();
             if (elem_size1 == 8)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint8[i]!=(in_uint8[i] & inout_uint8_for_check[i]) )
@@ -444,7 +482,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 16)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint16[i]!=(in_uint16[i] & inout_uint16_for_check[i]))
@@ -457,7 +497,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 32)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint32[i]!=(in_uint32[i] & inout_uint32_for_check[i]))
@@ -470,7 +512,9 @@ int main(int argc, char **argv) {
             }
             if (elem_size1 == 64)
             {
+                tstart = MPI_Wtime();
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+                tend = MPI_Wtime();
                 for (i=0; i<count; i++)
                 {
                     if(inout_uint64[i]!= (in_uint64[i] & inout_uint64_for_check[i]) )
@@ -481,7 +525,6 @@ int main(int argc, char **argv) {
                 else
                     printf("Integer Size 64 BAND check \033[1;31m fail\033[0m!");
             }
-            tend = MPI_Wtime();
         }
     }
 
@@ -489,6 +532,7 @@ int main(int argc, char **argv) {
         if(strcmp(op, "sum") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_float[i]!=inout_float_for_check[i]+in_float[i])
@@ -498,12 +542,12 @@ int main(int argc, char **argv) {
                 printf("Float Sum check \033[1;32m success!\033[0m");
             else
                 printf("Float Sum check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_float[i]!=in_float[i])
@@ -513,12 +557,12 @@ int main(int argc, char **argv) {
                 printf("Float Max check \033[1;32m success!\033[0m");
             else
                 printf("Float Max check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "min") == 0) {
             tstart = MPI_Wtime();
             MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_float[i]!=in_float[i])
@@ -528,12 +572,12 @@ int main(int argc, char **argv) {
                 printf("Float Min check \033[1;32m success!\033[0m");
             else
                 printf("Float Min check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "mul") == 0) {
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_float[i]!=in_float[i] * inout_float_for_check[i])
@@ -543,7 +587,6 @@ int main(int argc, char **argv) {
                 printf("Float Prod check \033[1;32m success!\033[0m");
             else
                 printf("Float Prod check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
     }
 
@@ -551,6 +594,7 @@ int main(int argc, char **argv) {
         if(strcmp(op, "sum") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_double[i]!=inout_double_for_check[i]+in_double[i])
@@ -560,12 +604,12 @@ int main(int argc, char **argv) {
                 printf("Double Sum check \033[1;32m success!\033[0m");
             else
                 printf("Double Sum check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_double[i]!=in_double[i])
@@ -575,12 +619,12 @@ int main(int argc, char **argv) {
                 printf("Double Max check \033[1;32m success!\033[0m");
             else
                 printf("Double Max check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "min") == 0) {
             tstart = MPI_Wtime();
             MPI_Reduce_local(inout_double,in_double,count, MPI_DOUBLE, MPI_MIN);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_double[i]!=in_double[i])
@@ -590,11 +634,11 @@ int main(int argc, char **argv) {
                 printf("Double Min check \033[1;32m success!\033[0m");
             else
                 printf("Double Min check \033[1;31m fail\033[0m!");
-            tend = MPI_Wtime();
         }
         if(strcmp(op, "mul") == 0) {
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+            tend = MPI_Wtime();
             for (i=0; i<count; i++)
             {
                 if(inout_double[i]!=inout_double_for_check[i]*in_double[i])
@@ -603,9 +647,8 @@ int main(int argc, char **argv) {
             if(correctness)
                 printf("Double Prod check \033[1;32m success!\033[0m");
             else
-                 printf("Double Prod check \033[1;31m fail\033[0m!");
-             tend = MPI_Wtime();
-         }
+                printf("Double Prod check \033[1;31m fail\033[0m!");
+        }
     }
     /*
        printf("\n ================\n");
@@ -646,16 +689,28 @@ int main(int argc, char **argv) {
        if((i+1)%8==0)
        printf("\n");
        }
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" float: %f", inout_double[i]);
-        if((i+1)%8==0)
-            printf("\n");
-    }
-*/
-    printf("\n#Local Reduce time %.6f seconds\n", tend-tstart);
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" float: %f", inout_double[i]);
+       if((i+1)%8==0)
+       printf("\n");
+       }
+       */
+    //tstart = MPI_Wtime();  
+    //memcpy(in_uint8,inout_uint8, count);
+    //memcpy(in_float, inout_float, count);
+    //memcpy(in_double, inout_double, count);
+    tend = MPI_Wtime();
+    printf("PERF count  %d  time %.6f seconds\n",count, tend-tstart);
     MPI_Finalize();
+#define L1size sysconf(_SC_LEVEL1_DCACHE_SIZE)
+#define L2size sysconf(_SC_LEVEL2_CACHE_SIZE)
+#define L3size sysconf(_SC_LEVEL2_CACHE_SIZE)
+    void cache_flush(){
+        char *cache = (char*)calloc(L1size+L2size+L3size, sizeof(char));
+        free(cache);
+    }
     return 0;
 }
 

From 54a9ea0cacc82705758c0c52a7300eb94b4e2c3c Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Tue, 3 Dec 2019 14:34:34 -0500
Subject: [PATCH 13/13] Update all 3buff functions

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/avx/op_avx_functions.c | 258 +++++++++++++++++++++--------
 test/datatype/Reduce_local_float.c |   2 +-
 2 files changed, 192 insertions(+), 68 deletions(-)

diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
index b2ed2e8bbfc..231696f12ca 100644
--- a/ompi/mca/op/avx/op_avx_functions.c
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -473,27 +473,67 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 512 / type_size; \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
+    int types_per_step = 512 / (8 * sizeof(type));          \
+    int left_over = *count; \
     type* in1 = (type*)_in1; \
     type* in2 = (type*)_in2; \
     type* out = (type*)_out; \
-    for (i = 0; i < round; i+=64) { \
-        __m512i vecA =  _mm512_loadu_si512((in1+i));\
-        __m512i vecB =  _mm512_loadu_si512((in2+i));\
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in1);\
+        __m512i vecB =  _mm512_loadu_si512(in2);\
+        in1 += types_per_step; \
+        in2 += types_per_step; \
         __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-        _mm512_storeu_si512((out+i), res); \
+        _mm512_storeu_si512((out), res); \
+        out += types_per_step; \
     }\
-    uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in1+round)); \
-        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (in2+round)); \
-        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
-        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in1); \
+            __m256i vecB = _mm256_loadu_si256(in2); \
+            in1 += types_per_step;                       \
+            in2 += types_per_step; \
+            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+        if( 0 != left_over ) {                        \
+            switch(left_over) {                         \
+            case 31: out[30] = current_func(in1[30],in2[30]) ;                        \
+            case 30: out[29] = current_func(in1[29],in2[29]) ;                        \
+            case 29: out[28] = current_func(in1[28],in2[28]) ;                        \
+            case 28: out[27] = current_func(in1[27],in2[27]) ;                        \
+            case 27: out[26] = current_func(in1[26],in2[26]) ;                        \
+            case 26: out[25] = current_func(in1[25],in2[25]) ;                        \
+            case 25: out[24] = current_func(in1[24],in2[24]) ;                        \
+            case 24: out[23] = current_func(in1[23],in2[23]) ;                        \
+            case 23: out[22] = current_func(in1[22],in2[22]) ;                        \
+            case 22: out[21] = current_func(in1[21],in2[21]) ;                        \
+            case 21: out[20] = current_func(in1[20],in2[20]) ;                        \
+            case 20: out[19] = current_func(in1[19],in2[19]) ;                        \
+            case 19: out[18] = current_func(in1[18],in2[18]) ;                        \
+            case 18: out[17] = current_func(in1[17],in2[17]) ;                        \
+            case 17: out[16] = current_func(in1[16],in2[16]) ;                        \
+            case 16: out[15] = current_func(in1[15],in2[15]) ;                        \
+            case 15: out[14] = current_func(in1[14],in2[14]) ;                        \
+            case 14: out[13] = current_func(in1[13],in2[13]) ;                        \
+            case 13: out[12] = current_func(in1[12],in2[12]) ;                        \
+            case 12: out[11] = current_func(in1[11],in2[11]) ;                        \
+            case 11: out[10] = current_func(in1[10],in2[10]) ;                        \
+            case 10: out[9] = current_func(in1[9],in2[9]) ;                        \
+            case 9: out[8] = current_func(in1[8],in2[8]) ;                        \
+            case 8: out[7] = current_func(in1[7],in2[7]) ;                        \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+            } \
+        }\
     }\
 }
 
@@ -563,27 +603,67 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 512 / type_size; \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
+    int types_per_step = 512 / (8 * sizeof(type)); \
+    int left_over = *count; \
     type* in1 = (type*)_in1; \
     type* in2 = (type*)_in2; \
     type* out = (type*)_out; \
-    for (i = 0; i < round; i+=64) { \
-        __m512i vecA =  _mm512_loadu_si512((in1+i));\
-        __m512i vecB =  _mm512_loadu_si512((in2+i));\
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in1);\
+        __m512i vecB =  _mm512_loadu_si512(in2);\
+        in1 += types_per_step;                       \
+        in2 += types_per_step;                       \
         __m512i res = _mm512_##op##_si512(vecA, vecB); \
-        _mm512_storeu_si512((out+i), res); \
+        _mm512_storeu_si512(out, res); \
+        out += types_per_step;                        \
     }\
-    uint64_t left_over = *count - (size*step); \
     if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512i vecA = _mm512_maskz_loadu_epi##type_size(left_over, (in1+round)); \
-        __m512i vecB = _mm512_maskz_loadu_epi##type_size(left_over, (in2+round)); \
-        __m512i res =  _mm512_##op##_si512(vecA, vecB); \
-        _mm512_mask_storeu_epi##type_size((out+round), left_over, res); \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in1); \
+            __m256i vecB = _mm256_loadu_si256(in2); \
+            in1 += types_per_step;                       \
+            in2 += types_per_step;                       \
+            __m256i res =  _mm256_##op##_si256(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+    }\
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                         \
+            case 31: out[30] = current_func(in1[30],in2[30]) ;                        \
+            case 30: out[29] = current_func(in1[29],in2[29]) ;                        \
+            case 29: out[28] = current_func(in1[28],in2[28]) ;                        \
+            case 28: out[27] = current_func(in1[27],in2[27]) ;                        \
+            case 27: out[26] = current_func(in1[26],in2[26]) ;                        \
+            case 26: out[25] = current_func(in1[25],in2[25]) ;                        \
+            case 25: out[24] = current_func(in1[24],in2[24]) ;                        \
+            case 24: out[23] = current_func(in1[23],in2[23]) ;                        \
+            case 23: out[22] = current_func(in1[22],in2[22]) ;                        \
+            case 22: out[21] = current_func(in1[21],in2[21]) ;                        \
+            case 21: out[20] = current_func(in1[20],in2[20]) ;                        \
+            case 20: out[19] = current_func(in1[19],in2[19]) ;                        \
+            case 19: out[18] = current_func(in1[18],in2[18]) ;                        \
+            case 18: out[17] = current_func(in1[17],in2[17]) ;                        \
+            case 17: out[16] = current_func(in1[16],in2[16]) ;                        \
+            case 16: out[15] = current_func(in1[15],in2[15]) ;                        \
+            case 15: out[14] = current_func(in1[14],in2[14]) ;                        \
+            case 14: out[13] = current_func(in1[13],in2[13]) ;                        \
+            case 13: out[12] = current_func(in1[12],in2[12]) ;                        \
+            case 12: out[11] = current_func(in1[11],in2[11]) ;                        \
+            case 11: out[10] = current_func(in1[10],in2[10]) ;                        \
+            case 10: out[9] = current_func(in1[9],in2[9]) ;                        \
+            case 9: out[8] = current_func(in1[8],in2[8]) ;                        \
+            case 8: out[7] = current_func(in1[7],in2[7]) ;                        \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        } \
     }\
 }
 
@@ -592,27 +672,43 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 16;                                                          \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-    float* in1 = (float*)_in1; \
-    float* in2 = (float*)_in2; \
-    float* out = (float*)_out; \
-    for (i = 0; i < round; i+=64) { \
-        __m512 vecA =  _mm512_load_ps((in1+i));\
-        __m512 vecB =  _mm512_load_ps((in2+i));\
-        __m512 res = _mm512_##op##_ps(vecA, vecB); \
-        _mm512_store_ps((out+i), res); \
-    }\
-    uint64_t left_over = *count - (size*step); \
-    if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512 vecA = _mm512_maskz_load_ps(left_over, (in1+round)); \
-        __m512 vecB = _mm512_maskz_load_ps(left_over, (in2+round)); \
-        __m512 res = _mm512_##op##_ps(vecA, vecB); \
-        _mm512_mask_store_ps((out+round), left_over, res); \
+    int types_per_step = 512 / (8 * sizeof(float));            \
+    int left_over = *count;                        \
+    float* in1 = (float*)_in1;                        \
+    float* in2 = (float*)_in2;                        \
+    float* out = (float*)_out;                        \
+    for (; left_over >= types_per_step; left_over -= types_per_step) {    \
+        __m512 vecA =  _mm512_load_ps(in1);                    \
+        __m512 vecB =  _mm512_load_ps(in2);                    \
+        in1 += types_per_step;                        \
+        in2 += types_per_step;                        \
+        __m512 res = _mm512_##op##_ps(vecA, vecB);                \
+        _mm512_store_ps(out, res);                    \
+        out += types_per_step;                        \
+    }                                    \
+    if( 0 != left_over ) {                        \
+        types_per_step >>= 1;  /* 256 / (8 * sizeof(float));    */    \
+        if( left_over >= types_per_step ) {                    \
+            __m256 vecA =  _mm256_load_ps(in1);                \
+            __m256 vecB =  _mm256_load_ps(in2);                \
+            __m256 res = _mm256_##op##_ps(vecA, vecB);            \
+            _mm256_store_ps(out, res);                    \
+            in1 += types_per_step;                        \
+            in2 += types_per_step;                        \
+            out += types_per_step;                        \
+            left_over -= types_per_step;                    \
+        }                                    \
+    }                                    \
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                            \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        }\
     }\
 }
 
@@ -621,33 +717,48 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 8;                                                          \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
+    int types_per_step = 512 / (8 * sizeof(double));           \
+    int left_over = *count; \
     double* in1 = (double*)_in1; \
     double* in2 = (double*)_in2; \
     double* out = (double*)_out; \
-    for (i = 0; i < round; i+=64) { \
-        __m512d vecA =  _mm512_load_pd((in1+i));\
-        __m512d vecB =  _mm512_load_pd((in2+i));\
+    for (; left_over >= types_per_step; left_over -= types_per_step) {    \
+        __m512d vecA =  _mm512_load_pd((in1));\
+        __m512d vecB =  _mm512_load_pd((in2));\
+        in1 += types_per_step;                        \
+        in2 += types_per_step;                        \
         __m512d res = _mm512_##op##_pd(vecA, vecB); \
-        _mm512_store_pd((out+i), res); \
+        _mm512_store_pd((out), res); \
+        out += types_per_step;                        \
     }\
-    uint64_t left_over = *count - (size*step); \
-    if(left_over!=0){ \
-        uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF; \
-        left_over = left_over64 >>(64-left_over); \
-        __m512d vecA = _mm512_maskz_load_pd(left_over, (in1+round)); \
-        __m512d vecB = _mm512_maskz_load_pd(left_over, (in2+round)); \
-        __m512d res = _mm512_##op##_pd(vecA, vecB); \
-        _mm512_mask_store_pd((out+round), left_over, res); \
+    if( 0 != left_over ) {                        \
+        types_per_step >>= 1;  /* 256 / (8 * sizeof(double));    */    \
+        if( left_over >= types_per_step ) {                    \
+            __m256 vecA =  _mm256_load_ps(in1);                \
+            __m256 vecB =  _mm256_load_ps(in2);                \
+            __m256 res = _mm256_##op##_ps(vecA, vecB);            \
+            _mm256_store_ps(out, res);                    \
+            in1 += types_per_step;                        \
+            in2 += types_per_step;                        \
+            out += types_per_step;                        \
+            left_over -= types_per_step;                    \
+            }                                    \
+    }                                    \
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                            \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        }\
     }\
 }
 
 /*************************************************************************
  * Max
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+
     OP_AVX_FUNC_3BUFF(max, i, 8,    int8_t, max)
     OP_AVX_FUNC_3BUFF(max, u, 8,   uint8_t, max)
     OP_AVX_FUNC_3BUFF(max, i, 16,  int16_t, max)
@@ -664,6 +775,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Min
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
     OP_AVX_FUNC_3BUFF(min, i, 8,    int8_t, min)
     OP_AVX_FUNC_3BUFF(min, u, 8,   uint8_t, min)
     OP_AVX_FUNC_3BUFF(min, i, 16,  int16_t, min)
@@ -680,6 +793,9 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Sum
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+
     OP_AVX_FUNC_3BUFF(sum, i, 8,    int8_t, add)
     OP_AVX_FUNC_3BUFF(sum, i, 8,   uint8_t, add)
     OP_AVX_FUNC_3BUFF(sum, i, 16,  int16_t, add)
@@ -696,6 +812,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Product
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
     OP_AVX_MUL_3BUFF(prod, i, 8, int8_t, mullo)
     OP_AVX_MUL_3BUFF(prod, i, 8, uint8_t, mullo)
     OP_AVX_FUNC_3BUFF(prod, i, 16,  int16_t, mullo)
@@ -712,6 +830,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
     OP_AVX_BIT_FUNC_3BUFF(band, 8,    int8_t, and)
     OP_AVX_BIT_FUNC_3BUFF(band, 8,   uint8_t, and)
     OP_AVX_BIT_FUNC_3BUFF(band, 16,  int16_t, and)
@@ -727,6 +847,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Bitwise OR
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
     OP_AVX_BIT_FUNC_3BUFF(bor, 8,    int8_t, or)
     OP_AVX_BIT_FUNC_3BUFF(bor, 8,   uint8_t, or)
     OP_AVX_BIT_FUNC_3BUFF(bor, 16,  int16_t, or)
@@ -742,6 +864,8 @@ static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
 /*************************************************************************
  * Bitwise XOR
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
     OP_AVX_BIT_FUNC_3BUFF(bxor, 8,    int8_t, xor)
     OP_AVX_BIT_FUNC_3BUFF(bxor, 8,   uint8_t, xor)
     OP_AVX_BIT_FUNC_3BUFF(bxor, 16,  int16_t, xor)
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index fabcd0bc0a9..c31020e04e6 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -57,7 +57,7 @@ int main(int argc, char **argv) {
         inout_double[i] = inout_double_for_check[i] = 1.0+2;
 
         in_uint8[i] = 5;
-        inout_uint8[i] = inout_uint8_for_check[i] = 3;
+        inout_uint8[i] = inout_uint8_for_check[i] = -3;
 
         in_uint16[i] = 5;
         inout_uint16[i] = inout_uint16_for_check[i] = 3;