diff --git a/config/opal_config_asm.m4 b/config/opal_config_asm.m4
index b90c4aa61bd..a11682ce24e 100644
--- a/config/opal_config_asm.m4
+++ b/config/opal_config_asm.m4
@@ -1374,7 +1374,7 @@ AC_MSG_ERROR([Can not continue.])
 
     # Check for RDTSCP support
     result=0
-    AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
+    AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
           [AC_MSG_CHECKING([for RDTSCP assembly support])
            AC_LANG_PUSH([C])
            AC_TRY_RUN([[
diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am
index 8c392f1dbec..c4d47f9e64a 100644
--- a/ompi/mca/op/Makefile.am
+++ b/ompi/mca/op/Makefile.am
@@ -17,6 +17,8 @@
 # $HEADER$
 #
 
+AM_CPPFLAGS = $(LTDLINCL)
+
 # main library setup
 noinst_LTLIBRARIES = libmca_op.la
 libmca_op_la_SOURCES =
diff --git a/ompi/mca/op/avx/Makefile.am b/ompi/mca/op/avx/Makefile.am
new file mode 100644
index 00000000000..3a02d7510fb
--- /dev/null
+++ b/ompi/mca/op/avx/Makefile.am
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This component provide support for the Advanced Vector Extensions (AVX)
+# available in recent versions of x86 processors.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+AM_CPPFLAGS=$(op_avx_CPPFLAGS)
+
+sources = op_avx_component.c op_avx_functions.c \
+	op_avx_functions.h op_avx.h
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_avx_DSO
+component_noinst =
+component_install = mca_op_avx.la
+else
+component_install =
+component_noinst = libmca_op_avx.la
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_avx_la_SOURCES = $(sources)
+mca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS)
+mca_op_avx_la_LDFLAGS = -module -avoid-version
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_avx_la_SOURCES = $(sources)
+libmca_op_avx_la_CFLAGS = $(op_avx_CPPFLAGS)
+libmca_op_avx_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/avx/configure.m4 b/ompi/mca/op/avx/configure.m4
new file mode 100644
index 00000000000..ab21f057933
--- /dev/null
+++ b/ompi/mca/op/avx/configure.m4
@@ -0,0 +1,67 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
+#		         [action-if-cant-compile])
+# ------------------------------------------------
+# We can always build, unless we were explicitly disabled.
+AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
+    op_avx_support=0
+    OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save op_avx_CPPFLAGS])
+
+    AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
+          [AC_LANG_PUSH([C])
+
+           # Check for AVX512 support with default flags
+           op_avx_CPPFLAGS=""
+           AC_MSG_CHECKING([for AVX512 support (no additional flags)])
+           AC_LINK_IFELSE(
+               [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB)
+                                ]])],
+               [op_avx_support=1
+                AC_MSG_RESULT([yes])],
+               [AC_MSG_RESULT([no])])
+
+           AS_IF([test $op_avx_support -eq 0],
+                 [AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
+                  op_avx_cflags_save="$CFLAGS"
+                  CFLAGS="$CFLAGS -march=skylake-avx512"
+                  AC_LINK_IFELSE(
+	              [AC_LANG_PROGRAM([[#include <immintrin.h>]],
+                                       [[
+    __m512 vA, vB;
+    _mm512_add_ps(vA, vB)
+                                       ]])],
+                      [op_avx_support=1
+	               op_avx_CPPFLAGS="-march=skylake-avx512"
+                       AC_MSG_RESULT([yes])],
+                      [AC_MSG_RESULT([no])])
+                  CFLAGS="$op_avx_cflags_save"
+                 ])
+           AC_LANG_POP([C])
+          ])
+
+    AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
+		       [$op_avx_support],
+		       [Whetever AVX512 is supported in the current compilation context])
+    AC_SUBST([op_avx_CPPFLAGS])
+    OPAL_VAR_SCOPE_POP
+    AS_IF([test $op_avx_support -eq 1],
+    	  [AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
+           [$1]],
+	  [$2])
+
+])dnl
diff --git a/ompi/mca/op/avx/op_avx.h b/ompi/mca/op/avx/op_avx.h
new file mode 100644
index 00000000000..927a48db947
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_AVX_EXPORT_H
+#define MCA_OP_AVX_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is avx-component-specific cached information.  We
+       tend to use this scheme (caching information on the avx
+       component itself) instead of lots of individual global
+       variables for the component.  The following data fields are
+       avxs; replace them with whatever is relevant for your
+       component. */
+
+    /** A simple boolean indicating whether double precision is
+        supported. */
+    bool double_supported;
+} ompi_op_avx_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *avx* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_avx_component_t
+    mca_op_avx_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_AVX_EXPORT_H */
diff --git a/ompi/mca/op/avx/op_avx_component.c b/ompi/mca/op/avx/op_avx_component.c
new file mode 100644
index 00000000000..5113a6cf437
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx_component.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "avx" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/avx/op_avx.h"
+#include "ompi/mca/op/avx/op_avx_functions.h"
+
+static int avx_component_open(void);
+static int avx_component_close(void);
+static int avx_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    avx_component_op_query(struct ompi_op_t *op, int *priority);
+static int avx_component_register(void);
+
+/**
+ * A slightly modified code from
+ * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
+
+#include <immintrin.h>
+
+int has_intel_AVX512_features(void)
+{
+    const unsigned long avx512_features = _FEATURE_AVX2;
+
+    return _may_i_use_cpu_feature( avx512_features );
+}
+#else /* non-Intel compiler */
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
+{
+#if defined(_MSC_VER)
+    __cpuidex(abcd, eax, ecx);
+#else
+    uint32_t ebx, edx;
+#if defined( __i386__ ) && defined ( __PIC__ )
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
+	      "+a" (eax), "=c" (ecx), "=d" (edx) );
+#else
+    __asm__ ( "cpuid" : "=b" (ebx),
+	      "+a" (eax), "=c" (ecx), "=d" (edx) );
+#endif  /* defined( __i386__ ) && defined ( __PIC__ ) */
+    abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
+#endif
+}
+
+int has_intel_AVX512_features(void)
+{
+  uint32_t abcd[4];
+  uint32_t osxsave_mask = (1 << 27);  // OSX.
+  uint32_t avx2_mask = (1 << 5);  // AVX2
+
+  run_cpuid( 1, 0, abcd );
+  // OS supports extended processor state management ?
+  if ( (abcd[2] & osxsave_mask) != osxsave_mask )
+    return 0;
+
+  run_cpuid( 7, 0, abcd );
+  return !!((abcd[1] & avx2_mask) != avx2_mask);
+}
+#endif /* non-Intel compiler */
+
+ompi_op_avx_component_t mca_op_avx_component = {
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "avx",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = avx_component_open,
+            .mca_close_component = avx_component_close,
+            .mca_register_component_params = avx_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = avx_component_init_query,
+        .opc_op_query = avx_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int avx_component_open(void)
+{
+
+    /* A first level check to see if avx is even available in this
+       process.  E.g., you may want to do a first-order check to see
+       if hardware is available.  If so, return OMPI_SUCCESS.  If not,
+       return anything other than OMPI_SUCCESS and the component will
+       silently be ignored.
+
+       Note that if this function returns non-OMPI_SUCCESS, then this
+       component won't even be shown in ompi_info output (which is
+       probably not what you want).
+    */
+  return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int avx_component_close(void)
+{
+
+    /* If avx was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int
+avx_component_register(void)
+{
+    mca_op_avx_component.double_supported = true;
+    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
+                                           "double_supported",
+                                           "Whether the double precision data types are supported or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_avx_component.double_supported);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int
+avx_component_init_query(bool enable_progress_threads,
+			 bool enable_mpi_thread_multiple)
+{
+    if( !has_intel_AVX512_features() )
+        return OMPI_ERR_NOT_SUPPORTED;
+    return OMPI_SUCCESS;
+}
+
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t*
+avx_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = NULL;
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+    case OMPI_OP_BASE_FORTRAN_MIN:
+    case OMPI_OP_BASE_FORTRAN_SUM:
+    case OMPI_OP_BASE_FORTRAN_PROD:
+    case OMPI_OP_BASE_FORTRAN_BOR:
+    case OMPI_OP_BASE_FORTRAN_BAND:
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        module = OBJ_NEW(ompi_op_base_module_t);
+        for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+	    module->opm_fns[i] = ompi_op_avx_functions[op->o_f_to_c_index][i];
+	    OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[op->o_f_to_c_index][i];
+	    OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+    case OMPI_OP_BASE_FORTRAN_LOR:
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+    case OMPI_OP_BASE_FORTRAN_REPLACE:
+    default:
+        break;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *avx* component pointer
+       (vs. a *base* component pointer -- where an *avx* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c
new file mode 100644
index 00000000000..231696f12ca
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx_functions.c
@@ -0,0 +1,1036 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/avx/op_avx.h"
+#include "ompi/mca/op/avx/op_avx_functions.h"
+
+#include <immintrin.h>
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ * Support ops: max, min, for signed/unsigned 8,16,32,64
+ *              sum, for integer 8,16,32,64
+ *
+ */
+#define OP_AVX_FUNC(name, type_sign, type_size, type, op) \
+    static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in);\
+        __m512i vecB =  _mm512_loadu_si512(out);\
+        in += types_per_step; \
+        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+        _mm512_storeu_si512((out), res); \
+        out += types_per_step; \
+    }\
+    if(left_over!=0){ \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in); \
+            __m256i vecB = _mm256_loadu_si256(out); \
+            in += types_per_step;                       \
+            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+        if( 0 != left_over ) {                        \
+            switch(left_over) {                         \
+                case 31: out[30] = current_func(out[30],in[30]) ;                        \
+                case 30: out[29] = current_func(out[29],in[29]) ;                        \
+                case 29: out[28] = current_func(out[28],in[28]) ;                        \
+                case 28: out[27] = current_func(out[27],in[27]) ;                        \
+                case 27: out[26] = current_func(out[26],in[26]) ;                        \
+                case 26: out[25] = current_func(out[25],in[25]) ;                        \
+                case 25: out[24] = current_func(out[24],in[24]) ;                        \
+                case 24: out[23] = current_func(out[23],in[23]) ;                        \
+                case 23: out[22] = current_func(out[22],in[22]) ;                        \
+                case 22: out[21] = current_func(out[21],in[21]) ;                        \
+                case 21: out[20] = current_func(out[20],in[20]) ;                        \
+                case 20: out[19] = current_func(out[19],in[19]) ;                        \
+                case 19: out[18] = current_func(out[18],in[18]) ;                        \
+                case 18: out[17] = current_func(out[17],in[17]) ;                        \
+                case 17: out[16] = current_func(out[16],in[16]) ;                        \
+                case 16: out[15] = current_func(out[15],in[15]) ;                        \
+                case 15: out[14] = current_func(out[14],in[14]) ;                        \
+                case 14: out[13] = current_func(out[13],in[13]) ;                        \
+                case 13: out[12] = current_func(out[12],in[12]) ;                        \
+                case 12: out[11] = current_func(out[11],in[11]) ;                        \
+                case 11: out[10] = current_func(out[10],in[10]) ;                        \
+                case 10: out[9] = current_func(out[9],in[9]) ;                        \
+                case 9: out[8] = current_func(out[8],in[8]) ;                        \
+                case 8: out[7] = current_func(out[7],in[7]) ;                        \
+                case 7: out[6] = current_func(out[6],in[6]) ;                        \
+                case 6: out[5] = current_func(out[5],in[5]) ;                        \
+                case 5: out[4] = current_func(out[4],in[4]) ;                        \
+                case 4: out[3] = current_func(out[3],in[3]) ;                        \
+                case 3: out[2] = current_func(out[2],in[2]) ;                        \
+                case 2: out[1] = current_func(out[1],in[1]) ;                        \
+                case 1: out[0] = current_func(out[0],in[0]) ;                        \
+            }\
+        }\
+    }\
+}
+
+/* special case for int8 mul */
+#define OP_AVX_MUL(name, type_sign, type_size, type, op) \
+        static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \
+                            struct ompi_datatype_t **dtype, \
+                            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 256 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    int store_mask = 0xFFFFFFFF;\
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m256i vecA_tmp =  _mm256_loadu_si256(in);\
+        __m256i vecB_tmp =  _mm256_loadu_si256(out);\
+        in += types_per_step; \
+        __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \
+        __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \
+        __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB); \
+        _mm512_mask_cvtepi16_storeu_epi8((out), store_mask, res);\
+        out += types_per_step; \
+        }\
+    if(left_over!=0){ \
+        switch(left_over) {                         \
+            case 31: out[30] = current_func(out[30],in[30]) ;                        \
+            case 30: out[29] = current_func(out[29],in[29]) ;                        \
+            case 29: out[28] = current_func(out[28],in[28]) ;                        \
+            case 28: out[27] = current_func(out[27],in[27]) ;                        \
+            case 27: out[26] = current_func(out[26],in[26]) ;                        \
+            case 26: out[25] = current_func(out[25],in[25]) ;                        \
+            case 25: out[24] = current_func(out[24],in[24]) ;                        \
+            case 24: out[23] = current_func(out[23],in[23]) ;                        \
+            case 23: out[22] = current_func(out[22],in[22]) ;                        \
+            case 22: out[21] = current_func(out[21],in[21]) ;                        \
+            case 21: out[20] = current_func(out[20],in[20]) ;                        \
+            case 20: out[19] = current_func(out[19],in[19]) ;                        \
+            case 19: out[18] = current_func(out[18],in[18]) ;                        \
+            case 18: out[17] = current_func(out[17],in[17]) ;                        \
+            case 17: out[16] = current_func(out[16],in[16]) ;                        \
+            case 16: out[15] = current_func(out[15],in[15]) ;                        \
+            case 15: out[14] = current_func(out[14],in[14]) ;                        \
+            case 14: out[13] = current_func(out[13],in[13]) ;                        \
+            case 13: out[12] = current_func(out[12],in[12]) ;                        \
+            case 12: out[11] = current_func(out[11],in[11]) ;                        \
+            case 11: out[10] = current_func(out[10],in[10]) ;                        \
+            case 10: out[9] = current_func(out[9],in[9]) ;                        \
+            case 9: out[8] = current_func(out[8],in[8]) ;                        \
+            case 8: out[7] = current_func(out[7],in[7]) ;                        \
+            case 7: out[6] = current_func(out[6],in[6]) ;                        \
+            case 6: out[5] = current_func(out[5],in[5]) ;                        \
+            case 5: out[4] = current_func(out[4],in[4]) ;                        \
+            case 4: out[3] = current_func(out[3],in[3]) ;                        \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
+    }\
+}
+
+/*
+ *  This macro is for bit-wise operations (out op in).
+ *
+ *  Support ops: or, xor, and of 512 bits (representing integer data)
+ *
+ */
+#define OP_AVX_BIT_FUNC(name, type_size, type, op) \
+    static void ompi_op_avx_2buff_##name##_##type(void *_in, void *_out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(type)); \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in);\
+        __m512i vecB =  _mm512_loadu_si512(out);\
+        in += types_per_step;                       \
+        __m512i res = _mm512_##op##_si512(vecA, vecB); \
+        _mm512_storeu_si512(out, res); \
+        out += types_per_step;                        \
+    }\
+    if(left_over!=0){ \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in); \
+            __m256i vecB = _mm256_loadu_si256(out); \
+            in += types_per_step;                       \
+            __m256i res =  _mm256_##op##_si256(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+        if( 0 != left_over ) {                        \
+            switch(left_over) {                         \
+                case 31: out[30] = current_func(out[30],in[30]) ;                        \
+                case 30: out[29] = current_func(out[29],in[29]) ;                        \
+                case 29: out[28] = current_func(out[28],in[28]) ;                        \
+                case 28: out[27] = current_func(out[27],in[27]) ;                        \
+                case 27: out[26] = current_func(out[26],in[26]) ;                        \
+                case 26: out[25] = current_func(out[25],in[25]) ;                        \
+                case 25: out[24] = current_func(out[24],in[24]) ;                        \
+                case 24: out[23] = current_func(out[23],in[23]) ;                        \
+                case 23: out[22] = current_func(out[22],in[22]) ;                        \
+                case 22: out[21] = current_func(out[21],in[21]) ;                        \
+                case 21: out[20] = current_func(out[20],in[20]) ;                        \
+                case 20: out[19] = current_func(out[19],in[19]) ;                        \
+                case 19: out[18] = current_func(out[18],in[18]) ;                        \
+                case 18: out[17] = current_func(out[17],in[17]) ;                        \
+                case 17: out[16] = current_func(out[16],in[16]) ;                        \
+                case 16: out[15] = current_func(out[15],in[15]) ;                        \
+                case 15: out[14] = current_func(out[14],in[14]) ;                        \
+                case 14: out[13] = current_func(out[13],in[13]) ;                        \
+                case 13: out[12] = current_func(out[12],in[12]) ;                        \
+                case 12: out[11] = current_func(out[11],in[11]) ;                        \
+                case 11: out[10] = current_func(out[10],in[10]) ;                        \
+                case 10: out[9] = current_func(out[9],in[9]) ;                        \
+                case 9: out[8] = current_func(out[8],in[8]) ;                        \
+                case 8: out[7] = current_func(out[7],in[7]) ;                        \
+                case 7: out[6] = current_func(out[6],in[6]) ;                        \
+                case 6: out[5] = current_func(out[5],in[5]) ;                        \
+                case 5: out[4] = current_func(out[4],in[4]) ;                        \
+                case 4: out[3] = current_func(out[3],in[3]) ;                        \
+                case 3: out[2] = current_func(out[2],in[2]) ;                        \
+                case 2: out[1] = current_func(out[1],in[1]) ;                        \
+                case 1: out[0] = current_func(out[0],in[0]) ;                        \
+            }\
+        }\
+    }\
+}
+
+#if 0
+#define OP_AVX_FLOAT_FUNC(op)						\
+  static void ompi_op_avx_2buff_##op##_float(void *in, void *out, int *count, \
+					     struct ompi_datatype_t **dtype, \
+					     struct ompi_op_base_module_1_0_0_t *module) \
+  {									\
+    int step = 16;							\
+    int size = *count/step;						\
+    int i;								\
+    int round = size*64;						\
+    for (i = 0; i < round; i+=64) {					\
+      __m512 vecA =  _mm512_load_ps((in+i));				\
+      __m512 vecB =  _mm512_load_ps((out+i));				\
+      __m512 res = _mm512_##op##_ps(vecA, vecB);			\
+	_mm512_store_ps((out+i), res);					\
+    }									\
+    uint64_t left_over = *count - (size*step);				\
+    if(left_over!=0) {							\
+      uint64_t left_over64 = 0xFFFFFFFFFFFFFFFF;			\
+      left_over = left_over64 >>(64-left_over);				\
+      __m512 vecA = _mm512_maskz_load_ps(left_over, (in+round));	\
+      __m512 vecB = _mm512_maskz_load_ps(left_over, (out+round));	\
+      __m512 res = _mm512_##op##_ps(vecA, vecB);			\
+	_mm512_mask_store_ps((out+round), left_over, res);		\
+    }									\
+  }
+
+#else
+#define OP_AVX_FLOAT_FUNC(op)				\
+static void ompi_op_avx_2buff_##op##_float(void *_in, void *_out, int *count, \
+					   struct ompi_datatype_t **dtype, \
+					   struct ompi_op_base_module_1_0_0_t *module) \
+{									\
+  int types_per_step = 512 / (8 * sizeof(float));			\
+  int left_over = *count;						\
+  float* in = (float*)_in;						\
+  float* out = (float*)_out;						\
+  for (; left_over >= types_per_step; left_over -= types_per_step) {	\
+    __m512 vecA =  _mm512_load_ps(in);					\
+    __m512 vecB =  _mm512_load_ps(out);					\
+    in += types_per_step;						\
+    __m512 res = _mm512_##op##_ps(vecA, vecB);				\
+      _mm512_store_ps(out, res);					\
+      out += types_per_step;						\
+  }									\
+  if( 0 != left_over ) {						\
+    types_per_step >>= 1;  /* 256 / (8 * sizeof(float));	*/	\
+    if( left_over >= types_per_step ) {					\
+      __m256 vecA =  _mm256_load_ps(in);				\
+      __m256 vecB =  _mm256_load_ps(out);				\
+      __m256 res = _mm256_##op##_ps(vecA, vecB);			\
+	_mm256_store_ps(out, res);					\
+	in += types_per_step;						\
+	out += types_per_step;						\
+	left_over -= types_per_step;					\
+    }									\
+  }									\
+  if( 0 != left_over ) {						\
+    switch(left_over) {							\
+        case 7: out[6] = current_func(out[6],in[6]) ;                        \
+        case 6: out[5] = current_func(out[5],in[5]) ;                        \
+        case 5: out[4] = current_func(out[4],in[4]) ;                        \
+        case 4: out[3] = current_func(out[3],in[3]) ;                        \
+        case 3: out[2] = current_func(out[2],in[2]) ;                        \
+        case 2: out[1] = current_func(out[1],in[1]) ;                        \
+        case 1: out[0] = current_func(out[0],in[0]) ;                        \
+    }\
+  }\
+}
+#endif
+
+#define OP_AVX_DOUBLE_FUNC(op) \
+    static void ompi_op_avx_2buff_##op##_double(void *_in, void *_out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(double));           \
+    int left_over = *count; \
+    double* in = (double*)_in; \
+    double* out = (double*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) {    \
+        __m512d vecA =  _mm512_load_pd(in);\
+        __m512d vecB =  _mm512_load_pd(out);\
+        in += types_per_step;                       \
+        __m512d res = _mm512_##op##_pd(vecA, vecB); \
+        _mm512_store_pd((out), res); \
+        out += types_per_step;                        \
+    }\
+    if(left_over!=0){ \
+        types_per_step >>= 1;  /* 256 / (8 * sizeof(double));    */  \
+            if( left_over >= types_per_step ) {                 \
+                __m256 vecA =  _mm256_load_pd(in);                \
+                __m256 vecB =  _mm256_load_pd(out);               \
+                __m256 res = _mm256_##op##_pd(vecA, vecB);            \
+                _mm256_store_pd(out, res);                  \
+                in += types_per_step;                       \
+                out += types_per_step;                      \
+                left_over -= types_per_step;                    \
+            }\
+    }\
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                         \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
+    }\
+}
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+    OP_AVX_FUNC(max, i, 8,    int8_t, max)
+    OP_AVX_FUNC(max, u, 8,   uint8_t, max)
+    OP_AVX_FUNC(max, i, 16,  int16_t, max)
+    OP_AVX_FUNC(max, u, 16, uint16_t, max)
+    OP_AVX_FUNC(max, i, 32,  int32_t, max)
+    OP_AVX_FUNC(max, u, 32, uint32_t, max)
+    OP_AVX_FUNC(max, i, 64,  int64_t, max)
+    OP_AVX_FUNC(max, u, 64, uint64_t, max)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(max)
+    OP_AVX_DOUBLE_FUNC(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_AVX_FUNC(min, i, 8,    int8_t, min)
+    OP_AVX_FUNC(min, u, 8,   uint8_t, min)
+    OP_AVX_FUNC(min, i, 16,  int16_t, min)
+    OP_AVX_FUNC(min, u, 16, uint16_t, min)
+    OP_AVX_FUNC(min, i, 32,  int32_t, min)
+    OP_AVX_FUNC(min, u, 32, uint32_t, min)
+    OP_AVX_FUNC(min, i, 64,  int64_t, min)
+    OP_AVX_FUNC(min, u, 64, uint64_t, min)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(min)
+    OP_AVX_DOUBLE_FUNC(min)
+
+/*************************************************************************
+ * Sum
+ ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+    OP_AVX_FUNC(sum, i, 8,    int8_t, add)
+    OP_AVX_FUNC(sum, i, 8,   uint8_t, add)
+    OP_AVX_FUNC(sum, i, 16,  int16_t, add)
+    OP_AVX_FUNC(sum, i, 16, uint16_t, add)
+    OP_AVX_FUNC(sum, i, 32,  int32_t, add)
+    OP_AVX_FUNC(sum, i, 32, uint32_t, add)
+    OP_AVX_FUNC(sum, i, 64,  int64_t, add)
+    OP_AVX_FUNC(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(add)
+    OP_AVX_DOUBLE_FUNC(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_AVX_MUL(prod, i, 8, int8_t, mullo)
+    OP_AVX_MUL(prod, i, 8, uint8_t, mullo)
+    OP_AVX_FUNC(prod, i, 16,  int16_t, mullo)
+    OP_AVX_FUNC(prod, i, 16, uint16_t, mullo)
+    OP_AVX_FUNC(prod, i, 32,  int32_t, mullo)
+    OP_AVX_FUNC(prod, i ,32, uint32_t, mullo)
+    OP_AVX_FUNC(prod, i, 64,  int64_t, mullo)
+    OP_AVX_FUNC(prod, i, 64, uint64_t, mullo)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC(mul)
+    OP_AVX_DOUBLE_FUNC(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_AVX_BIT_FUNC(band, 8,    int8_t, and)
+    OP_AVX_BIT_FUNC(band, 8,   uint8_t, and)
+    OP_AVX_BIT_FUNC(band, 16,  int16_t, and)
+    OP_AVX_BIT_FUNC(band, 16, uint16_t, and)
+    OP_AVX_BIT_FUNC(band, 32,  int32_t, and)
+    OP_AVX_BIT_FUNC(band, 32, uint32_t, and)
+    OP_AVX_BIT_FUNC(band, 64,  int64_t, and)
+    OP_AVX_BIT_FUNC(band, 64, uint64_t, and)
+
+    // not defined - OP_AVX_FLOAT_FUNC(and)
+    // not defined - OP_AVX_DOUBLE_FUNC(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_AVX_BIT_FUNC(bor, 8,    int8_t, or)
+    OP_AVX_BIT_FUNC(bor, 8,   uint8_t, or)
+    OP_AVX_BIT_FUNC(bor, 16,  int16_t, or)
+    OP_AVX_BIT_FUNC(bor, 16, uint16_t, or)
+    OP_AVX_BIT_FUNC(bor, 32,  int32_t, or)
+    OP_AVX_BIT_FUNC(bor, 32, uint32_t, or)
+    OP_AVX_BIT_FUNC(bor, 64,  int64_t, or)
+    OP_AVX_BIT_FUNC(bor, 64, uint64_t, or)
+
+    // not defined - OP_AVX_FLOAT_FUNC(or)
+    // not defined - OP_AVX_DOUBLE_FUNC(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_AVX_BIT_FUNC(bxor, 8,    int8_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 8,   uint8_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 16,  int16_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 16, uint16_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 32,  int32_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 32, uint32_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 64,  int64_t, xor)
+    OP_AVX_BIT_FUNC(bxor, 64, uint64_t, xor)
+
+    // not defined - OP_AVX_FLOAT_FUNC(xor)
+    // not defined - OP_AVX_DOUBLE_FUNC(xor)
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *  routines, needed for some optimizations.
+ */
+#define OP_AVX_FUNC_3BUFF(name, type_sign, type_size, type, op)\
+        static void ompi_op_avx_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in1);\
+        __m512i vecB =  _mm512_loadu_si512(in2);\
+        in1 += types_per_step; \
+        in2 += types_per_step; \
+        __m512i res = _mm512_##op##_ep##type_sign##type_size(vecA, vecB); \
+        _mm512_storeu_si512((out), res); \
+        out += types_per_step; \
+    }\
+    if(left_over!=0){ \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in1); \
+            __m256i vecB = _mm256_loadu_si256(in2); \
+            in1 += types_per_step;                       \
+            in2 += types_per_step; \
+            __m256i res =  _mm256_##op##_ep##type_sign##type_size(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+        if( 0 != left_over ) {                        \
+            switch(left_over) {                         \
+            case 31: out[30] = current_func(in1[30],in2[30]) ;                        \
+            case 30: out[29] = current_func(in1[29],in2[29]) ;                        \
+            case 29: out[28] = current_func(in1[28],in2[28]) ;                        \
+            case 28: out[27] = current_func(in1[27],in2[27]) ;                        \
+            case 27: out[26] = current_func(in1[26],in2[26]) ;                        \
+            case 26: out[25] = current_func(in1[25],in2[25]) ;                        \
+            case 25: out[24] = current_func(in1[24],in2[24]) ;                        \
+            case 24: out[23] = current_func(in1[23],in2[23]) ;                        \
+            case 23: out[22] = current_func(in1[22],in2[22]) ;                        \
+            case 22: out[21] = current_func(in1[21],in2[21]) ;                        \
+            case 21: out[20] = current_func(in1[20],in2[20]) ;                        \
+            case 20: out[19] = current_func(in1[19],in2[19]) ;                        \
+            case 19: out[18] = current_func(in1[18],in2[18]) ;                        \
+            case 18: out[17] = current_func(in1[17],in2[17]) ;                        \
+            case 17: out[16] = current_func(in1[16],in2[16]) ;                        \
+            case 16: out[15] = current_func(in1[15],in2[15]) ;                        \
+            case 15: out[14] = current_func(in1[14],in2[14]) ;                        \
+            case 14: out[13] = current_func(in1[13],in2[13]) ;                        \
+            case 13: out[12] = current_func(in1[12],in2[12]) ;                        \
+            case 12: out[11] = current_func(in1[11],in2[11]) ;                        \
+            case 11: out[10] = current_func(in1[10],in2[10]) ;                        \
+            case 10: out[9] = current_func(in1[9],in2[9]) ;                        \
+            case 9: out[8] = current_func(in1[8],in2[8]) ;                        \
+            case 8: out[7] = current_func(in1[7],in2[7]) ;                        \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+            } \
+        }\
+    }\
+}
+
+/* special case for int8 mul */
+#define OP_AVX_MUL_3BUFF(name, type_sign, type_size, type, op) \
+        static void ompi_op_avx_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 256 / (8 * sizeof(type));          \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
+    int store_mask = 0xFFFFFFFF;\
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m256i vecA_tmp =  _mm256_loadu_si256(in1);\
+        __m256i vecB_tmp =  _mm256_loadu_si256(in2);\
+        in1 += types_per_step; \
+        in2 += types_per_step; \
+        __m512i vecA = _mm512_cvtepi8_epi16(vecA_tmp); \
+        __m512i vecB = _mm512_cvtepi8_epi16(vecB_tmp); \
+        __m512i res = _mm512_##op##_ep##type_sign##16(vecA, vecB); \
+        _mm512_mask_cvtepi16_storeu_epi8((out), store_mask, res);\
+        out += types_per_step; \
+    }\
+    if(left_over!=0){ \
+        switch(left_over) {                         \
+            case 31: out[30] = current_func(in1[30],in2[30]) ;                        \
+            case 30: out[29] = current_func(in1[29],in2[29]) ;                        \
+            case 29: out[28] = current_func(in1[28],in2[28]) ;                        \
+            case 28: out[27] = current_func(in1[27],in2[27]) ;                        \
+            case 27: out[26] = current_func(in1[26],in2[26]) ;                        \
+            case 26: out[25] = current_func(in1[25],in2[25]) ;                        \
+            case 25: out[24] = current_func(in1[24],in2[24]) ;                        \
+            case 24: out[23] = current_func(in1[23],in2[23]) ;                        \
+            case 23: out[22] = current_func(in1[22],in2[22]) ;                        \
+            case 22: out[21] = current_func(in1[21],in2[21]) ;                        \
+            case 21: out[20] = current_func(in1[20],in2[20]) ;                        \
+            case 20: out[19] = current_func(in1[19],in2[19]) ;                        \
+            case 19: out[18] = current_func(in1[18],in2[18]) ;                        \
+            case 18: out[17] = current_func(in1[17],in2[17]) ;                        \
+            case 17: out[16] = current_func(in1[16],in2[16]) ;                        \
+            case 16: out[15] = current_func(in1[15],in2[15]) ;                        \
+            case 15: out[14] = current_func(in1[14],in2[14]) ;                        \
+            case 14: out[13] = current_func(in1[13],in2[13]) ;                        \
+            case 13: out[12] = current_func(in1[12],in2[12]) ;                        \
+            case 12: out[11] = current_func(in1[11],in2[11]) ;                        \
+            case 11: out[10] = current_func(in1[10],in2[10]) ;                        \
+            case 10: out[9] = current_func(in1[9],in2[9]) ;                        \
+            case 9: out[8] = current_func(in1[8],in2[8]) ;                        \
+            case 8: out[7] = current_func(in1[7],in2[7]) ;                        \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        } \
+    }\
+}
+
+#define OP_AVX_BIT_FUNC_3BUFF(name, type_size, type, op) \
+        static void ompi_op_avx_3buff_##op##_##type(void *_in1, void *_in2, void *_out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(type)); \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        __m512i vecA =  _mm512_loadu_si512(in1);\
+        __m512i vecB =  _mm512_loadu_si512(in2);\
+        in1 += types_per_step;                       \
+        in2 += types_per_step;                       \
+        __m512i res = _mm512_##op##_si512(vecA, vecB); \
+        _mm512_storeu_si512(out, res); \
+        out += types_per_step;                        \
+    }\
+    if(left_over!=0){ \
+        types_per_step >>= 1; \
+        if( left_over >= types_per_step ) {                 \
+            __m256i vecA = _mm256_loadu_si256(in1); \
+            __m256i vecB = _mm256_loadu_si256(in2); \
+            in1 += types_per_step;                       \
+            in2 += types_per_step;                       \
+            __m256i res =  _mm256_##op##_si256(vecA, vecB); \
+            _mm256_storeu_si256(out, res); \
+            out += types_per_step;                        \
+            left_over -= types_per_step; \
+        }\
+    }\
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                         \
+            case 31: out[30] = current_func(in1[30],in2[30]) ;                        \
+            case 30: out[29] = current_func(in1[29],in2[29]) ;                        \
+            case 29: out[28] = current_func(in1[28],in2[28]) ;                        \
+            case 28: out[27] = current_func(in1[27],in2[27]) ;                        \
+            case 27: out[26] = current_func(in1[26],in2[26]) ;                        \
+            case 26: out[25] = current_func(in1[25],in2[25]) ;                        \
+            case 25: out[24] = current_func(in1[24],in2[24]) ;                        \
+            case 24: out[23] = current_func(in1[23],in2[23]) ;                        \
+            case 23: out[22] = current_func(in1[22],in2[22]) ;                        \
+            case 22: out[21] = current_func(in1[21],in2[21]) ;                        \
+            case 21: out[20] = current_func(in1[20],in2[20]) ;                        \
+            case 20: out[19] = current_func(in1[19],in2[19]) ;                        \
+            case 19: out[18] = current_func(in1[18],in2[18]) ;                        \
+            case 18: out[17] = current_func(in1[17],in2[17]) ;                        \
+            case 17: out[16] = current_func(in1[16],in2[16]) ;                        \
+            case 16: out[15] = current_func(in1[15],in2[15]) ;                        \
+            case 15: out[14] = current_func(in1[14],in2[14]) ;                        \
+            case 14: out[13] = current_func(in1[13],in2[13]) ;                        \
+            case 13: out[12] = current_func(in1[12],in2[12]) ;                        \
+            case 12: out[11] = current_func(in1[11],in2[11]) ;                        \
+            case 11: out[10] = current_func(in1[10],in2[10]) ;                        \
+            case 10: out[9] = current_func(in1[9],in2[9]) ;                        \
+            case 9: out[8] = current_func(in1[8],in2[8]) ;                        \
+            case 8: out[7] = current_func(in1[7],in2[7]) ;                        \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        } \
+    }\
+}
+
+#define OP_AVX_FLOAT_FUNC_3BUFF(op) \
+        static void ompi_op_avx_3buff_##op##_float(void *_in1, void *_in2, void *_out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(float));            \
+    int left_over = *count;                        \
+    float* in1 = (float*)_in1;                        \
+    float* in2 = (float*)_in2;                        \
+    float* out = (float*)_out;                        \
+    for (; left_over >= types_per_step; left_over -= types_per_step) {    \
+        __m512 vecA =  _mm512_load_ps(in1);                    \
+        __m512 vecB =  _mm512_load_ps(in2);                    \
+        in1 += types_per_step;                        \
+        in2 += types_per_step;                        \
+        __m512 res = _mm512_##op##_ps(vecA, vecB);                \
+        _mm512_store_ps(out, res);                    \
+        out += types_per_step;                        \
+    }                                    \
+    if( 0 != left_over ) {                        \
+        types_per_step >>= 1;  /* 256 / (8 * sizeof(float));    */    \
+        if( left_over >= types_per_step ) {                    \
+            __m256 vecA =  _mm256_load_ps(in1);                \
+            __m256 vecB =  _mm256_load_ps(in2);                \
+            __m256 res = _mm256_##op##_ps(vecA, vecB);            \
+            _mm256_store_ps(out, res);                    \
+            in1 += types_per_step;                        \
+            in2 += types_per_step;                        \
+            out += types_per_step;                        \
+            left_over -= types_per_step;                    \
+        }                                    \
+    }                                    \
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                            \
+            case 7: out[6] = current_func(in1[6],in2[6]) ;                        \
+            case 6: out[5] = current_func(in1[5],in2[5]) ;                        \
+            case 5: out[4] = current_func(in1[4],in2[4]) ;                        \
+            case 4: out[3] = current_func(in1[3],in2[3]) ;                        \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        }\
+    }\
+}
+
+#define OP_AVX_DOUBLE_FUNC_3BUFF(op) \
+        static void ompi_op_avx_3buff_##op##_double(void *_in1, void *_in2, void *_out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = 512 / (8 * sizeof(double));           \
+    int left_over = *count; \
+    double* in1 = (double*)_in1; \
+    double* in2 = (double*)_in2; \
+    double* out = (double*)_out; \
+    for (; left_over >= types_per_step; left_over -= types_per_step) {    \
+        __m512d vecA =  _mm512_load_pd((in1));\
+        __m512d vecB =  _mm512_load_pd((in2));\
+        in1 += types_per_step;                        \
+        in2 += types_per_step;                        \
+        __m512d res = _mm512_##op##_pd(vecA, vecB); \
+        _mm512_store_pd((out), res); \
+        out += types_per_step;                        \
+    }\
+    if( 0 != left_over ) {                        \
+        types_per_step >>= 1;  /* 256 / (8 * sizeof(double));    */    \
+        if( left_over >= types_per_step ) {                    \
+            __m256 vecA =  _mm256_load_ps(in1);                \
+            __m256 vecB =  _mm256_load_ps(in2);                \
+            __m256 res = _mm256_##op##_ps(vecA, vecB);            \
+            _mm256_store_ps(out, res);                    \
+            in1 += types_per_step;                        \
+            in2 += types_per_step;                        \
+            out += types_per_step;                        \
+            left_over -= types_per_step;                    \
+            }                                    \
+    }                                    \
+    if( 0 != left_over ) {                        \
+        switch(left_over) {                            \
+            case 3: out[2] = current_func(in1[2],in2[2]) ;                        \
+            case 2: out[1] = current_func(in1[1],in2[1]) ;                        \
+            case 1: out[0] = current_func(in1[0],in2[0]) ;                        \
+        }\
+    }\
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+
+    OP_AVX_FUNC_3BUFF(max, i, 8,    int8_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 8,   uint8_t, max)
+    OP_AVX_FUNC_3BUFF(max, i, 16,  int16_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 16, uint16_t, max)
+    OP_AVX_FUNC_3BUFF(max, i, 32,  int32_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 32, uint32_t, max)
+    OP_AVX_FUNC_3BUFF(max, i, 64,  int64_t, max)
+    OP_AVX_FUNC_3BUFF(max, u, 64, uint64_t, max)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(max)
+    OP_AVX_DOUBLE_FUNC_3BUFF(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_AVX_FUNC_3BUFF(min, i, 8,    int8_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 8,   uint8_t, min)
+    OP_AVX_FUNC_3BUFF(min, i, 16,  int16_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 16, uint16_t, min)
+    OP_AVX_FUNC_3BUFF(min, i, 32,  int32_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 32, uint32_t, min)
+    OP_AVX_FUNC_3BUFF(min, i, 64,  int64_t, min)
+    OP_AVX_FUNC_3BUFF(min, u, 64, uint64_t, min)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(min)
+    OP_AVX_DOUBLE_FUNC_3BUFF(min)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+
+    OP_AVX_FUNC_3BUFF(sum, i, 8,    int8_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 8,   uint8_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 16,  int16_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 16, uint16_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 32,  int32_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 32, uint32_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 64,  int64_t, add)
+    OP_AVX_FUNC_3BUFF(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(add)
+    OP_AVX_DOUBLE_FUNC_3BUFF(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_AVX_MUL_3BUFF(prod, i, 8, int8_t, mullo)
+    OP_AVX_MUL_3BUFF(prod, i, 8, uint8_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 16,  int16_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 16, uint16_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 32,  int32_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i ,32, uint32_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 64,  int64_t, mullo)
+    OP_AVX_FUNC_3BUFF(prod, i, 64, uint64_t, mullo)
+
+    /* Floating point */
+    OP_AVX_FLOAT_FUNC_3BUFF(mul)
+    OP_AVX_DOUBLE_FUNC_3BUFF(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_AVX_BIT_FUNC_3BUFF(band, 8,    int8_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 8,   uint8_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 16,  int16_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 16, uint16_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 32,  int32_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 32, uint32_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 64,  int64_t, and)
+    OP_AVX_BIT_FUNC_3BUFF(band, 64, uint64_t, and)
+
+    // not defined - OP_AVX_FLOAT_FUNC_3BUFF(and)
+    // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_AVX_BIT_FUNC_3BUFF(bor, 8,    int8_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 8,   uint8_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 16,  int16_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 16, uint16_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 32,  int32_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 32, uint32_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 64,  int64_t, or)
+    OP_AVX_BIT_FUNC_3BUFF(bor, 64, uint64_t, or)
+
+    // not defined - OP_AVX_FLOAT_FUNC_3BUFF(or)
+    // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 8,    int8_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 8,   uint8_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 16,  int16_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 32,  int32_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 64,  int64_t, xor)
+    OP_AVX_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor)
+
+    // not defined - OP_AVX_FLOAT_FUNC_3BUFF(xor)
+    // not defined - OP_AVX_DOUBLE_FUNC_3BUFF(xor)
+
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                              \
+    [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_avx_##ftype##_##name##_int8_t,     \
+    [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_avx_##ftype##_##name##_uint8_t,   \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_avx_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_avx_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_avx_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_avx_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_avx_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_avx_##ftype##_##name##_uint64_t
+
+
+/** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype) ompi_op_avx_##ftype##_##name##_float
+#define DOUBLE(name, ftype) ompi_op_avx_##ftype##_##name##_double
+
+#define FLOATING_POINT(name, ftype)                                                            \
+    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_handler_fn_t ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 2buff),
+        FLOATING_POINT(add, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER(prod, 2buff),
+        FLOATING_POINT(mul, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 3buff),
+        FLOATING_POINT(add, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER(prod, 3buff),
+        FLOATING_POINT(mul, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(and, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(or, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(xor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};
diff --git a/ompi/mca/op/avx/op_avx_functions.h b/ompi/mca/op/avx/op_avx_functions.h
new file mode 100644
index 00000000000..621f8538d92
--- /dev/null
+++ b/ompi/mca/op/avx/op_avx_functions.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/avx/op_avx.h"
+
+BEGIN_C_DECLS
+
+OMPI_DECLSPEC extern ompi_op_base_handler_fn_t
+ompi_op_avx_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t
+ompi_op_avx_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
new file mode 100644
index 00000000000..c31020e04e6
--- /dev/null
+++ b/test/datatype/Reduce_local_float.c
@@ -0,0 +1,716 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+
+#define ARRAYSIZE  1024*1024
+
+float in_float[ARRAYSIZE];
+float inout_float[ARRAYSIZE];
+float inout_float_for_check[ARRAYSIZE];
+
+int8_t in_uint8[ARRAYSIZE];
+int8_t inout_uint8[ARRAYSIZE];
+int8_t inout_uint8_for_check[ARRAYSIZE];
+
+uint16_t in_uint16[ARRAYSIZE];
+uint16_t inout_uint16[ARRAYSIZE];
+uint16_t inout_uint16_for_check[ARRAYSIZE];
+
+uint32_t in_uint32[ARRAYSIZE];
+uint32_t inout_uint32[ARRAYSIZE];
+uint32_t inout_uint32_for_check[ARRAYSIZE];
+
+uint64_t in_uint64[ARRAYSIZE];
+uint64_t inout_uint64[ARRAYSIZE];
+uint64_t inout_uint64_for_check[ARRAYSIZE];
+
+double in_double[ARRAYSIZE];
+double inout_double[ARRAYSIZE];
+double inout_double_for_check[ARRAYSIZE];
+
+int main(int argc, char **argv) {
+
+    char *num_elem = argv[1];
+    int count = atoi(num_elem);
+    char *type = argv[2];
+    char *elem_size = argv[3];
+    int elem_size1  = atoi(elem_size);
+    char *op = argv[4];
+
+    int i;
+
+    for (i=0; i<count; i++)
+    {
+        in_float[i] = 1000.0+1;
+        inout_float[i] = inout_float_for_check[i] = 100.0+2;
+
+        in_double[i] = 10.0+1;
+        inout_double[i] = inout_double_for_check[i] = 1.0+2;
+
+        in_uint8[i] = 5;
+        inout_uint8[i] = inout_uint8_for_check[i] = -3;
+
+        in_uint16[i] = 5;
+        inout_uint16[i] = inout_uint16_for_check[i] = 3;
+
+        in_uint32[i] = 5;
+        inout_uint32[i] = inout_uint32_for_check[i] = 3;
+
+        in_uint64[i] = 5;
+        inout_uint64[i] = inout_uint64_for_check[i] = 3;
+    }
+    int rank, size, len;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    double tstart, tend;
+
+    int correctness=1;
+
+    if(*type=='i') {
+        if(strcmp(op, "sum") == 0){
+            printf("#Local Reduce SUM: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] + inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 SUM check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] + inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 SUM check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] + inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 SUM check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] + inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 SUM check \033[1;31m fail\033[0m!");
+            }
+        }
+
+        if(strcmp(op, "max") == 0){
+            printf("#Local Reduce MAX: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 MAX check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 MAX check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 MAX check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 MAX check \033[1;31m fail\033[0m!");
+            }
+        }
+        //intentionly reversed in and out
+        if(strcmp(op, "min") == 0) {
+            printf("#Local Reduce MIN: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_uint8,in_uint8,count, MPI_INT8_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 MIN check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_uint16,in_uint16,count, MPI_INT16_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 MIN check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_uint32,in_uint32,count, MPI_UINT32_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 MIN check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_uint64,in_uint64,count, MPI_UINT64_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 MIN check \033[1;31m fail\033[0m!");
+            }
+        }
+
+        if(strcmp(op, "bor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!= (in_uint8[i] | inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BOR check \033[1;31m fail\033[0m!");
+            }
+        }
+
+        if(strcmp(op, "bxor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] ^ inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BXOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] ^ inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BXOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] ^ inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BXOR check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=(in_uint64[i] ^ inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BXOR check \033[1;31m fail\033[0m!");
+            }
+        }
+
+
+        if(strcmp(op, "mul") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] * inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 PROD check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] * inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 PROD check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] * inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 PROD check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] * inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 PROD check \033[1;31m fail\033[0m!");
+            }
+        }
+
+        if(strcmp(op, "band") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            if (elem_size1 == 8)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] & inout_uint8_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BAND check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 16)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] & inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BAND check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 32)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] & inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BAND check \033[1;31m fail\033[0m!");
+            }
+            if (elem_size1 == 64)
+            {
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!= (in_uint64[i] & inout_uint64_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BAND check \033[1;31m fail\033[0m!");
+            }
+        }
+    }
+
+    if(*type=='f') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=inout_float_for_check[i]+in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Sum check \033[1;32m success!\033[0m");
+            else
+                printf("Float Sum check \033[1;31m fail\033[0m!");
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Max check \033[1;32m success!\033[0m");
+            else
+                printf("Float Max check \033[1;31m fail\033[0m!");
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Min check \033[1;32m success!\033[0m");
+            else
+                printf("Float Min check \033[1;31m fail\033[0m!");
+        }
+
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i] * inout_float_for_check[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Prod check \033[1;32m success!\033[0m");
+            else
+                printf("Float Prod check \033[1;31m fail\033[0m!");
+        }
+    }
+
+    if(*type=='d') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]+in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Sum check \033[1;32m success!\033[0m");
+            else
+                printf("Double Sum check \033[1;31m fail\033[0m!");
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Max check \033[1;32m success!\033[0m");
+            else
+                printf("Double Max check \033[1;31m fail\033[0m!");
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_double,in_double,count, MPI_DOUBLE, MPI_MIN);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Min check \033[1;32m success!\033[0m");
+            else
+                printf("Double Min check \033[1;31m fail\033[0m!");
+        }
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+            tend = MPI_Wtime();
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]*in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Prod check \033[1;32m success!\033[0m");
+            else
+                printf("Double Prod check \033[1;31m fail\033[0m!");
+        }
+    }
+    /*
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" float: %f", inout_float[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out8: %d", inout_uint8[i]);
+       if((i+1)%64==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out16: %d",inout_uint16[i]);
+       if((i+1)%32==0)
+       printf("\n");
+       }
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out32: %d",inout_uint32[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out64: %d",inout_uint64[i]);
+       if((i+1)%8==0)
+       printf("\n");
+       }
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" float: %f", inout_double[i]);
+       if((i+1)%8==0)
+       printf("\n");
+       }
+       */
+    //tstart = MPI_Wtime();  
+    //memcpy(in_uint8,inout_uint8, count);
+    //memcpy(in_float, inout_float, count);
+    //memcpy(in_double, inout_double, count);
+    tend = MPI_Wtime();
+    printf("PERF count  %d  time %.6f seconds\n",count, tend-tstart);
+    MPI_Finalize();
+#define L1size sysconf(_SC_LEVEL1_DCACHE_SIZE)
+#define L2size sysconf(_SC_LEVEL2_CACHE_SIZE)
+#define L3size sysconf(_SC_LEVEL2_CACHE_SIZE)
+    void cache_flush(){
+        char *cache = (char*)calloc(L1size+L2size+L3size, sizeof(char));
+        free(cache);
+    }
+    return 0;
+}
+
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
new file mode 100644
index 00000000000..f7b89da106c
--- /dev/null
+++ b/test/datatype/correctness_check.sh
@@ -0,0 +1,72 @@
+echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "/home/zhongdong/opt/git/avx512_reduction/bin/mpirun  -mca   op_sve_hardware_available 0  -mca   op_avx_hardware_available 0   -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1048576  i 8 max"
+
+Orange= "\033[0;33m"
+Blue= '\033[0;34m'
+Purple= '\033[0;35m'
+
+NC='\033[0m'
+
+
+
+echo "=========Integer type all operations & all sizes========"
+echo ""
+for op in max min sum mul band bor bxor
+do
+    echo ""
+    echo "===Operation  $op test==="
+    for size in 8 16 32 64
+    do
+        echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+        /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 i $size $op 
+         echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+         /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 127 i $size $op
+         echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+         /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 130 i $size $op
+    done
+done
+
+echo "=======Float type all operations========"
+echo ""
+echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 f 32 $op 
+done
+
+echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+## 28= 16+8 + 4
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 28 f 32 $op
+done
+
+echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+##40=16+4
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 20 f 32 $op
+done
+
+echo "========Double type all operations========="
+echo ""
+
+echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1024 d 64 $op
+done
+
+echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+## 20= 8 +8 + 3
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 19 d 64 $op
+done
+
+echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+##12=8+2
+for op in max min sum mul
+do
+    /home/zhongdong/opt/git/avx512_reduction/bin/mpirun -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 10 d 64 $op
+done