From 36d52d4e3268526547706ec713a5ba411dc99d35 Mon Sep 17 00:00:00 2001
From: dongzhong <zhongdong0321@hotmail.com>
Date: Fri, 11 Oct 2019 11:12:07 -0400
Subject: [PATCH 01/13] rewrite arm sve ops

Signed-off-by: dongzhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/arm_sve_op/Makefile.am        |  73 +++
 ompi/mca/op/arm_sve_op/op_sve.h           |  61 +++
 ompi/mca/op/arm_sve_op/op_sve_component.c | 246 ++++++++++
 ompi/mca/op/arm_sve_op/op_sve_functions.c | 517 ++++++++++++++++++++++
 ompi/mca/op/arm_sve_op/op_sve_functions.h |  29 ++
 test/datatype/Reduce_local_float.c        | 290 ++++++++++++
 6 files changed, 1216 insertions(+)
 create mode 100644 ompi/mca/op/arm_sve_op/Makefile.am
 create mode 100644 ompi/mca/op/arm_sve_op/op_sve.h
 create mode 100644 ompi/mca/op/arm_sve_op/op_sve_component.c
 create mode 100644 ompi/mca/op/arm_sve_op/op_sve_functions.c
 create mode 100644 ompi/mca/op/arm_sve_op/op_sve_functions.h
 create mode 100644 test/datatype/Reduce_local_float.c
diff --git a/ompi/mca/op/arm_sve_op/Makefile.am b/ompi/mca/op/arm_sve_op/Makefile.am
new file mode 100644
index 00000000000..9b2aca888a4
--- /dev/null
+++ b/ompi/mca/op/arm_sve_op/Makefile.am
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This is an sve op component.  This Makefile.am is a typical
+# sve of how to integrate into Open MPI's Automake-based build
+# system.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+sources = \
+    op_sve.h \
+    op_sve_component.c \
+	op_sve_functions.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_arm_sve_op_DSO
+lib =
+lib_sources =
+component = mca_op_sve.la
+component_sources = $(sources)
+else
+lib = libmca_op_sve.la
+lib_sources = $(sources)
+component =
+component_sources =
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component)
+mca_op_sve_la_SOURCES = $(component_sources)
+mca_op_sve_la_LDFLAGS = -module -avoid-version
+mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(lib)
+libmca_op_sve_la_SOURCES = $(lib_sources)
+libmca_op_sve_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/arm_sve_op/op_sve.h
new file mode 100644
index 00000000000..cad9935ae30
--- /dev/null
+++ b/ompi/mca/op/arm_sve_op/op_sve.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_SVE_EXPORT_H
+#define MCA_OP_SVE_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is sve-component-specific cached information.  We
+       tend to use this scheme (caching information on the sve
+       component itself) instead of lots of individual global
+       variables for the component.  The following data fields are
+       sves; replace them with whatever is relevant for your
+       component. */
+
+    /** A simple boolean indicating that the hardware is available. */
+    bool hardware_available;
+
+    /** A simple boolean indicating whether double precision is
+        supported. */
+    bool double_supported;
+} ompi_op_sve_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *sve* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_sve_component_t
+    mca_op_sve_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_SVE_EXPORT_H */
diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/arm_sve_op/op_sve_component.c
new file mode 100644
index 00000000000..c60df1a56d9
--- /dev/null
+++ b/ompi/mca/op/arm_sve_op/op_sve_component.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "sve" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/arm_sve_op/op_sve.h"
+#include "ompi/mca/op/arm_sve_op/op_sve_functions.h"
+
+static int sve_component_open(void);
+static int sve_component_close(void);
+static int sve_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    sve_component_op_query(struct ompi_op_t *op, int *priority);
+static int sve_component_register(void);
+
+ompi_op_sve_component_t mca_op_sve_component = {
+    /* First, the mca_base_component_t struct containing meta
+       information about the component itself */
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "sve",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = sve_component_open,
+            .mca_close_component = sve_component_close,
+            .mca_register_component_params = sve_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = sve_component_init_query,
+        .opc_op_query = sve_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int sve_component_open(void)
+{
+
+    /* A first level check to see if sve is even available in this
+       process.  E.g., you may want to do a first-order check to see
+       if hardware is available.  If so, return OMPI_SUCCESS.  If not,
+       return anything other than OMPI_SUCCESS and the component will
+       silently be ignored.
+
+       Note that if this function returns non-OMPI_SUCCESS, then this
+       component won't even be shown in ompi_info output (which is
+       probably not what you want).
+    */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int sve_component_close(void)
+{
+
+    /* If sve was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int sve_component_register(void)
+{
+
+    /* Additionally, since this component is simulating hardware,
+       let's make MCA params that determine whethere a) the hardware
+       is available, and b) whether double precision floating point
+       types are supported.  This allows you to change the behavior of
+       this component at run-time (by setting these MCA params at
+       run-time), simulating different kinds of hardware. */
+    mca_op_sve_component.hardware_available = false;
+    (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version,
+                                           "hardware_available",
+                                           "Whether the hardware is available or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_sve_component.hardware_available);
+
+    mca_op_sve_component.double_supported = true;
+    (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version,
+                                           "double_supported",
+                                           "Whether the double precision data types are supported or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_sve_component.double_supported);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int sve_component_init_query(bool enable_progress_threads,
+                                        bool enable_mpi_thread_multiple)
+{
+    if (mca_op_sve_component.hardware_available && !enable_mpi_thread_multiple) {
+        return OMPI_SUCCESS;
+    }
+    return OMPI_ERR_NOT_SUPPORTED;
+}
+
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t *
+    sve_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    int i=0;
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+        /* Corresponds to MPI_MAX */
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_MIN:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_SUM:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_PROD:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_BOR:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_BAND:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+        module= NULL;
+        break;
+    default:
+        module= NULL;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *sve* component pointer
+       (vs. a *base* component pointer -- where an *sve* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c
new file mode 100644
index 00000000000..c01c663e0fb
--- /dev/null
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/arm_sve_op/op_sve.h"
+#include "ompi/mca/op/arm_sve_op/op_sve_functions.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ * Support ops: max, min, for signed/unsigned 8,16,32,64
+ *              sum, for integer 8,16,32,64
+ *
+ */
+#define OP_SVE_FUNC(name, type_sign, type_size, type, op) \
+    static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 512 / type_size;                                        \
+    printf("op: %s  %s \n ", #op, #type_size);\
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+    svbool_t Pg = svptrue_b##type_size(); \
+}
+
+/*
+ *  This macro is for bit-wise operations (out op in).
+ *
+ *  Support ops: or, xor, and of 512 bits (representing integer data)
+ *
+ */
+#define OP_SVE_BIT_FUNC(name, type_size, type, op) \
+    static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 512 / type_size; \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+}
+
+#define OP_SVE_FLOAT_FUNC(op) \
+    static void ompi_op_sve_2buff_##op##_float(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 16;                                                          \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+}
+
+#define OP_SVE_DOUBLE_FUNC(op) \
+    static void ompi_op_sve_2buff_##op##_double(void *in, void *out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 8;                                                          \
+    int size = *count/step; \
+    int i; \
+}
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+    OP_SVE_FUNC(max, i, 8,    int8_t, max)
+    OP_SVE_FUNC(max, u, 8,   uint8_t, max)
+    OP_SVE_FUNC(max, i, 16,  int16_t, max)
+    OP_SVE_FUNC(max, u, 16, uint16_t, max)
+    OP_SVE_FUNC(max, i, 32,  int32_t, max)
+    OP_SVE_FUNC(max, u, 32, uint32_t, max)
+    OP_SVE_FUNC(max, i, 64,  int64_t, max)
+    OP_SVE_FUNC(max, u, 64, uint64_t, max)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC(max)
+    OP_SVE_DOUBLE_FUNC(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+    OP_SVE_FUNC(min, i, 8,    int8_t, min)
+    OP_SVE_FUNC(min, u, 8,   uint8_t, min)
+    OP_SVE_FUNC(min, i, 16,  int16_t, min)
+    OP_SVE_FUNC(min, u, 16, uint16_t, min)
+    OP_SVE_FUNC(min, i, 32,  int32_t, min)
+    OP_SVE_FUNC(min, u, 32, uint32_t, min)
+    OP_SVE_FUNC(min, i, 64,  int64_t, min)
+    OP_SVE_FUNC(min, u, 64, uint64_t, min)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC(min)
+    OP_SVE_DOUBLE_FUNC(min)
+
+/*************************************************************************
+ * Sum
+ ************************************************************************/
+    OP_SVE_FUNC(sum, i, 8,    int8_t, add)
+    OP_SVE_FUNC(sum, i, 8,   uint8_t, add)
+    OP_SVE_FUNC(sum, i, 16,  int16_t, add)
+    OP_SVE_FUNC(sum, i, 16, uint16_t, add)
+    OP_SVE_FUNC(sum, i, 32,  int32_t, add)
+    OP_SVE_FUNC(sum, i, 32, uint32_t, add)
+    OP_SVE_FUNC(sum, i, 64,  int64_t, add)
+    OP_SVE_FUNC(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC(add)
+    OP_SVE_DOUBLE_FUNC(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+    OP_SVE_FUNC(prod, i, 16,  int16_t, mullo)
+    OP_SVE_FUNC(prod, i, 16, uint16_t, mullo)
+    OP_SVE_FUNC(prod, i, 32,  int32_t, mullo)
+    OP_SVE_FUNC(prod, i ,32, uint32_t, mullo)
+    OP_SVE_FUNC(prod, i, 64,  int64_t, mullo)
+    OP_SVE_FUNC(prod, i, 64, uint64_t, mullo)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC(mul)
+    OP_SVE_DOUBLE_FUNC(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+    OP_SVE_BIT_FUNC(band, 8,    int8_t, and)
+    OP_SVE_BIT_FUNC(band, 8,   uint8_t, and)
+    OP_SVE_BIT_FUNC(band, 16,  int16_t, and)
+    OP_SVE_BIT_FUNC(band, 16, uint16_t, and)
+    OP_SVE_BIT_FUNC(band, 32,  int32_t, and)
+    OP_SVE_BIT_FUNC(band, 32, uint32_t, and)
+    OP_SVE_BIT_FUNC(band, 64,  int64_t, and)
+    OP_SVE_BIT_FUNC(band, 64, uint64_t, and)
+
+    OP_SVE_FLOAT_FUNC(and)
+    OP_SVE_DOUBLE_FUNC(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+    OP_SVE_BIT_FUNC(bor, 8,    int8_t, or)
+    OP_SVE_BIT_FUNC(bor, 8,   uint8_t, or)
+    OP_SVE_BIT_FUNC(bor, 16,  int16_t, or)
+    OP_SVE_BIT_FUNC(bor, 16, uint16_t, or)
+    OP_SVE_BIT_FUNC(bor, 32,  int32_t, or)
+    OP_SVE_BIT_FUNC(bor, 32, uint32_t, or)
+    OP_SVE_BIT_FUNC(bor, 64,  int64_t, or)
+    OP_SVE_BIT_FUNC(bor, 64, uint64_t, or)
+
+    OP_SVE_FLOAT_FUNC(or)
+    OP_SVE_DOUBLE_FUNC(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+    OP_SVE_BIT_FUNC(bxor, 8,    int8_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 8,   uint8_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 16,  int16_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 16, uint16_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 32,  int32_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 32, uint32_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 64,  int64_t, xor)
+    OP_SVE_BIT_FUNC(bxor, 64, uint64_t, xor)
+
+    OP_SVE_FLOAT_FUNC(xor)
+    OP_SVE_DOUBLE_FUNC(xor)
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *  routines, needed for some optimizations.
+ */
+#define OP_SVE_FUNC_3BUFF(name, type_sign, type_size, type, op)\
+        static void ompi_op_sve_3buff_##name##_##type(void * restrict in1,   \
+                void * restrict in2, void * restrict out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 512 / type_size; \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+}
+
+#define OP_SVE_BIT_FUNC_3BUFF(name, type_size, type, op) \
+        static void ompi_op_sve_3buff_##op##_##type(void *in1, void *in2, void *out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 512 / type_size; \
+    int size = *count/step; \
+    int i; \
+}
+
+#define OP_SVE_FLOAT_FUNC_3BUFF(op) \
+        static void ompi_op_sve_3buff_##op##_float(void *in1, void *in2, void *out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 16;                                                          \
+    int size = *count/step; \
+    int i; \
+    int round = size*64; \
+}
+
+#define OP_SVE_DOUBLE_FUNC_3BUFF(op) \
+        static void ompi_op_sve_3buff_##op##_double(void *in1, void *in2, void *out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int step = 8;                                                          \
+    int size = *count/step; \
+    int i; \
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(max, i, 8,    int8_t, max)
+    OP_SVE_FUNC_3BUFF(max, u, 8,   uint8_t, max)
+    OP_SVE_FUNC_3BUFF(max, i, 16,  int16_t, max)
+    OP_SVE_FUNC_3BUFF(max, u, 16, uint16_t, max)
+    OP_SVE_FUNC_3BUFF(max, i, 32,  int32_t, max)
+    OP_SVE_FUNC_3BUFF(max, u, 32, uint32_t, max)
+    OP_SVE_FUNC_3BUFF(max, i, 64,  int64_t, max)
+    OP_SVE_FUNC_3BUFF(max, u, 64, uint64_t, max)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC_3BUFF(max)
+    OP_SVE_DOUBLE_FUNC_3BUFF(max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(min, i, 8,    int8_t, min)
+    OP_SVE_FUNC_3BUFF(min, u, 8,   uint8_t, min)
+    OP_SVE_FUNC_3BUFF(min, i, 16,  int16_t, min)
+    OP_SVE_FUNC_3BUFF(min, u, 16, uint16_t, min)
+    OP_SVE_FUNC_3BUFF(min, i, 32,  int32_t, min)
+    OP_SVE_FUNC_3BUFF(min, u, 32, uint32_t, min)
+    OP_SVE_FUNC_3BUFF(min, i, 64,  int64_t, min)
+    OP_SVE_FUNC_3BUFF(min, u, 64, uint64_t, min)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC_3BUFF(min)
+    OP_SVE_DOUBLE_FUNC_3BUFF(min)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(sum, i, 8,    int8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 8,   uint8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 16,  int16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 16, uint16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 32,  int32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 32, uint32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 64,  int64_t, add)
+    OP_SVE_FUNC_3BUFF(sum, i, 64, uint64_t, add)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC_3BUFF(add)
+    OP_SVE_DOUBLE_FUNC_3BUFF(add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(prod, i, 16,  int16_t, mullo)
+    OP_SVE_FUNC_3BUFF(prod, i, 16, uint16_t, mullo)
+    OP_SVE_FUNC_3BUFF(prod, i, 32,  int32_t, mullo)
+    OP_SVE_FUNC_3BUFF(prod, i ,32, uint32_t, mullo)
+    OP_SVE_FUNC_3BUFF(prod, i, 64,  int64_t, mullo)
+    OP_SVE_FUNC_3BUFF(prod, i, 64, uint64_t, mullo)
+
+    /* Floating point */
+    OP_SVE_FLOAT_FUNC_3BUFF(mul)
+    OP_SVE_DOUBLE_FUNC_3BUFF(mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+    OP_SVE_BIT_FUNC_3BUFF(band, 8,    int8_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 8,   uint8_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 16,  int16_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 16, uint16_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 32,  int32_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 32, uint32_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 64,  int64_t, and)
+    OP_SVE_BIT_FUNC_3BUFF(band, 64, uint64_t, and)
+
+    OP_SVE_FLOAT_FUNC_3BUFF(and)
+    OP_SVE_DOUBLE_FUNC_3BUFF(and)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+    OP_SVE_BIT_FUNC_3BUFF(bor, 8,    int8_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 8,   uint8_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 16,  int16_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 16, uint16_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 32,  int32_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 32, uint32_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 64,  int64_t, or)
+    OP_SVE_BIT_FUNC_3BUFF(bor, 64, uint64_t, or)
+
+    OP_SVE_FLOAT_FUNC_3BUFF(or)
+    OP_SVE_DOUBLE_FUNC_3BUFF(or)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 8,    int8_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 8,   uint8_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 16,  int16_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 32,  int32_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 64,  int64_t, xor)
+    OP_SVE_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor)
+
+    OP_SVE_FLOAT_FUNC_3BUFF(xor)
+    OP_SVE_DOUBLE_FUNC_3BUFF(xor)
+
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                              \
+    [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_sve_##ftype##_##name##_int8_t,     \
+    [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_sve_##ftype##_##name##_uint8_t,   \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t
+
+
+/** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float
+#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_double
+
+#define FLOATING_POINT(name, ftype)                                                            \
+    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+#define C_INTEGER_PROD(name, ftype)                                           \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t
+
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 2buff),
+        FLOATING_POINT(add, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_PROD(prod, 2buff),
+        FLOATING_POINT(mul, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           impementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 3buff),
+        FLOATING_POINT(add, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_PROD(prod, 3buff),
+        FLOATING_POINT(mul, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(and, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(or, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(xor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           impementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/arm_sve_op/op_sve_functions.h
new file mode 100644
index 00000000000..0c621788842
--- /dev/null
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/arm_sve_op/op_sve.h"
+
+BEGIN_C_DECLS
+
+OMPI_DECLSPEC extern ompi_op_base_handler_fn_t
+ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t
+ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
new file mode 100644
index 00000000000..e33db456a03
--- /dev/null
+++ b/test/datatype/Reduce_local_float.c
@@ -0,0 +1,290 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+
+#define ARRAYSIZE  1024*1024
+
+float in_float[ARRAYSIZE];
+float inout_float[ARRAYSIZE];
+
+int8_t in_uint8[ARRAYSIZE];
+int8_t inout_uint8[ARRAYSIZE];
+
+uint16_t in_uint16[ARRAYSIZE];
+uint16_t inout_uint16[ARRAYSIZE];
+
+uint32_t in_uint32[ARRAYSIZE];
+uint32_t inout_uint32[ARRAYSIZE];
+
+uint64_t in_uint64[ARRAYSIZE];
+uint64_t inout_uint64[ARRAYSIZE];
+
+double in_double[ARRAYSIZE];
+double inout_double[ARRAYSIZE];
+
+
+int main(int argc, char **argv) {
+
+    char *num_elem = argv[1];
+    int count = atoi(num_elem);
+    char *type = argv[2];
+    char *elem_size = argv[3];
+    int elem_size1  = atoi(elem_size);
+    char *op = argv[4];
+
+    int i;
+
+    for (i=0; i<count; i++)
+    {
+        in_float[i] = 1000.0+1;
+        inout_float[i] = 100.0+2;
+
+        in_double[i] = 10.0+1;
+        inout_double[i] = 1.0+2;
+
+        in_uint8[i] = 4;
+        inout_uint8[i] = 3;
+
+        in_uint16[i] = 2;
+        inout_uint16[i] = 3;
+
+        in_uint32[i] = 2;
+        inout_uint32[i] = 3;
+
+        in_uint64[i] = 2;
+        inout_uint64[i] = 3;
+    }
+    int rank, size, len;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    double tstart, tend;
+
+    if(*type=='i') {
+        if(strcmp(op, "sum") == 0){
+            printf("#Local Reduce SUM: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
+             tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            printf("#Local Reduce MAX: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            printf("#Local Reduce MIN: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MIN);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MIN);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MIN);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MIN);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "land") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_LAND);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "bor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "bxor") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+            tend = MPI_Wtime();
+        }
+
+
+        if(strcmp(op, "mul") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "band") == 0) {
+            printf("#Local Reduce Logical: %d \n", count);
+            tstart = MPI_Wtime();
+            if (elem_size1 == 8)
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+            if (elem_size1 == 16)
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+            if (elem_size1 == 32)
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+            if (elem_size1 == 64)
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+            tend = MPI_Wtime();
+        }
+    }
+
+    if(*type=='f') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MIN);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            tend = MPI_Wtime();
+        }
+    }
+
+    if(*type=='d') {
+        if(strcmp(op, "sum") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "max") == 0){
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            tend = MPI_Wtime();
+        }
+
+        if(strcmp(op, "min") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MIN);
+            tend = MPI_Wtime();
+        }
+
+         if(strcmp(op, "mul") == 0) {
+             tstart = MPI_Wtime();
+             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+             tend = MPI_Wtime();
+         }
+    }
+/*
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" float: %f", inout_float[i]);
+        if((i+1)%16==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out8: %d", inout_uint8[i]);
+        if((i+1)%64==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out16: %d",inout_uint16[i]);
+        if((i+1)%32==0)
+            printf("\n");
+    }
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out32: %d",inout_uint32[i]);
+        if((i+1)%16==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" out64: %d",inout_uint64[i]);
+        if((i+1)%8==0)
+            printf("\n");
+    }
+
+    printf("\n ================\n");
+    for (i=0; i<count; i++)
+    {
+        printf(" float: %f", inout_double[i]);
+        if((i+1)%8==0)
+            printf("\n");
+    }
+*/
+    printf("#Local Reduce time %.6f seconds\n", tend-tstart);
+    MPI_Finalize();
+    return 0;
+}
+

From 348d0d28e3c40da411c5a836d8dcb5a08bef98f5 Mon Sep 17 00:00:00 2001
From: dong zhong <zhongdong0321@hotmail.com>
Date: Thu, 31 Oct 2019 18:41:02 -0400
Subject: [PATCH 02/13] sve ops working

Signed-off-by: dong zhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/arm_sve_op/op_sve.h           |   3 +
 ompi/mca/op/arm_sve_op/op_sve_component.c |   3 +
 ompi/mca/op/arm_sve_op/op_sve_functions.c | 513 +++++++++++-----------
 ompi/mca/op/arm_sve_op/op_sve_functions.h |   3 +
 test/datatype/Reduce_local_float.c        |   2 +-
 5 files changed, 270 insertions(+), 254 deletions(-)

diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/arm_sve_op/op_sve.h
index cad9935ae30..95b19b95e04 100644
--- a/ompi/mca/op/arm_sve_op/op_sve.h
+++ b/ompi/mca/op/arm_sve_op/op_sve.h
@@ -2,6 +2,9 @@
  * Copyright (c) 2019      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ *
+ * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ *
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/arm_sve_op/op_sve_component.c
index c60df1a56d9..e08bbae35d9 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_component.c
+++ b/ompi/mca/op/arm_sve_op/op_sve_component.c
@@ -2,6 +2,9 @@
  * Copyright (c) 2019      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ *
+ * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ *
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c
index c01c663e0fb..3233ac66e44 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.c
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c
@@ -2,6 +2,9 @@
  * Copyright (c) 2019      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ *
+ * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ *
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -33,323 +36,331 @@
  *
  * This macro is for (out op in).
  *
- * Support ops: max, min, for signed/unsigned 8,16,32,64
- *              sum, for integer 8,16,32,64
- *
  */
-#define OP_SVE_FUNC(name, type_sign, type_size, type, op) \
-    static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \
+#define OP_SVE_FUNC(name, type_name, type_size, type, op) \
+    static void ompi_op_sve_2buff_##name##_##type_name(void *in, void *out, int *count, \
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 512 / type_size;                                        \
-    printf("op: %s  %s \n ", #op, #type_size);\
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
+    uint64_t i; \
+    uint64_t step = 0; \
     svbool_t Pg = svptrue_b##type_size(); \
+    switch(type_size) {                                                \
+        case 8:                                                        \
+               step = svcntb();                                        \
+               break;                                                  \
+        case 16:                                                       \
+               step = svcnth();                                        \
+               break;                                                  \
+        case 32:                                                       \
+               step = svcntw();                                        \
+               break;                                                  \
+        case 64:                                                       \
+               step = svcntd();                                        \
+    }    \
+    uint64_t round = *count;                                           \
+    uint64_t remain = *count % step;                                   \
+    for(i=0; i< round; i=i+step)                                       \
+    {                                                                  \
+        sv##type  vsrc = svld1(Pg, (type *)in+i);                    \
+        sv##type  vdst = svld1(Pg, (type *)out+i);                   \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                               \
+        svst1(Pg, (type *)out+i,vdst);                                 \
+    }                                                                  \
+    \
+    if (remain !=0){                                                   \
+        Pg = svwhilelt_b##type_size##_u64(0, remain);                  \
+        sv##type  vsrc = svld1(Pg, (type *)in+i);                    \
+        sv##type  vdst = svld1(Pg, (type *)out+i);                   \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                               \
+        svst1(Pg, (type *)out+i,vdst);                                 \
+    } \
 }
 
-/*
- *  This macro is for bit-wise operations (out op in).
- *
- *  Support ops: or, xor, and of 512 bits (representing integer data)
- *
- */
-#define OP_SVE_BIT_FUNC(name, type_size, type, op) \
-    static void ompi_op_sve_2buff_##name##_##type(void *in, void *out, int *count, \
-            struct ompi_datatype_t **dtype, \
-            struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 512 / type_size; \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-}
-
-#define OP_SVE_FLOAT_FUNC(op) \
-    static void ompi_op_sve_2buff_##op##_float(void *in, void *out, int *count, \
-            struct ompi_datatype_t **dtype, \
-            struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 16;                                                          \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-}
-
-#define OP_SVE_DOUBLE_FUNC(op) \
-    static void ompi_op_sve_2buff_##op##_double(void *in, void *out, int *count, \
-            struct ompi_datatype_t **dtype, \
-            struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 8;                                                          \
-    int size = *count/step; \
-    int i; \
-}
-
-
 /*************************************************************************
  * Max
  *************************************************************************/
-    OP_SVE_FUNC(max, i, 8,    int8_t, max)
-    OP_SVE_FUNC(max, u, 8,   uint8_t, max)
-    OP_SVE_FUNC(max, i, 16,  int16_t, max)
-    OP_SVE_FUNC(max, u, 16, uint16_t, max)
-    OP_SVE_FUNC(max, i, 32,  int32_t, max)
-    OP_SVE_FUNC(max, u, 32, uint32_t, max)
-    OP_SVE_FUNC(max, i, 64,  int64_t, max)
-    OP_SVE_FUNC(max, u, 64, uint64_t, max)
+    OP_SVE_FUNC(max, int8_t  ,  8,   int8_t, max)
+    OP_SVE_FUNC(max, uint8_t,   8,  uint8_t, max)
+    OP_SVE_FUNC(max, int16_t,  16,  int16_t, max)
+    OP_SVE_FUNC(max, uint16_t, 16, uint16_t, max)
+    OP_SVE_FUNC(max, int32_t,  32,  int32_t, max)
+    OP_SVE_FUNC(max, uint32_t, 32, uint32_t, max)
+    OP_SVE_FUNC(max, int64_t,  64,  int64_t, max)
+    OP_SVE_FUNC(max, uint64_t, 64, uint64_t, max)
 
     /* Floating point */
-    OP_SVE_FLOAT_FUNC(max)
-    OP_SVE_DOUBLE_FUNC(max)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC(max, short_float, 16, float16_t, max)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC(max, short_float, 16, float16_t, max)
+#endif
+    OP_SVE_FUNC(max, float, 32, float32_t, max)
+    OP_SVE_FUNC(max, double, 64, float64_t, max)
 
 /*************************************************************************
  * Min
  *************************************************************************/
-    OP_SVE_FUNC(min, i, 8,    int8_t, min)
-    OP_SVE_FUNC(min, u, 8,   uint8_t, min)
-    OP_SVE_FUNC(min, i, 16,  int16_t, min)
-    OP_SVE_FUNC(min, u, 16, uint16_t, min)
-    OP_SVE_FUNC(min, i, 32,  int32_t, min)
-    OP_SVE_FUNC(min, u, 32, uint32_t, min)
-    OP_SVE_FUNC(min, i, 64,  int64_t, min)
-    OP_SVE_FUNC(min, u, 64, uint64_t, min)
+    OP_SVE_FUNC(min, int8_t  ,  8,   int8_t, min)
+    OP_SVE_FUNC(min, uint8_t,   8,  uint8_t, min)
+    OP_SVE_FUNC(min, int16_t,  16,  int16_t, min)
+    OP_SVE_FUNC(min, uint16_t, 16, uint16_t, min)
+    OP_SVE_FUNC(min, int32_t,  32,  int32_t, min)
+    OP_SVE_FUNC(min, uint32_t, 32, uint32_t, min)
+    OP_SVE_FUNC(min, int64_t,  64,  int64_t, min)
+    OP_SVE_FUNC(min, uint64_t, 64, uint64_t, min)
 
     /* Floating point */
-    OP_SVE_FLOAT_FUNC(min)
-    OP_SVE_DOUBLE_FUNC(min)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC(min, short_float, 16, float16_t, min)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC(min, short_float, 16, float16_t, min)
+#endif
+    OP_SVE_FUNC(min, float, 32, float32_t, min)
+    OP_SVE_FUNC(min, double, 64, float64_t, min)
 
 /*************************************************************************
  * Sum
  ************************************************************************/
-    OP_SVE_FUNC(sum, i, 8,    int8_t, add)
-    OP_SVE_FUNC(sum, i, 8,   uint8_t, add)
-    OP_SVE_FUNC(sum, i, 16,  int16_t, add)
-    OP_SVE_FUNC(sum, i, 16, uint16_t, add)
-    OP_SVE_FUNC(sum, i, 32,  int32_t, add)
-    OP_SVE_FUNC(sum, i, 32, uint32_t, add)
-    OP_SVE_FUNC(sum, i, 64,  int64_t, add)
-    OP_SVE_FUNC(sum, i, 64, uint64_t, add)
-
+    OP_SVE_FUNC(sum, int8_t  ,  8,   int8_t, add)
+    OP_SVE_FUNC(sum, uint8_t,   8,  uint8_t, add)
+    OP_SVE_FUNC(sum, int16_t,  16,  int16_t, add)
+    OP_SVE_FUNC(sum, uint16_t, 16, uint16_t, add)
+    OP_SVE_FUNC(sum, int32_t,  32,  int32_t, add)
+    OP_SVE_FUNC(sum, uint32_t, 32, uint32_t, add)
+    OP_SVE_FUNC(sum, int64_t,  64,  int64_t, add)
+    OP_SVE_FUNC(sum, uint64_t, 64, uint64_t, add)
     /* Floating point */
-    OP_SVE_FLOAT_FUNC(add)
-    OP_SVE_DOUBLE_FUNC(add)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC(sum, short_float, 16, float16_t, add)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC(sum, short_float, 16, float16_t, add)
+#endif
+    OP_SVE_FUNC(sum, float, 32, float32_t, add)
+    OP_SVE_FUNC(sum, double, 64, float64_t, add)
+
 
 /*************************************************************************
  * Product
  *************************************************************************/
-    OP_SVE_FUNC(prod, i, 16,  int16_t, mullo)
-    OP_SVE_FUNC(prod, i, 16, uint16_t, mullo)
-    OP_SVE_FUNC(prod, i, 32,  int32_t, mullo)
-    OP_SVE_FUNC(prod, i ,32, uint32_t, mullo)
-    OP_SVE_FUNC(prod, i, 64,  int64_t, mullo)
-    OP_SVE_FUNC(prod, i, 64, uint64_t, mullo)
+    OP_SVE_FUNC(prod,   int8_t, 8,    int8_t, mul)
+    OP_SVE_FUNC(prod,  uint8_t, 8,   uint8_t, mul)
+    OP_SVE_FUNC(prod,  int16_t, 16,  int16_t, mul)
+    OP_SVE_FUNC(prod, uint16_t, 16, uint16_t, mul)
+    OP_SVE_FUNC(prod,  int32_t, 32,  int32_t, mul)
+    OP_SVE_FUNC(prod, uint32_t, 32, uint32_t, mul)
+    OP_SVE_FUNC(prod,  int64_t, 64,  int64_t, mul)
+    OP_SVE_FUNC(prod, uint64_t, 64, uint64_t, mul)
 
     /* Floating point */
-    OP_SVE_FLOAT_FUNC(mul)
-    OP_SVE_DOUBLE_FUNC(mul)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC(prod, short_float, 16, float16_t, mul)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC(prod, short_float, 16, float16_t, mul)
+#endif
+    OP_SVE_FUNC(prod, float, 32, float32_t, mul)
+    OP_SVE_FUNC(prod, double, 64, float64_t, mul)
 
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
-    OP_SVE_BIT_FUNC(band, 8,    int8_t, and)
-    OP_SVE_BIT_FUNC(band, 8,   uint8_t, and)
-    OP_SVE_BIT_FUNC(band, 16,  int16_t, and)
-    OP_SVE_BIT_FUNC(band, 16, uint16_t, and)
-    OP_SVE_BIT_FUNC(band, 32,  int32_t, and)
-    OP_SVE_BIT_FUNC(band, 32, uint32_t, and)
-    OP_SVE_BIT_FUNC(band, 64,  int64_t, and)
-    OP_SVE_BIT_FUNC(band, 64, uint64_t, and)
-
-    OP_SVE_FLOAT_FUNC(and)
-    OP_SVE_DOUBLE_FUNC(and)
+    OP_SVE_FUNC(band,   int8_t, 8,    int8_t, and)
+    OP_SVE_FUNC(band,  uint8_t, 8,   uint8_t, and)
+    OP_SVE_FUNC(band,  int16_t, 16,  int16_t, and)
+    OP_SVE_FUNC(band, uint16_t, 16, uint16_t, and)
+    OP_SVE_FUNC(band,  int32_t, 32,  int32_t, and)
+    OP_SVE_FUNC(band, uint32_t, 32, uint32_t, and)
+    OP_SVE_FUNC(band,  int64_t, 64,  int64_t, and)
+    OP_SVE_FUNC(band, uint64_t, 64, uint64_t, and)
 
 /*************************************************************************
  * Bitwise OR
  *************************************************************************/
-    OP_SVE_BIT_FUNC(bor, 8,    int8_t, or)
-    OP_SVE_BIT_FUNC(bor, 8,   uint8_t, or)
-    OP_SVE_BIT_FUNC(bor, 16,  int16_t, or)
-    OP_SVE_BIT_FUNC(bor, 16, uint16_t, or)
-    OP_SVE_BIT_FUNC(bor, 32,  int32_t, or)
-    OP_SVE_BIT_FUNC(bor, 32, uint32_t, or)
-    OP_SVE_BIT_FUNC(bor, 64,  int64_t, or)
-    OP_SVE_BIT_FUNC(bor, 64, uint64_t, or)
-
-    OP_SVE_FLOAT_FUNC(or)
-    OP_SVE_DOUBLE_FUNC(or)
+    OP_SVE_FUNC(bor,   int8_t, 8,    int8_t, orr)
+    OP_SVE_FUNC(bor,  uint8_t, 8,   uint8_t, orr)
+    OP_SVE_FUNC(bor,  int16_t, 16,  int16_t, orr)
+    OP_SVE_FUNC(bor, uint16_t, 16, uint16_t, orr)
+    OP_SVE_FUNC(bor,  int32_t, 32,  int32_t, orr)
+    OP_SVE_FUNC(bor, uint32_t, 32, uint32_t, orr)
+    OP_SVE_FUNC(bor,  int64_t, 64,  int64_t, orr)
+    OP_SVE_FUNC(bor, uint64_t, 64, uint64_t, orr)
 
 /*************************************************************************
  * Bitwise XOR
  *************************************************************************/
-    OP_SVE_BIT_FUNC(bxor, 8,    int8_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 8,   uint8_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 16,  int16_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 16, uint16_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 32,  int32_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 32, uint32_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 64,  int64_t, xor)
-    OP_SVE_BIT_FUNC(bxor, 64, uint64_t, xor)
-
-    OP_SVE_FLOAT_FUNC(xor)
-    OP_SVE_DOUBLE_FUNC(xor)
+    OP_SVE_FUNC(bxor,   int8_t, 8,    int8_t, eor)
+    OP_SVE_FUNC(bxor,  uint8_t, 8,   uint8_t, eor)
+    OP_SVE_FUNC(bxor,  int16_t, 16,  int16_t, eor)
+    OP_SVE_FUNC(bxor, uint16_t, 16, uint16_t, eor)
+    OP_SVE_FUNC(bxor,  int32_t, 32,  int32_t, eor)
+    OP_SVE_FUNC(bxor, uint32_t, 32, uint32_t, eor)
+    OP_SVE_FUNC(bxor,  int64_t, 64,  int64_t, eor)
+    OP_SVE_FUNC(bxor, uint64_t, 64, uint64_t, eor)
+
 
 /*
  *  This is a three buffer (2 input and 1 output) version of the reduction
  *  routines, needed for some optimizations.
  */
-#define OP_SVE_FUNC_3BUFF(name, type_sign, type_size, type, op)\
-        static void ompi_op_sve_3buff_##name##_##type(void * restrict in1,   \
+#define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \
+        static void ompi_op_sve_3buff_##name##_##type_name(void * restrict in1,   \
                 void * restrict in2, void * restrict out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    int step = 512 / type_size; \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
-}
-
-#define OP_SVE_BIT_FUNC_3BUFF(name, type_size, type, op) \
-        static void ompi_op_sve_3buff_##op##_##type(void *in1, void *in2, void *out, int *count, \
-                struct ompi_datatype_t **dtype, \
-                struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 512 / type_size; \
-    int size = *count/step; \
-    int i; \
-}
-
-#define OP_SVE_FLOAT_FUNC_3BUFF(op) \
-        static void ompi_op_sve_3buff_##op##_float(void *in1, void *in2, void *out, int *count, \
-                struct ompi_datatype_t **dtype, \
-                struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 16;                                                          \
-    int size = *count/step; \
-    int i; \
-    int round = size*64; \
+    uint64_t i; \
+    uint64_t step = 0; \
+    svbool_t Pg = svptrue_b##type_size(); \
+    switch(type_size) {                                                \
+        case 8:                                                        \
+               step = svcntb();                                        \
+               break;                                                  \
+        case 16:                                                       \
+               step = svcnth();                                        \
+               break;                                                  \
+        case 32:                                                       \
+               step = svcntw();                                        \
+               break;                                                  \
+        case 64:                                                       \
+               step = svcntd();                                        \
+    }    \
+    uint64_t round = *count;                                           \
+    uint64_t remain = *count % step;                                   \
+    for(i=0; i< round; i=i+step)                                       \
+    {                                                                  \
+        sv##type  vsrc = svld1(Pg, (type *)in1+i);                     \
+        sv##type  vdst = svld1(Pg, (type *)in2+i);                     \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, (type *)out+i,vdst);                                 \
+    }                                                                  \
+    if (remain !=0){                                                   \
+        Pg = svwhilelt_b##type_size##_u64(0, remain);                  \
+        sv##type  vsrc = svld1(Pg, (type *)in1+i);                     \
+        sv##type  vdst = svld1(Pg, (type *)in2+i);                     \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, (type *)out+i,vdst);                                 \
+    } \
 }
 
-#define OP_SVE_DOUBLE_FUNC_3BUFF(op) \
-        static void ompi_op_sve_3buff_##op##_double(void *in1, void *in2, void *out, int *count, \
-                struct ompi_datatype_t **dtype, \
-                struct ompi_op_base_module_1_0_0_t *module) \
-{                                                                      \
-    int step = 8;                                                          \
-    int size = *count/step; \
-    int i; \
-}
 
 /*************************************************************************
  * Max
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(max, i, 8,    int8_t, max)
-    OP_SVE_FUNC_3BUFF(max, u, 8,   uint8_t, max)
-    OP_SVE_FUNC_3BUFF(max, i, 16,  int16_t, max)
-    OP_SVE_FUNC_3BUFF(max, u, 16, uint16_t, max)
-    OP_SVE_FUNC_3BUFF(max, i, 32,  int32_t, max)
-    OP_SVE_FUNC_3BUFF(max, u, 32, uint32_t, max)
-    OP_SVE_FUNC_3BUFF(max, i, 64,  int64_t, max)
-    OP_SVE_FUNC_3BUFF(max, u, 64, uint64_t, max)
+    OP_SVE_FUNC_3BUFF(max, int8_t  ,  8,   int8_t, max)
+    OP_SVE_FUNC_3BUFF(max, uint8_t,   8,  uint8_t, max)
+    OP_SVE_FUNC_3BUFF(max, int16_t,  16,  int16_t, max)
+    OP_SVE_FUNC_3BUFF(max, uint16_t, 16, uint16_t, max)
+    OP_SVE_FUNC_3BUFF(max, int32_t,  32,  int32_t, max)
+    OP_SVE_FUNC_3BUFF(max, uint32_t, 32, uint32_t, max)
+    OP_SVE_FUNC_3BUFF(max, int64_t,  64,  int64_t, max)
+    OP_SVE_FUNC_3BUFF(max, uint64_t, 64, uint64_t, max)
 
     /* Floating point */
-    OP_SVE_FLOAT_FUNC_3BUFF(max)
-    OP_SVE_DOUBLE_FUNC_3BUFF(max)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max)
+#endif
+    OP_SVE_FUNC_3BUFF(max, float, 32, float32_t, max)
+    OP_SVE_FUNC_3BUFF(max, double, 64, float64_t, max)
 
 /*************************************************************************
  * Min
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(min, i, 8,    int8_t, min)
-    OP_SVE_FUNC_3BUFF(min, u, 8,   uint8_t, min)
-    OP_SVE_FUNC_3BUFF(min, i, 16,  int16_t, min)
-    OP_SVE_FUNC_3BUFF(min, u, 16, uint16_t, min)
-    OP_SVE_FUNC_3BUFF(min, i, 32,  int32_t, min)
-    OP_SVE_FUNC_3BUFF(min, u, 32, uint32_t, min)
-    OP_SVE_FUNC_3BUFF(min, i, 64,  int64_t, min)
-    OP_SVE_FUNC_3BUFF(min, u, 64, uint64_t, min)
+    OP_SVE_FUNC_3BUFF(min, int8_t  ,  8,   int8_t, min)
+    OP_SVE_FUNC_3BUFF(min, uint8_t,   8,  uint8_t, min)
+    OP_SVE_FUNC_3BUFF(min, int16_t,  16,  int16_t, min)
+    OP_SVE_FUNC_3BUFF(min, uint16_t, 16, uint16_t, min)
+    OP_SVE_FUNC_3BUFF(min, int32_t,  32,  int32_t, min)
+    OP_SVE_FUNC_3BUFF(min, uint32_t, 32, uint32_t, min)
+    OP_SVE_FUNC_3BUFF(min, int64_t,  64,  int64_t, min)
+    OP_SVE_FUNC_3BUFF(min, uint64_t, 64, uint64_t, min)
 
     /* Floating point */
-    OP_SVE_FLOAT_FUNC_3BUFF(min)
-    OP_SVE_DOUBLE_FUNC_3BUFF(min)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min)
+#endif
+    OP_SVE_FUNC_3BUFF(min, float, 32, float32_t, min)
+    OP_SVE_FUNC_3BUFF(min, double, 64, float64_t, min)
 
 /*************************************************************************
  * Sum
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(sum, i, 8,    int8_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 8,   uint8_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 16,  int16_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 16, uint16_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 32,  int32_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 32, uint32_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 64,  int64_t, add)
-    OP_SVE_FUNC_3BUFF(sum, i, 64, uint64_t, add)
-
+    OP_SVE_FUNC_3BUFF(sum, int8_t  ,  8,   int8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, uint8_t,   8,  uint8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, int16_t,  16,  int16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, uint16_t, 16, uint16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, int32_t,  32,  int32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, uint32_t, 32, uint32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, int64_t,  64,  int64_t, add)
+    OP_SVE_FUNC_3BUFF(sum, uint64_t, 64, uint64_t, add)
     /* Floating point */
-    OP_SVE_FLOAT_FUNC_3BUFF(add)
-    OP_SVE_DOUBLE_FUNC_3BUFF(add)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add)
+#endif
+    OP_SVE_FUNC_3BUFF(sum, float, 32, float32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, double, 64, float64_t, add)
 
 /*************************************************************************
  * Product
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(prod, i, 16,  int16_t, mullo)
-    OP_SVE_FUNC_3BUFF(prod, i, 16, uint16_t, mullo)
-    OP_SVE_FUNC_3BUFF(prod, i, 32,  int32_t, mullo)
-    OP_SVE_FUNC_3BUFF(prod, i ,32, uint32_t, mullo)
-    OP_SVE_FUNC_3BUFF(prod, i, 64,  int64_t, mullo)
-    OP_SVE_FUNC_3BUFF(prod, i, 64, uint64_t, mullo)
+    OP_SVE_FUNC_3BUFF(prod,   int8_t, 8,    int8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod,  uint8_t, 8,   uint8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod,  int16_t, 16,  int16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, uint16_t, 16, uint16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod,  int32_t, 32,  int32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, uint32_t, 32, uint32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod,  int64_t, 64,  int64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, uint64_t, 64, uint64_t, mul)
 
     /* Floating point */
-    OP_SVE_FLOAT_FUNC_3BUFF(mul)
-    OP_SVE_DOUBLE_FUNC_3BUFF(mul)
+#if defined(HAVE_SHORT_FLOAT)
+    OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+    OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul)
+#endif
+    OP_SVE_FUNC_3BUFF(prod, float, 32, float32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, double, 64, float64_t, mul)
 
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
-    OP_SVE_BIT_FUNC_3BUFF(band, 8,    int8_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 8,   uint8_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 16,  int16_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 16, uint16_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 32,  int32_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 32, uint32_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 64,  int64_t, and)
-    OP_SVE_BIT_FUNC_3BUFF(band, 64, uint64_t, and)
-
-    OP_SVE_FLOAT_FUNC_3BUFF(and)
-    OP_SVE_DOUBLE_FUNC_3BUFF(and)
+    OP_SVE_FUNC_3BUFF(band,   int8_t, 8,    int8_t, and)
+    OP_SVE_FUNC_3BUFF(band,  uint8_t, 8,   uint8_t, and)
+    OP_SVE_FUNC_3BUFF(band,  int16_t, 16,  int16_t, and)
+    OP_SVE_FUNC_3BUFF(band, uint16_t, 16, uint16_t, and)
+    OP_SVE_FUNC_3BUFF(band,  int32_t, 32,  int32_t, and)
+    OP_SVE_FUNC_3BUFF(band, uint32_t, 32, uint32_t, and)
+    OP_SVE_FUNC_3BUFF(band,  int64_t, 64,  int64_t, and)
+    OP_SVE_FUNC_3BUFF(band, uint64_t, 64, uint64_t, and)
 
 /*************************************************************************
  * Bitwise OR
  *************************************************************************/
-    OP_SVE_BIT_FUNC_3BUFF(bor, 8,    int8_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 8,   uint8_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 16,  int16_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 16, uint16_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 32,  int32_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 32, uint32_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 64,  int64_t, or)
-    OP_SVE_BIT_FUNC_3BUFF(bor, 64, uint64_t, or)
-
-    OP_SVE_FLOAT_FUNC_3BUFF(or)
-    OP_SVE_DOUBLE_FUNC_3BUFF(or)
-
-/*************************************************************************
+    OP_SVE_FUNC_3BUFF(bor,   int8_t, 8,    int8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor,  uint8_t, 8,   uint8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor,  int16_t, 16,  int16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, uint16_t, 16, uint16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor,  int32_t, 32,  int32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, uint32_t, 32, uint32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor,  int64_t, 64,  int64_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, uint64_t, 64, uint64_t, orr)
+
+ /*************************************************************************
  * Bitwise XOR
  *************************************************************************/
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 8,    int8_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 8,   uint8_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 16,  int16_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 16, uint16_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 32,  int32_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 32, uint32_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 64,  int64_t, xor)
-    OP_SVE_BIT_FUNC_3BUFF(bxor, 64, uint64_t, xor)
-
-    OP_SVE_FLOAT_FUNC_3BUFF(xor)
-    OP_SVE_DOUBLE_FUNC_3BUFF(xor)
-
+    OP_SVE_FUNC_3BUFF(bxor,   int8_t, 8,    int8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor,  uint8_t, 8,   uint8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor,  int16_t, 16,  int16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, uint16_t, 16, uint16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor,  int32_t, 32,  int32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, uint32_t, 32, uint32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor,  int64_t, 64,  int64_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, uint64_t, 64, uint64_t, eor)
 
 /** C integer ***********************************************************/
 #define C_INTEGER(name, ftype)                                              \
@@ -364,23 +375,19 @@
 
 
 /** Floating point, including all the Fortran reals *********************/
+#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
+#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_short_float
+#else
+#define SHORT_FLOAT(name, ftype) NULL
+#endif
 #define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float
 #define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_double
 
-#define FLOATING_POINT(name, ftype)                                                            \
-    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = NULL, \
-    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+#define FLOATING_POINT(name, ftype)                                        \
+    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),            \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                        \
     [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
 
-#define C_INTEGER_PROD(name, ftype)                                           \
-    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t,   \
-    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \
-    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t,   \
-    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \
-    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t,   \
-    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t
-
-
 /*
  * MPI_OP_NULL
  * All types
@@ -411,12 +418,12 @@ ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMP
     /* Corresponds to MPI_SUM */
     [OMPI_OP_BASE_FORTRAN_SUM] = {
         C_INTEGER(sum, 2buff),
-        FLOATING_POINT(add, 2buff),
+        FLOATING_POINT(sum, 2buff),
     },
     /* Corresponds to MPI_PROD */
     [OMPI_OP_BASE_FORTRAN_PROD] = {
-        C_INTEGER_PROD(prod, 2buff),
-        FLOATING_POINT(mul, 2buff),
+        C_INTEGER(prod, 2buff),
+        FLOATING_POINT(prod, 2buff),
     },
     /* Corresponds to MPI_LAND */
     [OMPI_OP_BASE_FORTRAN_LAND] = {
@@ -474,12 +481,12 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN
     /* Corresponds to MPI_SUM */
     [OMPI_OP_BASE_FORTRAN_SUM] = {
         C_INTEGER(sum, 3buff),
-        FLOATING_POINT(add, 3buff),
+        FLOATING_POINT(sum, 3buff),
     },
     /* Corresponds to MPI_PROD */
     [OMPI_OP_BASE_FORTRAN_PROD] = {
-        C_INTEGER_PROD(prod, 3buff),
-        FLOATING_POINT(mul, 3buff),
+        C_INTEGER(prod, 3buff),
+        FLOATING_POINT(prod, 3buff),
     },
     /* Corresponds to MPI_LAND */
     [OMPI_OP_BASE_FORTRAN_LAND] ={
@@ -487,7 +494,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN
     },
     /* Corresponds to MPI_BAND */
     [OMPI_OP_BASE_FORTRAN_BAND] = {
-        C_INTEGER(and, 3buff),
+        C_INTEGER(band, 3buff),
     },
     /* Corresponds to MPI_LOR */
     [OMPI_OP_BASE_FORTRAN_LOR] = {
@@ -495,7 +502,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN
     },
     /* Corresponds to MPI_BOR */
     [OMPI_OP_BASE_FORTRAN_BOR] = {
-        C_INTEGER(or, 3buff),
+        C_INTEGER(bor, 3buff),
     },
     /* Corresponds to MPI_LXOR */
     [OMPI_OP_BASE_FORTRAN_LXOR] = {
@@ -503,7 +510,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN
     },
     /* Corresponds to MPI_BXOR */
     [OMPI_OP_BASE_FORTRAN_BXOR] = {
-        C_INTEGER(xor, 3buff),
+        C_INTEGER(bxor, 3buff),
     },
     /* Corresponds to MPI_REPLACE */
     [OMPI_OP_BASE_FORTRAN_REPLACE] = {
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/arm_sve_op/op_sve_functions.h
index 0c621788842..6c7b4c7df71 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.h
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.h
@@ -2,6 +2,9 @@
  * Copyright (c) 2019      The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ *
+ * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ *
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index e33db456a03..73ef7467b02 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -50,7 +50,7 @@ int main(int argc, char **argv) {
         in_double[i] = 10.0+1;
         inout_double[i] = 1.0+2;
 
-        in_uint8[i] = 4;
+        in_uint8[i] = 5;
         inout_uint8[i] = 3;
 
         in_uint16[i] = 2;

From 075bb80b95a525faf48fcbcdb43a33a45d6fd142 Mon Sep 17 00:00:00 2001
From: dong zhong <zhongdong0321@hotmail.com>
Date: Mon, 4 Nov 2019 11:17:00 -0500
Subject: [PATCH 03/13] Add readme and install scripts

Signed-off-by: dong zhong <zhongdong0321@hotmail.com>
---
 ARM_SVE_README | 30 ++++++++++++++++++++++++++++++
 arm_install.sh | 11 +++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 ARM_SVE_README
 create mode 100644 arm_install.sh

diff --git a/ARM_SVE_README b/ARM_SVE_README
new file mode 100644
index 00000000000..a4670d9a8df
--- /dev/null
+++ b/ARM_SVE_README
@@ -0,0 +1,30 @@
+Configuration and installation details please check script arm_install.sh
+
+Run test: (Test example takes 4 args)
+arg1 :  elements count for operation
+arg2 :  elements type could be : i (integer), f (float), d (double) 
+arg3:   type size in bits, only apply when you set arg2 to i.  eg: i 8 will be converted to int8;  i 16 to int16
+arg4:   operation type. Could be : max, min, sum , mul, band , bor,  bxor
+If you want to use SVE module for MPI ops, you need to pass mca params as : -mca op sve -mca op_sve_hardware_available 1
+=======
+Example for test
+$PATH_To_BIN/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 33 i 8 min 
+
+If you don't need armie you can remove the ARMIE part in the command line as : 
+$PATH_To_BIN//mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1 /ompi/test/datatype/Reduce_local_float 33 i 8 min 
+
+How we evaluate the performance?
+======
+Logical:
+
+Start_time;
+MPI_reduce_local(...);
+End_time;
+
+Reduce_time = Start_time - End_time;
+
+Possible issues (this happened on thunder2 machine):
+======
+Reason for "-mca pml ob1" : on Arm machine the default pml module will cause a problem with armie (instruction not supported, I don't know why), but with ob1 it works.
+
+
diff --git a/arm_install.sh b/arm_install.sh
new file mode 100644
index 00000000000..0b8afef93b0
--- /dev/null
+++ b/arm_install.sh
@@ -0,0 +1,11 @@
+mkdir build
+
+./autogen.pl >/dev/null
+
+./configure --prefix=$PWD/build --enable-mpirun-prefix-by-default --enable-debug CC=armclang CFLAGS="-march=armv8-a+sve" CXX=armclang++ FC=armflang >/dev/null
+
+./config.status >/dev/null
+make -j 128 install >/dev/null
+
+## compile the test code, test code under ompi/test/datapyte/Reduce_local_float.c
+./build/bin/mpicc  -g  -O3 -march=armv8-a+sve  -o ./test/datatype/Reduce_local_float ./test/datatype/Reduce_local_float.c

From e7491d8e2cc97ad194ee05337d6cc0d404494096 Mon Sep 17 00:00:00 2001
From: dong zhong <zhongdong0321@hotmail.com>
Date: Sun, 1 Dec 2019 17:30:35 -0500
Subject: [PATCH 04/13] improvement of code, add correctness_check scripts

---
 ompi/mca/op/arm_sve_op/op_sve.h           |   2 +-
 ompi/mca/op/arm_sve_op/op_sve_component.c |  49 +-
 ompi/mca/op/arm_sve_op/op_sve_functions.c | 409 ++++++++---------
 ompi/mca/op/arm_sve_op/op_sve_functions.h |   2 +-
 test/datatype/Reduce_local_float.c        | 531 ++++++++++++++++++----
 test/datatype/correctness_check.sh        |  52 +++
 6 files changed, 701 insertions(+), 344 deletions(-)
 create mode 100644 test/datatype/correctness_check.sh

diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/arm_sve_op/op_sve.h
index 95b19b95e04..21483d1fa56 100644
--- a/ompi/mca/op/arm_sve_op/op_sve.h
+++ b/ompi/mca/op/arm_sve_op/op_sve.h
@@ -3,7 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
- * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
  *
  * $COPYRIGHT$
  *
diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/arm_sve_op/op_sve_component.c
index e08bbae35d9..df8281e5ba1 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_component.c
+++ b/ompi/mca/op/arm_sve_op/op_sve_component.c
@@ -136,7 +136,7 @@ static int sve_component_register(void)
 static int sve_component_init_query(bool enable_progress_threads,
                                         bool enable_mpi_thread_multiple)
 {
-    if (mca_op_sve_component.hardware_available && !enable_mpi_thread_multiple) {
+    if (mca_op_sve_component.hardware_available) {
         return OMPI_SUCCESS;
     }
     return OMPI_ERR_NOT_SUPPORTED;
@@ -160,59 +160,16 @@ static struct ompi_op_base_module_1_0_0_t *
     int i=0;
     switch (op->o_f_to_c_index) {
     case OMPI_OP_BASE_FORTRAN_MAX:
-        /* Corresponds to MPI_MAX */
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MAX][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_MIN:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_MIN][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_SUM:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_SUM][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_PROD:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_PROD][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_BOR:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BOR][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_BAND:
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
-            OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BAND][i];
-            OBJ_RETAIN(module);
-        }
-        break;
     case OMPI_OP_BASE_FORTRAN_BXOR:
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            module->opm_fns[i] = ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            module->opm_fns[i] = ompi_op_sve_functions[op->o_f_to_c_index][i];
             OBJ_RETAIN(module);
-            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_BXOR][i];
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[op->o_f_to_c_index][i];
             OBJ_RETAIN(module);
         }
         break;
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c
index 3233ac66e44..8b2281271ea 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.c
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c
@@ -3,7 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
- * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
  *
  * $COPYRIGHT$
  *
@@ -38,329 +38,306 @@
  *
  */
 #define OP_SVE_FUNC(name, type_name, type_size, type, op) \
-    static void ompi_op_sve_2buff_##name##_##type_name(void *in, void *out, int *count, \
+    static void ompi_op_sve_2buff_##name##_##type(void *_in, void *_out, int *count, \
             struct ompi_datatype_t **dtype, \
             struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    uint64_t i; \
-    uint64_t step = 0; \
+    int types_per_step = svcnt##type_name();                           \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
     svbool_t Pg = svptrue_b##type_size(); \
-    switch(type_size) {                                                \
-        case 8:                                                        \
-               step = svcntb();                                        \
-               break;                                                  \
-        case 16:                                                       \
-               step = svcnth();                                        \
-               break;                                                  \
-        case 32:                                                       \
-               step = svcntw();                                        \
-               break;                                                  \
-        case 64:                                                       \
-               step = svcntd();                                        \
-    }    \
-    uint64_t round = *count;                                           \
-    uint64_t remain = *count % step;                                   \
-    for(i=0; i< round; i=i+step)                                       \
-    {                                                                  \
-        sv##type  vsrc = svld1(Pg, (type *)in+i);                    \
-        sv##type  vdst = svld1(Pg, (type *)out+i);                   \
-        vdst=sv##op##_z(Pg,vdst,vsrc);                               \
-        svst1(Pg, (type *)out+i,vdst);                                 \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        sv##type  vsrc = svld1(Pg, in);                                \
+        sv##type  vdst = svld1(Pg, out);                               \
+        in += types_per_step;                                          \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+        out += types_per_step; \
     }                                                                  \
     \
-    if (remain !=0){                                                   \
-        Pg = svwhilelt_b##type_size##_u64(0, remain);                  \
-        sv##type  vsrc = svld1(Pg, (type *)in+i);                    \
-        sv##type  vdst = svld1(Pg, (type *)out+i);                   \
-        vdst=sv##op##_z(Pg,vdst,vsrc);                               \
-        svst1(Pg, (type *)out+i,vdst);                                 \
+    if (left_over !=0){                                                \
+        Pg = svwhilelt_b##type_size##_u64(0, left_over);               \
+        sv##type  vsrc = svld1(Pg, in);                                \
+        sv##type  vdst = svld1(Pg, out);                               \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
     } \
 }
 
 /*************************************************************************
  * Max
  *************************************************************************/
-    OP_SVE_FUNC(max, int8_t  ,  8,   int8_t, max)
-    OP_SVE_FUNC(max, uint8_t,   8,  uint8_t, max)
-    OP_SVE_FUNC(max, int16_t,  16,  int16_t, max)
-    OP_SVE_FUNC(max, uint16_t, 16, uint16_t, max)
-    OP_SVE_FUNC(max, int32_t,  32,  int32_t, max)
-    OP_SVE_FUNC(max, uint32_t, 32, uint32_t, max)
-    OP_SVE_FUNC(max, int64_t,  64,  int64_t, max)
-    OP_SVE_FUNC(max, uint64_t, 64, uint64_t, max)
+    OP_SVE_FUNC(max, b,  8,   int8_t, max)
+    OP_SVE_FUNC(max, b,   8,  uint8_t, max)
+    OP_SVE_FUNC(max, h,  16,  int16_t, max)
+    OP_SVE_FUNC(max, h, 16, uint16_t, max)
+    OP_SVE_FUNC(max, w,  32,  int32_t, max)
+    OP_SVE_FUNC(max, w, 32, uint32_t, max)
+    OP_SVE_FUNC(max, d,  64,  int64_t, max)
+    OP_SVE_FUNC(max, d, 64, uint64_t, max)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(max, short_float, 16, float16_t, max)
+    OP_SVE_FUNC(max, h, 16, float16_t, max)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(max, short_float, 16, float16_t, max)
+    OP_SVE_FUNC(max, h, 16, float16_t, max)
 #endif
-    OP_SVE_FUNC(max, float, 32, float32_t, max)
-    OP_SVE_FUNC(max, double, 64, float64_t, max)
+    OP_SVE_FUNC(max, w, 32, float32_t, max)
+    OP_SVE_FUNC(max, d, 64, float64_t, max)
 
 /*************************************************************************
  * Min
  *************************************************************************/
-    OP_SVE_FUNC(min, int8_t  ,  8,   int8_t, min)
-    OP_SVE_FUNC(min, uint8_t,   8,  uint8_t, min)
-    OP_SVE_FUNC(min, int16_t,  16,  int16_t, min)
-    OP_SVE_FUNC(min, uint16_t, 16, uint16_t, min)
-    OP_SVE_FUNC(min, int32_t,  32,  int32_t, min)
-    OP_SVE_FUNC(min, uint32_t, 32, uint32_t, min)
-    OP_SVE_FUNC(min, int64_t,  64,  int64_t, min)
-    OP_SVE_FUNC(min, uint64_t, 64, uint64_t, min)
+    OP_SVE_FUNC(min, b,  8,   int8_t, min)
+    OP_SVE_FUNC(min, b,   8,  uint8_t, min)
+    OP_SVE_FUNC(min, h,  16,  int16_t, min)
+    OP_SVE_FUNC(min, h, 16, uint16_t, min)
+    OP_SVE_FUNC(min, w,  32,  int32_t, min)
+    OP_SVE_FUNC(min, w, 32, uint32_t, min)
+    OP_SVE_FUNC(min, d,  64,  int64_t, min)
+    OP_SVE_FUNC(min, d, 64, uint64_t, min)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(min, short_float, 16, float16_t, min)
+    OP_SVE_FUNC(min, h, 16, float16_t, min)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(min, short_float, 16, float16_t, min)
+    OP_SVE_FUNC(min, h, 16, float16_t, min)
 #endif
-    OP_SVE_FUNC(min, float, 32, float32_t, min)
-    OP_SVE_FUNC(min, double, 64, float64_t, min)
+    OP_SVE_FUNC(min, w, 32, float32_t, min)
+    OP_SVE_FUNC(min, d, 64, float64_t, min)
 
-/*************************************************************************
+ /*************************************************************************
  * Sum
  ************************************************************************/
-    OP_SVE_FUNC(sum, int8_t  ,  8,   int8_t, add)
-    OP_SVE_FUNC(sum, uint8_t,   8,  uint8_t, add)
-    OP_SVE_FUNC(sum, int16_t,  16,  int16_t, add)
-    OP_SVE_FUNC(sum, uint16_t, 16, uint16_t, add)
-    OP_SVE_FUNC(sum, int32_t,  32,  int32_t, add)
-    OP_SVE_FUNC(sum, uint32_t, 32, uint32_t, add)
-    OP_SVE_FUNC(sum, int64_t,  64,  int64_t, add)
-    OP_SVE_FUNC(sum, uint64_t, 64, uint64_t, add)
+    OP_SVE_FUNC(sum, b,  8,   int8_t, add)
+    OP_SVE_FUNC(sum, b,   8,  uint8_t, add)
+    OP_SVE_FUNC(sum, h,  16,  int16_t, add)
+    OP_SVE_FUNC(sum, h, 16, uint16_t, add)
+    OP_SVE_FUNC(sum, w,  32,  int32_t, add)
+    OP_SVE_FUNC(sum, w, 32, uint32_t, add)
+    OP_SVE_FUNC(sum, d,  64,  int64_t, add)
+    OP_SVE_FUNC(sum, d, 64, uint64_t, add)
+
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(sum, short_float, 16, float16_t, add)
+    OP_SVE_FUNC(sum, h, 16, float16_t, add)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(sum, short_float, 16, float16_t, add)
+    OP_SVE_FUNC(sum, h, 16, float16_t, add)
 #endif
-    OP_SVE_FUNC(sum, float, 32, float32_t, add)
-    OP_SVE_FUNC(sum, double, 64, float64_t, add)
-
+    OP_SVE_FUNC(sum, w, 32, float32_t, add)
+    OP_SVE_FUNC(sum, d, 64, float64_t, add)
 
 /*************************************************************************
  * Product
  *************************************************************************/
-    OP_SVE_FUNC(prod,   int8_t, 8,    int8_t, mul)
-    OP_SVE_FUNC(prod,  uint8_t, 8,   uint8_t, mul)
-    OP_SVE_FUNC(prod,  int16_t, 16,  int16_t, mul)
-    OP_SVE_FUNC(prod, uint16_t, 16, uint16_t, mul)
-    OP_SVE_FUNC(prod,  int32_t, 32,  int32_t, mul)
-    OP_SVE_FUNC(prod, uint32_t, 32, uint32_t, mul)
-    OP_SVE_FUNC(prod,  int64_t, 64,  int64_t, mul)
-    OP_SVE_FUNC(prod, uint64_t, 64, uint64_t, mul)
+    OP_SVE_FUNC(prod, b,  8,   int8_t, mul)
+    OP_SVE_FUNC(prod, b,   8,  uint8_t, mul)
+    OP_SVE_FUNC(prod, h,  16,  int16_t, mul)
+    OP_SVE_FUNC(prod, h, 16, uint16_t, mul)
+    OP_SVE_FUNC(prod, w,  32,  int32_t, mul)
+    OP_SVE_FUNC(prod, w, 32, uint32_t, mul)
+    OP_SVE_FUNC(prod, d,  64,  int64_t, mul)
+OP_SVE_FUNC(prod, d, 64, uint64_t, mul)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(prod, short_float, 16, float16_t, mul)
+OP_SVE_FUNC(prod, h, 16, float16_t, mul)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(prod, short_float, 16, float16_t, mul)
+OP_SVE_FUNC(prod, h, 16, float16_t, mul)
 #endif
-    OP_SVE_FUNC(prod, float, 32, float32_t, mul)
-    OP_SVE_FUNC(prod, double, 64, float64_t, mul)
+    OP_SVE_FUNC(prod, w, 32, float32_t, mul)
+OP_SVE_FUNC(prod, d, 64, float64_t, mul)
 
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
-    OP_SVE_FUNC(band,   int8_t, 8,    int8_t, and)
-    OP_SVE_FUNC(band,  uint8_t, 8,   uint8_t, and)
-    OP_SVE_FUNC(band,  int16_t, 16,  int16_t, and)
-    OP_SVE_FUNC(band, uint16_t, 16, uint16_t, and)
-    OP_SVE_FUNC(band,  int32_t, 32,  int32_t, and)
-    OP_SVE_FUNC(band, uint32_t, 32, uint32_t, and)
-    OP_SVE_FUNC(band,  int64_t, 64,  int64_t, and)
-    OP_SVE_FUNC(band, uint64_t, 64, uint64_t, and)
+    OP_SVE_FUNC(band, b,  8,   int8_t, and)
+    OP_SVE_FUNC(band, b,   8,  uint8_t, and)
+    OP_SVE_FUNC(band, h,  16,  int16_t, and)
+    OP_SVE_FUNC(band, h, 16, uint16_t, and)
+    OP_SVE_FUNC(band, w,  32,  int32_t, and)
+    OP_SVE_FUNC(band, w, 32, uint32_t, and)
+    OP_SVE_FUNC(band, d,  64,  int64_t, and)
+OP_SVE_FUNC(band, d, 64, uint64_t, and)
 
-/*************************************************************************
+ /*************************************************************************
  * Bitwise OR
  *************************************************************************/
-    OP_SVE_FUNC(bor,   int8_t, 8,    int8_t, orr)
-    OP_SVE_FUNC(bor,  uint8_t, 8,   uint8_t, orr)
-    OP_SVE_FUNC(bor,  int16_t, 16,  int16_t, orr)
-    OP_SVE_FUNC(bor, uint16_t, 16, uint16_t, orr)
-    OP_SVE_FUNC(bor,  int32_t, 32,  int32_t, orr)
-    OP_SVE_FUNC(bor, uint32_t, 32, uint32_t, orr)
-    OP_SVE_FUNC(bor,  int64_t, 64,  int64_t, orr)
-    OP_SVE_FUNC(bor, uint64_t, 64, uint64_t, orr)
+    OP_SVE_FUNC(bor, b,  8,   int8_t, orr)
+    OP_SVE_FUNC(bor, b,   8,  uint8_t, orr)
+    OP_SVE_FUNC(bor, h,  16,  int16_t, orr)
+    OP_SVE_FUNC(bor, h, 16, uint16_t, orr)
+    OP_SVE_FUNC(bor, w,  32,  int32_t, orr)
+    OP_SVE_FUNC(bor, w, 32, uint32_t, orr)
+    OP_SVE_FUNC(bor, d,  64,  int64_t, orr)
+OP_SVE_FUNC(bor, d, 64, uint64_t, orr)
 
 /*************************************************************************
  * Bitwise XOR
  *************************************************************************/
-    OP_SVE_FUNC(bxor,   int8_t, 8,    int8_t, eor)
-    OP_SVE_FUNC(bxor,  uint8_t, 8,   uint8_t, eor)
-    OP_SVE_FUNC(bxor,  int16_t, 16,  int16_t, eor)
-    OP_SVE_FUNC(bxor, uint16_t, 16, uint16_t, eor)
-    OP_SVE_FUNC(bxor,  int32_t, 32,  int32_t, eor)
-    OP_SVE_FUNC(bxor, uint32_t, 32, uint32_t, eor)
-    OP_SVE_FUNC(bxor,  int64_t, 64,  int64_t, eor)
-    OP_SVE_FUNC(bxor, uint64_t, 64, uint64_t, eor)
-
+    OP_SVE_FUNC(bxor, b,  8,   int8_t, eor)
+    OP_SVE_FUNC(bxor, b,   8,  uint8_t, eor)
+    OP_SVE_FUNC(bxor, h,  16,  int16_t, eor)
+    OP_SVE_FUNC(bxor, h, 16, uint16_t, eor)
+    OP_SVE_FUNC(bxor, w,  32,  int32_t, eor)
+    OP_SVE_FUNC(bxor, w, 32, uint32_t, eor)
+    OP_SVE_FUNC(bxor, d,  64,  int64_t, eor)
+OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
 
 /*
  *  This is a three buffer (2 input and 1 output) version of the reduction
  *  routines, needed for some optimizations.
  */
 #define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \
-        static void ompi_op_sve_3buff_##name##_##type_name(void * restrict in1,   \
-                void * restrict in2, void * restrict out, int *count, \
+        static void ompi_op_sve_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
                 struct ompi_datatype_t **dtype, \
                 struct ompi_op_base_module_1_0_0_t *module) \
 {                                                                      \
-    uint64_t i; \
-    uint64_t step = 0; \
+    int types_per_step = svcnt##type_name();                           \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
     svbool_t Pg = svptrue_b##type_size(); \
-    switch(type_size) {                                                \
-        case 8:                                                        \
-               step = svcntb();                                        \
-               break;                                                  \
-        case 16:                                                       \
-               step = svcnth();                                        \
-               break;                                                  \
-        case 32:                                                       \
-               step = svcntw();                                        \
-               break;                                                  \
-        case 64:                                                       \
-               step = svcntd();                                        \
-    }    \
-    uint64_t round = *count;                                           \
-    uint64_t remain = *count % step;                                   \
-    for(i=0; i< round; i=i+step)                                       \
-    {                                                                  \
-        sv##type  vsrc = svld1(Pg, (type *)in1+i);                     \
-        sv##type  vdst = svld1(Pg, (type *)in2+i);                     \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        sv##type  vsrc = svld1(Pg, in1);                               \
+        sv##type  vdst = svld1(Pg, in2);                               \
+        in1 += types_per_step; \
+        in2 += types_per_step; \
         vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
-        svst1(Pg, (type *)out+i,vdst);                                 \
+        svst1(Pg, out,vdst);                                           \
+        out += types_per_step; \
     }                                                                  \
-    if (remain !=0){                                                   \
-        Pg = svwhilelt_b##type_size##_u64(0, remain);                  \
-        sv##type  vsrc = svld1(Pg, (type *)in1+i);                     \
-        sv##type  vdst = svld1(Pg, (type *)in2+i);                     \
+    if (left_over !=0){                                                \
+        Pg = svwhilelt_b##type_size##_u64(0, left_over);               \
+        sv##type  vsrc = svld1(Pg, in1);                               \
+        sv##type  vdst = svld1(Pg, in2);                               \
         vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
-        svst1(Pg, (type *)out+i,vdst);                                 \
+        svst1(Pg, out,vdst);                                           \
     } \
 }
 
-
 /*************************************************************************
  * Max
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(max, int8_t  ,  8,   int8_t, max)
-    OP_SVE_FUNC_3BUFF(max, uint8_t,   8,  uint8_t, max)
-    OP_SVE_FUNC_3BUFF(max, int16_t,  16,  int16_t, max)
-    OP_SVE_FUNC_3BUFF(max, uint16_t, 16, uint16_t, max)
-    OP_SVE_FUNC_3BUFF(max, int32_t,  32,  int32_t, max)
-    OP_SVE_FUNC_3BUFF(max, uint32_t, 32, uint32_t, max)
-    OP_SVE_FUNC_3BUFF(max, int64_t,  64,  int64_t, max)
-    OP_SVE_FUNC_3BUFF(max, uint64_t, 64, uint64_t, max)
+    OP_SVE_FUNC_3BUFF(max, b,  8,   int8_t, max)
+    OP_SVE_FUNC_3BUFF(max, b,   8,  uint8_t, max)
+    OP_SVE_FUNC_3BUFF(max, h,  16,  int16_t, max)
+    OP_SVE_FUNC_3BUFF(max, h, 16, uint16_t, max)
+    OP_SVE_FUNC_3BUFF(max, w,  32,  int32_t, max)
+    OP_SVE_FUNC_3BUFF(max, w, 32, uint32_t, max)
+    OP_SVE_FUNC_3BUFF(max, d,  64,  int64_t, max)
+    OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max)
+    OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(max, short_float, 16, float16_t, max)
+    OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max)
 #endif
-    OP_SVE_FUNC_3BUFF(max, float, 32, float32_t, max)
-    OP_SVE_FUNC_3BUFF(max, double, 64, float64_t, max)
+    OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max)
+    OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max)
 
 /*************************************************************************
  * Min
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(min, int8_t  ,  8,   int8_t, min)
-    OP_SVE_FUNC_3BUFF(min, uint8_t,   8,  uint8_t, min)
-    OP_SVE_FUNC_3BUFF(min, int16_t,  16,  int16_t, min)
-    OP_SVE_FUNC_3BUFF(min, uint16_t, 16, uint16_t, min)
-    OP_SVE_FUNC_3BUFF(min, int32_t,  32,  int32_t, min)
-    OP_SVE_FUNC_3BUFF(min, uint32_t, 32, uint32_t, min)
-    OP_SVE_FUNC_3BUFF(min, int64_t,  64,  int64_t, min)
-    OP_SVE_FUNC_3BUFF(min, uint64_t, 64, uint64_t, min)
+    OP_SVE_FUNC_3BUFF(min, b,  8,   int8_t, min)
+    OP_SVE_FUNC_3BUFF(min, b,   8,  uint8_t, min)
+    OP_SVE_FUNC_3BUFF(min, h,  16,  int16_t, min)
+    OP_SVE_FUNC_3BUFF(min, h, 16, uint16_t, min)
+    OP_SVE_FUNC_3BUFF(min, w,  32,  int32_t, min)
+    OP_SVE_FUNC_3BUFF(min, w, 32, uint32_t, min)
+    OP_SVE_FUNC_3BUFF(min, d,  64,  int64_t, min)
+    OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min)
+    OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(min, short_float, 16, float16_t, min)
+    OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min)
 #endif
-    OP_SVE_FUNC_3BUFF(min, float, 32, float32_t, min)
-    OP_SVE_FUNC_3BUFF(min, double, 64, float64_t, min)
+    OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min)
+    OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min)
 
-/*************************************************************************
+ /*************************************************************************
  * Sum
- *************************************************************************/
-    OP_SVE_FUNC_3BUFF(sum, int8_t  ,  8,   int8_t, add)
-    OP_SVE_FUNC_3BUFF(sum, uint8_t,   8,  uint8_t, add)
-    OP_SVE_FUNC_3BUFF(sum, int16_t,  16,  int16_t, add)
-    OP_SVE_FUNC_3BUFF(sum, uint16_t, 16, uint16_t, add)
-    OP_SVE_FUNC_3BUFF(sum, int32_t,  32,  int32_t, add)
-    OP_SVE_FUNC_3BUFF(sum, uint32_t, 32, uint32_t, add)
-    OP_SVE_FUNC_3BUFF(sum, int64_t,  64,  int64_t, add)
-    OP_SVE_FUNC_3BUFF(sum, uint64_t, 64, uint64_t, add)
+ ************************************************************************/
+    OP_SVE_FUNC_3BUFF(sum, b,  8,   int8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, b,   8,  uint8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h,  16,  int16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h, 16, uint16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w,  32,  int32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w, 32, uint32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d,  64,  int64_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add)
+
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(sum, short_float, 16, float16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add)
 #endif
-    OP_SVE_FUNC_3BUFF(sum, float, 32, float32_t, add)
-    OP_SVE_FUNC_3BUFF(sum, double, 64, float64_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add)
 
 /*************************************************************************
  * Product
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(prod,   int8_t, 8,    int8_t, mul)
-    OP_SVE_FUNC_3BUFF(prod,  uint8_t, 8,   uint8_t, mul)
-    OP_SVE_FUNC_3BUFF(prod,  int16_t, 16,  int16_t, mul)
-    OP_SVE_FUNC_3BUFF(prod, uint16_t, 16, uint16_t, mul)
-    OP_SVE_FUNC_3BUFF(prod,  int32_t, 32,  int32_t, mul)
-    OP_SVE_FUNC_3BUFF(prod, uint32_t, 32, uint32_t, mul)
-    OP_SVE_FUNC_3BUFF(prod,  int64_t, 64,  int64_t, mul)
-    OP_SVE_FUNC_3BUFF(prod, uint64_t, 64, uint64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, b,  8,   int8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, b,   8,  uint8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, h,  16,  int16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, h, 16, uint16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w,  32,  int32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d,  64,  int64_t, mul)
+OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul)
+OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(prod, short_float, 16, float16_t, mul)
+OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul)
 #endif
-    OP_SVE_FUNC_3BUFF(prod, float, 32, float32_t, mul)
-    OP_SVE_FUNC_3BUFF(prod, double, 64, float64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul)
+OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul)
 
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(band,   int8_t, 8,    int8_t, and)
-    OP_SVE_FUNC_3BUFF(band,  uint8_t, 8,   uint8_t, and)
-    OP_SVE_FUNC_3BUFF(band,  int16_t, 16,  int16_t, and)
-    OP_SVE_FUNC_3BUFF(band, uint16_t, 16, uint16_t, and)
-    OP_SVE_FUNC_3BUFF(band,  int32_t, 32,  int32_t, and)
-    OP_SVE_FUNC_3BUFF(band, uint32_t, 32, uint32_t, and)
-    OP_SVE_FUNC_3BUFF(band,  int64_t, 64,  int64_t, and)
-    OP_SVE_FUNC_3BUFF(band, uint64_t, 64, uint64_t, and)
+    OP_SVE_FUNC_3BUFF(band, b,  8,   int8_t, and)
+    OP_SVE_FUNC_3BUFF(band, b,   8,  uint8_t, and)
+    OP_SVE_FUNC_3BUFF(band, h,  16,  int16_t, and)
+    OP_SVE_FUNC_3BUFF(band, h, 16, uint16_t, and)
+    OP_SVE_FUNC_3BUFF(band, w,  32,  int32_t, and)
+    OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and)
+    OP_SVE_FUNC_3BUFF(band, d,  64,  int64_t, and)
+OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and)
 
-/*************************************************************************
+ /*************************************************************************
  * Bitwise OR
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(bor,   int8_t, 8,    int8_t, orr)
-    OP_SVE_FUNC_3BUFF(bor,  uint8_t, 8,   uint8_t, orr)
-    OP_SVE_FUNC_3BUFF(bor,  int16_t, 16,  int16_t, orr)
-    OP_SVE_FUNC_3BUFF(bor, uint16_t, 16, uint16_t, orr)
-    OP_SVE_FUNC_3BUFF(bor,  int32_t, 32,  int32_t, orr)
-    OP_SVE_FUNC_3BUFF(bor, uint32_t, 32, uint32_t, orr)
-    OP_SVE_FUNC_3BUFF(bor,  int64_t, 64,  int64_t, orr)
-    OP_SVE_FUNC_3BUFF(bor, uint64_t, 64, uint64_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, b,  8,   int8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, b,   8,  uint8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, h,  16,  int16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, h, 16, uint16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, w,  32,  int32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, d,  64,  int64_t, orr)
+OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr)
 
- /*************************************************************************
+/*************************************************************************
  * Bitwise XOR
  *************************************************************************/
-    OP_SVE_FUNC_3BUFF(bxor,   int8_t, 8,    int8_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor,  uint8_t, 8,   uint8_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor,  int16_t, 16,  int16_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor, uint16_t, 16, uint16_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor,  int32_t, 32,  int32_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor, uint32_t, 32, uint32_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor,  int64_t, 64,  int64_t, eor)
-    OP_SVE_FUNC_3BUFF(bxor, uint64_t, 64, uint64_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, b,  8,   int8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, b,   8,  uint8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, h,  16,  int16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, h, 16, uint16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, w,  32,  int32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, d,  64,  int64_t, eor)
+OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor)
 
 /** C integer ***********************************************************/
 #define C_INTEGER(name, ftype)                                              \
@@ -376,12 +353,12 @@
 
 /** Floating point, including all the Fortran reals *********************/
 #if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
-#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_short_float
+#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float16_t
 #else
 #define SHORT_FLOAT(name, ftype) NULL
 #endif
-#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float
-#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_double
+#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t
+#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t
 
 #define FLOATING_POINT(name, ftype)                                        \
     [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),            \
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/arm_sve_op/op_sve_functions.h
index 6c7b4c7df71..d5e050afb21 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.h
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.h
@@ -3,7 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
- * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
  *
  * $COPYRIGHT$
  *
diff --git a/test/datatype/Reduce_local_float.c b/test/datatype/Reduce_local_float.c
index 73ef7467b02..a9546ebcf1d 100644
--- a/test/datatype/Reduce_local_float.c
+++ b/test/datatype/Reduce_local_float.c
@@ -14,22 +14,27 @@
 
 float in_float[ARRAYSIZE];
 float inout_float[ARRAYSIZE];
+float inout_float_for_check[ARRAYSIZE];
 
 int8_t in_uint8[ARRAYSIZE];
 int8_t inout_uint8[ARRAYSIZE];
+int8_t inout_uint8_for_check[ARRAYSIZE];
 
 uint16_t in_uint16[ARRAYSIZE];
 uint16_t inout_uint16[ARRAYSIZE];
+uint16_t inout_uint16_for_check[ARRAYSIZE];
 
 uint32_t in_uint32[ARRAYSIZE];
 uint32_t inout_uint32[ARRAYSIZE];
+uint32_t inout_uint32_for_check[ARRAYSIZE];
 
 uint64_t in_uint64[ARRAYSIZE];
 uint64_t inout_uint64[ARRAYSIZE];
+uint64_t inout_uint64_for_check[ARRAYSIZE];
 
 double in_double[ARRAYSIZE];
 double inout_double[ARRAYSIZE];
-
+double inout_double_for_check[ARRAYSIZE];
 
 int main(int argc, char **argv) {
 
@@ -45,22 +50,22 @@ int main(int argc, char **argv) {
     for (i=0; i<count; i++)
     {
         in_float[i] = 1000.0+1;
-        inout_float[i] = 100.0+2;
+        inout_float[i] = inout_float_for_check[i] = 100.0+2;
 
         in_double[i] = 10.0+1;
-        inout_double[i] = 1.0+2;
+        inout_double[i] = inout_double_for_check[i] = 1.0+2;
 
         in_uint8[i] = 5;
-        inout_uint8[i] = 3;
+        inout_uint8[i] = inout_uint8_for_check[i] = 3;
 
-        in_uint16[i] = 2;
-        inout_uint16[i] = 3;
+        in_uint16[i] = 5;
+        inout_uint16[i] = inout_uint16_for_check[i] = 3;
 
-        in_uint32[i] = 2;
-        inout_uint32[i] = 3;
+        in_uint32[i] = 5;
+        inout_uint32[i] = inout_uint32_for_check[i] = 3;
 
-        in_uint64[i] = 2;
-        inout_uint64[i] = 3;
+        in_uint64[i] = 5;
+        inout_uint64[i] = inout_uint64_for_check[i] = 3;
     }
     int rank, size, len;
 
@@ -69,60 +74,180 @@ int main(int argc, char **argv) {
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     double tstart, tend;
 
+    int correctness=1;
+
     if(*type=='i') {
         if(strcmp(op, "sum") == 0){
             printf("#Local Reduce SUM: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] + inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 SUM check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] + inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 SUM check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_SUM);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] + inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 SUM check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_SUM);
-             tend = MPI_Wtime();
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] + inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 SUM check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 SUM check \033[1;31m fail\033[0m!");
+            }
+            tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             printf("#Local Reduce MAX: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 MAX check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 MAX check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 MAX check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MAX);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MAX check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 MAX check \033[1;31m fail\033[0m!");
+            }
             tend = MPI_Wtime();
         }
-
+        //intentionly reversed in and out
         if(strcmp(op, "min") == 0) {
             printf("#Local Reduce MIN: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
-                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_MIN);
-            if (elem_size1 == 16)
-                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_MIN);
-            if (elem_size1 == 32)
-                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_MIN);
-            if (elem_size1 == 64)
-                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_MIN);
-            tend = MPI_Wtime();
-        }
-
-        if(strcmp(op, "land") == 0) {
-            printf("#Local Reduce Logical: %d \n", count);
-            tstart = MPI_Wtime();
-            if (elem_size1 == 8)
-                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_LAND);
+            {
+                MPI_Reduce_local(inout_uint8,in_uint8,count, MPI_INT8_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 MIN check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
-                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+            {
+                MPI_Reduce_local(inout_uint16,in_uint16,count, MPI_INT16_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 MIN check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
-                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+            {
+                MPI_Reduce_local(inout_uint32,in_uint32,count, MPI_UINT32_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 MIN check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
-                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+            {
+                MPI_Reduce_local(inout_uint64,in_uint64,count, MPI_UINT64_T, MPI_MIN);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 MIN check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 MIN check \033[1;31m fail\033[0m!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -130,13 +255,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!= (in_uint8[i] | inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BOR check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BOR check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BOR check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BOR check \033[1;31m fail\033[0m!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -144,13 +313,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] ^ inout_uint8_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BXOR check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] ^ inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BXOR check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] ^ inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BXOR check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BXOR);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=(in_uint64[i] ^ inout_uint64_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BXOR check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BXOR check \033[1;31m fail\033[0m!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -159,13 +372,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=in_uint8[i] * inout_uint8_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 PROD check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
+            {
                 MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=in_uint16[i] * inout_uint16_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 PROD check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
+            {
                 MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=in_uint32[i] * inout_uint32_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 PROD check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
+            {
                 MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!=in_uint64[i] * inout_uint64_for_check[i])
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 PROD check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 PROD check \033[1;31m fail\033[0m!");
+            }
             tend = MPI_Wtime();
         }
 
@@ -173,13 +430,57 @@ int main(int argc, char **argv) {
             printf("#Local Reduce Logical: %d \n", count);
             tstart = MPI_Wtime();
             if (elem_size1 == 8)
+            {
                 MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint8[i]!=(in_uint8[i] & inout_uint8_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 8 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 8 BAND check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 16)
-                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_PROD);
+            {
+                MPI_Reduce_local(in_uint16,inout_uint16,count, MPI_INT16_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint16[i]!=(in_uint16[i] & inout_uint16_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 16 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 16 BAND check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 32)
-                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_PROD);
+            {
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint32[i]!=(in_uint32[i] & inout_uint32_for_check[i]))
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 32 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 32 BAND check \033[1;31m fail\033[0m!");
+            }
             if (elem_size1 == 64)
-                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+            {
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_BAND);
+                for (i=0; i<count; i++)
+                {
+                    if(inout_uint64[i]!= (in_uint64[i] & inout_uint64_for_check[i]) )
+                        correctness = 0;
+                }
+                if(correctness)
+                    printf("Integer Size 64 BAND check \033[1;32m success!\033[0m");
+                else
+                    printf("Integer Size 64 BAND check \033[1;31m fail\033[0m!");
+            }
             tend = MPI_Wtime();
         }
     }
@@ -188,24 +489,60 @@ int main(int argc, char **argv) {
         if(strcmp(op, "sum") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_SUM);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=inout_float_for_check[i]+in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Sum check \033[1;32m success!\033[0m");
+            else
+                printf("Float Sum check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MAX);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Max check \033[1;32m success!\033[0m");
+            else
+                printf("Float Max check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "min") == 0) {
             tstart = MPI_Wtime();
-            MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_MIN);
+            MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Min check \033[1;32m success!\033[0m");
+            else
+                printf("Float Min check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "mul") == 0) {
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_float,inout_float,count, MPI_FLOAT, MPI_PROD);
+            for (i=0; i<count; i++)
+            {
+                if(inout_float[i]!=in_float[i] * inout_float_for_check[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Float Prod check \033[1;32m success!\033[0m");
+            else
+                printf("Float Prod check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
     }
@@ -214,67 +551,101 @@ int main(int argc, char **argv) {
         if(strcmp(op, "sum") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_SUM);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]+in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Sum check \033[1;32m success!\033[0m");
+            else
+                printf("Double Sum check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "max") == 0){
             tstart = MPI_Wtime();
             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MAX);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Max check \033[1;32m success!\033[0m");
+            else
+                printf("Double Max check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
 
         if(strcmp(op, "min") == 0) {
             tstart = MPI_Wtime();
-            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_MIN);
+            MPI_Reduce_local(inout_double,in_double,count, MPI_DOUBLE, MPI_MIN);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Min check \033[1;32m success!\033[0m");
+            else
+                printf("Double Min check \033[1;31m fail\033[0m!");
             tend = MPI_Wtime();
         }
-
-         if(strcmp(op, "mul") == 0) {
-             tstart = MPI_Wtime();
-             MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+        if(strcmp(op, "mul") == 0) {
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double,inout_double,count, MPI_DOUBLE, MPI_PROD);
+            for (i=0; i<count; i++)
+            {
+                if(inout_double[i]!=inout_double_for_check[i]*in_double[i])
+                    correctness = 0;
+            }
+            if(correctness)
+                printf("Double Prod check \033[1;32m success!\033[0m");
+            else
+                 printf("Double Prod check \033[1;31m fail\033[0m!");
              tend = MPI_Wtime();
          }
     }
-/*
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" float: %f", inout_float[i]);
-        if((i+1)%16==0)
-            printf("\n");
-    }
-
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out8: %d", inout_uint8[i]);
-        if((i+1)%64==0)
-            printf("\n");
-    }
-
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out16: %d",inout_uint16[i]);
-        if((i+1)%32==0)
-            printf("\n");
-    }
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out32: %d",inout_uint32[i]);
-        if((i+1)%16==0)
-            printf("\n");
-    }
-
-    printf("\n ================\n");
-    for (i=0; i<count; i++)
-    {
-        printf(" out64: %d",inout_uint64[i]);
-        if((i+1)%8==0)
-            printf("\n");
-    }
-
+    /*
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" float: %f", inout_float[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out8: %d", inout_uint8[i]);
+       if((i+1)%64==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out16: %d",inout_uint16[i]);
+       if((i+1)%32==0)
+       printf("\n");
+       }
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out32: %d",inout_uint32[i]);
+       if((i+1)%16==0)
+       printf("\n");
+       }
+
+       printf("\n ================\n");
+       for (i=0; i<count; i++)
+       {
+       printf(" out64: %d",inout_uint64[i]);
+       if((i+1)%8==0)
+       printf("\n");
+       }
     printf("\n ================\n");
     for (i=0; i<count; i++)
     {
@@ -283,7 +654,7 @@ int main(int argc, char **argv) {
             printf("\n");
     }
 */
-    printf("#Local Reduce time %.6f seconds\n", tend-tstart);
+    printf("\n#Local Reduce time %.6f seconds\n", tend-tstart);
     MPI_Finalize();
     return 0;
 }
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
new file mode 100644
index 00000000000..4ac4f08cba0
--- /dev/null
+++ b/test/datatype/correctness_check.sh
@@ -0,0 +1,52 @@
+echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "/home/zhongdong/opt/git/avx512_reduction/bin/mpirun  -mca   op_sve_hardware_available 0  -mca   op_avx_hardware_available 0   -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1048576  i 8 max"
+
+Orange= "\033[0;33m"
+Blue= '\033[0;34m'
+Purple= '\033[0;35m'
+
+NC='\033[0m'
+
+echo "=========Integer type all operations & all sizes========"
+echo ""
+for op in max min sum mul band bor bxor
+do
+    echo ""
+    echo "===Operation  $op test==="
+    for size in 8 16 32 64
+    do
+        echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+      /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1024 i $size $op 
+        echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+        echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+    done
+done
+
+echo "=======Float type all operations========"
+echo ""
+echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+for op in max min sum mul
+do
+   /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op 
+done
+
+echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+## 28= 16+8 + 4
+
+echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+##40=16+4
+
+echo "========Double type all operations========="
+echo ""
+
+echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
+for op in max min sum mul
+do
+   /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op
+done
+
+echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
+## 20= 8 +8 + 3
+
+echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+##12=8+2

From 3dfd022db651202811470b3e20e231480ca62cc3 Mon Sep 17 00:00:00 2001
From: dong zhong <zhongdong0321@hotmail.com>
Date: Sun, 1 Dec 2019 18:01:38 -0500
Subject: [PATCH 05/13] improvement of code, add duff device

Signed-off-by: dong zhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/arm_sve_op/op_sve_functions.c | 283 +++++++++++++++++++++-
 test/datatype/correctness_check.sh        |  77 +++---
 2 files changed, 317 insertions(+), 43 deletions(-)

diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/arm_sve_op/op_sve_functions.c
index 8b2281271ea..c9e3bf7b137 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.c
+++ b/ompi/mca/op/arm_sve_op/op_sve_functions.c
@@ -57,17 +57,272 @@
     }                                                                  \
     \
     if (left_over !=0){                                                \
-        Pg = svwhilelt_b##type_size##_u64(0, left_over);               \
-        sv##type  vsrc = svld1(Pg, in);                                \
-        sv##type  vdst = svld1(Pg, out);                               \
-        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
-        svst1(Pg, out,vdst);                                           \
+       switch(left_over) {                         \
+           case 256: out[255] = current_func(out[255],in[255]) ;                        \
+           case 255: out[254] = current_func(out[254],in[254]) ;                        \
+           case 254: out[253] = current_func(out[253],in[253]) ;                        \
+           case 253: out[252] = current_func(out[252],in[252]) ;                        \
+           case 252: out[251] = current_func(out[251],in[251]) ;                        \
+           case 251: out[250] = current_func(out[250],in[250]) ;                        \
+           case 250: out[249] = current_func(out[249],in[249]) ;                        \
+           case 249: out[248] = current_func(out[248],in[248]) ;                        \
+           case 248: out[247] = current_func(out[247],in[247]) ;                        \
+           case 247: out[246] = current_func(out[246],in[246]) ;                        \
+           case 246: out[245] = current_func(out[245],in[245]) ;                        \
+           case 245: out[244] = current_func(out[244],in[244]) ;                        \
+           case 244: out[243] = current_func(out[243],in[243]) ;                        \
+           case 243: out[242] = current_func(out[242],in[242]) ;                        \
+           case 242: out[241] = current_func(out[241],in[241]) ;                        \
+           case 241: out[240] = current_func(out[240],in[240]) ;                        \
+           case 240: out[239] = current_func(out[239],in[239]) ;                        \
+           case 239: out[238] = current_func(out[238],in[238]) ;                        \
+           case 238: out[237] = current_func(out[237],in[237]) ;                        \
+           case 237: out[236] = current_func(out[236],in[236]) ;                        \
+           case 236: out[235] = current_func(out[235],in[235]) ;                        \
+           case 235: out[234] = current_func(out[234],in[234]) ;                        \
+           case 234: out[233] = current_func(out[233],in[233]) ;                        \
+           case 233: out[232] = current_func(out[232],in[232]) ;                        \
+           case 232: out[231] = current_func(out[231],in[231]) ;                        \
+           case 231: out[230] = current_func(out[230],in[230]) ;                        \
+           case 230: out[229] = current_func(out[229],in[229]) ;                        \
+           case 229: out[228] = current_func(out[228],in[228]) ;                        \
+           case 228: out[227] = current_func(out[227],in[227]) ;                        \
+           case 227: out[226] = current_func(out[226],in[226]) ;                        \
+           case 226: out[225] = current_func(out[225],in[225]) ;                        \
+           case 225: out[224] = current_func(out[224],in[224]) ;                        \
+           case 224: out[223] = current_func(out[223],in[223]) ;                        \
+           case 223: out[222] = current_func(out[222],in[222]) ;                        \
+           case 222: out[221] = current_func(out[221],in[221]) ;                        \
+           case 221: out[220] = current_func(out[220],in[220]) ;                        \
+           case 220: out[219] = current_func(out[219],in[219]) ;                        \
+           case 219: out[218] = current_func(out[218],in[218]) ;                        \
+           case 218: out[217] = current_func(out[217],in[217]) ;                        \
+           case 217: out[216] = current_func(out[216],in[216]) ;                        \
+           case 216: out[215] = current_func(out[215],in[215]) ;                        \
+           case 215: out[214] = current_func(out[214],in[214]) ;                        \
+           case 214: out[213] = current_func(out[213],in[213]) ;                        \
+           case 213: out[212] = current_func(out[212],in[212]) ;                        \
+           case 212: out[211] = current_func(out[211],in[211]) ;                        \
+           case 211: out[210] = current_func(out[210],in[210]) ;                        \
+           case 210: out[209] = current_func(out[209],in[209]) ;                        \
+           case 209: out[208] = current_func(out[208],in[208]) ;                        \
+           case 208: out[207] = current_func(out[207],in[207]) ;                        \
+           case 207: out[206] = current_func(out[206],in[206]) ;                        \
+           case 206: out[205] = current_func(out[205],in[205]) ;                        \
+           case 205: out[204] = current_func(out[204],in[204]) ;                        \
+           case 204: out[203] = current_func(out[203],in[203]) ;                        \
+           case 203: out[202] = current_func(out[202],in[202]) ;                        \
+           case 202: out[201] = current_func(out[201],in[201]) ;                        \
+           case 201: out[200] = current_func(out[200],in[200]) ;                        \
+           case 200: out[199] = current_func(out[199],in[199]) ;                        \
+           case 199: out[198] = current_func(out[198],in[198]) ;                        \
+           case 198: out[197] = current_func(out[197],in[197]) ;                        \
+           case 197: out[196] = current_func(out[196],in[196]) ;                        \
+           case 196: out[195] = current_func(out[195],in[195]) ;                        \
+           case 195: out[194] = current_func(out[194],in[194]) ;                        \
+           case 194: out[193] = current_func(out[193],in[193]) ;                        \
+           case 193: out[192] = current_func(out[192],in[192]) ;                        \
+           case 192: out[191] = current_func(out[191],in[191]) ;                        \
+           case 191: out[190] = current_func(out[190],in[190]) ;                        \
+           case 190: out[189] = current_func(out[189],in[189]) ;                        \
+           case 189: out[188] = current_func(out[188],in[188]) ;                        \
+           case 188: out[187] = current_func(out[187],in[187]) ;                        \
+           case 187: out[186] = current_func(out[186],in[186]) ;                        \
+           case 186: out[185] = current_func(out[185],in[185]) ;                        \
+           case 185: out[184] = current_func(out[184],in[184]) ;                        \
+           case 184: out[183] = current_func(out[183],in[183]) ;                        \
+           case 183: out[182] = current_func(out[182],in[182]) ;                        \
+           case 182: out[181] = current_func(out[181],in[181]) ;                        \
+           case 181: out[180] = current_func(out[180],in[180]) ;                        \
+           case 180: out[179] = current_func(out[179],in[179]) ;                        \
+           case 179: out[178] = current_func(out[178],in[178]) ;                        \
+           case 178: out[177] = current_func(out[177],in[177]) ;                        \
+           case 177: out[176] = current_func(out[176],in[176]) ;                        \
+           case 176: out[175] = current_func(out[175],in[175]) ;                        \
+           case 175: out[174] = current_func(out[174],in[174]) ;                        \
+           case 174: out[173] = current_func(out[173],in[173]) ;                        \
+           case 173: out[172] = current_func(out[172],in[172]) ;                        \
+           case 172: out[171] = current_func(out[171],in[171]) ;                        \
+           case 171: out[170] = current_func(out[170],in[170]) ;                        \
+           case 170: out[169] = current_func(out[169],in[169]) ;                        \
+           case 169: out[168] = current_func(out[168],in[168]) ;                        \
+           case 168: out[167] = current_func(out[167],in[167]) ;                        \
+           case 167: out[166] = current_func(out[166],in[166]) ;                        \
+           case 166: out[165] = current_func(out[165],in[165]) ;                        \
+           case 165: out[164] = current_func(out[164],in[164]) ;                        \
+           case 164: out[163] = current_func(out[163],in[163]) ;                        \
+           case 163: out[162] = current_func(out[162],in[162]) ;                        \
+           case 162: out[161] = current_func(out[161],in[161]) ;                        \
+           case 161: out[160] = current_func(out[160],in[160]) ;                        \
+           case 160: out[159] = current_func(out[159],in[159]) ;                        \
+           case 159: out[158] = current_func(out[158],in[158]) ;                        \
+           case 158: out[157] = current_func(out[157],in[157]) ;                        \
+           case 157: out[156] = current_func(out[156],in[156]) ;                        \
+           case 156: out[155] = current_func(out[155],in[155]) ;                        \
+           case 155: out[154] = current_func(out[154],in[154]) ;                        \
+           case 154: out[153] = current_func(out[153],in[153]) ;                        \
+           case 153: out[152] = current_func(out[152],in[152]) ;                        \
+           case 152: out[151] = current_func(out[151],in[151]) ;                        \
+           case 151: out[150] = current_func(out[150],in[150]) ;                        \
+           case 150: out[149] = current_func(out[149],in[149]) ;                        \
+           case 149: out[148] = current_func(out[148],in[148]) ;                        \
+           case 148: out[147] = current_func(out[147],in[147]) ;                        \
+           case 147: out[146] = current_func(out[146],in[146]) ;                        \
+           case 146: out[145] = current_func(out[145],in[145]) ;                        \
+           case 145: out[144] = current_func(out[144],in[144]) ;                        \
+           case 144: out[143] = current_func(out[143],in[143]) ;                        \
+           case 143: out[142] = current_func(out[142],in[142]) ;                        \
+           case 142: out[141] = current_func(out[141],in[141]) ;                        \
+           case 141: out[140] = current_func(out[140],in[140]) ;                        \
+           case 140: out[139] = current_func(out[139],in[139]) ;                        \
+           case 139: out[138] = current_func(out[138],in[138]) ;                        \
+           case 138: out[137] = current_func(out[137],in[137]) ;                        \
+           case 137: out[136] = current_func(out[136],in[136]) ;                        \
+           case 136: out[135] = current_func(out[135],in[135]) ;                        \
+           case 135: out[134] = current_func(out[134],in[134]) ;                        \
+           case 134: out[133] = current_func(out[133],in[133]) ;                        \
+           case 133: out[132] = current_func(out[132],in[132]) ;                        \
+           case 132: out[131] = current_func(out[131],in[131]) ;                        \
+           case 131: out[130] = current_func(out[130],in[130]) ;                        \
+           case 130: out[129] = current_func(out[129],in[129]) ;                        \
+           case 129: out[128] = current_func(out[128],in[128]) ;                        \
+           case 128: out[127] = current_func(out[127],in[127]) ;                        \
+           case 127: out[126] = current_func(out[126],in[126]) ;                        \
+           case 126: out[125] = current_func(out[125],in[125]) ;                        \
+           case 125: out[124] = current_func(out[124],in[124]) ;                        \
+           case 124: out[123] = current_func(out[123],in[123]) ;                        \
+           case 123: out[122] = current_func(out[122],in[122]) ;                        \
+           case 122: out[121] = current_func(out[121],in[121]) ;                        \
+           case 121: out[120] = current_func(out[120],in[120]) ;                        \
+           case 120: out[119] = current_func(out[119],in[119]) ;                        \
+           case 119: out[118] = current_func(out[118],in[118]) ;                        \
+           case 118: out[117] = current_func(out[117],in[117]) ;                        \
+           case 117: out[116] = current_func(out[116],in[116]) ;                        \
+           case 116: out[115] = current_func(out[115],in[115]) ;                        \
+           case 115: out[114] = current_func(out[114],in[114]) ;                        \
+           case 114: out[113] = current_func(out[113],in[113]) ;                        \
+           case 113: out[112] = current_func(out[112],in[112]) ;                        \
+           case 112: out[111] = current_func(out[111],in[111]) ;                        \
+           case 111: out[110] = current_func(out[110],in[110]) ;                        \
+           case 110: out[109] = current_func(out[109],in[109]) ;                        \
+           case 109: out[108] = current_func(out[108],in[108]) ;                        \
+           case 108: out[107] = current_func(out[107],in[107]) ;                        \
+           case 107: out[106] = current_func(out[106],in[106]) ;                        \
+           case 106: out[105] = current_func(out[105],in[105]) ;                        \
+           case 105: out[104] = current_func(out[104],in[104]) ;                        \
+           case 104: out[103] = current_func(out[103],in[103]) ;                        \
+           case 103: out[102] = current_func(out[102],in[102]) ;                        \
+           case 102: out[101] = current_func(out[101],in[101]) ;                        \
+           case 101: out[100] = current_func(out[100],in[100]) ;                        \
+           case 100: out[99] = current_func(out[99],in[99]) ;                        \
+           case 99: out[98] = current_func(out[98],in[98]) ;                        \
+           case 98: out[97] = current_func(out[97],in[97]) ;                        \
+           case 97: out[96] = current_func(out[96],in[96]) ;                        \
+           case 96: out[95] = current_func(out[95],in[95]) ;                        \
+           case 95: out[94] = current_func(out[94],in[94]) ;                        \
+           case 94: out[93] = current_func(out[93],in[93]) ;                        \
+           case 93: out[92] = current_func(out[92],in[92]) ;                        \
+           case 92: out[91] = current_func(out[91],in[91]) ;                        \
+           case 91: out[90] = current_func(out[90],in[90]) ;                        \
+           case 90: out[89] = current_func(out[89],in[89]) ;                        \
+           case 89: out[88] = current_func(out[88],in[88]) ;                        \
+           case 88: out[87] = current_func(out[87],in[87]) ;                        \
+           case 87: out[86] = current_func(out[86],in[86]) ;                        \
+           case 86: out[85] = current_func(out[85],in[85]) ;                        \
+           case 85: out[84] = current_func(out[84],in[84]) ;                        \
+           case 84: out[83] = current_func(out[83],in[83]) ;                        \
+           case 83: out[82] = current_func(out[82],in[82]) ;                        \
+           case 82: out[81] = current_func(out[81],in[81]) ;                        \
+           case 81: out[80] = current_func(out[80],in[80]) ;                        \
+           case 80: out[79] = current_func(out[79],in[79]) ;                        \
+           case 79: out[78] = current_func(out[78],in[78]) ;                        \
+           case 78: out[77] = current_func(out[77],in[77]) ;                        \
+           case 77: out[76] = current_func(out[76],in[76]) ;                        \
+           case 76: out[75] = current_func(out[75],in[75]) ;                        \
+           case 75: out[74] = current_func(out[74],in[74]) ;                        \
+           case 74: out[73] = current_func(out[73],in[73]) ;                        \
+           case 73: out[72] = current_func(out[72],in[72]) ;                        \
+           case 72: out[71] = current_func(out[71],in[71]) ;                        \
+           case 71: out[70] = current_func(out[70],in[70]) ;                        \
+           case 70: out[69] = current_func(out[69],in[69]) ;                        \
+           case 69: out[68] = current_func(out[68],in[68]) ;                        \
+           case 68: out[67] = current_func(out[67],in[67]) ;                        \
+           case 67: out[66] = current_func(out[66],in[66]) ;                        \
+           case 66: out[65] = current_func(out[65],in[65]) ;                        \
+           case 65: out[64] = current_func(out[64],in[64]) ;                        \
+           case 64: out[63] = current_func(out[63],in[63]) ;                        \
+           case 63: out[62] = current_func(out[62],in[62]) ;                        \
+           case 62: out[61] = current_func(out[61],in[61]) ;                        \
+           case 61: out[60] = current_func(out[60],in[60]) ;                        \
+           case 60: out[59] = current_func(out[59],in[59]) ;                        \
+           case 59: out[58] = current_func(out[58],in[58]) ;                        \
+           case 58: out[57] = current_func(out[57],in[57]) ;                        \
+           case 57: out[56] = current_func(out[56],in[56]) ;                        \
+           case 56: out[55] = current_func(out[55],in[55]) ;                        \
+           case 55: out[54] = current_func(out[54],in[54]) ;                        \
+           case 54: out[53] = current_func(out[53],in[53]) ;                        \
+           case 53: out[52] = current_func(out[52],in[52]) ;                        \
+           case 52: out[51] = current_func(out[51],in[51]) ;                        \
+           case 51: out[50] = current_func(out[50],in[50]) ;                        \
+           case 50: out[49] = current_func(out[49],in[49]) ;                        \
+           case 49: out[48] = current_func(out[48],in[48]) ;                        \
+           case 48: out[47] = current_func(out[47],in[47]) ;                        \
+           case 47: out[46] = current_func(out[46],in[46]) ;                        \
+           case 46: out[45] = current_func(out[45],in[45]) ;                        \
+           case 45: out[44] = current_func(out[44],in[44]) ;                        \
+           case 44: out[43] = current_func(out[43],in[43]) ;                        \
+           case 43: out[42] = current_func(out[42],in[42]) ;                        \
+           case 42: out[41] = current_func(out[41],in[41]) ;                        \
+           case 41: out[40] = current_func(out[40],in[40]) ;                        \
+           case 40: out[39] = current_func(out[39],in[39]) ;                        \
+           case 39: out[38] = current_func(out[38],in[38]) ;                        \
+           case 38: out[37] = current_func(out[37],in[37]) ;                        \
+           case 37: out[36] = current_func(out[36],in[36]) ;                        \
+           case 36: out[35] = current_func(out[35],in[35]) ;                        \
+           case 35: out[34] = current_func(out[34],in[34]) ;                        \
+           case 34: out[33] = current_func(out[33],in[33]) ;                        \
+           case 33: out[32] = current_func(out[32],in[32]) ;                        \
+           case 32: out[31] = current_func(out[31],in[31]) ;                        \
+           case 31: out[30] = current_func(out[30],in[30]) ;                        \
+           case 30: out[29] = current_func(out[29],in[29]) ;                        \
+           case 29: out[28] = current_func(out[28],in[28]) ;                        \
+           case 28: out[27] = current_func(out[27],in[27]) ;                        \
+           case 27: out[26] = current_func(out[26],in[26]) ;                        \
+           case 26: out[25] = current_func(out[25],in[25]) ;                        \
+           case 25: out[24] = current_func(out[24],in[24]) ;                        \
+           case 24: out[23] = current_func(out[23],in[23]) ;                        \
+           case 23: out[22] = current_func(out[22],in[22]) ;                        \
+           case 22: out[21] = current_func(out[21],in[21]) ;                        \
+           case 21: out[20] = current_func(out[20],in[20]) ;                        \
+           case 20: out[19] = current_func(out[19],in[19]) ;                        \
+           case 19: out[18] = current_func(out[18],in[18]) ;                        \
+           case 18: out[17] = current_func(out[17],in[17]) ;                        \
+           case 17: out[16] = current_func(out[16],in[16]) ;                        \
+           case 16: out[15] = current_func(out[15],in[15]) ;                        \
+           case 15: out[14] = current_func(out[14],in[14]) ;                        \
+           case 14: out[13] = current_func(out[13],in[13]) ;                        \
+           case 13: out[12] = current_func(out[12],in[12]) ;                        \
+           case 12: out[11] = current_func(out[11],in[11]) ;                        \
+           case 11: out[10] = current_func(out[10],in[10]) ;                        \
+           case 10: out[9] = current_func(out[9],in[9]) ;                        \
+           case 9: out[8] = current_func(out[8],in[8]) ;                        \
+           case 8: out[7] = current_func(out[7],in[7]) ;                        \
+           case 7: out[6] = current_func(out[6],in[6]) ;                        \
+           case 6: out[5] = current_func(out[5],in[5]) ;                        \
+           case 5: out[4] = current_func(out[4],in[4]) ;                        \
+           case 4: out[3] = current_func(out[3],in[3]) ;                        \
+           case 3: out[2] = current_func(out[2],in[2]) ;                        \
+           case 2: out[1] = current_func(out[1],in[1]) ;                        \
+           case 1: out[0] = current_func(out[0],in[0]) ;                        \
+       }\
     } \
 }
 
 /*************************************************************************
  * Max
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
     OP_SVE_FUNC(max, b,  8,   int8_t, max)
     OP_SVE_FUNC(max, b,   8,  uint8_t, max)
     OP_SVE_FUNC(max, h,  16,  int16_t, max)
@@ -89,6 +344,8 @@
 /*************************************************************************
  * Min
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
     OP_SVE_FUNC(min, b,  8,   int8_t, min)
     OP_SVE_FUNC(min, b,   8,  uint8_t, min)
     OP_SVE_FUNC(min, h,  16,  int16_t, min)
@@ -110,6 +367,8 @@
  /*************************************************************************
  * Sum
  ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
     OP_SVE_FUNC(sum, b,  8,   int8_t, add)
     OP_SVE_FUNC(sum, b,   8,  uint8_t, add)
     OP_SVE_FUNC(sum, h,  16,  int16_t, add)
@@ -131,6 +390,8 @@
 /*************************************************************************
  * Product
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
     OP_SVE_FUNC(prod, b,  8,   int8_t, mul)
     OP_SVE_FUNC(prod, b,   8,  uint8_t, mul)
     OP_SVE_FUNC(prod, h,  16,  int16_t, mul)
@@ -138,7 +399,7 @@
     OP_SVE_FUNC(prod, w,  32,  int32_t, mul)
     OP_SVE_FUNC(prod, w, 32, uint32_t, mul)
     OP_SVE_FUNC(prod, d,  64,  int64_t, mul)
-OP_SVE_FUNC(prod, d, 64, uint64_t, mul)
+    OP_SVE_FUNC(prod, d, 64, uint64_t, mul)
 
     /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
@@ -152,6 +413,8 @@ OP_SVE_FUNC(prod, d, 64, float64_t, mul)
 /*************************************************************************
  * Bitwise AND
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
     OP_SVE_FUNC(band, b,  8,   int8_t, and)
     OP_SVE_FUNC(band, b,   8,  uint8_t, and)
     OP_SVE_FUNC(band, h,  16,  int16_t, and)
@@ -164,6 +427,8 @@ OP_SVE_FUNC(band, d, 64, uint64_t, and)
  /*************************************************************************
  * Bitwise OR
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
     OP_SVE_FUNC(bor, b,  8,   int8_t, orr)
     OP_SVE_FUNC(bor, b,   8,  uint8_t, orr)
     OP_SVE_FUNC(bor, h,  16,  int16_t, orr)
@@ -176,6 +441,8 @@ OP_SVE_FUNC(bor, d, 64, uint64_t, orr)
 /*************************************************************************
  * Bitwise XOR
  *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
     OP_SVE_FUNC(bxor, b,  8,   int8_t, eor)
     OP_SVE_FUNC(bxor, b,   8,  uint8_t, eor)
     OP_SVE_FUNC(bxor, h,  16,  int16_t, eor)
@@ -430,7 +697,7 @@ ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMP
     [OMPI_OP_BASE_FORTRAN_REPLACE] = {
         /* (MPI_ACCUMULATE is handled differently than the other
            reductions, so just zero out its function
-           impementations here to ensure that users don't invoke
+           implementations here to ensure that users don't invoke
            MPI_REPLACE with any reduction operations other than
            ACCUMULATE) */
         NULL,
@@ -493,7 +760,7 @@ ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN
     [OMPI_OP_BASE_FORTRAN_REPLACE] = {
         /* MPI_ACCUMULATE is handled differently than the other
            reductions, so just zero out its function
-           impementations here to ensure that users don't invoke
+           implementations here to ensure that users don't invoke
            MPI_REPLACE with any reduction operations other than
            ACCUMULATE */
         NULL,
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
index 4ac4f08cba0..cf9f09a801e 100644
--- a/test/datatype/correctness_check.sh
+++ b/test/datatype/correctness_check.sh
@@ -1,5 +1,5 @@
-echo "ompi version with AVX512 -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
-echo "/home/zhongdong/opt/git/avx512_reduction/bin/mpirun  -mca   op_sve_hardware_available 0  -mca   op_avx_hardware_available 0   -np 1 /home/zhongdong/Downloads/git/ARM/arm_sve_reduction/test/datatype/Reduce_local_float 1048576  i 8 max"
+echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1  -np 1 /your_test_path/Reduce_local_float 1048576  i 8 max"
 
 Orange= "\033[0;33m"
 Blue= '\033[0;34m'
@@ -7,46 +7,53 @@ Purple= '\033[0;35m'
 
 NC='\033[0m'
 
-echo "=========Integer type all operations & all sizes========"
-echo ""
-for op in max min sum mul band bor bxor
+# test all vector size
+for vector_len in  128 256 512 1024 2048 
 do
+
+    echo "=========Integer type all operations & all sizes========"
     echo ""
-    echo "===Operation  $op test==="
-    for size in 8 16 32 64
+    for op in max min sum mul band bor bxor
     do
-        echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
-      /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1024 i $size $op 
-        echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
-        echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
+        echo ""
+        echo "===Operation  $op test==="
+        for size in 8 16 32 64
+        do
+            echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1024 i $size $op 
+            echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1040 i $size $op 
+        done
     done
-done
-
-echo "=======Float type all operations========"
-echo ""
-echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
-for op in max min sum mul
-do
-   /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op 
-done
 
-echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
-## 28= 16+8 + 4
+    echo "=======Float type all operations========"
+    echo ""
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op 
+    done
 
-echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
-##40=16+4
+    echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 f 32 $op
+    done
 
-echo "========Double type all operations========="
-echo ""
 
-echo -e "Test \e[1;33m __mm512 instruction for loop \e[m Total_num_bits = 512*N "
-for op in max min sum mul
-do
-   /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op
-done
+    echo "========Double type all operations========="
+    echo ""
 
-echo -e "Test \e[1;34m __mm256 instruction for loop$ \e[m (512*(N-1) + 256) <  Total_num_bits < 512*N "
-## 20= 8 +8 + 3
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op
+    done
 
-echo -e "Test \e[1;35m  duff device code \e[m  512*N <  Total_num_bits < 512*N + 256  "
-##12=8+2
+    echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 d 64 $op
+    done
+done
+##echo -e "Test \e[1;35m  duff device code \e[m  2048*N <  Total_num_bits < 2048*N + 256  "

From 0914f2aa9ef9dff34897bf49eb24e4d710d12061 Mon Sep 17 00:00:00 2001
From: dong zhong <zhongdong0321@hotmail.com>
Date: Wed, 11 Dec 2019 00:25:32 -0500
Subject: [PATCH 06/13] Add test script and plot script

Signed-off-by: dong zhong <zhongdong0321@hotmail.com>
---
 test/datatype/Reduce_uint8.c    | 62 ++++++++++++++++++++++
 test/datatype/SVE_MPI_Op.py     | 91 +++++++++++++++++++++++++++++++++
 test/datatype/sve_uint8_test.sh | 21 ++++++++
 3 files changed, 174 insertions(+)
 create mode 100644 test/datatype/Reduce_uint8.c
 create mode 100644 test/datatype/SVE_MPI_Op.py
 create mode 100644 test/datatype/sve_uint8_test.sh

diff --git a/test/datatype/Reduce_uint8.c b/test/datatype/Reduce_uint8.c
new file mode 100644
index 00000000000..35570d8fe23
--- /dev/null
+++ b/test/datatype/Reduce_uint8.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+
+#define ARRAYSIZE  32*1024*1024
+
+int8_t in_uint8[ARRAYSIZE];
+int8_t inout_uint8[ARRAYSIZE];
+int8_t inout_uint8_for_check[ARRAYSIZE];
+
+int main(int argc, char **argv) {
+
+    char *num_elem = argv[1];
+    int count = atoi(num_elem);
+    char *type = argv[2];
+    char *elem_size = argv[3];
+    int elem_size1  = atoi(elem_size);
+    char *op = argv[4];
+
+    int i;
+
+    for (i=0; i<count; i++)
+    {
+
+        in_uint8[i] = 5;
+        inout_uint8[i] = inout_uint8_for_check[i] = -3;
+
+    }
+    int rank, size, len;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    double tstart, tend;
+
+    int correctness=1;
+
+    tstart = MPI_Wtime();
+    if(strcmp(op, "sum") == 0){
+        MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_INT8_T, MPI_SUM);
+    }
+    else
+        memcpy(in_uint8,inout_uint8, count);
+    tend = MPI_Wtime();
+    printf("PERF count  %d  time %.6f seconds\n",count, tend-tstart);
+    MPI_Finalize();
+#define L1size sysconf(_SC_LEVEL1_DCACHE_SIZE)
+#define L2size sysconf(_SC_LEVEL2_CACHE_SIZE)
+#define L3size sysconf(_SC_LEVEL2_CACHE_SIZE)
+    char *cache = (char*)calloc(L1size+L2size+L3size, sizeof(char));
+    free(cache);
+    return 0;
+}
diff --git a/test/datatype/SVE_MPI_Op.py b/test/datatype/SVE_MPI_Op.py
new file mode 100644
index 00000000000..5051936e29d
--- /dev/null
+++ b/test/datatype/SVE_MPI_Op.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sys
+
+print("\n \n Processing first file \n")
+
+filename1 = sys.argv[1]
+f1 = open(filename1,"r")
+lines1 = f1.readlines()
+f1.close()
+
+noMin1 = {}
+sortednomin1 = {}
+
+#Select lines with key
+for line in lines1:
+   if "time" in line:
+        vals = line.split( )
+        noMin1.setdefault((int)(vals[2]), []).append((float)(vals[4]))
+
+sortednomin1=sorted(noMin1.items())
+num_of_elements1 = [x[0] for x in sortednomin1]
+Reduce_latency1 = [x[1] for x in sortednomin1]
+print(Reduce_latency1)
+
+print("\n \n Processing second file \n")
+
+
+filename2 = sys.argv[2]
+f2 = open(filename2,"r")
+lines2 = f2.readlines()
+f2.close()
+
+noMin2 = {}
+sortednomin2 = {}
+
+
+#Select lines with key
+for line in lines2:
+   if "time" in line:
+        vals = line.split( )
+        noMin2.setdefault((int)(vals[2]), []).append((float)(vals[4]))
+
+sortednomin2=sorted(noMin2.items())
+num_of_elements2 = [x[0] for x in sortednomin2]
+Reduce_latency2 = [x[1] for x in sortednomin2]
+print(Reduce_latency2)
+
+Msg=[1,2,3,4,5,6,7,8,9]
+
+Msg_size=['1k', '4k', '16k', '64k', '256k', '1M', '4M', '16M', '32M']
+
+meanlineprops = dict(linewidth=2)
+fig, (ax1) = plt.subplots(1, figsize=(6, 4))
+
+ax1.grid(True,linestyle='--')
+
+
+bp1 =ax1.boxplot(Reduce_latency1, notch=1, sym='+', vert=1, whis=1.5,patch_artist=True)
+bp2 =ax1.boxplot(Reduce_latency2, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True)
+#bp3 =ax1.boxplot(cpy, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True)
+#ax1.plot(Msg, cpy, linestyle='-.', lw=2.5, marker='o',markersize='8', color='b', label="0.1s")
+
+colors = ['red','red','red','red','red','red','red','red','red']
+for patch, color in zip(bp2['boxes'], colors):
+    patch.set_facecolor(color)
+
+colors = ['purple','purple','purple','purple','purple','purple','purple','purple','purple']
+for patch, color in zip(bp1['boxes'], colors):
+    patch.set_facecolor(color)
+
+"""
+colors = ['green','green','green','green','green','green']
+for patch, color in zip(bp3['boxes'], colors):
+    patch.set_facecolor(color)
+"""
+fig.suptitle('MPI_Op: mul; MPI_type: uint8 \n(Configured with optimized, Flushed cache)', x=0.45, y=1.01, va='center',fontsize=15)
+fig.text(-0.03, 0.5, 'Reduce time (s)', va='center', rotation='vertical',fontsize=12)
+plt.xlabel('Buffer size (Bytes)',fontsize=12)
+
+ax1.set_xticks(Msg)
+ax1.set_xticklabels(Msg_size, minor=False)
+
+
+ax1.legend([bp2["boxes"][0], bp1["boxes"][0]], ['NO_SVE','SVE'], loc='upper left',fontsize=12)#, bbox_to_anchor=(1, 0.5),fontsize=15)
+
+plt.yscale('log')
+
+plt.savefig(filename1+"_sum.pdf",dpi=300,bbox_inches='tight')
diff --git a/test/datatype/sve_uint8_test.sh b/test/datatype/sve_uint8_test.sh
new file mode 100644
index 00000000000..b7f857db157
--- /dev/null
+++ b/test/datatype/sve_uint8_test.sh
@@ -0,0 +1,21 @@
+echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1  -np 1 /your_test_path/Reduce_local_float 1048576  i 8 max"
+
+# test all vector size
+for vector_len in  128 256 512 1024 2048
+do
+
+    echo "=========Integer type all operations & all sizes========"
+    echo ""
+    echo ""
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for (( i=1; i<11; i++ ))
+    do
+        for val in 1024 4096 16384 65536 262144 1048576 4194304 16777216 33554432
+        do
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8  $val i 8 sum |  tee -a sve-sum-$vector_len.txt
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 0 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8  $val i 8 sum |   tee -a no-sve-sum-$vector_len.txt
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_uint8  $val i 8 cpy |  tee -a sve-cpy-$vector_len.txt 
+        done
+    done
+done

From 19a5ffea4993652842d13cd07d5d500064f6d56d Mon Sep 17 00:00:00 2001
From: dong zhong <zhongdong0321@hotmail.com>
Date: Wed, 11 Dec 2019 00:51:37 -0500
Subject: [PATCH 07/13] Add test README

Signed-off-by: dong zhong <zhongdong0321@hotmail.com>
---
 test/datatype/HOW_TO_TEST_README.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 test/datatype/HOW_TO_TEST_README.txt

diff --git a/test/datatype/HOW_TO_TEST_README.txt b/test/datatype/HOW_TO_TEST_README.txt
new file mode 100644
index 00000000000..4d6025c95ee
--- /dev/null
+++ b/test/datatype/HOW_TO_TEST_README.txt
@@ -0,0 +1,17 @@
+(1) Reduce_uint8.c : test code for MPI_SUM operation with type uint8.
+    compile as : $path/mpicc -march=armv8-a+sve -O3 -o Reduce_uint8 Reduce_uint8.c 
+(2) sve_uint8_test.sh: A shell script that will generate results with time information for MPI_SUM operation with different message size. 
+(3) SVE_MPI_Op.py: A python script which you can use to generate plot.
+
+ALL YOU NEED TO DO IS:
+(1) Change the path of your mpirun and test binary in sve_uint8_test.sh. 
+(2) Run sve_uint8_test.sh 
+ This will generate 3 types of output files with names
+    a. sve-sum-vectorlength.txt    ##MPI_SUM with SVE-enabled operation
+    b. no-sve-sum-vectorlength.txt ##MPI_SUM without SVE-enabled operation
+    c. sve-cpy-vectorlength.txt    ## memcpy 
+(2) run python scripts as
+    $python SVE_MPI_Op.py sve-sum-vectorlength.txt no-sve-sum-vectorlength.txt
+    make sure put sve-sum-vectorlength.txt before no-sve-sum-vectorlength.txt 
+    example : SVE_MPI_Op.py sve-sum-128.txt no-sve-sum-128.txt
+

From 86f3965b57a040a7d0b7d6cf607eb11f74831525 Mon Sep 17 00:00:00 2001
From: Dong Zhong <dzhong@vols.utk.edu>
Date: Wed, 11 Dec 2019 01:28:52 -0500
Subject: [PATCH 08/13] Change to 30 runs

---
 test/datatype/sve_uint8_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/datatype/sve_uint8_test.sh b/test/datatype/sve_uint8_test.sh
index b7f857db157..ed0cc151592 100644
--- a/test/datatype/sve_uint8_test.sh
+++ b/test/datatype/sve_uint8_test.sh
@@ -9,7 +9,7 @@ do
     echo ""
     echo ""
     echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
-    for (( i=1; i<11; i++ ))
+    for (( i=1; i<31; i++ ))
     do
         for val in 1024 4096 16384 65536 262144 1048576 4194304 16777216 33554432
         do

From 72195f8cd1c236ece32f889f4e7f3646ddfbc43c Mon Sep 17 00:00:00 2001
From: Dong Zhong <zhongdong0321@hotmail.com>
Date: Fri, 21 Feb 2020 14:27:42 -0500
Subject: [PATCH 09/13] update makefile

---
 ompi/mca/op/Makefile.am            |  4 +++-
 ompi/mca/op/arm_sve_op/Makefile.am | 21 +++++++++------------
 test/datatype/Makefile.am          |  8 +++++++-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am
index 8c392f1dbec..e942d1a8c21 100644
--- a/ompi/mca/op/Makefile.am
+++ b/ompi/mca/op/Makefile.am
@@ -2,7 +2,7 @@
 # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 #                         University Research and Technology
 #                         Corporation.  All rights reserved.
-# Copyright (c) 2004-2005 The University of Tennessee and The University
+# Copyright (c) 2004-2020 The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
 # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -17,6 +17,8 @@
 # $HEADER$
 #
 
+AM_CPPFLAGS = $(LTDLINCL)
+
 # main library setup
 noinst_LTLIBRARIES = libmca_op.la
 libmca_op_la_SOURCES =
diff --git a/ompi/mca/op/arm_sve_op/Makefile.am b/ompi/mca/op/arm_sve_op/Makefile.am
index 9b2aca888a4..7a0d2162581 100644
--- a/ompi/mca/op/arm_sve_op/Makefile.am
+++ b/ompi/mca/op/arm_sve_op/Makefile.am
@@ -22,6 +22,7 @@
 sources = \
     op_sve.h \
     op_sve_component.c \
+	op_sve_functions.h \
 	op_sve_functions.c
 
 # Open MPI components can be compiled two ways:
@@ -41,15 +42,11 @@ sources = \
 # which way this component should be built.
 
 if MCA_BUILD_ompi_op_arm_sve_op_DSO
-lib =
-lib_sources =
-component = mca_op_sve.la
-component_sources = $(sources)
+component_noinst =
+component_install = mca_op_sve.la
 else
-lib = libmca_op_sve.la
-lib_sources = $(sources)
-component =
-component_sources =
+component_install =
+component_noinst = component_noinst
 endif
 
 # Specific information for DSO builds.
@@ -58,8 +55,8 @@ endif
 # $prefix/lib/openmpi).
 
 mcacomponentdir = $(ompilibdir)
-mcacomponent_LTLIBRARIES = $(component)
-mca_op_sve_la_SOURCES = $(component_sources)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_sve_la_SOURCES = $(sources)
 mca_op_sve_la_LDFLAGS = -module -avoid-version
 mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 
@@ -68,6 +65,6 @@ mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
 # Note that we *must* "noinst"; the upper-layer Makefile.am's will
 # slurp in the resulting .la library into libmpi.
 
-noinst_LTLIBRARIES = $(lib)
-libmca_op_sve_la_SOURCES = $(lib_sources)
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_sve_la_SOURCES = $(sources)
 libmca_op_sve_la_LDFLAGS = -module -avoid-version
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 4366724a523..19c0d6489b3 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -16,7 +16,7 @@
 
 if PROJECT_OMPI
     MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data
-    MPI_CHECKS = to_self
+    MPI_CHECKS = to_self reduce_local
 endif
 TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS)
 
@@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
 unpack_hetero_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
+reduce_local_SOURCES = reduce_local.c
+reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
+reduce_local_LDADD = \
+					 $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
+					 $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+
 distclean:
 	rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile

From f632d687907c08584a9d2d5c29183df648698f6b Mon Sep 17 00:00:00 2001
From: Dong Zhong <zhongdong0321@hotmail.com>
Date: Fri, 21 Feb 2020 16:57:17 -0500
Subject: [PATCH 10/13] rename sve folder

---
 ompi/mca/op/{arm_sve_op => sve}/Makefile.am   |  2 +-
 ompi/mca/op/sve/configure.m4                  | 21 +++++++++++++++++++
 ompi/mca/op/{arm_sve_op => sve}/op_sve.h      |  0
 .../op/{arm_sve_op => sve}/op_sve_component.c |  4 ++--
 .../op/{arm_sve_op => sve}/op_sve_functions.c |  4 ++--
 .../op/{arm_sve_op => sve}/op_sve_functions.h |  2 +-
 6 files changed, 27 insertions(+), 6 deletions(-)
 rename ompi/mca/op/{arm_sve_op => sve}/Makefile.am (98%)
 create mode 100644 ompi/mca/op/sve/configure.m4
 rename ompi/mca/op/{arm_sve_op => sve}/op_sve.h (100%)
 rename ompi/mca/op/{arm_sve_op => sve}/op_sve_component.c (98%)
 rename ompi/mca/op/{arm_sve_op => sve}/op_sve_functions.c (99%)
 rename ompi/mca/op/{arm_sve_op => sve}/op_sve_functions.h (94%)

diff --git a/ompi/mca/op/arm_sve_op/Makefile.am b/ompi/mca/op/sve/Makefile.am
similarity index 98%
rename from ompi/mca/op/arm_sve_op/Makefile.am
rename to ompi/mca/op/sve/Makefile.am
index 7a0d2162581..1e70e2fead8 100644
--- a/ompi/mca/op/arm_sve_op/Makefile.am
+++ b/ompi/mca/op/sve/Makefile.am
@@ -41,7 +41,7 @@ sources = \
 # MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
 # which way this component should be built.
 
-if MCA_BUILD_ompi_op_arm_sve_op_DSO
+if MCA_BUILD_ompi_op_sve_DSO
 component_noinst =
 component_install = mca_op_sve.la
 else
diff --git a/ompi/mca/op/sve/configure.m4 b/ompi/mca/op/sve/configure.m4
new file mode 100644
index 00000000000..9f71d56c951
--- /dev/null
+++ b/ompi/mca/op/sve/configure.m4
@@ -0,0 +1,21 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019-2020 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_sve_CONFIG([action-if-can-compile],
+#		         [action-if-cant-compile])
+# ------------------------------------------------
+# We can always build, unless we were explicitly disabled.
+AC_DEFUN([MCA_ompi_op_sve_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/sve/Makefile])
+           [$1],
+])dnl
diff --git a/ompi/mca/op/arm_sve_op/op_sve.h b/ompi/mca/op/sve/op_sve.h
similarity index 100%
rename from ompi/mca/op/arm_sve_op/op_sve.h
rename to ompi/mca/op/sve/op_sve.h
diff --git a/ompi/mca/op/arm_sve_op/op_sve_component.c b/ompi/mca/op/sve/op_sve_component.c
similarity index 98%
rename from ompi/mca/op/arm_sve_op/op_sve_component.c
rename to ompi/mca/op/sve/op_sve_component.c
index df8281e5ba1..16393aefe23 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_component.c
+++ b/ompi/mca/op/sve/op_sve_component.c
@@ -26,8 +26,8 @@
 #include "ompi/op/op.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
-#include "ompi/mca/op/arm_sve_op/op_sve.h"
-#include "ompi/mca/op/arm_sve_op/op_sve_functions.h"
+#include "ompi/mca/op/sve/op_sve.h"
+#include "ompi/mca/op/sve/op_sve_functions.h"
 
 static int sve_component_open(void);
 static int sve_component_close(void);
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c
similarity index 99%
rename from ompi/mca/op/arm_sve_op/op_sve_functions.c
rename to ompi/mca/op/sve/op_sve_functions.c
index c9e3bf7b137..265f41d9988 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.c
+++ b/ompi/mca/op/sve/op_sve_functions.c
@@ -22,8 +22,8 @@
 #include "ompi/op/op.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
-#include "ompi/mca/op/arm_sve_op/op_sve.h"
-#include "ompi/mca/op/arm_sve_op/op_sve_functions.h"
+#include "ompi/mca/op/sve/op_sve.h"
+#include "ompi/mca/op/sve/op_sve_functions.h"
 
 #ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
diff --git a/ompi/mca/op/arm_sve_op/op_sve_functions.h b/ompi/mca/op/sve/op_sve_functions.h
similarity index 94%
rename from ompi/mca/op/arm_sve_op/op_sve_functions.h
rename to ompi/mca/op/sve/op_sve_functions.h
index d5e050afb21..00db651cfac 100644
--- a/ompi/mca/op/arm_sve_op/op_sve_functions.h
+++ b/ompi/mca/op/sve/op_sve_functions.h
@@ -19,7 +19,7 @@
 #endif
 
 #include "ompi/mca/op/op.h"
-#include "ompi/mca/op/arm_sve_op/op_sve.h"
+#include "ompi/mca/op/sve/op_sve.h"
 
 BEGIN_C_DECLS
 

From 3d9a27926f04f6fa627c7cdc5ed2ef6025dbdadd Mon Sep 17 00:00:00 2001
From: Dong Zhong <zhongdong0321@hotmail.com>
Date: Fri, 21 Feb 2020 16:27:03 -0500
Subject: [PATCH 11/13] remove short_float, rewrite duff device

Signed-off-by: Dong Zhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/sve/op_sve_functions.c | 364 ++++-------------------------
 1 file changed, 45 insertions(+), 319 deletions(-)

diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c
index 265f41d9988..1b14affe336 100644
--- a/ompi/mca/op/sve/op_sve_functions.c
+++ b/ompi/mca/op/sve/op_sve_functions.c
@@ -56,265 +56,45 @@
         out += types_per_step; \
     }                                                                  \
     \
-    if (left_over !=0){                                                \
-       switch(left_over) {                         \
-           case 256: out[255] = current_func(out[255],in[255]) ;                        \
-           case 255: out[254] = current_func(out[254],in[254]) ;                        \
-           case 254: out[253] = current_func(out[253],in[253]) ;                        \
-           case 253: out[252] = current_func(out[252],in[252]) ;                        \
-           case 252: out[251] = current_func(out[251],in[251]) ;                        \
-           case 251: out[250] = current_func(out[250],in[250]) ;                        \
-           case 250: out[249] = current_func(out[249],in[249]) ;                        \
-           case 249: out[248] = current_func(out[248],in[248]) ;                        \
-           case 248: out[247] = current_func(out[247],in[247]) ;                        \
-           case 247: out[246] = current_func(out[246],in[246]) ;                        \
-           case 246: out[245] = current_func(out[245],in[245]) ;                        \
-           case 245: out[244] = current_func(out[244],in[244]) ;                        \
-           case 244: out[243] = current_func(out[243],in[243]) ;                        \
-           case 243: out[242] = current_func(out[242],in[242]) ;                        \
-           case 242: out[241] = current_func(out[241],in[241]) ;                        \
-           case 241: out[240] = current_func(out[240],in[240]) ;                        \
-           case 240: out[239] = current_func(out[239],in[239]) ;                        \
-           case 239: out[238] = current_func(out[238],in[238]) ;                        \
-           case 238: out[237] = current_func(out[237],in[237]) ;                        \
-           case 237: out[236] = current_func(out[236],in[236]) ;                        \
-           case 236: out[235] = current_func(out[235],in[235]) ;                        \
-           case 235: out[234] = current_func(out[234],in[234]) ;                        \
-           case 234: out[233] = current_func(out[233],in[233]) ;                        \
-           case 233: out[232] = current_func(out[232],in[232]) ;                        \
-           case 232: out[231] = current_func(out[231],in[231]) ;                        \
-           case 231: out[230] = current_func(out[230],in[230]) ;                        \
-           case 230: out[229] = current_func(out[229],in[229]) ;                        \
-           case 229: out[228] = current_func(out[228],in[228]) ;                        \
-           case 228: out[227] = current_func(out[227],in[227]) ;                        \
-           case 227: out[226] = current_func(out[226],in[226]) ;                        \
-           case 226: out[225] = current_func(out[225],in[225]) ;                        \
-           case 225: out[224] = current_func(out[224],in[224]) ;                        \
-           case 224: out[223] = current_func(out[223],in[223]) ;                        \
-           case 223: out[222] = current_func(out[222],in[222]) ;                        \
-           case 222: out[221] = current_func(out[221],in[221]) ;                        \
-           case 221: out[220] = current_func(out[220],in[220]) ;                        \
-           case 220: out[219] = current_func(out[219],in[219]) ;                        \
-           case 219: out[218] = current_func(out[218],in[218]) ;                        \
-           case 218: out[217] = current_func(out[217],in[217]) ;                        \
-           case 217: out[216] = current_func(out[216],in[216]) ;                        \
-           case 216: out[215] = current_func(out[215],in[215]) ;                        \
-           case 215: out[214] = current_func(out[214],in[214]) ;                        \
-           case 214: out[213] = current_func(out[213],in[213]) ;                        \
-           case 213: out[212] = current_func(out[212],in[212]) ;                        \
-           case 212: out[211] = current_func(out[211],in[211]) ;                        \
-           case 211: out[210] = current_func(out[210],in[210]) ;                        \
-           case 210: out[209] = current_func(out[209],in[209]) ;                        \
-           case 209: out[208] = current_func(out[208],in[208]) ;                        \
-           case 208: out[207] = current_func(out[207],in[207]) ;                        \
-           case 207: out[206] = current_func(out[206],in[206]) ;                        \
-           case 206: out[205] = current_func(out[205],in[205]) ;                        \
-           case 205: out[204] = current_func(out[204],in[204]) ;                        \
-           case 204: out[203] = current_func(out[203],in[203]) ;                        \
-           case 203: out[202] = current_func(out[202],in[202]) ;                        \
-           case 202: out[201] = current_func(out[201],in[201]) ;                        \
-           case 201: out[200] = current_func(out[200],in[200]) ;                        \
-           case 200: out[199] = current_func(out[199],in[199]) ;                        \
-           case 199: out[198] = current_func(out[198],in[198]) ;                        \
-           case 198: out[197] = current_func(out[197],in[197]) ;                        \
-           case 197: out[196] = current_func(out[196],in[196]) ;                        \
-           case 196: out[195] = current_func(out[195],in[195]) ;                        \
-           case 195: out[194] = current_func(out[194],in[194]) ;                        \
-           case 194: out[193] = current_func(out[193],in[193]) ;                        \
-           case 193: out[192] = current_func(out[192],in[192]) ;                        \
-           case 192: out[191] = current_func(out[191],in[191]) ;                        \
-           case 191: out[190] = current_func(out[190],in[190]) ;                        \
-           case 190: out[189] = current_func(out[189],in[189]) ;                        \
-           case 189: out[188] = current_func(out[188],in[188]) ;                        \
-           case 188: out[187] = current_func(out[187],in[187]) ;                        \
-           case 187: out[186] = current_func(out[186],in[186]) ;                        \
-           case 186: out[185] = current_func(out[185],in[185]) ;                        \
-           case 185: out[184] = current_func(out[184],in[184]) ;                        \
-           case 184: out[183] = current_func(out[183],in[183]) ;                        \
-           case 183: out[182] = current_func(out[182],in[182]) ;                        \
-           case 182: out[181] = current_func(out[181],in[181]) ;                        \
-           case 181: out[180] = current_func(out[180],in[180]) ;                        \
-           case 180: out[179] = current_func(out[179],in[179]) ;                        \
-           case 179: out[178] = current_func(out[178],in[178]) ;                        \
-           case 178: out[177] = current_func(out[177],in[177]) ;                        \
-           case 177: out[176] = current_func(out[176],in[176]) ;                        \
-           case 176: out[175] = current_func(out[175],in[175]) ;                        \
-           case 175: out[174] = current_func(out[174],in[174]) ;                        \
-           case 174: out[173] = current_func(out[173],in[173]) ;                        \
-           case 173: out[172] = current_func(out[172],in[172]) ;                        \
-           case 172: out[171] = current_func(out[171],in[171]) ;                        \
-           case 171: out[170] = current_func(out[170],in[170]) ;                        \
-           case 170: out[169] = current_func(out[169],in[169]) ;                        \
-           case 169: out[168] = current_func(out[168],in[168]) ;                        \
-           case 168: out[167] = current_func(out[167],in[167]) ;                        \
-           case 167: out[166] = current_func(out[166],in[166]) ;                        \
-           case 166: out[165] = current_func(out[165],in[165]) ;                        \
-           case 165: out[164] = current_func(out[164],in[164]) ;                        \
-           case 164: out[163] = current_func(out[163],in[163]) ;                        \
-           case 163: out[162] = current_func(out[162],in[162]) ;                        \
-           case 162: out[161] = current_func(out[161],in[161]) ;                        \
-           case 161: out[160] = current_func(out[160],in[160]) ;                        \
-           case 160: out[159] = current_func(out[159],in[159]) ;                        \
-           case 159: out[158] = current_func(out[158],in[158]) ;                        \
-           case 158: out[157] = current_func(out[157],in[157]) ;                        \
-           case 157: out[156] = current_func(out[156],in[156]) ;                        \
-           case 156: out[155] = current_func(out[155],in[155]) ;                        \
-           case 155: out[154] = current_func(out[154],in[154]) ;                        \
-           case 154: out[153] = current_func(out[153],in[153]) ;                        \
-           case 153: out[152] = current_func(out[152],in[152]) ;                        \
-           case 152: out[151] = current_func(out[151],in[151]) ;                        \
-           case 151: out[150] = current_func(out[150],in[150]) ;                        \
-           case 150: out[149] = current_func(out[149],in[149]) ;                        \
-           case 149: out[148] = current_func(out[148],in[148]) ;                        \
-           case 148: out[147] = current_func(out[147],in[147]) ;                        \
-           case 147: out[146] = current_func(out[146],in[146]) ;                        \
-           case 146: out[145] = current_func(out[145],in[145]) ;                        \
-           case 145: out[144] = current_func(out[144],in[144]) ;                        \
-           case 144: out[143] = current_func(out[143],in[143]) ;                        \
-           case 143: out[142] = current_func(out[142],in[142]) ;                        \
-           case 142: out[141] = current_func(out[141],in[141]) ;                        \
-           case 141: out[140] = current_func(out[140],in[140]) ;                        \
-           case 140: out[139] = current_func(out[139],in[139]) ;                        \
-           case 139: out[138] = current_func(out[138],in[138]) ;                        \
-           case 138: out[137] = current_func(out[137],in[137]) ;                        \
-           case 137: out[136] = current_func(out[136],in[136]) ;                        \
-           case 136: out[135] = current_func(out[135],in[135]) ;                        \
-           case 135: out[134] = current_func(out[134],in[134]) ;                        \
-           case 134: out[133] = current_func(out[133],in[133]) ;                        \
-           case 133: out[132] = current_func(out[132],in[132]) ;                        \
-           case 132: out[131] = current_func(out[131],in[131]) ;                        \
-           case 131: out[130] = current_func(out[130],in[130]) ;                        \
-           case 130: out[129] = current_func(out[129],in[129]) ;                        \
-           case 129: out[128] = current_func(out[128],in[128]) ;                        \
-           case 128: out[127] = current_func(out[127],in[127]) ;                        \
-           case 127: out[126] = current_func(out[126],in[126]) ;                        \
-           case 126: out[125] = current_func(out[125],in[125]) ;                        \
-           case 125: out[124] = current_func(out[124],in[124]) ;                        \
-           case 124: out[123] = current_func(out[123],in[123]) ;                        \
-           case 123: out[122] = current_func(out[122],in[122]) ;                        \
-           case 122: out[121] = current_func(out[121],in[121]) ;                        \
-           case 121: out[120] = current_func(out[120],in[120]) ;                        \
-           case 120: out[119] = current_func(out[119],in[119]) ;                        \
-           case 119: out[118] = current_func(out[118],in[118]) ;                        \
-           case 118: out[117] = current_func(out[117],in[117]) ;                        \
-           case 117: out[116] = current_func(out[116],in[116]) ;                        \
-           case 116: out[115] = current_func(out[115],in[115]) ;                        \
-           case 115: out[114] = current_func(out[114],in[114]) ;                        \
-           case 114: out[113] = current_func(out[113],in[113]) ;                        \
-           case 113: out[112] = current_func(out[112],in[112]) ;                        \
-           case 112: out[111] = current_func(out[111],in[111]) ;                        \
-           case 111: out[110] = current_func(out[110],in[110]) ;                        \
-           case 110: out[109] = current_func(out[109],in[109]) ;                        \
-           case 109: out[108] = current_func(out[108],in[108]) ;                        \
-           case 108: out[107] = current_func(out[107],in[107]) ;                        \
-           case 107: out[106] = current_func(out[106],in[106]) ;                        \
-           case 106: out[105] = current_func(out[105],in[105]) ;                        \
-           case 105: out[104] = current_func(out[104],in[104]) ;                        \
-           case 104: out[103] = current_func(out[103],in[103]) ;                        \
-           case 103: out[102] = current_func(out[102],in[102]) ;                        \
-           case 102: out[101] = current_func(out[101],in[101]) ;                        \
-           case 101: out[100] = current_func(out[100],in[100]) ;                        \
-           case 100: out[99] = current_func(out[99],in[99]) ;                        \
-           case 99: out[98] = current_func(out[98],in[98]) ;                        \
-           case 98: out[97] = current_func(out[97],in[97]) ;                        \
-           case 97: out[96] = current_func(out[96],in[96]) ;                        \
-           case 96: out[95] = current_func(out[95],in[95]) ;                        \
-           case 95: out[94] = current_func(out[94],in[94]) ;                        \
-           case 94: out[93] = current_func(out[93],in[93]) ;                        \
-           case 93: out[92] = current_func(out[92],in[92]) ;                        \
-           case 92: out[91] = current_func(out[91],in[91]) ;                        \
-           case 91: out[90] = current_func(out[90],in[90]) ;                        \
-           case 90: out[89] = current_func(out[89],in[89]) ;                        \
-           case 89: out[88] = current_func(out[88],in[88]) ;                        \
-           case 88: out[87] = current_func(out[87],in[87]) ;                        \
-           case 87: out[86] = current_func(out[86],in[86]) ;                        \
-           case 86: out[85] = current_func(out[85],in[85]) ;                        \
-           case 85: out[84] = current_func(out[84],in[84]) ;                        \
-           case 84: out[83] = current_func(out[83],in[83]) ;                        \
-           case 83: out[82] = current_func(out[82],in[82]) ;                        \
-           case 82: out[81] = current_func(out[81],in[81]) ;                        \
-           case 81: out[80] = current_func(out[80],in[80]) ;                        \
-           case 80: out[79] = current_func(out[79],in[79]) ;                        \
-           case 79: out[78] = current_func(out[78],in[78]) ;                        \
-           case 78: out[77] = current_func(out[77],in[77]) ;                        \
-           case 77: out[76] = current_func(out[76],in[76]) ;                        \
-           case 76: out[75] = current_func(out[75],in[75]) ;                        \
-           case 75: out[74] = current_func(out[74],in[74]) ;                        \
-           case 74: out[73] = current_func(out[73],in[73]) ;                        \
-           case 73: out[72] = current_func(out[72],in[72]) ;                        \
-           case 72: out[71] = current_func(out[71],in[71]) ;                        \
-           case 71: out[70] = current_func(out[70],in[70]) ;                        \
-           case 70: out[69] = current_func(out[69],in[69]) ;                        \
-           case 69: out[68] = current_func(out[68],in[68]) ;                        \
-           case 68: out[67] = current_func(out[67],in[67]) ;                        \
-           case 67: out[66] = current_func(out[66],in[66]) ;                        \
-           case 66: out[65] = current_func(out[65],in[65]) ;                        \
-           case 65: out[64] = current_func(out[64],in[64]) ;                        \
-           case 64: out[63] = current_func(out[63],in[63]) ;                        \
-           case 63: out[62] = current_func(out[62],in[62]) ;                        \
-           case 62: out[61] = current_func(out[61],in[61]) ;                        \
-           case 61: out[60] = current_func(out[60],in[60]) ;                        \
-           case 60: out[59] = current_func(out[59],in[59]) ;                        \
-           case 59: out[58] = current_func(out[58],in[58]) ;                        \
-           case 58: out[57] = current_func(out[57],in[57]) ;                        \
-           case 57: out[56] = current_func(out[56],in[56]) ;                        \
-           case 56: out[55] = current_func(out[55],in[55]) ;                        \
-           case 55: out[54] = current_func(out[54],in[54]) ;                        \
-           case 54: out[53] = current_func(out[53],in[53]) ;                        \
-           case 53: out[52] = current_func(out[52],in[52]) ;                        \
-           case 52: out[51] = current_func(out[51],in[51]) ;                        \
-           case 51: out[50] = current_func(out[50],in[50]) ;                        \
-           case 50: out[49] = current_func(out[49],in[49]) ;                        \
-           case 49: out[48] = current_func(out[48],in[48]) ;                        \
-           case 48: out[47] = current_func(out[47],in[47]) ;                        \
-           case 47: out[46] = current_func(out[46],in[46]) ;                        \
-           case 46: out[45] = current_func(out[45],in[45]) ;                        \
-           case 45: out[44] = current_func(out[44],in[44]) ;                        \
-           case 44: out[43] = current_func(out[43],in[43]) ;                        \
-           case 43: out[42] = current_func(out[42],in[42]) ;                        \
-           case 42: out[41] = current_func(out[41],in[41]) ;                        \
-           case 41: out[40] = current_func(out[40],in[40]) ;                        \
-           case 40: out[39] = current_func(out[39],in[39]) ;                        \
-           case 39: out[38] = current_func(out[38],in[38]) ;                        \
-           case 38: out[37] = current_func(out[37],in[37]) ;                        \
-           case 37: out[36] = current_func(out[36],in[36]) ;                        \
-           case 36: out[35] = current_func(out[35],in[35]) ;                        \
-           case 35: out[34] = current_func(out[34],in[34]) ;                        \
-           case 34: out[33] = current_func(out[33],in[33]) ;                        \
-           case 33: out[32] = current_func(out[32],in[32]) ;                        \
-           case 32: out[31] = current_func(out[31],in[31]) ;                        \
-           case 31: out[30] = current_func(out[30],in[30]) ;                        \
-           case 30: out[29] = current_func(out[29],in[29]) ;                        \
-           case 29: out[28] = current_func(out[28],in[28]) ;                        \
-           case 28: out[27] = current_func(out[27],in[27]) ;                        \
-           case 27: out[26] = current_func(out[26],in[26]) ;                        \
-           case 26: out[25] = current_func(out[25],in[25]) ;                        \
-           case 25: out[24] = current_func(out[24],in[24]) ;                        \
-           case 24: out[23] = current_func(out[23],in[23]) ;                        \
-           case 23: out[22] = current_func(out[22],in[22]) ;                        \
-           case 22: out[21] = current_func(out[21],in[21]) ;                        \
-           case 21: out[20] = current_func(out[20],in[20]) ;                        \
-           case 20: out[19] = current_func(out[19],in[19]) ;                        \
-           case 19: out[18] = current_func(out[18],in[18]) ;                        \
-           case 18: out[17] = current_func(out[17],in[17]) ;                        \
-           case 17: out[16] = current_func(out[16],in[16]) ;                        \
-           case 16: out[15] = current_func(out[15],in[15]) ;                        \
-           case 15: out[14] = current_func(out[14],in[14]) ;                        \
-           case 14: out[13] = current_func(out[13],in[13]) ;                        \
-           case 13: out[12] = current_func(out[12],in[12]) ;                        \
-           case 12: out[11] = current_func(out[11],in[11]) ;                        \
-           case 11: out[10] = current_func(out[10],in[10]) ;                        \
-           case 10: out[9] = current_func(out[9],in[9]) ;                        \
-           case 9: out[8] = current_func(out[8],in[8]) ;                        \
-           case 8: out[7] = current_func(out[7],in[7]) ;                        \
-           case 7: out[6] = current_func(out[6],in[6]) ;                        \
-           case 6: out[5] = current_func(out[5],in[5]) ;                        \
-           case 5: out[4] = current_func(out[4],in[4]) ;                        \
-           case 4: out[3] = current_func(out[3],in[3]) ;                        \
-           case 3: out[2] = current_func(out[2],in[2]) ;                        \
-           case 2: out[1] = current_func(out[1],in[1]) ;                        \
-           case 1: out[0] = current_func(out[0],in[0]) ;                        \
-       }\
+    while( left_over > 0 ) {                        \
+        int how_much = (left_over > 32) ? 32 : left_over; \
+        switch(left_over) {                         \
+            case 32: out[31] = current_func(out[31],in[31]) ;                        \
+            case 31: out[30] = current_func(out[30],in[30]) ;                        \
+            case 30: out[29] = current_func(out[29],in[29]) ;                        \
+            case 29: out[28] = current_func(out[28],in[28]) ;                        \
+            case 28: out[27] = current_func(out[27],in[27]) ;                        \
+            case 27: out[26] = current_func(out[26],in[26]) ;                        \
+            case 26: out[25] = current_func(out[25],in[25]) ;                        \
+            case 25: out[24] = current_func(out[24],in[24]) ;                        \
+            case 24: out[23] = current_func(out[23],in[23]) ;                        \
+            case 23: out[22] = current_func(out[22],in[22]) ;                        \
+            case 22: out[21] = current_func(out[21],in[21]) ;                        \
+            case 21: out[20] = current_func(out[20],in[20]) ;                        \
+            case 20: out[19] = current_func(out[19],in[19]) ;                        \
+            case 19: out[18] = current_func(out[18],in[18]) ;                        \
+            case 18: out[17] = current_func(out[17],in[17]) ;                        \
+            case 17: out[16] = current_func(out[16],in[16]) ;                        \
+            case 16: out[15] = current_func(out[15],in[15]) ;                        \
+            case 15: out[14] = current_func(out[14],in[14]) ;                        \
+            case 14: out[13] = current_func(out[13],in[13]) ;                        \
+            case 13: out[12] = current_func(out[12],in[12]) ;                        \
+            case 12: out[11] = current_func(out[11],in[11]) ;                        \
+            case 11: out[10] = current_func(out[10],in[10]) ;                        \
+            case 10: out[9] = current_func(out[9],in[9]) ;                        \
+            case 9: out[8] = current_func(out[8],in[8]) ;                        \
+            case 8: out[7] = current_func(out[7],in[7]) ;                        \
+            case 7: out[6] = current_func(out[6],in[6]) ;                        \
+            case 6: out[5] = current_func(out[5],in[5]) ;                        \
+            case 5: out[4] = current_func(out[4],in[4]) ;                        \
+            case 4: out[3] = current_func(out[3],in[3]) ;                        \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
+        left_over -= how_much;                 \
+        out += how_much;                       \
+        in += how_much;                        \
     } \
 }
 
@@ -332,12 +112,6 @@
     OP_SVE_FUNC(max, d,  64,  int64_t, max)
     OP_SVE_FUNC(max, d, 64, uint64_t, max)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(max, h, 16, float16_t, max)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(max, h, 16, float16_t, max)
-#endif
     OP_SVE_FUNC(max, w, 32, float32_t, max)
     OP_SVE_FUNC(max, d, 64, float64_t, max)
 
@@ -355,12 +129,6 @@
     OP_SVE_FUNC(min, d,  64,  int64_t, min)
     OP_SVE_FUNC(min, d, 64, uint64_t, min)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(min, h, 16, float16_t, min)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(min, h, 16, float16_t, min)
-#endif
     OP_SVE_FUNC(min, w, 32, float32_t, min)
     OP_SVE_FUNC(min, d, 64, float64_t, min)
 
@@ -378,12 +146,6 @@
     OP_SVE_FUNC(sum, d,  64,  int64_t, add)
     OP_SVE_FUNC(sum, d, 64, uint64_t, add)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC(sum, h, 16, float16_t, add)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC(sum, h, 16, float16_t, add)
-#endif
     OP_SVE_FUNC(sum, w, 32, float32_t, add)
     OP_SVE_FUNC(sum, d, 64, float64_t, add)
 
@@ -401,14 +163,8 @@
     OP_SVE_FUNC(prod, d,  64,  int64_t, mul)
     OP_SVE_FUNC(prod, d, 64, uint64_t, mul)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-OP_SVE_FUNC(prod, h, 16, float16_t, mul)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-OP_SVE_FUNC(prod, h, 16, float16_t, mul)
-#endif
     OP_SVE_FUNC(prod, w, 32, float32_t, mul)
-OP_SVE_FUNC(prod, d, 64, float64_t, mul)
+    OP_SVE_FUNC(prod, d, 64, float64_t, mul)
 
 /*************************************************************************
  * Bitwise AND
@@ -498,12 +254,6 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
     OP_SVE_FUNC_3BUFF(max, d,  64,  int64_t, max)
     OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(max, h, 16, float16_t, max)
-#endif
     OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max)
     OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max)
 
@@ -519,12 +269,6 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
     OP_SVE_FUNC_3BUFF(min, d,  64,  int64_t, min)
     OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(min, h, 16, float16_t, min)
-#endif
     OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min)
     OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min)
 
@@ -540,12 +284,6 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
     OP_SVE_FUNC_3BUFF(sum, d,  64,  int64_t, add)
     OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-    OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-    OP_SVE_FUNC_3BUFF(sum, h, 16, float16_t, add)
-#endif
     OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add)
     OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add)
 
@@ -559,16 +297,10 @@ OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
     OP_SVE_FUNC_3BUFF(prod, w,  32,  int32_t, mul)
     OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul)
     OP_SVE_FUNC_3BUFF(prod, d,  64,  int64_t, mul)
-OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul)
 
-    /* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-OP_SVE_FUNC_3BUFF(prod, h, 16, float16_t, mul)
-#endif
     OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul)
-OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul)
 
 /*************************************************************************
  * Bitwise AND
@@ -580,7 +312,7 @@ OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul)
     OP_SVE_FUNC_3BUFF(band, w,  32,  int32_t, and)
     OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and)
     OP_SVE_FUNC_3BUFF(band, d,  64,  int64_t, and)
-OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and)
+    OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and)
 
  /*************************************************************************
  * Bitwise OR
@@ -592,7 +324,7 @@ OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and)
     OP_SVE_FUNC_3BUFF(bor, w,  32,  int32_t, orr)
     OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr)
     OP_SVE_FUNC_3BUFF(bor, d,  64,  int64_t, orr)
-OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr)
 
 /*************************************************************************
  * Bitwise XOR
@@ -604,7 +336,7 @@ OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr)
     OP_SVE_FUNC_3BUFF(bxor, w,  32,  int32_t, eor)
     OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor)
     OP_SVE_FUNC_3BUFF(bxor, d,  64,  int64_t, eor)
-OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor)
 
 /** C integer ***********************************************************/
 #define C_INTEGER(name, ftype)                                              \
@@ -619,16 +351,10 @@ OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor)
 
 
 /** Floating point, including all the Fortran reals *********************/
-#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
-#define SHORT_FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float16_t
-#else
-#define SHORT_FLOAT(name, ftype) NULL
-#endif
 #define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t
 #define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t
 
 #define FLOATING_POINT(name, ftype)                                        \
-    [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),            \
     [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                        \
     [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
 

From 6c3bde95b7cad83b28cc866c9dc8f0f56358c57d Mon Sep 17 00:00:00 2001
From: Dong Zhong <zhongdong0321@hotmail.com>
Date: Fri, 21 Feb 2020 17:08:19 -0500
Subject: [PATCH 12/13] add reduce_local test code

Signed-off-by: Dong Zhong <zhongdong0321@hotmail.com>
---
 test/datatype/reduce_local.c | 1149 ++++++++++++++++++++++++++++++++++
 1 file changed, 1149 insertions(+)
 create mode 100644 test/datatype/reduce_local.c

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
new file mode 100644
index 00000000000..b2fe9b352c3
--- /dev/null
+++ b/test/datatype/reduce_local.c
@@ -0,0 +1,1149 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+#include "mpi.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/runtime/mpiruntime.h"
+#include "ompi/datatype/ompi_datatype.h"
+
+static void print_status(char* op, char* type, int correctness)
+{
+    if(correctness)
+        printf("%s %s [\033[1;32msuccess\033[0m]", op, type);
+    else
+        printf("%s %s [\033[1;31mfail\033[0m]", op, type);
+}
+
+int main(int argc, char **argv)
+{
+    static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
+    int count, elem_size, rank, size, len, provided, correctness = 1, i;
+    double tstart, tend;
+    char *type, *op;
+
+    if(argc < 4 ) {
+        fprintf(stderr,
+                "Less arguments than expected (we need at least 3): <count> <type> <element size> <op>\n"
+                "  <type> : [i, u, f, d]\n"
+                "  <op> ; [sum, max, min, bor, bxor, mul, band]\n");
+        exit(-1);
+    }
+    count = atoi(argv[1]);
+    type = argv[2];
+    elem_size = atoi(argv[3]);
+    op = argv[4];
+
+    if( count <= 0 ) {
+        printf("The number of elements should be positive\n");
+        exit(-1);
+    }
+    if( (0 != (elem_size%8)) || (elem_size <= 0) || (elem_size > 64) ) {
+        printf("The element type should be 8, 16, 32 or 64\n");
+        exit(-2);
+    }
+
+    in_buf          = malloc(count * sizeof(double));
+    inout_buf       = malloc(count * sizeof(double));
+    inout_check_buf = malloc(count * sizeof(double));
+
+    ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
+
+    rank = ompi_comm_rank(MPI_COMM_WORLD);
+    size = ompi_comm_size(MPI_COMM_WORLD);
+
+    if(*type=='i') {
+        if( 8 == elem_size ) {
+            int8_t *in_int8 = (int8_t*)in_buf,
+                *inout_int8 = (int8_t*)inout_buf,
+                *inout_int8_for_check = (int8_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int8[i] = 5;
+                inout_int8[i] = inout_int8_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (int8_t)(in_int8[i] + inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != in_int8[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {  //intentionly reversed in and out
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int8, in_int8, count, MPI_INT8_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != in_int8[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (in_int8[i] | inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (in_int8[i] ^ inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8,inout_int8,count, MPI_INT8_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (int8_t)(in_int8[i] * inout_int8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int8, inout_int8, count, MPI_INT8_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int8[i] != (in_int8[i] & inout_int8_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT8_T", correctness);
+            }
+        }
+        if( 16 == elem_size ) {
+            int16_t *in_int16 = (int16_t*)in_buf,
+                *inout_int16 = (int16_t*)inout_buf,
+                *inout_int16_for_check = (int16_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int16[i] = 5;
+                inout_int16[i] = inout_int16_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (int16_t)(in_int16[i] + inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ )  {
+                    if(inout_int16[i] != in_int16[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int16, in_int16, count, MPI_INT16_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != in_int16[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (in_int16[i] | inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (in_int16[i] ^ inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (int16_t)(in_int16[i] * inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int16, inout_int16, count, MPI_INT16_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int16[i] != (in_int16[i] & inout_int16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT16_T", correctness);
+            }
+        }
+        if( 32 == elem_size ) {
+            int32_t *in_int32 = (int32_t*)in_buf,
+                *inout_int32 = (int32_t*)inout_buf,
+                *inout_int32_for_check = (int32_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int32[i] = 5;
+                inout_int32[i] = inout_int32_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (int32_t)(in_int32[i] + inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != in_int32[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int32, in_int32, count, MPI_INT32_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != in_int32[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (in_int32[i] | inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (int32_t)(in_int32[i] * inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (in_int32[i] & inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int32, inout_int32, count, MPI_INT32_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int32[i] != (in_int32[i] ^ inout_int32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT32_T", correctness);
+            }
+        }
+        if( 64 == elem_size ) {
+            int64_t *in_int64 = (int64_t*)in_buf,
+                *inout_int64 = (int64_t*)inout_buf,
+                *inout_int64_for_check = (int64_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_int64[i] = 5;
+                inout_int64[i] = inout_int64_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (int64_t)(in_int64[i] + inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != in_int64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_int64, in_int64, count, MPI_INT64_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != in_int64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (in_int64[i] | inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (in_int64[i] ^ inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64,inout_int64,count, MPI_INT64_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (int64_t)(in_int64[i] * inout_int64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_INT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_int64, inout_int64, count, MPI_INT64_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_int64[i] != (in_int64[i] & inout_int64_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT64_T", correctness);
+            }
+        }
+    }
+    if(*type=='u') {
+        if( 8 == elem_size ) {
+            uint8_t *in_uint8 = (uint8_t*)in_buf,
+                *inout_uint8 = (uint8_t*)inout_buf,
+                *inout_uint8_for_check = (uint8_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint8[i] = 5;
+                inout_uint8[i] = inout_uint8_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (uint8_t)(in_uint8[i] + inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != inout_uint8_for_check[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {  //intentionly reversed in and out
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != in_uint8[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (in_uint8[i] | inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (in_uint8[i] ^ inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8,inout_uint8,count, MPI_UINT8_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (int8_t)(in_uint8[i] * inout_uint8_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT8_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT8_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint8, inout_uint8, count, MPI_UINT8_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint8[i] != (in_uint8[i] & inout_uint8_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT8_T", correctness);
+            }
+        }
+        if( 16 == elem_size ) {
+            uint16_t *in_uint16 = (uint16_t*)in_buf,
+                *inout_uint16 = (uint16_t*)inout_buf,
+                *inout_uint16_for_check = (uint16_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint16[i] = 5;
+                inout_uint16[i] = inout_uint16_for_check[i] = -3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (uint16_t)(in_uint16[i] + inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ )  {
+                    if(inout_uint16[i] != inout_uint16_for_check[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != in_uint16[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (in_uint16[i] | inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (in_uint16[i] ^ inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (uint16_t)(in_uint16[i] * inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT16_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT16_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint16, inout_uint16, count, MPI_UINT16_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint16[i] != (in_uint16[i] & inout_uint16_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT16_T", correctness);
+            }
+        }
+        if( 32 == elem_size ) {
+            uint32_t *in_uint32 = (uint32_t*)in_buf,
+                *inout_uint32 = (uint32_t*)inout_buf,
+                *inout_uint32_for_check = (uint32_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint32[i] = 5;
+                inout_uint32[i] = inout_uint32_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (uint32_t)(in_uint32[i] + inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != inout_uint32_for_check[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != in_uint32[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32,inout_uint32,count, MPI_UINT32_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (in_uint32[i] | inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (uint32_t)(in_uint32[i] * inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (in_uint32[i] & inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT32_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT32_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint32, inout_uint32, count, MPI_UINT32_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint32[i] != (in_uint32[i] ^ inout_uint32_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT32_T", correctness);
+            }
+        }
+        if( 64 == elem_size ) {
+            int64_t *in_uint64 = (int64_t*)in_buf,
+                *inout_uint64 = (int64_t*)inout_buf,
+                *inout_uint64_for_check = (int64_t*)inout_check_buf;
+            for( i = 0; i < count; i++ ) {
+                in_uint64[i] = 5;
+                inout_uint64[i] = inout_uint64_for_check[i] = 3;
+            }
+            if( 0 == strcmp(op, "sum") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_SUM);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (int64_t)(in_uint64[i] + inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_SUM", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "max") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_MAX);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != in_uint64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MAX", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "min") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(inout_uint64, in_uint64, count, MPI_UINT64_T, MPI_MIN);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != in_uint64[i]) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_MIN", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BOR", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (in_uint64[i] | inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "bxor") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BXOR", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BXOR);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (in_uint64[i] ^ inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BXOR", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "mul") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64,inout_uint64,count, MPI_UINT64_T, MPI_PROD);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (int64_t)(in_uint64[i] * inout_uint64_for_check[i])) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_PROD", "MPI_INT64_T", correctness);
+            }
+            if( 0 == strcmp(op, "band") ) {
+                printf("#Local Reduce %s for %s: %d \n", "MPI_BAND", "MPI_UINT64_T", count);
+                tstart = MPI_Wtime();
+                MPI_Reduce_local(in_uint64, inout_uint64, count, MPI_UINT64_T, MPI_BAND);
+                tend = MPI_Wtime();
+                for( correctness = 1, i = 0; i < count; i++ ) {
+                    if(inout_uint64[i] != (in_uint64[i] & inout_uint64_for_check[i]) ) {
+                        if( correctness )
+                            printf("First error at position %d\n", i);
+                        correctness = 0;
+                        break;
+                    }
+                }
+                print_status("MPI_BAND", "MPI_INT64_T", correctness);
+            }
+        }
+    }
+
+    if(*type=='f') {
+        float *in_float = (float*)in_buf,
+            *inout_float = (float*)inout_buf,
+            *inout_float_for_check = (float*)inout_check_buf;
+        for( i = 0; i < count; i++ ) {
+            in_float[i] = 1000.0+1;
+            inout_float[i] = inout_float_for_check[i] = 100.0+2;
+        }
+        if( 0 == strcmp(op, "sum") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_SUM);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != inout_float_for_check[i]+in_float[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_SUM", "MPI_FLOAT", correctness);
+        }
+
+        if( 0 == strcmp(op, "max") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_MAX);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != in_float[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MAX", "MPI_FLOAT", correctness);
+        }
+
+        if( 0 == strcmp(op, "min") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_float,in_float,count, MPI_FLOAT, MPI_MIN);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != in_float[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MIN", "MPI_FLOAT", correctness);
+        }
+
+        if( 0 == strcmp(op, "mul") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_FLOAT", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_float, inout_float, count, MPI_FLOAT, MPI_PROD);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_float[i] != in_float[i] * inout_float_for_check[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_PROD", "MPI_FLOAT", correctness);
+        }
+    }
+
+    if(*type=='d') {
+        double *in_double = (double*)in_buf,
+            *inout_double = (double*)inout_buf,
+            *inout_double_for_check = (double*)inout_check_buf;
+        for( i = 0; i < count; i++ ) {
+            in_double[i] = 10.0+1;
+            inout_double[i] = inout_double_for_check[i] = 1.0+2;
+        }
+
+        if( 0 == strcmp(op, "sum") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_SUM", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_SUM);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != inout_double_for_check[i]+in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_SUM", "MPI_DOUBLE", correctness);
+        }
+
+        if( 0 == strcmp(op, "max") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MAX", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_MAX);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MAX", "MPI_DOUBLE", correctness);
+        }
+
+        if( 0 == strcmp(op, "min") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_MIN", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(inout_double, in_double, count, MPI_DOUBLE, MPI_MIN);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_MIN", "MPI_DOUBLE", correctness);
+        }
+        if( 0 == strcmp(op, "mul") ) {
+            printf("#Local Reduce %s for %s: %d \n", "MPI_PROD", "MPI_DOUBLE", count);
+            tstart = MPI_Wtime();
+            MPI_Reduce_local(in_double, inout_double, count, MPI_DOUBLE, MPI_PROD);
+            tend = MPI_Wtime();
+            for( correctness = 1, i = 0; i < count; i++ ) {
+                if(inout_double[i] != inout_double_for_check[i]*in_double[i]) {
+                    if( correctness )
+                        printf("First error at position %d\n", i);
+                    correctness = 0;
+                    break;
+                }
+            }
+            print_status("MPI_PROD", "MPI_DOUBLE", correctness);
+        }
+    }
+    //tstart = MPI_Wtime(); 
+    //memcpy(in_uint8,inout_uint8, count);
+    //memcpy(in_float, inout_float, count);
+    //memcpy(in_double, inout_double, count);
+    printf(" count  %d  time %.6f seconds\n",count, tend-tstart);
+    ompi_mpi_finalize();
+
+    free(in_buf);
+    free(inout_buf);
+    free(inout_check_buf);
+
+    return correctness ? 0 : -1;
+}
+

From b418d37886a16f7af0f70df610dde40ad30ac974 Mon Sep 17 00:00:00 2001
From: Dong Zhong <zhongdong0321@hotmail.com>
Date: Mon, 24 Feb 2020 15:09:02 -0500
Subject: [PATCH 13/13] shrink duff device code

Signed-off-by: Dong Zhong <zhongdong0321@hotmail.com>
---
 ompi/mca/op/sve/op_sve_functions.c | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c
index 1b14affe336..4e86f5b2471 100644
--- a/ompi/mca/op/sve/op_sve_functions.c
+++ b/ompi/mca/op/sve/op_sve_functions.c
@@ -57,32 +57,8 @@
     }                                                                  \
     \
     while( left_over > 0 ) {                        \
-        int how_much = (left_over > 32) ? 32 : left_over; \
+        int how_much = (left_over > 8) ? 8 : left_over; \
         switch(left_over) {                         \
-            case 32: out[31] = current_func(out[31],in[31]) ;                        \
-            case 31: out[30] = current_func(out[30],in[30]) ;                        \
-            case 30: out[29] = current_func(out[29],in[29]) ;                        \
-            case 29: out[28] = current_func(out[28],in[28]) ;                        \
-            case 28: out[27] = current_func(out[27],in[27]) ;                        \
-            case 27: out[26] = current_func(out[26],in[26]) ;                        \
-            case 26: out[25] = current_func(out[25],in[25]) ;                        \
-            case 25: out[24] = current_func(out[24],in[24]) ;                        \
-            case 24: out[23] = current_func(out[23],in[23]) ;                        \
-            case 23: out[22] = current_func(out[22],in[22]) ;                        \
-            case 22: out[21] = current_func(out[21],in[21]) ;                        \
-            case 21: out[20] = current_func(out[20],in[20]) ;                        \
-            case 20: out[19] = current_func(out[19],in[19]) ;                        \
-            case 19: out[18] = current_func(out[18],in[18]) ;                        \
-            case 18: out[17] = current_func(out[17],in[17]) ;                        \
-            case 17: out[16] = current_func(out[16],in[16]) ;                        \
-            case 16: out[15] = current_func(out[15],in[15]) ;                        \
-            case 15: out[14] = current_func(out[14],in[14]) ;                        \
-            case 14: out[13] = current_func(out[13],in[13]) ;                        \
-            case 13: out[12] = current_func(out[12],in[12]) ;                        \
-            case 12: out[11] = current_func(out[11],in[11]) ;                        \
-            case 11: out[10] = current_func(out[10],in[10]) ;                        \
-            case 10: out[9] = current_func(out[9],in[9]) ;                        \
-            case 9: out[8] = current_func(out[8],in[8]) ;                        \
             case 8: out[7] = current_func(out[7],in[7]) ;                        \
             case 7: out[6] = current_func(out[6],in[6]) ;                        \
             case 6: out[5] = current_func(out[5],in[5]) ;                        \