diff --git a/ARM_SVE_README b/ARM_SVE_README
new file mode 100644
index 00000000000..a4670d9a8df
--- /dev/null
+++ b/ARM_SVE_README
@@ -0,0 +1,30 @@
+Configuration and installation details please check script arm_install.sh
+
+Run test: (Test example takes 4 args)
+arg1 :  elements count for operation
+arg2 :  elements type could be : i (integer), f (float), d (double) 
+arg3:   type size in bits, only apply when you set arg2 to i.  eg: i 8 will be converted to int8;  i 16 to int16
+arg4:   operation type. Could be : max, min, sum , mul, band , bor,  bxor
+If you want to use SVE module for MPI ops, you need to pass mca params as : -mca op sve -mca op_sve_hardware_available 1
+=======
+Example for test
+$PATH_To_BIN/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=256 --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 33 i 8 min 
+
+If you don't need armie you can remove the ARMIE part in the command line as : 
+$PATH_To_BIN//mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1 /ompi/test/datatype/Reduce_local_float 33 i 8 min 
+
+How we evaluate the performance?
+======
+Logical:
+
+Start_time;
+MPI_reduce_local(...);
+End_time;
+
+Reduce_time = Start_time - End_time;
+
+Possible issues (this happened on thunder2 machine):
+======
+Reason for "-mca pml ob1" : on Arm machine the default pml module will cause a problem with armie (instruction not supported, I don't know why), but with ob1 it works.
+
+
diff --git a/arm_install.sh b/arm_install.sh
new file mode 100644
index 00000000000..0b8afef93b0
--- /dev/null
+++ b/arm_install.sh
@@ -0,0 +1,11 @@
+mkdir build
+
+./autogen.pl >/dev/null
+
+./configure --prefix=$PWD/build --enable-mpirun-prefix-by-default --enable-debug CC=armclang CFLAGS="-march=armv8-a+sve" CXX=armclang++ FC=armflang >/dev/null
+
+./config.status >/dev/null
+make -j 128 install >/dev/null
+
+## compile the test code, test code under ompi/test/datapyte/Reduce_local_float.c
+./build/bin/mpicc  -g  -O3 -march=armv8-a+sve  -o ./test/datatype/Reduce_local_float ./test/datatype/Reduce_local_float.c
diff --git a/ompi/mca/op/Makefile.am b/ompi/mca/op/Makefile.am
index 8c392f1dbec..e942d1a8c21 100644
--- a/ompi/mca/op/Makefile.am
+++ b/ompi/mca/op/Makefile.am
@@ -2,7 +2,7 @@
 # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 #                         University Research and Technology
 #                         Corporation.  All rights reserved.
-# Copyright (c) 2004-2005 The University of Tennessee and The University
+# Copyright (c) 2004-2020 The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
 # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -17,6 +17,8 @@
 # $HEADER$
 #
 
+AM_CPPFLAGS = $(LTDLINCL)
+
 # main library setup
 noinst_LTLIBRARIES = libmca_op.la
 libmca_op_la_SOURCES =
diff --git a/ompi/mca/op/sve/Makefile.am b/ompi/mca/op/sve/Makefile.am
new file mode 100644
index 00000000000..9f6e747b384
--- /dev/null
+++ b/ompi/mca/op/sve/Makefile.am
@@ -0,0 +1,70 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This is an sve op component.  This Makefile.am is a typical
+# sve of how to integrate into Open MPI's Automake-based build
+# system.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+sources = \
+    op_sve.h \
+    op_sve_component.c \
+    op_sve_functions.h \
+    op_sve_functions.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_sve_DSO
+component_noinst  =
+component_install = mca_op_sve.la
+else
+component_install =
+component_noinst  = libmca_op_sve.la
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_sve_la_SOURCES = $(sources)
+mca_op_sve_la_LDFLAGS = -module -avoid-version
+mca_op_sve_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_sve_la_SOURCES = $(sources)
+libmca_op_sve_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/sve/configure.m4 b/ompi/mca/op/sve/configure.m4
new file mode 100644
index 00000000000..47138dd44a5
--- /dev/null
+++ b/ompi/mca/op/sve/configure.m4
@@ -0,0 +1,34 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019-2020 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_sve_CONFIG([action-if-can-compile],
+#		         [action-if-cant-compile])
+# ------------------------------------------------
+# We can always build, unless we were explicitly disabled.
+AC_DEFUN([MCA_ompi_op_sve_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/sve/Makefile])
+    case "${host}" in
+        aarch64)
+            op_sve_support="yes";;
+        *)
+            op_sve_support="no";;
+    esac
+    [$1]
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sve_support],
+                   [test "$op_sve_support" = "yes"])
+    AC_SUBST(MCA_BUILD_ompi_op_has_sve_support)
+    
+    AS_IF([test $op_sve_support == "yes"],
+          [$1],
+          [$2])
+])dnl
diff --git a/ompi/mca/op/sve/op_sve.h b/ompi/mca/op/sve/op_sve.h
new file mode 100644
index 00000000000..21483d1fa56
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_SVE_EXPORT_H
+#define MCA_OP_SVE_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is sve-component-specific cached information.  We
+       tend to use this scheme (caching information on the sve
+       component itself) instead of lots of individual global
+       variables for the component.  The following data fields are
+       sves; replace them with whatever is relevant for your
+       component. */
+
+    /** A simple boolean indicating that the hardware is available. */
+    bool hardware_available;
+
+    /** A simple boolean indicating whether double precision is
+        supported. */
+    bool double_supported;
+} ompi_op_sve_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *sve* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_sve_component_t
+    mca_op_sve_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_SVE_EXPORT_H */
diff --git a/ompi/mca/op/sve/op_sve_component.c b/ompi/mca/op/sve/op_sve_component.c
new file mode 100644
index 00000000000..16393aefe23
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve_component.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      ARM Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "sve" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/sve/op_sve.h"
+#include "ompi/mca/op/sve/op_sve_functions.h"
+
+static int sve_component_open(void);
+static int sve_component_close(void);
+static int sve_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    sve_component_op_query(struct ompi_op_t *op, int *priority);
+static int sve_component_register(void);
+
+ompi_op_sve_component_t mca_op_sve_component = {
+    /* First, the mca_base_component_t struct containing meta
+       information about the component itself */
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "sve",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = sve_component_open,
+            .mca_close_component = sve_component_close,
+            .mca_register_component_params = sve_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = sve_component_init_query,
+        .opc_op_query = sve_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int sve_component_open(void)
+{
+
+    /* A first level check to see if sve is even available in this
+       process.  E.g., you may want to do a first-order check to see
+       if hardware is available.  If so, return OMPI_SUCCESS.  If not,
+       return anything other than OMPI_SUCCESS and the component will
+       silently be ignored.
+
+       Note that if this function returns non-OMPI_SUCCESS, then this
+       component won't even be shown in ompi_info output (which is
+       probably not what you want).
+    */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int sve_component_close(void)
+{
+
+    /* If sve was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int sve_component_register(void)
+{
+
+    /* Additionally, since this component is simulating hardware,
+       let's make MCA params that determine whethere a) the hardware
+       is available, and b) whether double precision floating point
+       types are supported.  This allows you to change the behavior of
+       this component at run-time (by setting these MCA params at
+       run-time), simulating different kinds of hardware. */
+    mca_op_sve_component.hardware_available = false;
+    (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version,
+                                           "hardware_available",
+                                           "Whether the hardware is available or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_sve_component.hardware_available);
+
+    mca_op_sve_component.double_supported = true;
+    (void) mca_base_component_var_register(&mca_op_sve_component.super.opc_version,
+                                           "double_supported",
+                                           "Whether the double precision data types are supported or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_sve_component.double_supported);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int sve_component_init_query(bool enable_progress_threads,
+                                        bool enable_mpi_thread_multiple)
+{
+    if (mca_op_sve_component.hardware_available) {
+        return OMPI_SUCCESS;
+    }
+    return OMPI_ERR_NOT_SUPPORTED;
+}
+
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t *
+    sve_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = OBJ_NEW(ompi_op_base_module_t);
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    int i=0;
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+    case OMPI_OP_BASE_FORTRAN_MIN:
+    case OMPI_OP_BASE_FORTRAN_SUM:
+    case OMPI_OP_BASE_FORTRAN_PROD:
+    case OMPI_OP_BASE_FORTRAN_BOR:
+    case OMPI_OP_BASE_FORTRAN_BAND:
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = ompi_op_sve_functions[op->o_f_to_c_index][i];
+            OBJ_RETAIN(module);
+            module->opm_3buff_fns[i] = ompi_op_sve_3buff_functions[op->o_f_to_c_index][i];
+            OBJ_RETAIN(module);
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+        module = NULL;
+        break;
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+        module= NULL;
+        break;
+    default:
+        module= NULL;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *sve* component pointer
+       (vs. a *base* component pointer -- where an *sve* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/sve/op_sve_functions.c b/ompi/mca/op/sve/op_sve_functions.c
new file mode 100644
index 00000000000..4e86f5b2471
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve_functions.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/sve/op_sve.h"
+#include "ompi/mca/op/sve/op_sve_functions.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif /* __ARM_FEATURE_SVE */
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ */
+#define OP_SVE_FUNC(name, type_name, type_size, type, op) \
+    static void ompi_op_sve_2buff_##name##_##type(void *_in, void *_out, int *count, \
+            struct ompi_datatype_t **dtype, \
+            struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = svcnt##type_name();                           \
+    int left_over = *count; \
+    type* in = (type*)_in; \
+    type* out = (type*)_out; \
+    svbool_t Pg = svptrue_b##type_size(); \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        sv##type  vsrc = svld1(Pg, in);                                \
+        sv##type  vdst = svld1(Pg, out);                               \
+        in += types_per_step;                                          \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+        out += types_per_step; \
+    }                                                                  \
+    \
+    while( left_over > 0 ) {                        \
+        int how_much = (left_over > 8) ? 8 : left_over; \
+        switch(left_over) {                         \
+            case 8: out[7] = current_func(out[7],in[7]) ;                        \
+            case 7: out[6] = current_func(out[6],in[6]) ;                        \
+            case 6: out[5] = current_func(out[5],in[5]) ;                        \
+            case 5: out[4] = current_func(out[4],in[4]) ;                        \
+            case 4: out[3] = current_func(out[3],in[3]) ;                        \
+            case 3: out[2] = current_func(out[2],in[2]) ;                        \
+            case 2: out[1] = current_func(out[1],in[1]) ;                        \
+            case 1: out[0] = current_func(out[0],in[0]) ;                        \
+        }\
+        left_over -= how_much;                 \
+        out += how_much;                       \
+        in += how_much;                        \
+    } \
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+    OP_SVE_FUNC(max, b,  8,   int8_t, max)
+    OP_SVE_FUNC(max, b,   8,  uint8_t, max)
+    OP_SVE_FUNC(max, h,  16,  int16_t, max)
+    OP_SVE_FUNC(max, h, 16, uint16_t, max)
+    OP_SVE_FUNC(max, w,  32,  int32_t, max)
+    OP_SVE_FUNC(max, w, 32, uint32_t, max)
+    OP_SVE_FUNC(max, d,  64,  int64_t, max)
+    OP_SVE_FUNC(max, d, 64, uint64_t, max)
+
+    OP_SVE_FUNC(max, w, 32, float32_t, max)
+    OP_SVE_FUNC(max, d, 64, float64_t, max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_SVE_FUNC(min, b,  8,   int8_t, min)
+    OP_SVE_FUNC(min, b,   8,  uint8_t, min)
+    OP_SVE_FUNC(min, h,  16,  int16_t, min)
+    OP_SVE_FUNC(min, h, 16, uint16_t, min)
+    OP_SVE_FUNC(min, w,  32,  int32_t, min)
+    OP_SVE_FUNC(min, w, 32, uint32_t, min)
+    OP_SVE_FUNC(min, d,  64,  int64_t, min)
+    OP_SVE_FUNC(min, d, 64, uint64_t, min)
+
+    OP_SVE_FUNC(min, w, 32, float32_t, min)
+    OP_SVE_FUNC(min, d, 64, float64_t, min)
+
+ /*************************************************************************
+ * Sum
+ ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+    OP_SVE_FUNC(sum, b,  8,   int8_t, add)
+    OP_SVE_FUNC(sum, b,   8,  uint8_t, add)
+    OP_SVE_FUNC(sum, h,  16,  int16_t, add)
+    OP_SVE_FUNC(sum, h, 16, uint16_t, add)
+    OP_SVE_FUNC(sum, w,  32,  int32_t, add)
+    OP_SVE_FUNC(sum, w, 32, uint32_t, add)
+    OP_SVE_FUNC(sum, d,  64,  int64_t, add)
+    OP_SVE_FUNC(sum, d, 64, uint64_t, add)
+
+    OP_SVE_FUNC(sum, w, 32, float32_t, add)
+    OP_SVE_FUNC(sum, d, 64, float64_t, add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_SVE_FUNC(prod, b,  8,   int8_t, mul)
+    OP_SVE_FUNC(prod, b,   8,  uint8_t, mul)
+    OP_SVE_FUNC(prod, h,  16,  int16_t, mul)
+    OP_SVE_FUNC(prod, h, 16, uint16_t, mul)
+    OP_SVE_FUNC(prod, w,  32,  int32_t, mul)
+    OP_SVE_FUNC(prod, w, 32, uint32_t, mul)
+    OP_SVE_FUNC(prod, d,  64,  int64_t, mul)
+    OP_SVE_FUNC(prod, d, 64, uint64_t, mul)
+
+    OP_SVE_FUNC(prod, w, 32, float32_t, mul)
+    OP_SVE_FUNC(prod, d, 64, float64_t, mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_SVE_FUNC(band, b,  8,   int8_t, and)
+    OP_SVE_FUNC(band, b,   8,  uint8_t, and)
+    OP_SVE_FUNC(band, h,  16,  int16_t, and)
+    OP_SVE_FUNC(band, h, 16, uint16_t, and)
+    OP_SVE_FUNC(band, w,  32,  int32_t, and)
+    OP_SVE_FUNC(band, w, 32, uint32_t, and)
+    OP_SVE_FUNC(band, d,  64,  int64_t, and)
+OP_SVE_FUNC(band, d, 64, uint64_t, and)
+
+ /*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_SVE_FUNC(bor, b,  8,   int8_t, orr)
+    OP_SVE_FUNC(bor, b,   8,  uint8_t, orr)
+    OP_SVE_FUNC(bor, h,  16,  int16_t, orr)
+    OP_SVE_FUNC(bor, h, 16, uint16_t, orr)
+    OP_SVE_FUNC(bor, w,  32,  int32_t, orr)
+    OP_SVE_FUNC(bor, w, 32, uint32_t, orr)
+    OP_SVE_FUNC(bor, d,  64,  int64_t, orr)
+OP_SVE_FUNC(bor, d, 64, uint64_t, orr)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_SVE_FUNC(bxor, b,  8,   int8_t, eor)
+    OP_SVE_FUNC(bxor, b,   8,  uint8_t, eor)
+    OP_SVE_FUNC(bxor, h,  16,  int16_t, eor)
+    OP_SVE_FUNC(bxor, h, 16, uint16_t, eor)
+    OP_SVE_FUNC(bxor, w,  32,  int32_t, eor)
+    OP_SVE_FUNC(bxor, w, 32, uint32_t, eor)
+    OP_SVE_FUNC(bxor, d,  64,  int64_t, eor)
+OP_SVE_FUNC(bxor, d, 64, uint64_t, eor)
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *  routines, needed for some optimizations.
+ */
+#define OP_SVE_FUNC_3BUFF(name, type_name, type_size, type, op) \
+        static void ompi_op_sve_3buff_##name##_##type(void * restrict _in1,   \
+                void * restrict _in2, void * restrict _out, int *count, \
+                struct ompi_datatype_t **dtype, \
+                struct ompi_op_base_module_1_0_0_t *module) \
+{                                                                      \
+    int types_per_step = svcnt##type_name();                           \
+    int left_over = *count; \
+    type* in1 = (type*)_in1; \
+    type* in2 = (type*)_in2; \
+    type* out = (type*)_out; \
+    svbool_t Pg = svptrue_b##type_size(); \
+    for (; left_over >= types_per_step; left_over -= types_per_step) { \
+        sv##type  vsrc = svld1(Pg, in1);                               \
+        sv##type  vdst = svld1(Pg, in2);                               \
+        in1 += types_per_step; \
+        in2 += types_per_step; \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+        out += types_per_step; \
+    }                                                                  \
+    if (left_over !=0){                                                \
+        Pg = svwhilelt_b##type_size##_u64(0, left_over);               \
+        sv##type  vsrc = svld1(Pg, in1);                               \
+        sv##type  vdst = svld1(Pg, in2);                               \
+        vdst=sv##op##_z(Pg,vdst,vsrc);                                 \
+        svst1(Pg, out,vdst);                                           \
+    } \
+}
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(max, b,  8,   int8_t, max)
+    OP_SVE_FUNC_3BUFF(max, b,   8,  uint8_t, max)
+    OP_SVE_FUNC_3BUFF(max, h,  16,  int16_t, max)
+    OP_SVE_FUNC_3BUFF(max, h, 16, uint16_t, max)
+    OP_SVE_FUNC_3BUFF(max, w,  32,  int32_t, max)
+    OP_SVE_FUNC_3BUFF(max, w, 32, uint32_t, max)
+    OP_SVE_FUNC_3BUFF(max, d,  64,  int64_t, max)
+    OP_SVE_FUNC_3BUFF(max, d, 64, uint64_t, max)
+
+    OP_SVE_FUNC_3BUFF(max, w, 32, float32_t, max)
+    OP_SVE_FUNC_3BUFF(max, d, 64, float64_t, max)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(min, b,  8,   int8_t, min)
+    OP_SVE_FUNC_3BUFF(min, b,   8,  uint8_t, min)
+    OP_SVE_FUNC_3BUFF(min, h,  16,  int16_t, min)
+    OP_SVE_FUNC_3BUFF(min, h, 16, uint16_t, min)
+    OP_SVE_FUNC_3BUFF(min, w,  32,  int32_t, min)
+    OP_SVE_FUNC_3BUFF(min, w, 32, uint32_t, min)
+    OP_SVE_FUNC_3BUFF(min, d,  64,  int64_t, min)
+    OP_SVE_FUNC_3BUFF(min, d, 64, uint64_t, min)
+
+    OP_SVE_FUNC_3BUFF(min, w, 32, float32_t, min)
+    OP_SVE_FUNC_3BUFF(min, d, 64, float64_t, min)
+
+ /*************************************************************************
+ * Sum
+ ************************************************************************/
+    OP_SVE_FUNC_3BUFF(sum, b,  8,   int8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, b,   8,  uint8_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h,  16,  int16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, h, 16, uint16_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w,  32,  int32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, w, 32, uint32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d,  64,  int64_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d, 64, uint64_t, add)
+
+    OP_SVE_FUNC_3BUFF(sum, w, 32, float32_t, add)
+    OP_SVE_FUNC_3BUFF(sum, d, 64, float64_t, add)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(prod, b,  8,   int8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, b,   8,  uint8_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, h,  16,  int16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, h, 16, uint16_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w,  32,  int32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, w, 32, uint32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d,  64,  int64_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d, 64, uint64_t, mul)
+
+    OP_SVE_FUNC_3BUFF(prod, w, 32, float32_t, mul)
+    OP_SVE_FUNC_3BUFF(prod, d, 64, float64_t, mul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(band, b,  8,   int8_t, and)
+    OP_SVE_FUNC_3BUFF(band, b,   8,  uint8_t, and)
+    OP_SVE_FUNC_3BUFF(band, h,  16,  int16_t, and)
+    OP_SVE_FUNC_3BUFF(band, h, 16, uint16_t, and)
+    OP_SVE_FUNC_3BUFF(band, w,  32,  int32_t, and)
+    OP_SVE_FUNC_3BUFF(band, w, 32, uint32_t, and)
+    OP_SVE_FUNC_3BUFF(band, d,  64,  int64_t, and)
+    OP_SVE_FUNC_3BUFF(band, d, 64, uint64_t, and)
+
+ /*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(bor, b,  8,   int8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, b,   8,  uint8_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, h,  16,  int16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, h, 16, uint16_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, w,  32,  int32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, w, 32, uint32_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, d,  64,  int64_t, orr)
+    OP_SVE_FUNC_3BUFF(bor, d, 64, uint64_t, orr)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+    OP_SVE_FUNC_3BUFF(bxor, b,  8,   int8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, b,   8,  uint8_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, h,  16,  int16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, h, 16, uint16_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, w,  32,  int32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, w, 32, uint32_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, d,  64,  int64_t, eor)
+    OP_SVE_FUNC_3BUFF(bxor, d, 64, uint64_t, eor)
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                              \
+    [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_sve_##ftype##_##name##_int8_t,     \
+    [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_sve_##ftype##_##name##_uint8_t,   \
+    [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_sve_##ftype##_##name##_int16_t,   \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_sve_##ftype##_##name##_uint16_t, \
+    [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_sve_##ftype##_##name##_int32_t,   \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_sve_##ftype##_##name##_uint32_t, \
+    [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_sve_##ftype##_##name##_int64_t,   \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_sve_##ftype##_##name##_uint64_t
+
+
+/** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype) ompi_op_sve_##ftype##_##name##_float32_t
+#define DOUBLE(name, ftype) ompi_op_sve_##ftype##_##name##_float64_t
+
+#define FLOATING_POINT(name, ftype)                                        \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                        \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_handler_fn_t ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 2buff),
+        FLOATING_POINT(sum, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER(prod, 2buff),
+        FLOATING_POINT(prod, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+ompi_op_base_3buff_handler_fn_t ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER(sum, 3buff),
+        FLOATING_POINT(sum, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER(prod, 3buff),
+        FLOATING_POINT(prod, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER(band, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER(bor, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER(bxor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};
diff --git a/ompi/mca/op/sve/op_sve_functions.h b/ompi/mca/op/sve/op_sve_functions.h
new file mode 100644
index 00000000000..00db651cfac
--- /dev/null
+++ b/ompi/mca/op/sve/op_sve_functions.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ *
+ * Copyright (c) 2019      Arm Ltd.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/sve/op_sve.h"
+
+BEGIN_C_DECLS
+
+OMPI_DECLSPEC extern ompi_op_base_handler_fn_t
+ompi_op_sve_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+OMPI_DECLSPEC extern ompi_op_base_3buff_handler_fn_t
+ompi_op_sve_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
diff --git a/test/datatype/HOW_TO_TEST_README.txt b/test/datatype/HOW_TO_TEST_README.txt
new file mode 100644
index 00000000000..4d6025c95ee
--- /dev/null
+++ b/test/datatype/HOW_TO_TEST_README.txt
@@ -0,0 +1,17 @@
+(1) Reduce_uint8.c : test code for MPI_SUM operation with type uint8.
+    compile as : $path/mpicc -march=armv8-a+sve -O3 -o Reduce_uint8 Reduce_uint8.c 
+(2) sve_uint8_test.sh: A shell script that will generate results with time information for MPI_SUM operation with different message size. 
+(3) SVE_MPI_Op.py: A python script which you can use to generate plot.
+
+ALL YOU NEED TO DO IS:
+(1) Change the path of your mpirun and test binary in sve_uint8_test.sh. 
+(2) Run sve_uint8_test.sh 
+ This will generate 3 types of output files with names
+    a. sve-sum-vectorlength.txt    ##MPI_SUM with SVE-enabled operation
+    b. no-sve-sum-vectorlength.txt ##MPI_SUM without SVE-enabled operation
+    c. sve-cpy-vectorlength.txt    ## memcpy 
+(2) run python scripts as
+    $python SVE_MPI_Op.py sve-sum-vectorlength.txt no-sve-sum-vectorlength.txt
+    make sure put sve-sum-vectorlength.txt before no-sve-sum-vectorlength.txt 
+    example : SVE_MPI_Op.py sve-sum-128.txt no-sve-sum-128.txt
+
diff --git a/test/datatype/SVE_MPI_Op.py b/test/datatype/SVE_MPI_Op.py
new file mode 100644
index 00000000000..5051936e29d
--- /dev/null
+++ b/test/datatype/SVE_MPI_Op.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sys
+
+print("\n \n Processing first file \n")
+
+filename1 = sys.argv[1]
+f1 = open(filename1,"r")
+lines1 = f1.readlines()
+f1.close()
+
+noMin1 = {}
+sortednomin1 = {}
+
+#Select lines with key
+for line in lines1:
+   if "time" in line:
+        vals = line.split( )
+        noMin1.setdefault((int)(vals[2]), []).append((float)(vals[4]))
+
+sortednomin1=sorted(noMin1.items())
+num_of_elements1 = [x[0] for x in sortednomin1]
+Reduce_latency1 = [x[1] for x in sortednomin1]
+print(Reduce_latency1)
+
+print("\n \n Processing second file \n")
+
+
+filename2 = sys.argv[2]
+f2 = open(filename2,"r")
+lines2 = f2.readlines()
+f2.close()
+
+noMin2 = {}
+sortednomin2 = {}
+
+
+#Select lines with key
+for line in lines2:
+   if "time" in line:
+        vals = line.split( )
+        noMin2.setdefault((int)(vals[2]), []).append((float)(vals[4]))
+
+sortednomin2=sorted(noMin2.items())
+num_of_elements2 = [x[0] for x in sortednomin2]
+Reduce_latency2 = [x[1] for x in sortednomin2]
+print(Reduce_latency2)
+
+Msg=[1,2,3,4,5,6,7,8,9]
+
+Msg_size=['1k', '4k', '16k', '64k', '256k', '1M', '4M', '16M', '32M']
+
+meanlineprops = dict(linewidth=2)
+fig, (ax1) = plt.subplots(1, figsize=(6, 4))
+
+ax1.grid(True,linestyle='--')
+
+
+bp1 =ax1.boxplot(Reduce_latency1, notch=1, sym='+', vert=1, whis=1.5,patch_artist=True)
+bp2 =ax1.boxplot(Reduce_latency2, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True)
+#bp3 =ax1.boxplot(cpy, notch=0, sym='+', vert=1, whis=1.5,patch_artist=True)
+#ax1.plot(Msg, cpy, linestyle='-.', lw=2.5, marker='o',markersize='8', color='b', label="0.1s")
+
+colors = ['red','red','red','red','red','red','red','red','red']
+for patch, color in zip(bp2['boxes'], colors):
+    patch.set_facecolor(color)
+
+colors = ['purple','purple','purple','purple','purple','purple','purple','purple','purple']
+for patch, color in zip(bp1['boxes'], colors):
+    patch.set_facecolor(color)
+
+"""
+colors = ['green','green','green','green','green','green']
+for patch, color in zip(bp3['boxes'], colors):
+    patch.set_facecolor(color)
+"""
+fig.suptitle('MPI_Op: mul; MPI_type: uint8 \n(Configured with optimized, Flushed cache)', x=0.45, y=1.01, va='center',fontsize=15)
+fig.text(-0.03, 0.5, 'Reduce time (s)', va='center', rotation='vertical',fontsize=12)
+plt.xlabel('Buffer size (Bytes)',fontsize=12)
+
+ax1.set_xticks(Msg)
+ax1.set_xticklabels(Msg_size, minor=False)
+
+
+ax1.legend([bp2["boxes"][0], bp1["boxes"][0]], ['NO_SVE','SVE'], loc='upper left',fontsize=12)#, bbox_to_anchor=(1, 0.5),fontsize=15)
+
+plt.yscale('log')
+
+plt.savefig(filename1+"_sum.pdf",dpi=300,bbox_inches='tight')
diff --git a/test/datatype/correctness_check.sh b/test/datatype/correctness_check.sh
new file mode 100644
index 00000000000..cf9f09a801e
--- /dev/null
+++ b/test/datatype/correctness_check.sh
@@ -0,0 +1,59 @@
+echo "ompi version with SVE -- Usage: arg1: count of elements, args2: 'i'|'f'|'d' : datatype: integer, float, double. args3 size of type. args4 operation"
+echo "your_path/mpirun -mca op sve -mca op_sve_hardware_available 1  -np 1 /your_test_path/Reduce_local_float 1048576  i 8 max"
+
+Orange= "\033[0;33m"
+Blue= '\033[0;34m'
+Purple= '\033[0;35m'
+
+NC='\033[0m'
+
+# test all vector size
+for vector_len in  128 256 512 1024 2048 
+do
+
+    echo "=========Integer type all operations & all sizes========"
+    echo ""
+    for op in max min sum mul band bor bxor
+    do
+        echo ""
+        echo "===Operation  $op test==="
+        for size in 8 16 32 64
+        do
+            echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1024 i $size $op 
+            echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+            /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float  1040 i $size $op 
+        done
+    done
+
+    echo "=======Float type all operations========"
+    echo ""
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 f 32 $op 
+    done
+
+    echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1  -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 f 32 $op
+    done
+
+
+    echo "========Double type all operations========="
+    echo ""
+
+    echo -e "Test \e[1;33m SVE full vector instruction for loop \e[m Total_num_bits = 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1024 d 64 $op
+    done
+
+    echo -e "Test \e[1;34m SVE partial vector instruction for loop$ \e[m (2048*(N-1) ) <  Total_num_bits < 2048*N "
+    for op in max min sum mul
+    do
+        /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/build/bin/mpirun -mca op sve -mca op_sve_hardware_available 1 -mca pml ob1  -np 1  armie -msve-vector-bits=$vector_len --iclient libinscount_emulated.so  --unsafe-ldstex   --  /ccsopen/home/dzhong/Downloads/github/intel_to_arm/ompi/test/datatype/Reduce_local_float 1040 d 64 $op
+    done
+done
+##echo -e "Test \e[1;35m  duff device code \e[m  2048*N <  Total_num_bits < 2048*N + 256  "