diff --git a/.gitignore b/.gitignore
index 43388e9cb5..f4f5b9210f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 /.vscode
 *.cache
 *.code-workspace
+/tests/opencl/atomics/
\ No newline at end of file
diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index e60cd6ec74..53b02e4fda 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -22,6 +22,9 @@ all:
 	$(MAKE) -C kmeans
 	$(MAKE) -C blackscholes
 	$(MAKE) -C bfs
+	$(MAKE) -C twokernels
+	$(MAKE) -C copybuf
+
 
 run-simx:
 	$(MAKE) -C vecadd run-simx
@@ -43,6 +46,10 @@ run-simx:
 	$(MAKE) -C kmeans run-simx
 	$(MAKE) -C blackscholes run-simx
 	$(MAKE) -C bfs run-simx
+	$(MAKE) -C twokernels run-simx
+	$(MAKE) -C copybuf run-simx
+	
+
 
 run-rtlsim:
 	$(MAKE) -C vecadd run-rtlsim
@@ -64,6 +71,9 @@ run-rtlsim:
 	$(MAKE) -C kmeans run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
 	$(MAKE) -C bfs run-rtlsim
+	$(MAKE) -C twokernels run-rtlsim
+	$(MAKE) -C copybuf run-rtlsim
+	
 
 clean:
 	$(MAKE) -C vecadd clean
@@ -85,4 +95,6 @@ clean:
 	$(MAKE) -C guassian clean
 	$(MAKE) -C kmeans clean
 	$(MAKE) -C blackscholes clean
-	$(MAKE) -C bfs clean
\ No newline at end of file
+	$(MAKE) -C bfs clean
+	$(MAKE) -C copybuf clean
+	$(MAKE) -C twokernels clean
\ No newline at end of file
diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk
index f9fe92d89e..757ab7c19d 100644
--- a/tests/opencl/common.mk
+++ b/tests/opencl/common.mk
@@ -43,7 +43,7 @@ VX_CFLAGS  += -Xclang -target-feature -Xclang +zicond
 VX_CFLAGS  += -mllvm -disable-loop-idiom-all	# disable memset/memcpy loop replacement
 #VX_CFLAGS += -mllvm -vortex-branch-divergence=0
 #VX_CFLAGS += -mllvm -debug -mllvm -print-after-all
-
+VX_CFLAGS += -save-temps
 VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_HOME)/kernel/scripts/link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortex.a $(VX_LIBS)
 
 VX_BINTOOL += OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py
@@ -52,6 +52,7 @@ CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
 CXXFLAGS += -pthread
 CXXFLAGS += -I$(POCL_PATH)/include
+CXXFLAGS += -I$(POCL_PATH)/include/hpp
 CXXFLAGS += $(CONFIGS)
 
 POCL_CC_FLAGS += LLVM_PREFIX=$(LLVM_VORTEX) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)"
diff --git a/tests/opencl/copybuf/Makefile b/tests/opencl/copybuf/Makefile
new file mode 100644
index 0000000000..f10a02695a
--- /dev/null
+++ b/tests/opencl/copybuf/Makefile
@@ -0,0 +1,11 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := copybuf
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cc
+
+
+include ../common.mk
diff --git a/tests/opencl/copybuf/main.cc b/tests/opencl/copybuf/main.cc
new file mode 100644
index 0000000000..76c243a96b
--- /dev/null
+++ b/tests/opencl/copybuf/main.cc
@@ -0,0 +1,77 @@
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/opencl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main() {
+  cl_int err;
+  cl_uint num_platforms = 0;
+  err = clGetPlatformIDs(0, NULL, &num_platforms);
+  if (err != CL_SUCCESS || num_platforms == 0) {
+    fprintf(stderr, "No OpenCL platform\n");
+    return -1;
+  }
+  cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_uint num_devices = 0;
+  err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  if (err != CL_SUCCESS || num_devices == 0) {
+    fprintf(stderr, "No OpenCL device\n");
+    free(platforms);
+    return -1;
+  }
+  cl_device_id *devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
+  clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_context context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err);
+  if (err != CL_SUCCESS) {
+    fprintf(stderr, "Failed to create context\n");
+    return -1;
+  }
+  cl_command_queue queue = clCreateCommandQueue(context, devices[0], 0, &err);
+  if (err != CL_SUCCESS) {
+    fprintf(stderr, "Failed to create queue\n");
+    return -1;
+  }
+  int src[] = {10, 20, 30, 40, 50};
+  int dst[5] = {0};
+  cl_mem bufSrc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR ,
+                                 sizeof(src), &src, &err);
+  if(err!= CL_SUCCESS){
+    fprintf(stderr, "Failed to create buffer\n");
+    return -1;
+  }
+  cl_mem bufDst = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                 sizeof(dst), NULL, &err);
+  if (err != CL_SUCCESS) {
+    fprintf(stderr, "Failed to create buffer\n");
+    return -1;
+  }
+  err = clEnqueueCopyBuffer(queue, bufSrc, bufDst, 0, 0, sizeof(src), 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+    fprintf(stderr, "Failed to copy buffer\n");
+    return -1;
+  }
+  err = clEnqueueReadBuffer(queue, bufDst, CL_TRUE, 0, sizeof(dst), dst, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+    fprintf(stderr, "Failed to read buffer\n");
+    return -1;
+  }
+
+  printf("Source Buffer: ");
+  for (int i = 0; i < 5; i++)
+    printf("%d ", src[i]);
+  printf("\nDestination buffer: ");
+  for (int i = 0; i < 5; i++)
+    printf("%d ", dst[i]);
+  printf("\n");
+
+  clReleaseMemObject(bufSrc);
+  clReleaseMemObject(bufDst);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+
+  free(devices);
+  free(platforms);
+
+  return 0;
+}
diff --git a/tests/opencl/twokernels/Makefile b/tests/opencl/twokernels/Makefile
new file mode 100644
index 0000000000..254a9d94ef
--- /dev/null
+++ b/tests/opencl/twokernels/Makefile
@@ -0,0 +1,11 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := twokernels
+
+SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+
+include ../common.mk
diff --git a/tests/opencl/twokernels/main.cpp b/tests/opencl/twokernels/main.cpp
new file mode 100644
index 0000000000..23c8340ef6
--- /dev/null
+++ b/tests/opencl/twokernels/main.cpp
@@ -0,0 +1,126 @@
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+
+#include <CL/opencl.hpp>
+#include <iostream>
+#include <vector>
+
+
+
+const char *kernelSrcAdd = R"CLC(
+__kernel void pair_add(__global const int* in, __global int* out) {
+    int gid = get_global_id(0);
+    int i = 2 * gid;
+    out[gid] = in[i]+ in[i + 1];
+}
+)CLC";
+
+
+const char *kernelSrcMul = R"CLC(
+__kernel void pair_mul(__global const int* in, __global int* out) {
+    int gid = get_global_id(0);
+    int i = 2 * gid;
+    out[gid] = in[i]* in[i + 1];
+}
+)CLC";
+
+
+std::vector<int> runPairAdd(cl::Context &context, cl::CommandQueue &queue,
+                            const std::vector<int> &input) {
+  int N = input.size();
+  int outN = N / 2;
+  std::vector<int> output(outN, 0);
+
+  cl::Buffer bufIn(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                   sizeof(int) * N, const_cast<int *>(input.data()));
+  cl::Buffer bufOut(context, CL_MEM_WRITE_ONLY, sizeof(int) * outN);
+
+  cl::Program program(context, kernelSrcAdd);
+  if (program.build({queue.getInfo<CL_QUEUE_DEVICE>()}) != CL_SUCCESS) {
+    std::cerr << "Error of bulding  pair_add:\n"
+              << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(queue.getInfo<CL_QUEUE_DEVICE>())
+              << std::endl;
+    return {};
+  }
+  
+  cl::Kernel kernel(program, "pair_add");
+  kernel.setArg(0, bufIn);
+  kernel.setArg(1, bufOut);
+
+  queue.enqueueNDRangeKernel(kernel, cl::NullRange,
+                             cl::NDRange(outN), cl::NullRange);
+  queue.enqueueReadBuffer(bufOut, CL_TRUE, 0, sizeof(int) * outN, output.data());
+  queue.finish();
+  return output;
+}
+
+std::vector<int> runPairMul(cl::Context &context, cl::CommandQueue &queue,
+                            const std::vector<int> &input) {
+  int N = input.size();
+  int outN = N / 2;
+  std::vector<int> output(outN, 0);
+
+  cl::Buffer bufIn(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                   sizeof(int) * N, const_cast<int *>(input.data()));
+  cl::Buffer bufOut(context, CL_MEM_WRITE_ONLY, sizeof(int) * outN);
+
+  cl::Program program(context, kernelSrcMul);
+  if (program.build({queue.getInfo<CL_QUEUE_DEVICE>()}) != CL_SUCCESS) {
+    std::cerr << "Error of building pair_mul:\n"
+              << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(queue.getInfo<CL_QUEUE_DEVICE>())
+              << std::endl;
+    return {};
+  }
+
+  cl::Kernel kernel(program, "pair_mul");
+  kernel.setArg(0, bufIn);
+  kernel.setArg(1, bufOut);
+  queue.enqueueNDRangeKernel(kernel, cl::NullRange,
+                                                                       cl::NDRange(outN), cl::NullRange);
+  queue.enqueueReadBuffer(bufOut, CL_TRUE, 0, sizeof(int) * outN, output.data());
+  queue.finish();
+  return output;
+}
+
+int main() {
+  std::vector<cl::Platform> platforms;
+  cl::Platform::get(&platforms);
+  if (platforms.empty()) {
+    std::cerr << "No OpenCL platforms\n";
+    return 1;
+  }
+
+  std::vector<cl::Device> devices;
+  
+  platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.empty()) {
+    std::cerr << "No OpenCL devices\n";
+    return 2;
+  }
+
+  cl::Context context(devices[0]);
+  cl::CommandQueue queue(context, devices[0]);
+
+  std::vector<int> input = {1, 2, 3, 4, 5, 6, 7, 8};
+  auto resultAdd = runPairAdd(context, queue, input);
+  auto resultMul = runPairMul(context, queue, input);
+  
+  
+  
+  
+
+  std::cout << "Input: ";
+  for (auto v : input)
+    std::cout << v << " ";
+  std::cout << "\nResult of pair_add: ";
+  for (auto v : resultAdd)
+    std::cout << v << " ";
+  std::cout << "\nResult of pair_mul: ";
+  for (auto v : resultMul)
+    std::cout << v << " ";
+  std::cout << std::endl;
+
+
+  return 0;
+}