From 80133e667226124f7a28d00283cbdba03a00d832 Mon Sep 17 00:00:00 2001
From: EID <eid@zuerich.ibm.com>
Date: Fri, 31 Aug 2018 18:33:35 +0200
Subject: [PATCH] Update microbenchmarks 04/2020

---
 .gitignore                      |   1 +
 kw/boot/Makefile                |   6 +-
 kw/boot/boot.c                  |   2 +-
 kw/boot/link.ld                 |   2 +-
 kw/libcxl/libcxl.c              |   6 +-
 kw/liboprecomp/Makefile         |   3 +-
 kw/mb/blstm/Makefile            |   2 +-
 kw/mb/blstm/host/Makefile       |   2 +-
 kw/samples/nop/Makefile         |   2 +-
 kw/samples/nop/host/Makefile    |   2 +-
 kw/samples/nop/testset.cfg      |  25 ++++++
 kw/samples/square/Makefile      |   2 +-
 kw/samples/square/host/Makefile |   2 +-
 kw/samples/square/testset.cfg   |  25 ++++++
 mb/cnn/kw/Makefile              |   2 +-
 mb/cnn/kw/host/Makefile         |   2 +-
 mb/cnn/kw/pulp/pulp.c           |  28 +++++--
 mb/sparsesolve/Makefile         |  17 +++-
 mb/sparsesolve/floatm.h         |  30 +++++++
 mb/sparsesolve/matrix.c         |  86 +++++++-------------
 mb/sparsesolve/matrix.h         |   9 +--
 mb/sparsesolve/sparsesolve.c    | 135 +++++++++++++++++++++-----------
 mb/sparsesolve/vector.h         |  69 ++++------------
 mb/svm/baseline/README.md       |   1 -
 testset.cfg                     |  14 ++++
 25 files changed, 287 insertions(+), 188 deletions(-)
 create mode 100644 kw/samples/nop/testset.cfg
 create mode 100644 kw/samples/square/testset.cfg
 create mode 100644 mb/sparsesolve/floatm.h
 create mode 100644 testset.cfg

diff --git a/.gitignore b/.gitignore
index 0993400..6c030ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 *.out
 *.log
 __pycache__
+*.sw[p,o]
diff --git a/kw/boot/Makefile b/kw/boot/Makefile
index 836f42c..e57462b 100644
--- a/kw/boot/Makefile
+++ b/kw/boot/Makefile
@@ -4,11 +4,11 @@
 # Builds the PULP bootloader without the run time. We need a separate link
 # script since the code is loacted at a different static location (boot ROM).
 
-PULP_APP = boot-oprecompkw
+PULP_APP = rom-oprecompkw
 PULP_APP_FC_SRCS = boot.c
 PULP_APP_FC_ASM_SRCS = crt0.S
 PULP_CFLAGS = -O3 -g -fno-jump-tables
-PULP_LDFLAGS = -Tlink.ld -nostdlib
-CONFIG_OPT = options/rt/no-crt0=true options/rt/no-link-script=true options/rt/no-rt=true
+PULP_LDFLAGS = -Tlink.ld -nostdlib -lgcc
+CONFIG_OPT = rt/no-crt0=true rt/no-link-script=true rt/no-rt=true
 
 include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk
diff --git a/kw/boot/boot.c b/kw/boot/boot.c
index ac759a9..4791ccd 100644
--- a/kw/boot/boot.c
+++ b/kw/boot/boot.c
@@ -26,7 +26,7 @@ inline uint64_t host2local(uint64_t host) {
 
 
 // Rudimentary putc/puts for output.
-#define boot_putc_address (STDOUT_BASE_ADDR + STDOUT_PUTC_OFFSET + (hal_core_id()<<3) + (hal_cluster_id()<<7))
+#define boot_putc_address (ARCHI_STDOUT_ADDR + STDOUT_PUTC_OFFSET + (hal_core_id()<<3) + (hal_cluster_id()<<7))
 
 static void boot_putc(unsigned enable, unsigned int c) {
 	if (enable) {
diff --git a/kw/boot/link.ld b/kw/boot/link.ld
index 33ea9d1..5bc895d 100644
--- a/kw/boot/link.ld
+++ b/kw/boot/link.ld
@@ -6,7 +6,7 @@ __DYNAMIC  =  0;
 MEMORY
 {
 	ROM : ORIGIN = 0x1A000000, LENGTH = 0x2000
-	L1  : ORIGIN = 0x10000000, LENGTH = 0x10000
+	L1  : ORIGIN = 0x10000004, LENGTH = 0xFFFC
 	L2  : ORIGIN = 0x1C000000, LENGTH = 0x4000
 }
 
diff --git a/kw/libcxl/libcxl.c b/kw/libcxl/libcxl.c
index b2f57f7..df09b18 100644
--- a/kw/libcxl/libcxl.c
+++ b/kw/libcxl/libcxl.c
@@ -9,7 +9,7 @@
 #include <pthread.h>
 
 #include "libcxl.h"
-#include "gv/gv_launcher.h"
+#include "vp/launcher.h"
 
 
 struct cxl_afu_h {
@@ -119,14 +119,14 @@ afu_open(struct cxl_afu_h *afu) {
 	}
 
 	// Configure the handles into the platform.
-	afu->capi_binding = gv_ioreq_binding(afu->gv, "soc/soc_ico", (void *)((uint64_t)1 << 48), (uint64_t)1 << 48, capi_callback, afu);
+	afu->capi_binding = gv_ioreq_binding(afu->gv, "soc/host_injector", (void *)((uint64_t)1 << 48), (uint64_t)1 << 48, capi_callback, afu);
 	// afu->capi_binding = gv_ioreq_binding(afu->gv, "soc/soc_ico", (void *)0x1a600000, 0x00100000, capi_callback, afu);
 	if (afu->capi_binding == NULL) {
 		fprintf(stderr, "libcxl error: unable to establish CAPI binding (gv_ioreq_binding)\n");
 		goto fail;
 	}
 
-	afu->wed_binding = gv_ioreq_binding(afu->gv, "job_fifo", NULL, 0, NULL, NULL);
+	afu->wed_binding = gv_ioreq_binding(afu->gv, "job_fifo_injector", NULL, 0, NULL, NULL);
 	if (afu->wed_binding == NULL) {
 		fprintf(stderr, "libcxl error: unable to establish WED FIFO binding (gv_ioreq_binding)\n");
 		goto fail;
diff --git a/kw/liboprecomp/Makefile b/kw/liboprecomp/Makefile
index 983a1b4..6700196 100644
--- a/kw/liboprecomp/Makefile
+++ b/kw/liboprecomp/Makefile
@@ -38,4 +38,5 @@ install:
 	install -D build/liboprecomp.a $(PULP_SDK_HOME)/install/ws/lib
 	install -D build/liboprecomp.so $(PULP_SDK_HOME)/install/ws/lib
 	install -D liboprecomp.h $(PULP_SDK_HOME)/install/ws/include
-	plpinfo tree --config-options="gvsoc/load-binary_eval=os.environ.get('PULP_SDK_HOME') + '/install/bin/boot-oprecompkw'" > $(PULP_SDK_HOME)/install/cfg/oprecompkw_config.json
+	mkdir -p $(PULP_SDK_HOME)/install/cfg
+	plpinfo tree --config-options="plt_loader/load-binary_eval=os.environ.get('PULP_SDK_HOME') + '/install/bin/rom-oprecompkw'" --config-options="soc/cluster/pe0/fetch_enable=true" --config-options="chip/boot_from_rom=true" --config-options="loader/boot/mode=rom" > $(PULP_SDK_HOME)/install/cfg/oprecompkw_config.json
diff --git a/kw/mb/blstm/Makefile b/kw/mb/blstm/Makefile
index 0f12f2a..9606865 100644
--- a/kw/mb/blstm/Makefile
+++ b/kw/mb/blstm/Makefile
@@ -6,4 +6,4 @@ all:
 	$(MAKE) -C host
 
 run: all
-	host/build/blstm pulp/build/system.oprecompkw/blstm/blstm
+	host/build/blstm pulp/build/oprecompkw/blstm/blstm
diff --git a/kw/mb/blstm/host/Makefile b/kw/mb/blstm/host/Makefile
index 7e96468..7b3f5c0 100644
--- a/kw/mb/blstm/host/Makefile
+++ b/kw/mb/blstm/host/Makefile
@@ -5,7 +5,7 @@ CFLAGS ?= -O3 -g -Wall -Wextra -std=c99
 CPPFLAGS ?= -O3 -g -Wall -Wextra -std=c++11 -fopenmp
 CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include
 CPPFLAGS += -I$(PULP_SDK_HOME)/install/ws/include -I./include
-LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread
+LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread
 
 all: clean build/neuron build/blstm
 
diff --git a/kw/samples/nop/Makefile b/kw/samples/nop/Makefile
index 19fc89e..ec0bb3f 100644
--- a/kw/samples/nop/Makefile
+++ b/kw/samples/nop/Makefile
@@ -6,4 +6,4 @@ all:
 	$(MAKE) -C host
 
 run: all
-	host/build/nop pulp/build/system.oprecompkw/nop/nop
+	host/build/nop pulp/build/oprecompkw/nop/nop
diff --git a/kw/samples/nop/host/Makefile b/kw/samples/nop/host/Makefile
index 56abd74..34d64b4 100644
--- a/kw/samples/nop/host/Makefile
+++ b/kw/samples/nop/host/Makefile
@@ -3,7 +3,7 @@
 
 CFLAGS ?= -O3 -g -Wall -Wextra
 CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include
-LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread
+LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread
 
 all:: build/nop
 
diff --git a/kw/samples/nop/testset.cfg b/kw/samples/nop/testset.cfg
new file mode 100644
index 0000000..8a8d615
--- /dev/null
+++ b/kw/samples/nop/testset.cfg
@@ -0,0 +1,25 @@
+from plptest import *
+
+TestConfig = c = {}
+
+def check_output(config, output):
+
+  if output.find('Hello from the loaded PULP binary!') == -1:
+    return (False, "Didn't find output string")
+
+  return (True, None)
+
+
+def get_test():
+  return Test(
+    name = 'nop',
+    commands = [
+      Shell('build', 'make all run'),
+      Check('check', check_output)
+    ],
+    timeout=1000000000
+  )
+  
+c['tests'] = [ ]
+
+c['tests'].append(get_test())
diff --git a/kw/samples/square/Makefile b/kw/samples/square/Makefile
index b927841..ebd9db4 100644
--- a/kw/samples/square/Makefile
+++ b/kw/samples/square/Makefile
@@ -6,4 +6,4 @@ all:
 	$(MAKE) -C host
 
 run: all
-	host/build/square pulp/build/system.oprecompkw/square/square
+	host/build/square pulp/build/oprecompkw/square/square
diff --git a/kw/samples/square/host/Makefile b/kw/samples/square/host/Makefile
index 6406125..0fc007f 100644
--- a/kw/samples/square/host/Makefile
+++ b/kw/samples/square/host/Makefile
@@ -3,7 +3,7 @@
 
 CFLAGS ?= -O3 -g -Wall -Wextra -std=c99
 CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include
-LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread
+LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread
 
 all:: build/square
 
diff --git a/kw/samples/square/testset.cfg b/kw/samples/square/testset.cfg
new file mode 100644
index 0000000..ff8e40b
--- /dev/null
+++ b/kw/samples/square/testset.cfg
@@ -0,0 +1,25 @@
+from plptest import *
+
+TestConfig = c = {}
+
+def check_output(config, output):
+
+  if output.find('all correct') == -1:
+    return (False, "Didn't find output string")
+
+  return (True, None)
+
+
+def get_test():
+  return Test(
+    name = 'square',
+    commands = [
+      Shell('build', 'make all run'),
+      Check('check', check_output)
+    ],
+    timeout=1000000
+  )
+  
+c['tests'] = [ ]
+
+c['tests'].append(get_test())
diff --git a/mb/cnn/kw/Makefile b/mb/cnn/kw/Makefile
index 368ec88..3dfb4d9 100644
--- a/mb/cnn/kw/Makefile
+++ b/mb/cnn/kw/Makefile
@@ -6,4 +6,4 @@ all:
 	$(MAKE) -C host
 
 run: all
-	host/build/conv pulp/build/system.oprecompkw/conv/conv
+	host/build/conv pulp/build/oprecompkw/conv/conv
diff --git a/mb/cnn/kw/host/Makefile b/mb/cnn/kw/host/Makefile
index d8622c9..4b61f75 100644
--- a/mb/cnn/kw/host/Makefile
+++ b/mb/cnn/kw/host/Makefile
@@ -3,7 +3,7 @@
 
 CFLAGS ?= -O3 -g -Wall -Wextra -std=c99
 CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include
-LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread
+LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread
 
 all:: build/conv
 
diff --git a/mb/cnn/kw/pulp/pulp.c b/mb/cnn/kw/pulp/pulp.c
index 0ea25ce..06ff842 100644
--- a/mb/cnn/kw/pulp/pulp.c
+++ b/mb/cnn/kw/pulp/pulp.c
@@ -56,7 +56,7 @@ int main(uint64_t wedptr) {
 
 	// Determine the tiling of the input data via a binary search. This is quite
 	// inefficient and should be improved.
-	const uint32_t max_size = 65536;
+	const uint32_t max_size = 65536 - 0x5000;
 	const uint32_t max_volume = (max_size - (16 << 10)) / 4;
 	printf("Determining Tiling\n");
 	printf("  max_volume = %lu\n", max_volume);
@@ -96,6 +96,11 @@ int main(uint64_t wedptr) {
 	float *tile_x = rt_alloc(RT_ALLOC_CL_DATA, bufsz_x);
 	float *tile_w = rt_alloc(RT_ALLOC_CL_DATA, bufsz_w);
 	float *tile_y = rt_alloc(RT_ALLOC_CL_DATA, bufsz_y);
+	if (tile_x == NULL || tile_y == NULL || tile_w == NULL)
+	{
+	  printf("Failed to allocate memory\n");
+	  return -1;
+	}
 
 	// Perform the outer iterations.
 	for (int32_t ko1 = 0; ko1 < wed.KO; ko1 += TKO) {
@@ -123,25 +128,31 @@ int main(uint64_t wedptr) {
 					const int32_t XM1 = m_hi-m_lo;
 
 					printf("Loading x tile ki1=%lu..%lu, n=%lu..%lu (%ld), m=%lu..%lu (%ld)\n", ki1, ki1+KI1, n_lo, n_hi, XN1, m_lo, m_hi, XM1);
+					int merge = 0;
 					for (int32_t ki2 = 0; ki2 < KI1; ++ki2) {
 						for (int32_t n2 = 0; n2 < XN1; ++n2) {
-							plp_dma_memcpy(
+							plp_dma_memcpy_merge(
 								host2local(wed.in_x + (ki1+ki2)*wed.N*wed.M*4 + (n2+n_lo)*wed.M*4 + m_lo*4),
 								(uint32_t)tile_x + ki2*XN1*XM1*4 + n2*XM1*4,
 								XM1*4,
-								PLP_DMA_EXT2LOC
+								PLP_DMA_EXT2LOC,
+								merge
 							);
+							merge = 1;
 						}
 					}
 					plp_dma_barrier();
+					merge = 0;
 					for (int32_t ko2 = 0; ko2 < KO1; ++ko2) {
 						for (int32_t ki2 = 0; ki2 < KI1; ++ki2) {
-							plp_dma_memcpy(
+							plp_dma_memcpy_merge(
 								host2local(wed.in_w + (ko1+ko2)*wed.KI*wed.U*wed.V*4 + (ki1+ki2)*wed.U*wed.V*4),
 								(uint32_t)tile_w + ko2*TKI*wed.U*wed.V*4 + ki2*wed.U*wed.V*4,
 								wed.U*wed.V*4,
-								PLP_DMA_EXT2LOC
+								PLP_DMA_EXT2LOC,
+								merge
 							);
+							merge = 1;
 						}
 					}
 					plp_dma_barrier();
@@ -171,14 +182,17 @@ int main(uint64_t wedptr) {
 				}
 
 				// Write result tile back.
+				int merge = 0;
 				for (int32_t ko2 = 0; ko2 < KO1; ++ko2) {
 					for (int32_t n2 = 0; n2 < N1; ++n2) {
-						plp_dma_memcpy(
+						plp_dma_memcpy_merge(
 							host2local(wed.out_y + (ko1+ko2)*wed.N*wed.M*4 + (n1+n2)*wed.M*4 + m1*4),
 							(uint32_t)tile_y + ko2*N1*M1*4 + n2*M1*4,
 							M1*4,
-							PLP_DMA_LOC2EXT
+							PLP_DMA_LOC2EXT,
+							merge
 						);
+						merge = 1;
 					}
 				}
 			}
diff --git a/mb/sparsesolve/Makefile b/mb/sparsesolve/Makefile
index 341c156..4f54651 100644
--- a/mb/sparsesolve/Makefile
+++ b/mb/sparsesolve/Makefile
@@ -11,7 +11,7 @@ LDFLAGS = -lm -lrt
 
 build: sparsesolve
 
-sparsesolve: sparsesolve.o mmio.o matrix.o cg.o ir.o oprecomp.o
+sparsesolve: sparsesolve.o mmio.o matrix.o oprecomp.o
 	$(CC) $^ $(LDFLAGS) -o $@
 
 clean:
@@ -21,9 +21,20 @@ tags: *.c *.h
 	ctags *.c *.h
 
 test: sparsesolve
-	./sparsesolve data/prepared/mb/sparsesolve/bcsstk01.mtx 1 1e-7 10000 1e-7 100
+	./sparsesolve data/prepared/mb/sparsesolve/bcsstk01.mtx 1 1e-7 10000 1e-7 52
 
-%.o: %.c cg.h vector.h matrix.h
+plots: bcsstk01.pdf gr_30_30.pdf msc10848.pdf
+
+%.pdf: %.eps
+	epstopdf $<
+
+%.eps: sparsesolve.plt %.dat
+	gnuplot -c $^ $* > $@
+
+%.dat: sparsesolve
+	for i in `seq 1 52`; do ./sparsesolve ../data/prepared/mb/sparsesolve/$*.mtx 10000 1e-7 8 $$i ; done | grep -v '#' > $@
+
+%.o: %.c floatm.h vector.h matrix.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
 oprecomp.o: $(VPATH)../common/oprecomp.c
diff --git a/mb/sparsesolve/floatm.h b/mb/sparsesolve/floatm.h
new file mode 100644
index 0000000..2ebd7c2
--- /dev/null
+++ b/mb/sparsesolve/floatm.h
@@ -0,0 +1,30 @@
+#ifndef FLOATM
+#define FLOATM double
+#endif
+
+#ifndef DOUBLE
+#define DOUBLE double
+#endif
+
+static inline double double_truncate(uint8_t m, double y)
+{
+    if (m >= 52) return y; // nothing to do
+    if (!isfinite(y)) return y; // NaN and Inf does not need conversion
+    union { double d; uint64_t i; } c;
+    c.d = y;
+    c.i &= ~(((1UL << (52 - m)) - 1UL));
+    return c.d;
+}
+
+static inline float float_truncate(uint8_t m, float y)
+{
+    if (m >= 23) return y; // nothing to do
+    if (!isfinite(y)) return y; // NaN and Inf does not need conversion
+    union { float f; uint32_t i; } c;
+    c.f = y;
+    c.i &= ~(((1U << (23 - m)) - 1U));
+    return c.f;
+}
+
+#define TRUNCATE2(t, m, y) t ## _truncate(m, y)
+#define TRUNCATE(t, m, y) TRUNCATE2(t, m, y)
diff --git a/mb/sparsesolve/matrix.c b/mb/sparsesolve/matrix.c
index 5933c6c..452ea69 100644
--- a/mb/sparsesolve/matrix.c
+++ b/mb/sparsesolve/matrix.c
@@ -8,7 +8,7 @@
 #include "mmio.h"
 #include <tgmath.h> // interferes with mmio.h
 
-#include "cg.h"
+#include "floatm.h"
 #include "matrix.h"
 
 static int compar(const void *pa, const void *pb)
@@ -19,7 +19,8 @@ static int compar(const void *pa, const void *pb)
     if (a->i > b->i) return 1;
     if (a->j < b->j) return -1;
     if (a->j > b->j) return 1;
-    return 0;
+    fprintf(stderr, "Repeated matrix element %i %i\n", a->i, a->j);
+    exit(1);
 }
 
 void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a)
@@ -34,7 +35,7 @@ void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a)
         fprintf(stderr, "Could not process Matrix Market banner\n");
         exit(1);
     }
-    if (!mm_is_matrix(matcode) || !mm_is_sparse(matcode) || mm_is_complex(matcode) || !mm_is_symmetric(matcode)) {
+    if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) {
         fprintf(stderr, "This application does not support the Market Market type: %s\n",
                 mm_typecode_to_str(matcode));
         exit(1);
@@ -48,8 +49,11 @@ void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a)
         fprintf(stderr, "Matrix is not square\n");
         exit(1);
     }
-    struct matrix_coo *coo = ALLOC(struct matrix_coo, 2 * NZ);
-
+    struct matrix_coo *coo = malloc(sizeof(struct matrix_coo) * 2 * NZ);
+    if (coo == NULL) {
+        fprintf(stderr, "Out of memory\n");
+        exit(1);
+    }
     int k = 0;
     for (int l = 0; l < NZ; l++) {
         double real;
@@ -73,37 +77,6 @@ void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a)
     *n = N; *nz = k; *a = coo;
 }
 
-double coo_norm_inf(int n, int nz, struct matrix_coo *coo)
-{
-    double *amax = CALLOC(double, n);
-    for (int i = 0; i < nz; i++) {
-        int row = coo[i].i;
-        int val = coo[i].a;
-        amax[row] += abs(val);
-    }
-    double norm = 0.0;
-    for (int i = 0; i < n; i++) {
-        if (amax[i] > norm) norm = amax[i];
-    }
-    FREE(amax);
-    return norm;
-}
-
-double coo_max_nz(int n, int nz, struct matrix_coo *coo)
-{
-    double *m = CALLOC(double, n);
-    for (int i = 0; i < nz; i++) {
-        int row = coo[i].i;
-        m[row]++;
-    }
-    int r = 0.0;
-    for (int i = 0; i < n; i++) {
-        if (m[i] > r) r = m[i];
-    }
-    FREE(m);
-    return r;
-}
-
 // CSR matrix
 
 struct matrix_csr { struct matrix super; int *i; int *j; DOUBLE *A; };
@@ -118,10 +91,10 @@ void csr_dmult(struct matrix_csr *mat, DOUBLE *x, DOUBLE *y)
     }
 }
 
-void csr_smult(struct matrix_csr *mat, FLOAT *x, FLOAT *y)
+void csr_smult(struct matrix_csr *mat, FLOATM *x, FLOATM *y)
 {
     for (int k = 0; k < mat->super.n; k++) {
-        FLOAT2 t = 0.0;
+        FLOATM t = 0.0;
         for (int l = mat->i[k]; l < mat->i[k + 1]; l++)
             t += mat->A[l] * x[mat->j[l]];
         y[k] = t;
@@ -130,9 +103,9 @@ void csr_smult(struct matrix_csr *mat, FLOAT *x, FLOAT *y)
 
 struct matrix *csr_create(int n, int nz, struct matrix_coo *coo)
 {
-    int *i = ALLOC(int, n + 1);
-    int *j = ALLOC(int, nz);
-    DOUBLE *A = ALLOC(DOUBLE, nz);
+    int *i = malloc(sizeof(int) * (n + 1));
+    int *j = malloc(sizeof(int) * nz);
+    DOUBLE *A = malloc(sizeof(DOUBLE) * nz);
 
     i[0] = 0;
     int l = 0;
@@ -145,13 +118,13 @@ struct matrix *csr_create(int n, int nz, struct matrix_coo *coo)
         i[k + 1] = l;
     }
 
-    struct matrix_csr *mat = ALLOC(struct matrix_csr, 1);
+    struct matrix_csr *mat = malloc(sizeof(struct matrix_csr));
     mat->super.n = n;
     mat->i = i;
     mat->j = j;
     mat->A = A;
     mat->super.dmult = (void (*)(struct matrix *, DOUBLE *, DOUBLE *))csr_dmult;
-    mat->super.smult = (void (*)(struct matrix *, FLOAT *, FLOAT *))csr_smult;
+    mat->super.smult = (void (*)(struct matrix *, FLOATM *, FLOATM *))csr_smult;
     return (struct matrix *)mat;
 }
 
@@ -169,10 +142,10 @@ void dense_dmult(struct matrix_dense *mat, DOUBLE *x, DOUBLE *y)
     }
 }
 
-void dense_smult(uint8_t m, struct matrix_dense *mat, FLOAT *x, FLOAT *y)
+void dense_smult(struct matrix_dense *mat, FLOATM *x, FLOATM *y)
 {
     for (int i = 0; i < mat->super.n; i++) {
-        FLOAT2 t = 0.0;
+        FLOATM t = 0.0;
         for (int j = 0; j < mat->super.n; j++)
             t += mat->A[i * mat->super.n + j] * x[j];
         y[i] = t;
@@ -181,42 +154,43 @@ void dense_smult(uint8_t m, struct matrix_dense *mat, FLOAT *x, FLOAT *y)
 
 struct matrix *dense_create(int n, int nz, struct matrix_coo *coo)
 {
-    DOUBLE *A = CALLOC(DOUBLE, n * n);
+    DOUBLE *A = calloc(n * n, sizeof(DOUBLE));
 
     // row major format
     for (int k = 0; k < nz; k++) A[coo[k].i * n + coo[k].j] = coo[k].a;
 
-    struct matrix_dense *mat = ALLOC(struct matrix_dense, 1);
+    struct matrix_dense *mat = malloc(sizeof(struct matrix_dense));
     mat->super.n = n;
     mat->super.dmult = (void (*)(struct matrix *, DOUBLE *, DOUBLE *))dense_dmult;
-    mat->super.smult = (void (*)(struct matrix *, FLOAT *, FLOAT *))dense_smult;
+    mat->super.smult = (void (*)(struct matrix *, FLOATM *, FLOATM *))dense_smult;
     mat->A = A;
     return (struct matrix *)mat;
 }
 
 // Jacobi preconditioner
 
-struct precond_jacobi { struct matrix super; FLOAT *d; };
+struct precond_jacobi { struct matrix super; int bits; FLOATM *d; };
 
-void jacobi_smult(struct precond_jacobi *pre, FLOAT *x, FLOAT *y)
+void jacobi_smult(struct precond_jacobi *pre, FLOATM *x, FLOATM *y)
 {
     for (int k = 0; k < pre->super.n; k++) {
-       y[k] = x[k] / pre->d[k];
+       y[k] = TRUNCATE(FLOATM, pre->bits, x[k] / pre->d[k]);
     }
 }
 
-struct matrix *jacobi_create(int n, int nz, struct matrix_coo *coo)
+struct matrix *jacobi_create(int bits, int n, int nz, struct matrix_coo *coo)
 {
-    FLOAT *d = ALLOC(FLOAT, n);
+    FLOATM *d = malloc(sizeof(FLOATM) * n);
     for (int k = 0; k < n; k++) d[k] = 0.0;
     for (int k = 0; k < nz; k++)
         if (coo[k].i == coo[k].j)
-            d[coo[k].i] = coo[k].a;
+            d[coo[k].i] = TRUNCATE(FLOATM, bits, coo[k].a);
 
-    struct precond_jacobi *pre = ALLOC(struct precond_jacobi, 1);
+    struct precond_jacobi *pre = malloc(sizeof(struct precond_jacobi));
     pre->super.n = n;
     pre->super.dmult = NULL;
-    pre->super.smult = (void (*)(struct matrix *, FLOAT *, FLOAT *))jacobi_smult;
+    pre->super.smult = (void (*)(struct matrix *, FLOATM *, FLOATM *))jacobi_smult;
+    pre->bits = bits;
     pre->d = d;
     return (struct matrix *)pre;
 }
diff --git a/mb/sparsesolve/matrix.h b/mb/sparsesolve/matrix.h
index 1705e52..81a6db7 100644
--- a/mb/sparsesolve/matrix.h
+++ b/mb/sparsesolve/matrix.h
@@ -4,20 +4,17 @@ struct matrix_coo { int i; int j; double a; };
 
 extern void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **coo);
 
-extern double coo_norm_inf(int n, int nz, struct matrix_coo *coo);
-extern double coo_max_nz(int n, int nz, struct matrix_coo *coo);
-
 struct matrix {
     int n;
     void (*dmult)(struct matrix *, DOUBLE *, DOUBLE *);
-    void (*smult)(struct matrix *, FLOAT *, FLOAT *);
+    void (*smult)(struct matrix *, FLOATM *, FLOATM *);
 };
 
 static inline void matrix_mult(struct matrix *mat, DOUBLE *x, DOUBLE *y) {
     mat->dmult(mat, x, y);
 }
 
-static inline void floatm_mult(struct matrix *mat, FLOAT *x, FLOAT *y) {
+static inline void floatm_mult(struct matrix *mat, FLOATM *x, FLOATM *y) {
     mat->smult(mat, x, y);
 }
 
@@ -25,4 +22,4 @@ extern struct matrix *csr_create(int n, int nz, struct matrix_coo *coo);
 
 extern struct matrix *dense_create(int n, int nz, struct matrix_coo *coo);
 
-extern struct matrix *jacobi_create(int n, int nz, struct matrix_coo *coo);
+extern struct matrix *jacobi_create(int bits, int n, int nz, struct matrix_coo *coo);
diff --git a/mb/sparsesolve/sparsesolve.c b/mb/sparsesolve/sparsesolve.c
index c3e5d3b..7441ee6 100644
--- a/mb/sparsesolve/sparsesolve.c
+++ b/mb/sparsesolve/sparsesolve.c
@@ -3,10 +3,8 @@
 #include <stdint.h>
 #include <tgmath.h>
 #include <float.h>
-#include <stdbool.h>
-#include <string.h>
 
-#include "cg.h"
+#include "floatm.h"
 #include "vector.h"
 #include "matrix.h"
 #include "oprecomp.h"
@@ -18,10 +16,69 @@ void iterative_refinement(int n, struct matrix *A, struct matrix *M, DOUBLE *b,
         int out_maxiter, DOUBLE out_tol, int in_maxiter, DOUBLE in_tol, int step_check,
         int *out_iter, int *in_iter);
 
+// #define USE_DENSE
+// #define USE_PRECOND
+
+int conjugate_gradient(uint8_t bits, int n, struct matrix *A, struct matrix *M, FLOATM *b, FLOATM *x, int maxiter, FLOATM umbral)
+{
+    int iter = 0;
+    FLOATM alpha, beta, rho, tau, tol;
+
+    FLOATM *r = ALLOC(FLOATM, n);
+    FLOATM *p = ALLOC(FLOATM, n);
+    FLOATM *z = ALLOC(FLOATM, n);
+
+    floatm_set(bits, n, 1.0, x);
+    floatm_mult(A, x, r);
+    floatm_xpby(bits, n, b, -1.0, r); // r = b - Ax
+
+    if (M) {
+        floatm_mult(M, r, p);
+        rho = floatm_dot(n, r, p);
+        tol = floatm_norm2(n, r);
+    } else { // small optimization
+        floatm_copy(bits, n, r, p);
+        rho = floatm_dot(n, r, r);
+        tol = sqrt(rho);
+    }
+
+    while ((iter < maxiter) && (tol > umbral)) {
+        // alpha = (r,z) / (Ap,p)
+        floatm_mult(A, p, z);
+        alpha = rho / floatm_dot(n, z, p);
+        // x = x + alpha * p
+        floatm_axpy(bits, n, alpha, p, x);
+        // r = r - alpha * Ap
+        floatm_axpy(bits, n, -alpha, z, r);
+        // apply preconditioner
+        if (M) {
+            floatm_mult(M, r, z);
+            tau = floatm_dot(n, r, z);
+            tol = floatm_norm2(n, r);
+        } else {
+            tau = floatm_dot(n, r, r);
+            tol = sqrt(tau);
+        }
+        // beta = (r,z) / rho
+        beta =  tau / rho;
+        rho = tau;
+        // p = z + beta * p
+        if (M) floatm_xpby(bits, n, z, beta, p);
+        else floatm_xpby(bits, n, r, beta, p);
+        iter++;
+        // printf ("(%d,%20.10e)\n", iter, tol);
+    }
+
+    free(r);
+    free(p);
+    free(z);
+    return iter;
+}
+
 int main (int argc, char *argv[])
 {
     if (argc != 7) {
-        fprintf(stderr, "Missing arguments: algorithm matrix_file out_its out_tol in_its in_tol step_chk\n");
+        fprintf(stderr, "Missing arguments: matrix_file out_its out_tol in_its in_tol mantissa_bits\n");
         return 1;
     }
 
@@ -37,12 +94,10 @@ int main (int argc, char *argv[])
 
     DOUBLE *x = ALLOC(DOUBLE, n);
     DOUBLE *b = ALLOC(DOUBLE, n);
-    DOUBLE *r = ALLOC(DOUBLE, n);
-    DOUBLE *s = ALLOC(DOUBLE, n);
-
-    vector_rand(n, s);
-    // vector_set(n, 1.0 / sqrt(n), s);
-    matrix_mult(A, s, b);
+    DOUBLE *e = ALLOC(DOUBLE, n);
+    FLOATM *r = ALLOC(FLOATM, n);
+    FLOATM *d = ALLOC(FLOATM, n);
+    vector_set(n, 1.0, b);
 
     int out_maxiter = atoi(argv[2]);
     DOUBLE out_tol = atof(argv[3]);
@@ -51,58 +106,50 @@ int main (int argc, char *argv[])
     int in_maxiter = atoi(argv[4]);
     DOUBLE in_tol = atof(argv[5]);
     if (in_tol <= 0.0) in_tol = 1.0e-8;
-    int step_check = atoi(argv[6]);
-
-    DOUBLE max = 0;
-    for (int i = 0; i < nz; i++) {
-        DOUBLE x1 = coo[i].a;
-        DOUBLE x2 = x1;
-        DOUBLE error = (x1 - x2) / x1;
-        if (error > max) max = error;
-    }
+
+    uint8_t bits = atoi(argv[6]);
 
 #ifdef USE_PRECOND
-    struct matrix *M = jacobi_create(n, nz, coo);
+    struct matrix *M = jacobi_create(bits, n, nz, coo);
 #else
     struct matrix *M = NULL;
 #endif
-    DOUBLE norm = coo_norm_inf(n, nz, coo);
     free(coo);
 
-    printf("# algorithm: %s\n", argv[1]);
-    printf("# sizeof FLOAT: %lu\n", sizeof(FLOAT));
-    // printf("# roundoff: %e\n", epsilon(bits));
-    printf("# matrix: %s\n", argv[2]);
-    printf("# problem_size: %d\n", n);
-    printf("# nnz: %d\n", nz);
-    printf("# matrix_norm: %e\n", (double)norm);
-    printf("# matrix_error: %e\n", (double)max);
-    printf("# bnorm: %e\n", (double)vector_norm2(n, b));
+    int out_iter, in_iter;
+    DOUBLE residual;
 
-    vector_rand(n, x);
+    oprecomp_start();
+    do {
 
-    int out_iter = 0, in_iter = 0;
+    out_iter = 0; in_iter = 0;
+    vector_set(n, 0.0, x);
+    vector_copy(bits, n, b, r);
 
-    oprecomp_start();
     do {
+        in_iter += conjugate_gradient(bits, n, A, M, r, d, in_maxiter, in_tol);
+
+        vector_axpy(n, 1.0, d, x); // x = x + d
+        matrix_mult(A, x, e);
+        vector_xpby(n, b, -1.0, e);
+        residual = vector_norm2(n, e);
+        vector_copy(bits, n, e, r); // r = b - Ax
 
-        iterative_refinement(n, A, M, b, x, out_maxiter, out_tol, in_maxiter, in_tol, step_check,
-                             &out_iter, &in_iter);
+        out_iter++;
+        printf("[%i] %e\n", in_iter, residual);
+    } while ((residual > out_tol) && (out_iter < out_maxiter));
 
     } while (oprecomp_iterate());
     oprecomp_stop();
 
-    matrix_mult(A, x, r);
-    vector_xpby(n, b, -1.0, r);
-    DOUBLE residual = vector_norm2(n, r);
-    DOUBLE normalized_residual = residual / (vector_norm2(n, x) * norm);
-
+    printf("# matrix: %s\n", argv[1]);
+    printf("# problem_size: %d\n", n);
     printf("# outer_maxiter: %d\n", out_maxiter);
-    printf("# outer_tolerance: %e\n", (double)out_tol);
+    printf("# outer_tolerance: %e\n", out_tol);
     printf("# inner_maxiter: %d\n", in_maxiter);
-    printf("# inner_tolerance: %e\n", (double)in_tol);
+    printf("# inner_tolerance: %e\n", in_tol);
+    printf("# precision: %d\n", bits);
     printf("# outer_iterations: %d\n", out_iter);
     printf("# inner_iterations: %d\n", in_iter);
-    printf("# residual: %e\n", (double)residual);
-    printf("# normalized_residual: %e\n", (double)normalized_residual);
+    printf("# residual: %e\n", residual);
 }
diff --git a/mb/sparsesolve/vector.h b/mb/sparsesolve/vector.h
index cf2e906..c0e466b 100644
--- a/mb/sparsesolve/vector.h
+++ b/mb/sparsesolve/vector.h
@@ -4,18 +4,6 @@ static inline void vector_set(int n, DOUBLE a, DOUBLE *x) {
     for (int i = 0; i < n; i++) x[i] = a;
 }
 
-static inline void vector_rand(int n, DOUBLE *x) {
-    for (int i = 0; i < n; i++) x[i] = rand() / (double)RAND_MAX;
-}
-
-static inline void vector_copy(int n, DOUBLE *x, DOUBLE *y) {
-    for (int i = 0; i < n; i++) y[i] = x[i];
-}
-
-static inline void vector_axpy(int n, DOUBLE a, DOUBLE *x, DOUBLE *y) {
-    for (int i = 0; i < n; i++) y[i] = y[i] + x[i] * a;
-}
-
 static inline void vector_xpby(int n, DOUBLE *x, DOUBLE b, DOUBLE *y) {
     for (int i = 0; i < n; i++) y[i] = x[i] + b * y[i];
 }
@@ -32,68 +20,41 @@ static inline DOUBLE vector_norm2(int n, DOUBLE *x) {
 
 // limited precision version
 
-static inline void floatm_set(int n, FLOAT a, FLOAT *x) {
-    FLOAT b = a;
+static inline void floatm_set(uint8_t m, int n, FLOATM a, FLOATM *x) {
+    FLOATM b = TRUNCATE(FLOATM, m, a);
     for (int i = 0; i < n; i++) x[i] = b;
 }
 
-static inline void floatm_rand(int n, FLOAT *x) {
-    for (int i = 0; i < n; i++) x[i] = rand() / (double)RAND_MAX;
-}
-
-static inline void floatm_copy(int n, FLOAT *x, FLOAT *y) {
-    for (int i = 0; i < n; i++) y[i] = x[i];
+static inline void floatm_copy(uint8_t m, int n, FLOATM *x, FLOATM *y) {
+    for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, x[i]);
 }
 
-static inline void floatm_axpy(int n, FLOAT a, FLOAT *x, FLOAT *y) {
-    for (int i = 0; i < n; i++) y[i] = y[i] + x[i] * a;
+static inline void floatm_axpy(uint8_t m, int n, FLOATM a, FLOATM *x, FLOATM *y) {
+    for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, y[i] + x[i] * a);
 }
 
-static inline void floatm_xpby(int n, FLOAT *x, FLOAT b, FLOAT *y) {
-    for (int i = 0; i < n; i++) y[i] = x[i] + b * y[i];
+static inline void floatm_xpby(uint8_t m, int n, FLOATM *x, FLOATM b, FLOATM *y) {
+    for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, x[i] + b * y[i]);
 }
 
-// the dot product can be computed with more precision than FLOAT
+// the dot product is computed with full precision
 
-static inline FLOAT2 floatm_dot(int n, FLOAT *x, FLOAT *y) {
-    FLOAT2 r = 0.0;
+static inline FLOATM floatm_dot(int n, FLOATM *x, FLOATM *y) {
+    FLOATM r = 0.0;
     for (int i = 0; i < n; i++) r += y[i] * x[i];
     return r;
 }
 
-static inline FLOAT2 floatm_norm2(int n, FLOAT *x) {
+static inline FLOATM floatm_norm2(int n, FLOATM *x) {
     return sqrt(floatm_dot(n, x, x));
 }
 
-static inline FLOAT2 floatm_axpy_dot(int n, FLOAT a, FLOAT *x, FLOAT *y) {
-    FLOAT2 r = 0.0;
-    for (int i = 0; i < n; i++) {
-        FLOAT2 t = y[i] + x[i] * a;
-        y[i] = t;
-        r += t * t;
-    }
-    return r;
-}
-
-static inline FLOAT2 floatm_diff_norm2(int n, FLOAT *x, FLOAT *y) {
-    FLOAT2 r = 0.0;
-    for (int i = 0; i < n; i++) {
-        FLOAT2 t = x[i] - y[i];
-        r += t * t;
-    }
-    return sqrt(r);
-}
-
 // mixed precision routines
 
-static inline void mixed_copy(int n, DOUBLE *x, FLOAT *y) {
-    for (int i = 0; i < n; i++) y[i] = x[i];
+static inline void vector_copy(uint8_t m, int n, DOUBLE *x, FLOATM *y) {
+    for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, x[i]);
 }
 
-static inline void mixed_axpy(int n, DOUBLE a, FLOAT *x, DOUBLE *y) {
+static inline void vector_axpy(int n, DOUBLE a, FLOATM *x, DOUBLE *y) {
     for (int i = 0; i < n; i++) y[i] += x[i] * a;
 }
-
-static inline void mixed_xpby(int n, DOUBLE *x, DOUBLE b, FLOAT *y) {
-    for (int i = 0; i < n; i++) y[i] = x[i] + b * y[i];
-}
diff --git a/mb/svm/baseline/README.md b/mb/svm/baseline/README.md
index a85ffbf..d2742a0 100644
--- a/mb/svm/baseline/README.md
+++ b/mb/svm/baseline/README.md
@@ -72,7 +72,6 @@ If you use ThunderSVM in your paper, please cite our work ([preprint now availab
 ```
 ## Related websites
 * [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) | [SVM<sup>light</sup>](http://svmlight.joachims.org/) | [OHD-SVM](https://github.com/OrcusCZ/OHD-SVM) | [NVIDIA Machine Learning](http://www.nvidia.com/object/machine-learning.html)
- 
 
 ## Acknowlegement 
 * We acknowledge NVIDIA for their hardware donations.
diff --git a/testset.cfg b/testset.cfg
new file mode 100644
index 0000000..5b28bc6
--- /dev/null
+++ b/testset.cfg
@@ -0,0 +1,14 @@
+from plptest import *
+
+TestConfig = c = {}
+
+quick = Testset(
+  name  = 'oprecomp',
+  files = [
+    'kw/samples/nop/testset.cfg',
+    'kw/samples/square/testset.cfg'
+  ],
+  restrict = 'config.get_str("**/chip/name") == "oprecompkw"'
+)
+
+c['testsets'] = [ quick ]