From 80133e667226124f7a28d00283cbdba03a00d832 Mon Sep 17 00:00:00 2001 From: EID Date: Fri, 31 Aug 2018 18:33:35 +0200 Subject: [PATCH] Update microbenchmarks 04/2020 --- .gitignore | 1 + kw/boot/Makefile | 6 +- kw/boot/boot.c | 2 +- kw/boot/link.ld | 2 +- kw/libcxl/libcxl.c | 6 +- kw/liboprecomp/Makefile | 3 +- kw/mb/blstm/Makefile | 2 +- kw/mb/blstm/host/Makefile | 2 +- kw/samples/nop/Makefile | 2 +- kw/samples/nop/host/Makefile | 2 +- kw/samples/nop/testset.cfg | 25 ++++++ kw/samples/square/Makefile | 2 +- kw/samples/square/host/Makefile | 2 +- kw/samples/square/testset.cfg | 25 ++++++ mb/cnn/kw/Makefile | 2 +- mb/cnn/kw/host/Makefile | 2 +- mb/cnn/kw/pulp/pulp.c | 28 +++++-- mb/sparsesolve/Makefile | 17 +++- mb/sparsesolve/floatm.h | 30 +++++++ mb/sparsesolve/matrix.c | 86 +++++++------------- mb/sparsesolve/matrix.h | 9 +-- mb/sparsesolve/sparsesolve.c | 135 +++++++++++++++++++++----------- mb/sparsesolve/vector.h | 69 ++++------------ mb/svm/baseline/README.md | 1 - testset.cfg | 14 ++++ 25 files changed, 287 insertions(+), 188 deletions(-) create mode 100644 kw/samples/nop/testset.cfg create mode 100644 kw/samples/square/testset.cfg create mode 100644 mb/sparsesolve/floatm.h create mode 100644 testset.cfg diff --git a/.gitignore b/.gitignore index 0993400..6c030ad 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ *.out *.log __pycache__ +*.sw[p,o] diff --git a/kw/boot/Makefile b/kw/boot/Makefile index 836f42c..e57462b 100644 --- a/kw/boot/Makefile +++ b/kw/boot/Makefile @@ -4,11 +4,11 @@ # Builds the PULP bootloader without the run time. We need a separate link # script since the code is loacted at a different static location (boot ROM). -PULP_APP = boot-oprecompkw +PULP_APP = rom-oprecompkw PULP_APP_FC_SRCS = boot.c PULP_APP_FC_ASM_SRCS = crt0.S PULP_CFLAGS = -O3 -g -fno-jump-tables -PULP_LDFLAGS = -Tlink.ld -nostdlib -CONFIG_OPT = options/rt/no-crt0=true options/rt/no-link-script=true options/rt/no-rt=true +PULP_LDFLAGS = -Tlink.ld -nostdlib -lgcc +CONFIG_OPT = rt/no-crt0=true rt/no-link-script=true rt/no-rt=true include $(PULP_SDK_HOME)/install/rules/pulp_rt.mk diff --git a/kw/boot/boot.c b/kw/boot/boot.c index ac759a9..4791ccd 100644 --- a/kw/boot/boot.c +++ b/kw/boot/boot.c @@ -26,7 +26,7 @@ inline uint64_t host2local(uint64_t host) { // Rudimentary putc/puts for output. -#define boot_putc_address (STDOUT_BASE_ADDR + STDOUT_PUTC_OFFSET + (hal_core_id()<<3) + (hal_cluster_id()<<7)) +#define boot_putc_address (ARCHI_STDOUT_ADDR + STDOUT_PUTC_OFFSET + (hal_core_id()<<3) + (hal_cluster_id()<<7)) static void boot_putc(unsigned enable, unsigned int c) { if (enable) { diff --git a/kw/boot/link.ld b/kw/boot/link.ld index 33ea9d1..5bc895d 100644 --- a/kw/boot/link.ld +++ b/kw/boot/link.ld @@ -6,7 +6,7 @@ __DYNAMIC = 0; MEMORY { ROM : ORIGIN = 0x1A000000, LENGTH = 0x2000 - L1 : ORIGIN = 0x10000000, LENGTH = 0x10000 + L1 : ORIGIN = 0x10000004, LENGTH = 0xFFFC L2 : ORIGIN = 0x1C000000, LENGTH = 0x4000 } diff --git a/kw/libcxl/libcxl.c b/kw/libcxl/libcxl.c index b2f57f7..df09b18 100644 --- a/kw/libcxl/libcxl.c +++ b/kw/libcxl/libcxl.c @@ -9,7 +9,7 @@ #include #include "libcxl.h" -#include "gv/gv_launcher.h" +#include "vp/launcher.h" struct cxl_afu_h { @@ -119,14 +119,14 @@ afu_open(struct cxl_afu_h *afu) { } // Configure the handles into the platform. - afu->capi_binding = gv_ioreq_binding(afu->gv, "soc/soc_ico", (void *)((uint64_t)1 << 48), (uint64_t)1 << 48, capi_callback, afu); + afu->capi_binding = gv_ioreq_binding(afu->gv, "soc/host_injector", (void *)((uint64_t)1 << 48), (uint64_t)1 << 48, capi_callback, afu); // afu->capi_binding = gv_ioreq_binding(afu->gv, "soc/soc_ico", (void *)0x1a600000, 0x00100000, capi_callback, afu); if (afu->capi_binding == NULL) { fprintf(stderr, "libcxl error: unable to establish CAPI binding (gv_ioreq_binding)\n"); goto fail; } - afu->wed_binding = gv_ioreq_binding(afu->gv, "job_fifo", NULL, 0, NULL, NULL); + afu->wed_binding = gv_ioreq_binding(afu->gv, "job_fifo_injector", NULL, 0, NULL, NULL); if (afu->wed_binding == NULL) { fprintf(stderr, "libcxl error: unable to establish WED FIFO binding (gv_ioreq_binding)\n"); goto fail; diff --git a/kw/liboprecomp/Makefile b/kw/liboprecomp/Makefile index 983a1b4..6700196 100644 --- a/kw/liboprecomp/Makefile +++ b/kw/liboprecomp/Makefile @@ -38,4 +38,5 @@ install: install -D build/liboprecomp.a $(PULP_SDK_HOME)/install/ws/lib install -D build/liboprecomp.so $(PULP_SDK_HOME)/install/ws/lib install -D liboprecomp.h $(PULP_SDK_HOME)/install/ws/include - plpinfo tree --config-options="gvsoc/load-binary_eval=os.environ.get('PULP_SDK_HOME') + '/install/bin/boot-oprecompkw'" > $(PULP_SDK_HOME)/install/cfg/oprecompkw_config.json + mkdir -p $(PULP_SDK_HOME)/install/cfg + plpinfo tree --config-options="plt_loader/load-binary_eval=os.environ.get('PULP_SDK_HOME') + '/install/bin/rom-oprecompkw'" --config-options="soc/cluster/pe0/fetch_enable=true" --config-options="chip/boot_from_rom=true" --config-options="loader/boot/mode=rom" > $(PULP_SDK_HOME)/install/cfg/oprecompkw_config.json diff --git a/kw/mb/blstm/Makefile b/kw/mb/blstm/Makefile index 0f12f2a..9606865 100644 --- a/kw/mb/blstm/Makefile +++ b/kw/mb/blstm/Makefile @@ -6,4 +6,4 @@ all: $(MAKE) -C host run: all - host/build/blstm pulp/build/system.oprecompkw/blstm/blstm + host/build/blstm pulp/build/oprecompkw/blstm/blstm diff --git a/kw/mb/blstm/host/Makefile b/kw/mb/blstm/host/Makefile index 7e96468..7b3f5c0 100644 --- a/kw/mb/blstm/host/Makefile +++ b/kw/mb/blstm/host/Makefile @@ -5,7 +5,7 @@ CFLAGS ?= -O3 -g -Wall -Wextra -std=c99 CPPFLAGS ?= -O3 -g -Wall -Wextra -std=c++11 -fopenmp CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include CPPFLAGS += -I$(PULP_SDK_HOME)/install/ws/include -I./include -LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread +LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread all: clean build/neuron build/blstm diff --git a/kw/samples/nop/Makefile b/kw/samples/nop/Makefile index 19fc89e..ec0bb3f 100644 --- a/kw/samples/nop/Makefile +++ b/kw/samples/nop/Makefile @@ -6,4 +6,4 @@ all: $(MAKE) -C host run: all - host/build/nop pulp/build/system.oprecompkw/nop/nop + host/build/nop pulp/build/oprecompkw/nop/nop diff --git a/kw/samples/nop/host/Makefile b/kw/samples/nop/host/Makefile index 56abd74..34d64b4 100644 --- a/kw/samples/nop/host/Makefile +++ b/kw/samples/nop/host/Makefile @@ -3,7 +3,7 @@ CFLAGS ?= -O3 -g -Wall -Wextra CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include -LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread +LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread all:: build/nop diff --git a/kw/samples/nop/testset.cfg b/kw/samples/nop/testset.cfg new file mode 100644 index 0000000..8a8d615 --- /dev/null +++ b/kw/samples/nop/testset.cfg @@ -0,0 +1,25 @@ +from plptest import * + +TestConfig = c = {} + +def check_output(config, output): + + if output.find('Hello from the loaded PULP binary!') == -1: + return (False, "Didn't find output string") + + return (True, None) + + +def get_test(): + return Test( + name = 'nop', + commands = [ + Shell('build', 'make all run'), + Check('check', check_output) + ], + timeout=1000000000 + ) + +c['tests'] = [ ] + +c['tests'].append(get_test()) diff --git a/kw/samples/square/Makefile b/kw/samples/square/Makefile index b927841..ebd9db4 100644 --- a/kw/samples/square/Makefile +++ b/kw/samples/square/Makefile @@ -6,4 +6,4 @@ all: $(MAKE) -C host run: all - host/build/square pulp/build/system.oprecompkw/square/square + host/build/square pulp/build/oprecompkw/square/square diff --git a/kw/samples/square/host/Makefile b/kw/samples/square/host/Makefile index 6406125..0fc007f 100644 --- a/kw/samples/square/host/Makefile +++ b/kw/samples/square/host/Makefile @@ -3,7 +3,7 @@ CFLAGS ?= -O3 -g -Wall -Wextra -std=c99 CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include -LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread +LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread all:: build/square diff --git a/kw/samples/square/testset.cfg b/kw/samples/square/testset.cfg new file mode 100644 index 0000000..ff8e40b --- /dev/null +++ b/kw/samples/square/testset.cfg @@ -0,0 +1,25 @@ +from plptest import * + +TestConfig = c = {} + +def check_output(config, output): + + if output.find('all correct') == -1: + return (False, "Didn't find output string") + + return (True, None) + + +def get_test(): + return Test( + name = 'square', + commands = [ + Shell('build', 'make all run'), + Check('check', check_output) + ], + timeout=1000000 + ) + +c['tests'] = [ ] + +c['tests'].append(get_test()) diff --git a/mb/cnn/kw/Makefile b/mb/cnn/kw/Makefile index 368ec88..3dfb4d9 100644 --- a/mb/cnn/kw/Makefile +++ b/mb/cnn/kw/Makefile @@ -6,4 +6,4 @@ all: $(MAKE) -C host run: all - host/build/conv pulp/build/system.oprecompkw/conv/conv + host/build/conv pulp/build/oprecompkw/conv/conv diff --git a/mb/cnn/kw/host/Makefile b/mb/cnn/kw/host/Makefile index d8622c9..4b61f75 100644 --- a/mb/cnn/kw/host/Makefile +++ b/mb/cnn/kw/host/Makefile @@ -3,7 +3,7 @@ CFLAGS ?= -O3 -g -Wall -Wextra -std=c99 CFLAGS += -I$(PULP_SDK_HOME)/install/ws/include -LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lgvlauncher -lpthread +LDFLAGS ?= -L$(PULP_SDK_HOME)/install/ws/lib -loprecomp -lcxl -lpulpvplauncher -lpthread all:: build/conv diff --git a/mb/cnn/kw/pulp/pulp.c b/mb/cnn/kw/pulp/pulp.c index 0ea25ce..06ff842 100644 --- a/mb/cnn/kw/pulp/pulp.c +++ b/mb/cnn/kw/pulp/pulp.c @@ -56,7 +56,7 @@ int main(uint64_t wedptr) { // Determine the tiling of the input data via a binary search. This is quite // inefficient and should be improved. - const uint32_t max_size = 65536; + const uint32_t max_size = 65536 - 0x5000; const uint32_t max_volume = (max_size - (16 << 10)) / 4; printf("Determining Tiling\n"); printf(" max_volume = %lu\n", max_volume); @@ -96,6 +96,11 @@ int main(uint64_t wedptr) { float *tile_x = rt_alloc(RT_ALLOC_CL_DATA, bufsz_x); float *tile_w = rt_alloc(RT_ALLOC_CL_DATA, bufsz_w); float *tile_y = rt_alloc(RT_ALLOC_CL_DATA, bufsz_y); + if (tile_x == NULL || tile_y == NULL || tile_w == NULL) + { + printf("Failed to allocate memory\n"); + return -1; + } // Perform the outer iterations. for (int32_t ko1 = 0; ko1 < wed.KO; ko1 += TKO) { @@ -123,25 +128,31 @@ int main(uint64_t wedptr) { const int32_t XM1 = m_hi-m_lo; printf("Loading x tile ki1=%lu..%lu, n=%lu..%lu (%ld), m=%lu..%lu (%ld)\n", ki1, ki1+KI1, n_lo, n_hi, XN1, m_lo, m_hi, XM1); + int merge = 0; for (int32_t ki2 = 0; ki2 < KI1; ++ki2) { for (int32_t n2 = 0; n2 < XN1; ++n2) { - plp_dma_memcpy( + plp_dma_memcpy_merge( host2local(wed.in_x + (ki1+ki2)*wed.N*wed.M*4 + (n2+n_lo)*wed.M*4 + m_lo*4), (uint32_t)tile_x + ki2*XN1*XM1*4 + n2*XM1*4, XM1*4, - PLP_DMA_EXT2LOC + PLP_DMA_EXT2LOC, + merge ); + merge = 1; } } plp_dma_barrier(); + merge = 0; for (int32_t ko2 = 0; ko2 < KO1; ++ko2) { for (int32_t ki2 = 0; ki2 < KI1; ++ki2) { - plp_dma_memcpy( + plp_dma_memcpy_merge( host2local(wed.in_w + (ko1+ko2)*wed.KI*wed.U*wed.V*4 + (ki1+ki2)*wed.U*wed.V*4), (uint32_t)tile_w + ko2*TKI*wed.U*wed.V*4 + ki2*wed.U*wed.V*4, wed.U*wed.V*4, - PLP_DMA_EXT2LOC + PLP_DMA_EXT2LOC, + merge ); + merge = 1; } } plp_dma_barrier(); @@ -171,14 +182,17 @@ int main(uint64_t wedptr) { } // Write result tile back. + int merge = 0; for (int32_t ko2 = 0; ko2 < KO1; ++ko2) { for (int32_t n2 = 0; n2 < N1; ++n2) { - plp_dma_memcpy( + plp_dma_memcpy_merge( host2local(wed.out_y + (ko1+ko2)*wed.N*wed.M*4 + (n1+n2)*wed.M*4 + m1*4), (uint32_t)tile_y + ko2*N1*M1*4 + n2*M1*4, M1*4, - PLP_DMA_LOC2EXT + PLP_DMA_LOC2EXT, + merge ); + merge = 1; } } } diff --git a/mb/sparsesolve/Makefile b/mb/sparsesolve/Makefile index 341c156..4f54651 100644 --- a/mb/sparsesolve/Makefile +++ b/mb/sparsesolve/Makefile @@ -11,7 +11,7 @@ LDFLAGS = -lm -lrt build: sparsesolve -sparsesolve: sparsesolve.o mmio.o matrix.o cg.o ir.o oprecomp.o +sparsesolve: sparsesolve.o mmio.o matrix.o oprecomp.o $(CC) $^ $(LDFLAGS) -o $@ clean: @@ -21,9 +21,20 @@ tags: *.c *.h ctags *.c *.h test: sparsesolve - ./sparsesolve data/prepared/mb/sparsesolve/bcsstk01.mtx 1 1e-7 10000 1e-7 100 + ./sparsesolve data/prepared/mb/sparsesolve/bcsstk01.mtx 1 1e-7 10000 1e-7 52 -%.o: %.c cg.h vector.h matrix.h +plots: bcsstk01.pdf gr_30_30.pdf msc10848.pdf + +%.pdf: %.eps + epstopdf $< + +%.eps: sparsesolve.plt %.dat + gnuplot -c $^ $* > $@ + +%.dat: sparsesolve + for i in `seq 1 52`; do ./sparsesolve ../data/prepared/mb/sparsesolve/$*.mtx 10000 1e-7 8 $$i ; done | grep -v '#' > $@ + +%.o: %.c floatm.h vector.h matrix.h $(CC) $(CFLAGS) -c $< -o $@ oprecomp.o: $(VPATH)../common/oprecomp.c diff --git a/mb/sparsesolve/floatm.h b/mb/sparsesolve/floatm.h new file mode 100644 index 0000000..2ebd7c2 --- /dev/null +++ b/mb/sparsesolve/floatm.h @@ -0,0 +1,30 @@ +#ifndef FLOATM +#define FLOATM double +#endif + +#ifndef DOUBLE +#define DOUBLE double +#endif + +static inline double double_truncate(uint8_t m, double y) +{ + if (m >= 52) return y; // nothing to do + if (!isfinite(y)) return y; // NaN and Inf does not need conversion + union { double d; uint64_t i; } c; + c.d = y; + c.i &= ~(((1UL << (52 - m)) - 1UL)); + return c.d; +} + +static inline float float_truncate(uint8_t m, float y) +{ + if (m >= 23) return y; // nothing to do + if (!isfinite(y)) return y; // NaN and Inf does not need conversion + union { float f; uint32_t i; } c; + c.f = y; + c.i &= ~(((1U << (23 - m)) - 1U)); + return c.f; +} + +#define TRUNCATE2(t, m, y) t ## _truncate(m, y) +#define TRUNCATE(t, m, y) TRUNCATE2(t, m, y) diff --git a/mb/sparsesolve/matrix.c b/mb/sparsesolve/matrix.c index 5933c6c..452ea69 100644 --- a/mb/sparsesolve/matrix.c +++ b/mb/sparsesolve/matrix.c @@ -8,7 +8,7 @@ #include "mmio.h" #include // interferes with mmio.h -#include "cg.h" +#include "floatm.h" #include "matrix.h" static int compar(const void *pa, const void *pb) @@ -19,7 +19,8 @@ static int compar(const void *pa, const void *pb) if (a->i > b->i) return 1; if (a->j < b->j) return -1; if (a->j > b->j) return 1; - return 0; + fprintf(stderr, "Repeated matrix element %i %i\n", a->i, a->j); + exit(1); } void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a) @@ -34,7 +35,7 @@ void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a) fprintf(stderr, "Could not process Matrix Market banner\n"); exit(1); } - if (!mm_is_matrix(matcode) || !mm_is_sparse(matcode) || mm_is_complex(matcode) || !mm_is_symmetric(matcode)) { + if (!(mm_is_real(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))) { fprintf(stderr, "This application does not support the Market Market type: %s\n", mm_typecode_to_str(matcode)); exit(1); @@ -48,8 +49,11 @@ void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a) fprintf(stderr, "Matrix is not square\n"); exit(1); } - struct matrix_coo *coo = ALLOC(struct matrix_coo, 2 * NZ); - + struct matrix_coo *coo = malloc(sizeof(struct matrix_coo) * 2 * NZ); + if (coo == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(1); + } int k = 0; for (int l = 0; l < NZ; l++) { double real; @@ -73,37 +77,6 @@ void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **a) *n = N; *nz = k; *a = coo; } -double coo_norm_inf(int n, int nz, struct matrix_coo *coo) -{ - double *amax = CALLOC(double, n); - for (int i = 0; i < nz; i++) { - int row = coo[i].i; - int val = coo[i].a; - amax[row] += abs(val); - } - double norm = 0.0; - for (int i = 0; i < n; i++) { - if (amax[i] > norm) norm = amax[i]; - } - FREE(amax); - return norm; -} - -double coo_max_nz(int n, int nz, struct matrix_coo *coo) -{ - double *m = CALLOC(double, n); - for (int i = 0; i < nz; i++) { - int row = coo[i].i; - m[row]++; - } - int r = 0.0; - for (int i = 0; i < n; i++) { - if (m[i] > r) r = m[i]; - } - FREE(m); - return r; -} - // CSR matrix struct matrix_csr { struct matrix super; int *i; int *j; DOUBLE *A; }; @@ -118,10 +91,10 @@ void csr_dmult(struct matrix_csr *mat, DOUBLE *x, DOUBLE *y) } } -void csr_smult(struct matrix_csr *mat, FLOAT *x, FLOAT *y) +void csr_smult(struct matrix_csr *mat, FLOATM *x, FLOATM *y) { for (int k = 0; k < mat->super.n; k++) { - FLOAT2 t = 0.0; + FLOATM t = 0.0; for (int l = mat->i[k]; l < mat->i[k + 1]; l++) t += mat->A[l] * x[mat->j[l]]; y[k] = t; @@ -130,9 +103,9 @@ void csr_smult(struct matrix_csr *mat, FLOAT *x, FLOAT *y) struct matrix *csr_create(int n, int nz, struct matrix_coo *coo) { - int *i = ALLOC(int, n + 1); - int *j = ALLOC(int, nz); - DOUBLE *A = ALLOC(DOUBLE, nz); + int *i = malloc(sizeof(int) * (n + 1)); + int *j = malloc(sizeof(int) * nz); + DOUBLE *A = malloc(sizeof(DOUBLE) * nz); i[0] = 0; int l = 0; @@ -145,13 +118,13 @@ struct matrix *csr_create(int n, int nz, struct matrix_coo *coo) i[k + 1] = l; } - struct matrix_csr *mat = ALLOC(struct matrix_csr, 1); + struct matrix_csr *mat = malloc(sizeof(struct matrix_csr)); mat->super.n = n; mat->i = i; mat->j = j; mat->A = A; mat->super.dmult = (void (*)(struct matrix *, DOUBLE *, DOUBLE *))csr_dmult; - mat->super.smult = (void (*)(struct matrix *, FLOAT *, FLOAT *))csr_smult; + mat->super.smult = (void (*)(struct matrix *, FLOATM *, FLOATM *))csr_smult; return (struct matrix *)mat; } @@ -169,10 +142,10 @@ void dense_dmult(struct matrix_dense *mat, DOUBLE *x, DOUBLE *y) } } -void dense_smult(uint8_t m, struct matrix_dense *mat, FLOAT *x, FLOAT *y) +void dense_smult(struct matrix_dense *mat, FLOATM *x, FLOATM *y) { for (int i = 0; i < mat->super.n; i++) { - FLOAT2 t = 0.0; + FLOATM t = 0.0; for (int j = 0; j < mat->super.n; j++) t += mat->A[i * mat->super.n + j] * x[j]; y[i] = t; @@ -181,42 +154,43 @@ void dense_smult(uint8_t m, struct matrix_dense *mat, FLOAT *x, FLOAT *y) struct matrix *dense_create(int n, int nz, struct matrix_coo *coo) { - DOUBLE *A = CALLOC(DOUBLE, n * n); + DOUBLE *A = calloc(n * n, sizeof(DOUBLE)); // row major format for (int k = 0; k < nz; k++) A[coo[k].i * n + coo[k].j] = coo[k].a; - struct matrix_dense *mat = ALLOC(struct matrix_dense, 1); + struct matrix_dense *mat = malloc(sizeof(struct matrix_dense)); mat->super.n = n; mat->super.dmult = (void (*)(struct matrix *, DOUBLE *, DOUBLE *))dense_dmult; - mat->super.smult = (void (*)(struct matrix *, FLOAT *, FLOAT *))dense_smult; + mat->super.smult = (void (*)(struct matrix *, FLOATM *, FLOATM *))dense_smult; mat->A = A; return (struct matrix *)mat; } // Jacobi preconditioner -struct precond_jacobi { struct matrix super; FLOAT *d; }; +struct precond_jacobi { struct matrix super; int bits; FLOATM *d; }; -void jacobi_smult(struct precond_jacobi *pre, FLOAT *x, FLOAT *y) +void jacobi_smult(struct precond_jacobi *pre, FLOATM *x, FLOATM *y) { for (int k = 0; k < pre->super.n; k++) { - y[k] = x[k] / pre->d[k]; + y[k] = TRUNCATE(FLOATM, pre->bits, x[k] / pre->d[k]); } } -struct matrix *jacobi_create(int n, int nz, struct matrix_coo *coo) +struct matrix *jacobi_create(int bits, int n, int nz, struct matrix_coo *coo) { - FLOAT *d = ALLOC(FLOAT, n); + FLOATM *d = malloc(sizeof(FLOATM) * n); for (int k = 0; k < n; k++) d[k] = 0.0; for (int k = 0; k < nz; k++) if (coo[k].i == coo[k].j) - d[coo[k].i] = coo[k].a; + d[coo[k].i] = TRUNCATE(FLOATM, bits, coo[k].a); - struct precond_jacobi *pre = ALLOC(struct precond_jacobi, 1); + struct precond_jacobi *pre = malloc(sizeof(struct precond_jacobi)); pre->super.n = n; pre->super.dmult = NULL; - pre->super.smult = (void (*)(struct matrix *, FLOAT *, FLOAT *))jacobi_smult; + pre->super.smult = (void (*)(struct matrix *, FLOATM *, FLOATM *))jacobi_smult; + pre->bits = bits; pre->d = d; return (struct matrix *)pre; } diff --git a/mb/sparsesolve/matrix.h b/mb/sparsesolve/matrix.h index 1705e52..81a6db7 100644 --- a/mb/sparsesolve/matrix.h +++ b/mb/sparsesolve/matrix.h @@ -4,20 +4,17 @@ struct matrix_coo { int i; int j; double a; }; extern void coo_load(const char *fname, int *n, int *nz, struct matrix_coo **coo); -extern double coo_norm_inf(int n, int nz, struct matrix_coo *coo); -extern double coo_max_nz(int n, int nz, struct matrix_coo *coo); - struct matrix { int n; void (*dmult)(struct matrix *, DOUBLE *, DOUBLE *); - void (*smult)(struct matrix *, FLOAT *, FLOAT *); + void (*smult)(struct matrix *, FLOATM *, FLOATM *); }; static inline void matrix_mult(struct matrix *mat, DOUBLE *x, DOUBLE *y) { mat->dmult(mat, x, y); } -static inline void floatm_mult(struct matrix *mat, FLOAT *x, FLOAT *y) { +static inline void floatm_mult(struct matrix *mat, FLOATM *x, FLOATM *y) { mat->smult(mat, x, y); } @@ -25,4 +22,4 @@ extern struct matrix *csr_create(int n, int nz, struct matrix_coo *coo); extern struct matrix *dense_create(int n, int nz, struct matrix_coo *coo); -extern struct matrix *jacobi_create(int n, int nz, struct matrix_coo *coo); +extern struct matrix *jacobi_create(int bits, int n, int nz, struct matrix_coo *coo); diff --git a/mb/sparsesolve/sparsesolve.c b/mb/sparsesolve/sparsesolve.c index c3e5d3b..7441ee6 100644 --- a/mb/sparsesolve/sparsesolve.c +++ b/mb/sparsesolve/sparsesolve.c @@ -3,10 +3,8 @@ #include #include #include -#include -#include -#include "cg.h" +#include "floatm.h" #include "vector.h" #include "matrix.h" #include "oprecomp.h" @@ -18,10 +16,69 @@ void iterative_refinement(int n, struct matrix *A, struct matrix *M, DOUBLE *b, int out_maxiter, DOUBLE out_tol, int in_maxiter, DOUBLE in_tol, int step_check, int *out_iter, int *in_iter); +// #define USE_DENSE +// #define USE_PRECOND + +int conjugate_gradient(uint8_t bits, int n, struct matrix *A, struct matrix *M, FLOATM *b, FLOATM *x, int maxiter, FLOATM umbral) +{ + int iter = 0; + FLOATM alpha, beta, rho, tau, tol; + + FLOATM *r = ALLOC(FLOATM, n); + FLOATM *p = ALLOC(FLOATM, n); + FLOATM *z = ALLOC(FLOATM, n); + + floatm_set(bits, n, 1.0, x); + floatm_mult(A, x, r); + floatm_xpby(bits, n, b, -1.0, r); // r = b - Ax + + if (M) { + floatm_mult(M, r, p); + rho = floatm_dot(n, r, p); + tol = floatm_norm2(n, r); + } else { // small optimization + floatm_copy(bits, n, r, p); + rho = floatm_dot(n, r, r); + tol = sqrt(rho); + } + + while ((iter < maxiter) && (tol > umbral)) { + // alpha = (r,z) / (Ap,p) + floatm_mult(A, p, z); + alpha = rho / floatm_dot(n, z, p); + // x = x + alpha * p + floatm_axpy(bits, n, alpha, p, x); + // r = r - alpha * Ap + floatm_axpy(bits, n, -alpha, z, r); + // apply preconditioner + if (M) { + floatm_mult(M, r, z); + tau = floatm_dot(n, r, z); + tol = floatm_norm2(n, r); + } else { + tau = floatm_dot(n, r, r); + tol = sqrt(tau); + } + // beta = (r,z) / rho + beta = tau / rho; + rho = tau; + // p = z + beta * p + if (M) floatm_xpby(bits, n, z, beta, p); + else floatm_xpby(bits, n, r, beta, p); + iter++; + // printf ("(%d,%20.10e)\n", iter, tol); + } + + free(r); + free(p); + free(z); + return iter; +} + int main (int argc, char *argv[]) { if (argc != 7) { - fprintf(stderr, "Missing arguments: algorithm matrix_file out_its out_tol in_its in_tol step_chk\n"); + fprintf(stderr, "Missing arguments: matrix_file out_its out_tol in_its in_tol mantissa_bits\n"); return 1; } @@ -37,12 +94,10 @@ int main (int argc, char *argv[]) DOUBLE *x = ALLOC(DOUBLE, n); DOUBLE *b = ALLOC(DOUBLE, n); - DOUBLE *r = ALLOC(DOUBLE, n); - DOUBLE *s = ALLOC(DOUBLE, n); - - vector_rand(n, s); - // vector_set(n, 1.0 / sqrt(n), s); - matrix_mult(A, s, b); + DOUBLE *e = ALLOC(DOUBLE, n); + FLOATM *r = ALLOC(FLOATM, n); + FLOATM *d = ALLOC(FLOATM, n); + vector_set(n, 1.0, b); int out_maxiter = atoi(argv[2]); DOUBLE out_tol = atof(argv[3]); @@ -51,58 +106,50 @@ int main (int argc, char *argv[]) int in_maxiter = atoi(argv[4]); DOUBLE in_tol = atof(argv[5]); if (in_tol <= 0.0) in_tol = 1.0e-8; - int step_check = atoi(argv[6]); - - DOUBLE max = 0; - for (int i = 0; i < nz; i++) { - DOUBLE x1 = coo[i].a; - DOUBLE x2 = x1; - DOUBLE error = (x1 - x2) / x1; - if (error > max) max = error; - } + + uint8_t bits = atoi(argv[6]); #ifdef USE_PRECOND - struct matrix *M = jacobi_create(n, nz, coo); + struct matrix *M = jacobi_create(bits, n, nz, coo); #else struct matrix *M = NULL; #endif - DOUBLE norm = coo_norm_inf(n, nz, coo); free(coo); - printf("# algorithm: %s\n", argv[1]); - printf("# sizeof FLOAT: %lu\n", sizeof(FLOAT)); - // printf("# roundoff: %e\n", epsilon(bits)); - printf("# matrix: %s\n", argv[2]); - printf("# problem_size: %d\n", n); - printf("# nnz: %d\n", nz); - printf("# matrix_norm: %e\n", (double)norm); - printf("# matrix_error: %e\n", (double)max); - printf("# bnorm: %e\n", (double)vector_norm2(n, b)); + int out_iter, in_iter; + DOUBLE residual; - vector_rand(n, x); + oprecomp_start(); + do { - int out_iter = 0, in_iter = 0; + out_iter = 0; in_iter = 0; + vector_set(n, 0.0, x); + vector_copy(bits, n, b, r); - oprecomp_start(); do { + in_iter += conjugate_gradient(bits, n, A, M, r, d, in_maxiter, in_tol); + + vector_axpy(n, 1.0, d, x); // x = x + d + matrix_mult(A, x, e); + vector_xpby(n, b, -1.0, e); + residual = vector_norm2(n, e); + vector_copy(bits, n, e, r); // r = b - Ax - iterative_refinement(n, A, M, b, x, out_maxiter, out_tol, in_maxiter, in_tol, step_check, - &out_iter, &in_iter); + out_iter++; + printf("[%i] %e\n", in_iter, residual); + } while ((residual > out_tol) && (out_iter < out_maxiter)); } while (oprecomp_iterate()); oprecomp_stop(); - matrix_mult(A, x, r); - vector_xpby(n, b, -1.0, r); - DOUBLE residual = vector_norm2(n, r); - DOUBLE normalized_residual = residual / (vector_norm2(n, x) * norm); - + printf("# matrix: %s\n", argv[1]); + printf("# problem_size: %d\n", n); printf("# outer_maxiter: %d\n", out_maxiter); - printf("# outer_tolerance: %e\n", (double)out_tol); + printf("# outer_tolerance: %e\n", out_tol); printf("# inner_maxiter: %d\n", in_maxiter); - printf("# inner_tolerance: %e\n", (double)in_tol); + printf("# inner_tolerance: %e\n", in_tol); + printf("# precision: %d\n", bits); printf("# outer_iterations: %d\n", out_iter); printf("# inner_iterations: %d\n", in_iter); - printf("# residual: %e\n", (double)residual); - printf("# normalized_residual: %e\n", (double)normalized_residual); + printf("# residual: %e\n", residual); } diff --git a/mb/sparsesolve/vector.h b/mb/sparsesolve/vector.h index cf2e906..c0e466b 100644 --- a/mb/sparsesolve/vector.h +++ b/mb/sparsesolve/vector.h @@ -4,18 +4,6 @@ static inline void vector_set(int n, DOUBLE a, DOUBLE *x) { for (int i = 0; i < n; i++) x[i] = a; } -static inline void vector_rand(int n, DOUBLE *x) { - for (int i = 0; i < n; i++) x[i] = rand() / (double)RAND_MAX; -} - -static inline void vector_copy(int n, DOUBLE *x, DOUBLE *y) { - for (int i = 0; i < n; i++) y[i] = x[i]; -} - -static inline void vector_axpy(int n, DOUBLE a, DOUBLE *x, DOUBLE *y) { - for (int i = 0; i < n; i++) y[i] = y[i] + x[i] * a; -} - static inline void vector_xpby(int n, DOUBLE *x, DOUBLE b, DOUBLE *y) { for (int i = 0; i < n; i++) y[i] = x[i] + b * y[i]; } @@ -32,68 +20,41 @@ static inline DOUBLE vector_norm2(int n, DOUBLE *x) { // limited precision version -static inline void floatm_set(int n, FLOAT a, FLOAT *x) { - FLOAT b = a; +static inline void floatm_set(uint8_t m, int n, FLOATM a, FLOATM *x) { + FLOATM b = TRUNCATE(FLOATM, m, a); for (int i = 0; i < n; i++) x[i] = b; } -static inline void floatm_rand(int n, FLOAT *x) { - for (int i = 0; i < n; i++) x[i] = rand() / (double)RAND_MAX; -} - -static inline void floatm_copy(int n, FLOAT *x, FLOAT *y) { - for (int i = 0; i < n; i++) y[i] = x[i]; +static inline void floatm_copy(uint8_t m, int n, FLOATM *x, FLOATM *y) { + for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, x[i]); } -static inline void floatm_axpy(int n, FLOAT a, FLOAT *x, FLOAT *y) { - for (int i = 0; i < n; i++) y[i] = y[i] + x[i] * a; +static inline void floatm_axpy(uint8_t m, int n, FLOATM a, FLOATM *x, FLOATM *y) { + for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, y[i] + x[i] * a); } -static inline void floatm_xpby(int n, FLOAT *x, FLOAT b, FLOAT *y) { - for (int i = 0; i < n; i++) y[i] = x[i] + b * y[i]; +static inline void floatm_xpby(uint8_t m, int n, FLOATM *x, FLOATM b, FLOATM *y) { + for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, x[i] + b * y[i]); } -// the dot product can be computed with more precision than FLOAT +// the dot product is computed with full precision -static inline FLOAT2 floatm_dot(int n, FLOAT *x, FLOAT *y) { - FLOAT2 r = 0.0; +static inline FLOATM floatm_dot(int n, FLOATM *x, FLOATM *y) { + FLOATM r = 0.0; for (int i = 0; i < n; i++) r += y[i] * x[i]; return r; } -static inline FLOAT2 floatm_norm2(int n, FLOAT *x) { +static inline FLOATM floatm_norm2(int n, FLOATM *x) { return sqrt(floatm_dot(n, x, x)); } -static inline FLOAT2 floatm_axpy_dot(int n, FLOAT a, FLOAT *x, FLOAT *y) { - FLOAT2 r = 0.0; - for (int i = 0; i < n; i++) { - FLOAT2 t = y[i] + x[i] * a; - y[i] = t; - r += t * t; - } - return r; -} - -static inline FLOAT2 floatm_diff_norm2(int n, FLOAT *x, FLOAT *y) { - FLOAT2 r = 0.0; - for (int i = 0; i < n; i++) { - FLOAT2 t = x[i] - y[i]; - r += t * t; - } - return sqrt(r); -} - // mixed precision routines -static inline void mixed_copy(int n, DOUBLE *x, FLOAT *y) { - for (int i = 0; i < n; i++) y[i] = x[i]; +static inline void vector_copy(uint8_t m, int n, DOUBLE *x, FLOATM *y) { + for (int i = 0; i < n; i++) y[i] = TRUNCATE(FLOATM, m, x[i]); } -static inline void mixed_axpy(int n, DOUBLE a, FLOAT *x, DOUBLE *y) { +static inline void vector_axpy(int n, DOUBLE a, FLOATM *x, DOUBLE *y) { for (int i = 0; i < n; i++) y[i] += x[i] * a; } - -static inline void mixed_xpby(int n, DOUBLE *x, DOUBLE b, FLOAT *y) { - for (int i = 0; i < n; i++) y[i] = x[i] + b * y[i]; -} diff --git a/mb/svm/baseline/README.md b/mb/svm/baseline/README.md index a85ffbf..d2742a0 100644 --- a/mb/svm/baseline/README.md +++ b/mb/svm/baseline/README.md @@ -72,7 +72,6 @@ If you use ThunderSVM in your paper, please cite our work ([preprint now availab ``` ## Related websites * [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) | [SVMlight](http://svmlight.joachims.org/) | [OHD-SVM](https://github.com/OrcusCZ/OHD-SVM) | [NVIDIA Machine Learning](http://www.nvidia.com/object/machine-learning.html) - ## Acknowlegement * We acknowledge NVIDIA for their hardware donations. diff --git a/testset.cfg b/testset.cfg new file mode 100644 index 0000000..5b28bc6 --- /dev/null +++ b/testset.cfg @@ -0,0 +1,14 @@ +from plptest import * + +TestConfig = c = {} + +quick = Testset( + name = 'oprecomp', + files = [ + 'kw/samples/nop/testset.cfg', + 'kw/samples/square/testset.cfg' + ], + restrict = 'config.get_str("**/chip/name") == "oprecompkw"' +) + +c['testsets'] = [ quick ]