From 524daf4dfd87a468b6682dfac56561dcdf885651 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Thu, 1 Dec 2016 18:56:15 -0600 Subject: [PATCH 01/10] Adding ATMI as the runtime layer to launch Chapel's generated GPU kernels --- make/compiler/Makefile.hsa | 6 +- runtime/include/chpl-atmi.h | 40 ++++++++ runtime/include/chpl-gen-includes.h | 2 +- runtime/src/Makefile.share | 4 +- runtime/src/chpl-atmi-reducehost.c | 136 ++++++++++++++++++++++++++++ runtime/src/chpl-atmi.c | 116 ++++++++++++++++++++++++ util/setchplenv_hsa.bash | 4 + 7 files changed, 302 insertions(+), 6 deletions(-) create mode 100644 runtime/include/chpl-atmi.h create mode 100644 runtime/src/chpl-atmi-reducehost.c create mode 100644 runtime/src/chpl-atmi.c diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa index fa5115fa79..7c1baa28f0 100644 --- a/make/compiler/Makefile.hsa +++ b/make/compiler/Makefile.hsa @@ -17,11 +17,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu ifdef CHPL_ROCM # ROCm locations CLOC=/opt/rocm/cloc/bin/cloc.sh -LIBS+=-lhsa-runtime64 -lhsakmt -lm +LIBS+=-latmi_runtime -lm # TODO: move these in third-party directory? -GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -HSA_INCLUDES=-I/opt/rocm/hsa/include +GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib +HSA_INCLUDES=-I/opt/rocm/libatmi/include else # HSA locations CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h new file mode 100644 index 0000000000..0f4c07feae --- /dev/null +++ b/runtime/include/chpl-atmi.h @@ -0,0 +1,40 @@ +#ifndef _chpl_atmi_h_ +#define _chpl_atmi_h_ + +#include +#include /* size_t */ +#include /* uintXX_t */ +#ifndef __cplusplus +#include +#endif /* __cplusplus */ + +#include "chpltypes.h" +#include "chpl-hsa-kernelparams.h" + +atmi_kernel_t reduction_kernel; +atmi_kernel_t *gpu_kernels; + +enum { + GPU_KERNEL_IMPL = 10565, + REDUCTION_GPU_IMPL = 42 +}; +/* +typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) { + uint64_t in; + uint64_t out; + uint32_t count; +} hsail_reduce_kernarg_t; + +typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) { + uint64_t bundle; +} hsail_kernarg_t; +*/ + +int chpl_hsa_initialize(void); + +int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count); +int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count); + +void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, + uint32_t wkitem_count_x, void *bundled_args); +#endif //_chpl_atmi_h_ diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h index 4527659c5d..2f414baa78 100644 --- a/runtime/include/chpl-gen-includes.h +++ b/runtime/include/chpl-gen-includes.h @@ -31,7 +31,7 @@ #include "chpl-tasks.h" #include "chpltypes.h" #ifdef TARGET_HSA -#include "chpl-hsa.h" +#include "chpl-atmi.h" #endif // diff --git a/runtime/src/Makefile.share b/runtime/src/Makefile.share index 732e06ad1f..8d93b13d3a 100644 --- a/runtime/src/Makefile.share +++ b/runtime/src/Makefile.share @@ -18,9 +18,9 @@ ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa) HSA_SRCS = \ - chpl-hsa.c \ + chpl-atmi.c \ chpl-hsa-reducekernels.cl \ - chpl-hsa-reducehost.c + chpl-atmi-reducehost.c endif COMMON_LAUNCHER_SRCS = \ diff --git a/runtime/src/chpl-atmi-reducehost.c b/runtime/src/chpl-atmi-reducehost.c new file mode 100644 index 0000000000..3d642a7f22 --- /dev/null +++ b/runtime/src/chpl-atmi-reducehost.c @@ -0,0 +1,136 @@ + +#include "chpl-atmi.h" +#include "chplrt.h" +#include "chplexit.h" +#include "chpl-mem.h" + +/*enum ReduceOp { + MAX, + MIN, + SUM, + PROD, + BITAND, + BITOR, + BITXOR, + LOGAND, + LOGOR + }; + */ + +/* + * Estimate and schedule the required number of GPU kernels + */ + static inline +void atmi_sched_reducekernels(size_t count, + void *darray[2], size_t *iter_ct, + size_t *items_left) +{ + size_t incount, outcount, i, iter, in, out; + uint32_t max_num_wkgrps, num_wkgroups, grid_size_x; + + const int num_args = 3; + atmi_task_group_t task_group = {1, ATMI_TRUE}; + ATMI_LPARM(lparm); + lparm->group = &task_group; + lparm->kernel_id = REDUCTION_GPU_IMPL; + lparm->synchronous = ATMI_FALSE; + lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); + + incount = count; + max_num_wkgrps = incount / WKGRP_SIZE; + num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE; + grid_size_x = num_wkgroups * WKGRP_SIZE; + outcount = num_wkgroups; + iter = 0; + while (grid_size_x > WKGRP_SIZE) { + in = (iter & 1); + out = (iter & 1) ^ 1; + + void *args[] = {&darray[in], &darray[out], &incount}; + lparm->gridDim[0] = grid_size_x; + lparm->groupDim[0] = WKGRP_SIZE; + atmi_task_launch(lparm, reduction_kernel, args); + + iter += 1; + incount = outcount; + max_num_wkgrps = incount / WKGRP_SIZE; + num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE; + grid_size_x = num_wkgroups * WKGRP_SIZE; + outcount = num_wkgroups; + } + + if (iter > 0) { + atmi_task_group_sync(&task_group); + } + + (*items_left) = incount; + (*iter_ct) = iter; +} + +/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count) + { + int32_t res; + size_t iter, items_left, out, i; + int32_t * darray[2]; + hsa_symbol_info_t * symbol_info; + symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index + darray[0] = src; + if (0 != chpl_posix_memalign((void **) &darray[1], 64, + count * sizeof(int32_t))) { + chpl_exit_any(1); + } + + hsa_sched_reducekernels(count, symbol_info, (void**)darray, + &iter, &items_left); + + res = 0; + out = (iter & 1); + chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left); + for (i = 0; i < items_left; ++i) res += darray[out][i]; + + chpl_free (darray[1]); + return res; + }*/ + +int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count) +{ + int64_t res; + size_t iter, items_left, out, i; + int64_t * darray[2]; + darray[0] = src; + if (0 != chpl_posix_memalign((void **) &darray[1], 64, + count * sizeof(int64_t))) { + chpl_exit_any(1); + } + + atmi_sched_reducekernels(count, (void**)darray, + &iter, &items_left); + + res = 0; + out = (iter & 1); + chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left); + for (i = 0; i < items_left; ++i) res += darray[out][i]; + + chpl_free (darray[1]); + return res; +} + +//FIXME: use the op argument like this to extend this to different ops +/*if (!strcasecmp(op, "Max")) + opType = MAX; + else if (!strcasecmp(op, "Min")) + opType = MIN; + else if (!strcasecmp(op, "Sum")) + opType = SUM; + else if (!strcasecmp(op, "Product")) + opType = PROD; + else if (!strcasecmp(op, "LogicalAnd")) + opType = LOGAND; + else if (!strcasecmp(op, "LogicalOr")) + opType = LOGOR; + else if (!strcasecmp(op, "BitwiseAnd")) + opType = BITAND; + else if (!strcasecmp(op, "BitwiseOr")) + opType = BITOR; + else if (!strcasecmp(op, "BitwiseXor")) + opType = BITXOR; */ diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c new file mode 100644 index 0000000000..f396931ab1 --- /dev/null +++ b/runtime/src/chpl-atmi.c @@ -0,0 +1,116 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "chpl-atmi.h" +#include "chplrt.h" +#include "chpl-mem.h" +#include "chplcgfns.h" + +#define OUTPUT_ATMI_STATUS(status, msg) \ +{ \ + if (ATMI_STATUS_SUCCESS != (status)) { \ + fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \ +#msg, status); \ + atmi_finalize(); \ + return status; \ + } \ +} + +/** + * Initialize the ATMI/HSA runtime + */ +int chpl_hsa_initialize(void) +{ + atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); + if(st != ATMI_STATUS_SUCCESS) return -1; + + char reduce_kernel_filename[1024]; + char gen_kernel_filename[1024]; + int arglen = strlen(chpl_executionCommand)+1; + char* argCopy = chpl_mem_allocMany(arglen, sizeof(char), + CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0); + char *binName; + int cx; + + cx = snprintf(reduce_kernel_filename, 1024, +#ifdef ROCM + "%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME, +#else + "%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME, +#endif + CHPL_RUNTIME_OBJDIR); + if (cx < 0 || cx >= 256) { + OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename); + } + strcpy(argCopy, chpl_executionCommand); + binName = strtok(argCopy, " "); +#ifdef ROCM + cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName); +#else + cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName); +#endif + if (cx < 0 || cx >= 256) { + OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename); + } + chpl_mem_free(argCopy, 0, 0); + +#ifdef ROCM + atmi_platform_type_t module_type = AMDGCN; +#else + atmi_platform_type_t module_type = BRIG; +#endif + + /* FIXME: Create all reduction kernels, not just the int64-sum kernel */ + const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename}; + atmi_platform_type_t module_types[2] = {module_type, module_type}; + st = atmi_module_register(modules, module_types, 2); + OUTPUT_ATMI_STATUS(st, Registering all modules); + + size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)}; + const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]); + atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes); + atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL); + + size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; + const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]); + gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels); + for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) { + //FIXME: get the actual kernel name + const char *kernel_name = chpl_gpu_kernels[i]; + atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes); + atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL); + } + + return ATMI_STATUS_SUCCESS; +} + +/** + * Release resources used by the base kernels and tear down the HSA structures + */ +int hsa_shutdown(void) +{ + chpl_free(gpu_kernels); + atmi_finalize(); +} + +/* + * Enqueue/execute a kernel + */ +void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, + uint32_t wkitem_count_x, void *bundled_args) +{ + void *args[] = {&bundled_args}; + ATMI_LPARM_1D(lparm, wkitem_count_x); + lparm->groupDim[0] = wkgrp_size_x; + lparm->synchronous = ATMI_TRUE; + + lparm->kernel_id = GPU_KERNEL_IMPL; + lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); + atmi_task_launch(lparm, gpu_kernels[kernel_idx], args); +} + diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash index cfd76b1c41..3625a4657f 100644 --- a/util/setchplenv_hsa.bash +++ b/util/setchplenv_hsa.bash @@ -105,6 +105,10 @@ if [ "$1" == "debug" ]; then export CHPL_DEBUG=1 fi +echo -n "Setting CHPL_ROCM" +export CHPL_ROCM=1 +echo " to 1" + echo -n "Setting CHPL_LOCALE_MODEL" export CHPL_LOCALE_MODEL=hsa echo " to hsa" From 37bb0ba7699bc2d6296a4532a6fbde01d7dd1530 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Thu, 15 Dec 2016 11:51:07 -0600 Subject: [PATCH 02/10] ATMI as the new heterogeneous tasking layer -- 1st pass --- make/tasks/Makefile.atmi | 19 + runtime/Makefile.help | 3 + runtime/etc/Makefile.tasks-atmi | 19 + runtime/include/tasks/atmi/tasks-atmi.h | 445 +++++++++++ runtime/src/tasks/atmi/Makefile | 39 + runtime/src/tasks/atmi/Makefile.include | 29 + runtime/src/tasks/atmi/Makefile.share | 25 + runtime/src/tasks/atmi/tasks-atmi.c | 963 ++++++++++++++++++++++++ util/printchplenv | 2 + util/setchplenv_hsa.bash | 8 +- 10 files changed, 1550 insertions(+), 2 deletions(-) create mode 100644 make/tasks/Makefile.atmi create mode 100644 runtime/etc/Makefile.tasks-atmi create mode 100644 runtime/include/tasks/atmi/tasks-atmi.h create mode 100644 runtime/src/tasks/atmi/Makefile create mode 100644 runtime/src/tasks/atmi/Makefile.include create mode 100644 runtime/src/tasks/atmi/Makefile.share create mode 100644 runtime/src/tasks/atmi/tasks-atmi.c diff --git a/make/tasks/Makefile.atmi b/make/tasks/Makefile.atmi new file mode 100644 index 0000000000..00ca53fb62 --- /dev/null +++ b/make/tasks/Makefile.atmi @@ -0,0 +1,19 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_INCLS += -I$(QTHREAD_INCLUDE_DIR) +CHPL_MAKE_THREADS=none diff --git a/runtime/Makefile.help b/runtime/Makefile.help index 00e96f04c4..a6d84c1ff5 100644 --- a/runtime/Makefile.help +++ b/runtime/Makefile.help @@ -168,4 +168,7 @@ include $(RUNTIME_ROOT)/make/Makefile.runtime.foot src/tasks/qthreads/Makefile.include: cd src/tasks/qthreads && $(MAKE) copy_qthread_files +src/tasks/atmi/Makefile.include: + cd src/tasks/atmi && $(MAKE) copy_qthread_files + .NOTPARALLEL: diff --git a/runtime/etc/Makefile.tasks-atmi b/runtime/etc/Makefile.tasks-atmi new file mode 100644 index 0000000000..ce328fd06b --- /dev/null +++ b/runtime/etc/Makefile.tasks-atmi @@ -0,0 +1,19 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +GEN_CFLAGS += -I$(QTHREAD_INCLUDE_DIR) +GEN_LFLAGS += -L$(QTHREAD_LIB_DIR) -Wl,-rpath,$(QTHREAD_LIB_DIR) diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h new file mode 100644 index 0000000000..58dab0a982 --- /dev/null +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -0,0 +1,445 @@ +/************************************************************************** +* Copyright 2011 Sandia Corporation. Under the terms of Contract +* DE-AC04-94AL85000, there is a non-exclusive license for use of this work by +* or on behalf of the U.S. Government. Export of this program may require a +* license from the United States Government +**************************************************************************/ + +/* + * Copyright 2004-2015 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _tasks_qthreads_h_ +#define _tasks_qthreads_h_ + +#include + +#include +#include +#include +#include + +#include "chpltypes.h" +#include "chpl-tasks-prvdata.h" + +#define CHPL_COMM_YIELD_TASK_WHILE_POLLING +void chpl_task_yield(void); + +// For mutexes +// type(s) +// threadlayer_mutex_t +// threadlayer_mutex_p +// functions +// threadlayer_mutex_init() +// threadlayer_mutex_new() +// threadlayer_mutex_lock() +// threadlayer_mutex_unlock() +// +// For thread management +// type(s) +// +// functions +// threadlayer_thread_id() +// threadlayer_thread_cancel() +// threadlayer_thread_join() +// +// For sync variables +// type(s) +// threadlayer_sync_aux_t +// functions +// threadlayer_sync_suspend() +// threadlayer_sync_awaken() +// threadlayer_sync_init() +// threadlayer_sync_destroy() +// +// For task management +// type(s) +// +// functions +// threadlayer_init() +// threadlayer_thread_create() +// threadlayer_pool_suspend() +// threadlayer_pool_awaken() +// threadlayer_get_thread_private_data() +// threadlayer_set_thread_private_data() +// +// The types are declared in the threads-*.h file for each specific +// threading layer, and the callback functions are declared here. The +// interfaces and requirements for these other types and callback +// functions are described elsewhere in this file. +// +// Although the above list may seem long, in practice many of the +// functions are quite simple, and with luck also easily extrapolated +// from what is done for other threading layers. For an example of an +// implementation, see "pthreads" threading. +// + +// +// Type (and default value) used to communicate task identifiers +// between C code and Chapel code in the runtime. +// +typedef unsigned int chpl_taskID_t; +#define chpl_nullTaskID QTHREAD_NULL_TASK_ID + +// +// Mutexes +// +typedef syncvar_t chpl_mutex_t; + +// +// Sync variables +// +// The threading layer's threadlayer_sync_aux_t may include any +// additional members the layer needs to support the suspend/awaken +// callbacks efficiently. The FIFO tasking code itself does not +// refer to this type or the tl_aux member at all. +// +typedef struct { + aligned_t lockers_in; + aligned_t lockers_out; + uint_fast32_t uncontested_locks; + int is_full; + syncvar_t signal_full; + syncvar_t signal_empty; +} chpl_sync_aux_t; + +#define chpl_sync_reset(x) qthread_syncvar_empty(&(x)->sync_aux.signal_full) + +#define chpl_read_FE(x) ({ \ + uint64_t y; \ + qthread_syncvar_readFE(&y, &(x)->sync_aux.signal_full); \ + y; }) + +#define chpl_read_FF(x) ({ \ + uint64_t y; \ + qthread_syncvar_readFF(&y, &(x)->sync_aux.signal_full); \ + y; }) + +#define chpl_read_XX(x) ((x)->sync_aux.signal_full.u.s.data) + +#define chpl_write_EF(x, y) do { \ + uint64_t z = (uint64_t)(y); \ + qthread_syncvar_writeEF(&(x)->sync_aux.signal_full, &z); \ +} while(0) + +#define chpl_write_FF(x, y) do { \ + uint64_t z, dummy; \ + z = (uint64_t)(y); \ + qthread_syncvar_readFE(&dummy, &(x)->sync_aux.signal_full); \ + qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \ +} while(0) + +#define chpl_write_XF(x, y) do { \ + uint64_t z = (uint64_t)(y); \ + qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \ +} while(0) + +#define chpl_single_reset(x) qthread_syncvar_empty(&(x)->single_aux.signal_full) + +#define chpl_single_read_FF(x) ({ \ + uint64_t y; \ + qthread_syncvar_readFF(&y, &(x)->single_aux.signal_full); \ + y; }) + +#define chpl_single_write_EF(x, y) do { \ + uint64_t z = (uint64_t)(y); \ + qthread_syncvar_writeEF(&(x)->single_aux.signal_full, &z); \ +} while(0) + +#define chpl_single_read_XX(x) ((x)->single_aux.signal_full.u.s.data) + +// Tasks + +// +// Handy services for threading layer callback functions. +// +// The FIFO tasking implementation also provides the following service +// routines that can be used by threading layer callback functions. +// + +// +// The remaining declarations are all for callback functions to be +// provided by the threading layer. +// + +// +// These are called once each, from CHPL_TASKING_INIT() and +// CHPL_TASKING_EXIT(). +// +void threadlayer_init(void); +void threadlayer_exit(void); + +// +// Mutexes +// +/*void qthread_mutex_init(qthread_mutex_p); + * qthread_mutex_p qthread_mutex_new(void); + * void qthread_mutex_lock(qthread_mutex_p); + * void qthread_mutex_unlock(qthread_mutex_p);*/ + +// +// Sync variables +// +// The CHPL_SYNC_WAIT_{FULL,EMPTY}_AND_LOCK() functions should call +// threadlayer_sync_suspend() when a sync variable is not in the desired +// full/empty state. The call will be made with the sync variable's +// mutex held. (Thus, threadlayer_sync_suspend() can dependably tell +// that the desired state must be the opposite of the state it initially +// sees the variable in.) It should return (with the mutex again held) +// as soon as it can once either the sync variable changes to the +// desired state, or (if the given deadline pointer is non-NULL) the +// deadline passes. It can return also early, before either of these +// things occur, with no ill effects. If a deadline is given and it +// does pass, then threadlayer_sync_suspend() must return true; +// otherwise false. +// +// The less the function can execute while waiting for the sync variable +// to change state, and the quicker it can un-suspend when the variable +// does change state, the better overall performance will be. Obviously +// the sync variable's mutex must be unlocked while the routine waits +// for the variable to change state or the deadline to pass, or livelock +// may result. +// +// The CHPL_SYNC_MARK_AND_SIGNAL_{FULL,EMPTY}() functions will call +// threadlayer_sync_awaken() every time they are called, not just when +// they change the state of the sync variable. +// +// Threadlayer_sync_{init,destroy}() are called to initialize or +// destroy, respectively, the contents of the tl_aux member of the +// chpl_sync_aux_t for the specific threading layer. +// +/*chpl_bool threadlayer_sync_suspend(chpl_sync_aux_t *s, + * struct timeval *deadline); + * void threadlayer_sync_awaken(chpl_sync_aux_t *s); + * void threadlayer_sync_init(chpl_sync_aux_t *s); + * void threadlayer_sync_destroy(chpl_sync_aux_t *s);*/ + +// +// Task management +// + +// +// The interface for thread creation may need to be extended eventually +// to allow for specifying such things as stack sizes and/or locations. +// +/*int threadlayer_thread_create(threadlayer_threadID_t*, void*(*)(void*), void*);*/ + +// +// Threadlayer_pool_suspend() is called when a thread finds nothing in +// the pool of unclaimed tasks, and so has no work to do. The call will +// be made with the pointed-to mutex held. It should return (with the +// mutex again held) as soon as it can once either the task pool is no +// longer empty or (if the given deadline pointer is non-NULL) the +// deadline passes. It can return also early, before either of these +// things occur, with no ill effects. If a deadline is given and it +// does pass, then threadlayer_pool_suspend() must return true; +// otherwise false. +// +// The less the function can execute while waiting for the pool to +// become nonempty, and the quicker it can un-suspend when that happens, +// the better overall performance will be. +// +// The mutex passed to threadlayer_pool_suspend() is the one that +// provides mutual exclusion for changes to the task pool. Allowing +// access to this mutex simplifies the implementation for certain +// threading layers, such as those based on pthreads condition +// variables. However, it also introduces a complication in that it +// allows a threading layer to create deadlock or livelock situations if +// it is not careful. Certainly the mutex must be unlocked while the +// routine waits for the task pool to fill or the deadline to pass, or +// livelock may result. +// +/*chpl_bool threadlayer_pool_suspend(chpl_mutex_t*, struct timeval*); + * void threadlayer_pool_awaken(void);*/ + +// +// Thread private data +// +// These set and get a pointer to thread private data associated with +// each thread. This is for the use of the FIFO tasking implementation +// itself. If the threading layer also needs to store some data private +// to each thread, it must make other arrangements to do so. +// +/*void threadlayer_set_thread_private_data(void*); + * void* threadlayer_get_thread_private_data(void);*/ + + +#ifndef QTHREAD_MULTINODE +extern +#ifdef __cplusplus +"C" +#endif +volatile int chpl_qthread_done_initializing; +#endif + +typedef struct { + c_string task_filename; + int task_lineno; + chpl_taskID_t id; + chpl_bool is_executeOn; + c_sublocid_t requestedSubloc; // requested sublocal for task + chpl_task_prvData_t prvdata; +} chpl_task_prvDataImpl_t; + +// Define PRV_DATA_IMPL_VAL to set up a chpl_task_prvData_t. +#define PRV_DATA_IMPL_VAL(_fn, _ln, _id, _is_execOn, _subloc, _serial) \ + { .task_filename = _fn, \ + .task_lineno = _ln, \ + .id = _id, \ + .is_executeOn = _is_execOn, \ + .requestedSubloc = _subloc, \ + .prvdata = { .serial_state = _serial } } + +typedef struct { + void *fn; + void *args; + chpl_bool countRunning; + chpl_task_prvDataImpl_t chpl_data; +} chpl_qthread_wrapper_args_t; + +// Structure of task-local storage +typedef struct chpl_qthread_tls_s { + /* Task private data: serial state, etc. */ + chpl_task_prvDataImpl_t chpl_data; + /* Reports */ + c_string lock_filename; + size_t lock_lineno; +} chpl_qthread_tls_t; + +extern pthread_t chpl_qthread_process_pthread; +extern pthread_t chpl_qthread_comm_pthread; + +extern chpl_qthread_tls_t chpl_qthread_process_tls; +extern chpl_qthread_tls_t chpl_qthread_comm_task_tls; + +#define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized +void chpl_task_stdModulesInitialized(void); + +// Wrap qthread_get_tasklocal() and assert that it is always available. +static inline chpl_qthread_tls_t * chpl_qthread_get_tasklocal(void) +{ + chpl_qthread_tls_t* tls; + + if (chpl_qthread_done_initializing) { + tls = (chpl_qthread_tls_t *) + qthread_get_tasklocal(sizeof(chpl_qthread_tls_t)); + if (tls == NULL) { + pthread_t me = pthread_self(); + if (pthread_equal(me, chpl_qthread_comm_pthread)) + tls = &chpl_qthread_comm_task_tls; + else if (pthread_equal(me, chpl_qthread_process_pthread)) + tls = &chpl_qthread_process_tls; + } + assert(tls); + } + else + tls = NULL; + + return tls; +} + +#ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL +#error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GET_PRVDATA_IMPL_DECL 1 +#endif +static inline chpl_task_prvData_t* chpl_task_getPrvData(void) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + if (data) { + return &data->chpl_data.prvdata; + } + assert(data); + return NULL; +} + +#ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL +#error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GETSUBLOC_IMPL_DECL 1 +#endif +static inline +c_sublocid_t chpl_task_getSubloc(void) +{ + return (c_sublocid_t) qthread_shep(); +} + +#ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL +#error "CHPL_TASK_SETSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_SETSUBLOC_IMPL_DECL 1 +#endif +static inline +void chpl_task_setSubloc(c_sublocid_t subloc) +{ + qthread_shepherd_id_t curr_shep; + + assert(subloc != c_sublocid_none); + + // Only change sublocales if the caller asked for a particular one, + // which is not the current one, and we're a (movable) task. + // + // Note: It's likely that this won't work in all cases where we need + // it. In particular, we envision needing to move execution + // from sublocale to sublocale while initializing the memory + // layer, in order to get the NUMA domain affinity right for + // the subparts of the heap. But this will be happening well + // before tasking init and in any case would be done from the + // main thread of execution, which doesn't have a shepherd. + // The code below wouldn't work in that situation. + if ((curr_shep = qthread_shep()) != NO_SHEPHERD) { + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + if (data) { + data->chpl_data.requestedSubloc = subloc; + } + + if (subloc != c_sublocid_any && + (qthread_shepherd_id_t) subloc != curr_shep) { + qthread_migrate_to((qthread_shepherd_id_t) subloc); + } + } +} + +#ifdef CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL +#error "CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL 1 +#endif +static inline +c_sublocid_t chpl_task_getRequestedSubloc(void) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + if (data) { + return data->chpl_data.requestedSubloc; + } + return c_sublocid_any; +} + +#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL +#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1 +#endif +extern int chpl_qthread_supports_remote_cache; +static inline +int chpl_task_supportsRemoteCache(void) { + return chpl_qthread_supports_remote_cache; +} + +#endif // ifndef _tasks_qthreads_h_ +/* vim:set expandtab: */ diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile new file mode 100644 index 0000000000..cb1bf64265 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile @@ -0,0 +1,39 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_ROOT = ../../.. +RUNTIME_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) + +ifndef CHPL_MAKE_HOME +export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/.. +endif + +include $(RUNTIME_ROOT)/make/Makefile.runtime.head + +TASKS_OBJDIR = $(RUNTIME_OBJDIR) +include Makefile.share + +TARGETS = $(TASKS_OBJS) + +include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules + +FORCE: + +# +# standard footer +# +include $(RUNTIME_ROOT)/make/Makefile.runtime.foot diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include new file mode 100644 index 0000000000..a510dc9a60 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile.include @@ -0,0 +1,29 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) + +TASKS_OBJDIR = $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/$(RUNTIME_OBJDIR) + +# +# point to sources under third-party, at least until they're +# contributed back to the usual local directory +# +ALL_SRCS += $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.c \ + $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.h + +include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share new file mode 100644 index 0000000000..374f9cc760 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile.share @@ -0,0 +1,25 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TASKS_SRCS = + +SVN_SRCS = $(TASKS_SRCS) +SRCS = $(SVN_SRCS) + +TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o) + +RUNTIME_CFLAGS += -I$(QTHREAD_INCLUDE) diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c new file mode 100644 index 0000000000..8902c2949e --- /dev/null +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -0,0 +1,963 @@ +// +// Qthreads implementation of Chapel tasking interface +// +// Copyright 2011 Sandia Corporation. Under the terms of Contract +// DE-AC04-94AL85000, there is a non-exclusive license for use of this work by +// or on behalf of the U.S. Government. Export of this program may require a +// license from the United States Government +// + +/* + * Copyright 2004-2015 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// For SVID definitions (setenv) +#define _SVID_SOURCE + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "chplrt.h" +#include "chplsys.h" +#include "tasks-qthreads.h" +//#include "tasks-atmi.h" +#include "chplcgfns.h" +#include "chpl-comm.h" +#include "chpl-locale-model.h" +#include "chpl-tasks.h" +#include "chpl-tasks-callbacks-internal.h" +#include "config.h" +#include "error.h" +#include "arg.h" +#include "signal.h" +#include "chplexit.h" +#include +#include +#include +#include +#include +#include +#include +#include + +//#include + +#if 1 +#include "qthread/qthread.h" +#include "qthread/qtimer.h" +#include "qt_feb.h" +#include "qt_syncvar.h" +#include "qt_hash.h" +#include "qt_atomics.h" +#include "qt_shepherd_innards.h" +#include "qt_envariables.h" +#include "qt_debug.h" + +#ifdef QTHREAD_MULTINODE +#include "qthread/spr.h" +#endif /* QTHREAD_MULTINODE */ + +#include +#endif +#ifdef CHAPEL_PROFILE +# define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0) + +/* Tasks */ +static aligned_t profile_task_yield = 0; +static aligned_t profile_task_addToTaskList = 0; +static aligned_t profile_task_processTaskList = 0; +static aligned_t profile_task_executeTasksInList = 0; +static aligned_t profile_task_freeTaskList = 0; +static aligned_t profile_task_startMovedTask = 0; +static aligned_t profile_task_getId = 0; +static aligned_t profile_task_sleep = 0; +static aligned_t profile_task_getSerial = 0; +static aligned_t profile_task_setSerial = 0; +static aligned_t profile_task_getCallStackSize = 0; +/* Sync */ +static aligned_t profile_sync_lock= 0; +static aligned_t profile_sync_unlock= 0; +static aligned_t profile_sync_waitFullAndLock= 0; +static aligned_t profile_sync_waitEmptyAndLock= 0; +static aligned_t profile_sync_markAndSignalFull= 0; +static aligned_t profile_sync_markAndSignalEmpty= 0; +static aligned_t profile_sync_isFull= 0; +static aligned_t profile_sync_initAux= 0; +static aligned_t profile_sync_destroyAux= 0; + +static void profile_print(void) +{ + /* Tasks */ + fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield); + fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList); + fprintf(stderr, "task processTaskList: %lu\n", (unsigned long)profile_task_processTaskList); + fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList); + fprintf(stderr, "task freeTaskList: %lu\n", (unsigned long)profile_task_freeTaskList); + fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask); + fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId); + fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep); + fprintf(stderr, "task getSerial: %lu\n", (unsigned long)profile_task_getSerial); + fprintf(stderr, "task setSerial: %lu\n", (unsigned long)profile_task_setSerial); + fprintf(stderr, "task getCallStackSize: %lu\n", (unsigned long)profile_task_getCallStackSize); + /* Sync */ + fprintf(stderr, "sync lock: %lu\n", (unsigned long)profile_sync_lock); + fprintf(stderr, "sync unlock: %lu\n", (unsigned long)profile_sync_unlock); + fprintf(stderr, "sync waitFullAndLock: %lu\n", (unsigned long)profile_sync_waitFullAndLock); + fprintf(stderr, "sync waitEmptyAndLock: %lu\n", (unsigned long)profile_sync_waitEmptyAndLock); + fprintf(stderr, "sync markAndSignalFull: %lu\n", (unsigned long)profile_sync_markAndSignalFull); + fprintf(stderr, "sync markAndSignalEmpty: %lu\n", (unsigned long)profile_sync_markAndSignalEmpty); + fprintf(stderr, "sync isFull: %lu\n", (unsigned long)profile_sync_isFull); + fprintf(stderr, "sync initAux: %lu\n", (unsigned long)profile_sync_initAux); + fprintf(stderr, "sync destroyAux: %lu\n", (unsigned long)profile_sync_destroyAux); +} +#else +# define PROFILE_INCR(counter,count) +#endif /* CHAPEL_PROFILE */ + +#ifndef QTHREAD_MULTINODE +volatile int chpl_qthread_done_initializing; +#endif + +// Make qt env sizes uniform. Same as qt, but they use the literal everywhere +#define QT_ENV_S 100 + +// aka chpl_task_list_p +struct chpl_task_list { + chpl_fn_p fun; + void *arg; + c_string filename; + int lineno; + chpl_task_list_p next; +}; + +static aligned_t next_task_id = 1; + +pthread_t chpl_qthread_process_pthread; +pthread_t chpl_qthread_comm_pthread; + +chpl_qthread_tls_t chpl_qthread_process_tls = { + PRV_DATA_IMPL_VAL("
", 0, chpl_nullTaskID, false, + c_sublocid_any_val, false), + NULL, 0 }; + +chpl_qthread_tls_t chpl_qthread_comm_task_tls = { + PRV_DATA_IMPL_VAL("", 0, chpl_nullTaskID, false, + c_sublocid_any_val, false), + NULL, 0 }; + +// +// QTHREADS_SUPPORTS_REMOTE_CACHE is set in the Chapel Qthreads +// Makefile, based on the Qthreads scheduler configuration. +// +int chpl_qthread_supports_remote_cache = QTHREADS_SUPPORTS_REMOTE_CACHE; + +// +// structs chpl_task_prvDataImpl_t, chpl_qthread_wrapper_args_t and +// chpl_qthread_tls_t have been moved to tasks-qthreads.h +// + +// +// chpl_qthread_get_tasklocal() is in tasks-qthreads.h +// + +static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; + +static volatile chpl_bool canCountRunningTasks = false; + +void chpl_task_yield(void) +{ + PROFILE_INCR(profile_task_yield,1); + if (qthread_shep() == NO_SHEPHERD) { + sched_yield(); + } else { + qthread_yield(); + } +} + +// Sync variables +void chpl_sync_lock(chpl_sync_aux_t *s) +{ + aligned_t l; + chpl_bool uncontested_lock = true; + + PROFILE_INCR(profile_sync_lock, 1); + + // + // To prevent starvation due to never switching away from a task that is + // spinning while doing readXX() on a sync variable, yield if this sync var + // has a "lot" of uncontested locks. Note that the uncontested locks do not + // have to be consecutive. Also note that the number of uncontested locks + // is a lossy counter. Currently a "lot" is defined as ~100 uncontested + // locks, with care taken to not yield on the first uncontested lock. + // + // If real qthreads sync vars were used, it's possible this wouldn't be + // needed. + // + + l = qthread_incr(&s->lockers_in, 1); + + while (l != s->lockers_out) { + uncontested_lock = false; + qthread_yield(); + } + + if (uncontested_lock) { + if ((++s->uncontested_locks & 0x5F) == 0) { + qthread_yield(); + } + } +} + +void chpl_sync_unlock(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_unlock, 1); + + qthread_incr(&s->lockers_out, 1); +} + +static inline void about_to_block(int32_t lineno, + c_string filename) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + assert(data); + + data->lock_lineno = lineno; + data->lock_filename = filename; +} + +void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s, + int32_t lineno, + c_string filename) +{ + PROFILE_INCR(profile_sync_waitFullAndLock, 1); + + if (blockreport) { about_to_block(lineno, filename); } + chpl_sync_lock(s); + while (s->is_full == 0) { + chpl_sync_unlock(s); + qthread_syncvar_readFE(NULL, &(s->signal_full)); + chpl_sync_lock(s); + } +} + +void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s, + int32_t lineno, + c_string filename) +{ + PROFILE_INCR(profile_sync_waitEmptyAndLock, 1); + + if (blockreport) { about_to_block(lineno, filename); } + chpl_sync_lock(s); + while (s->is_full != 0) { + chpl_sync_unlock(s); + qthread_syncvar_readFE(NULL, &(s->signal_empty)); + chpl_sync_lock(s); + } +} + +void chpl_sync_markAndSignalFull(chpl_sync_aux_t *s) // and unlock +{ + PROFILE_INCR(profile_sync_markAndSignalFull, 1); + + qthread_syncvar_fill(&(s->signal_full)); + s->is_full = 1; + chpl_sync_unlock(s); +} + +void chpl_sync_markAndSignalEmpty(chpl_sync_aux_t *s) // and unlock +{ + PROFILE_INCR(profile_sync_markAndSignalEmpty, 1); + + qthread_syncvar_fill(&(s->signal_empty)); + s->is_full = 0; + chpl_sync_unlock(s); +} + +chpl_bool chpl_sync_isFull(void *val_ptr, + chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_isFull, 1); + + return s->is_full; +} + +void chpl_sync_initAux(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_initAux, 1); + + s->lockers_in = 0; + s->lockers_out = 0; + s->is_full = 0; + s->signal_empty = SYNCVAR_EMPTY_INITIALIZER; + s->signal_full = SYNCVAR_EMPTY_INITIALIZER; +} + +void chpl_sync_destroyAux(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_destroyAux, 1); +} + +static void chapel_display_thread(qt_key_t addr, + qthread_f f, + void *arg, + void *retloc, + unsigned int thread_id, + void *tls, + void *callarg) +{ + chpl_qthread_tls_t *rep = (chpl_qthread_tls_t *)tls; + + if (rep) { + if ((rep->lock_lineno > 0) && rep->lock_filename) { + fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", rep->lock_filename, rep->lock_lineno, rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + } else if (rep->lock_lineno == 0 && rep->lock_filename) { + fprintf(stderr, "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", rep->lock_filename, rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + } else if (rep->lock_lineno == 0) { + fprintf(stderr, "Waiting for dependencies (uninitialized task %s:%zu)\n", rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + } + fflush(stderr); + } +} + +static void report_locked_threads(void) +{ + qthread_feb_callback(chapel_display_thread, NULL); + qthread_syncvar_callback(chapel_display_thread, NULL); +} + +static void SIGINT_handler(int sig) +{ + signal(sig, SIG_IGN); + + if (blockreport) { + report_locked_threads(); + } + + if (taskreport) { + fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n"); + // report_all_tasks(); + } + + chpl_exit_any(1); +} + +// Tasks + +#ifndef QTHREAD_MULTINODE +static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER; +static volatile int done_finalizing = 0; + +static void *initializer(void *junk) +{ + qthread_initialize(); + MACHINE_FENCE; + chpl_qthread_done_initializing = 1; + + qthread_syncvar_readFF(NULL, &canexit); + + qthread_finalize(); + MACHINE_FENCE; + done_finalizing = 1; + return NULL; +} +#endif /* ! QTHREAD_MULTINODE */ + +// Helper function to set a qthreads env var. This is meant to mirror setenv +// functionality, but qthreads has two environment variables for every setting: +// a QT_ and a QTHREAD_ version. We often forget to think about both so this +// wraps the overriding logic. In verbose mode it prints out if we overrode +// values, or if we were prevented from setting values because they existed +// (and override was 0.) +static void chpl_qt_setenv(char* var, char* val, int32_t override) { + char qt_env[QT_ENV_S] = { 0 }; + char qthread_env[QT_ENV_S] = { 0 }; + char *qt_val; + char *qthread_val; + chpl_bool eitherSet = false; + + strncpy(qt_env, "QT_", sizeof(qt_env)); + strncat(qt_env, var, sizeof(qt_env) - 1); + + strncpy(qthread_env, "QTHREAD_", sizeof(qthread_env)); + strncat(qthread_env, var, sizeof(qthread_env) - 1); + + qt_val = getenv(qt_env); + qthread_val = getenv(qthread_env); + eitherSet = (qt_val != NULL || qthread_val != NULL); + + if (override || !eitherSet) { + if (verbosity >= 2 && override && eitherSet) { + printf("QTHREADS: Overriding the value of %s and %s " + "with %s\n", qt_env, qthread_env, val); + } + (void) setenv(qt_env, val, 1); + (void) setenv(qthread_env, val, 1); + } else if (verbosity >= 2) { + char* set_env = NULL; + char* set_val = NULL; + if (qt_val != NULL) { + set_env = qt_env; + set_val = qt_val; + } else { + set_env = qthread_env; + set_val = qthread_val; + } + printf("QTHREADS: Not setting %s to %s because %s is set to %s and " + "overriding was not requested\n", qt_env, val, set_env, set_val); + } +} + +// Determine the number of workers based on environment settings. If a user set +// HWPAR, they are saying they want to use HWPAR many workers, but let the +// runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or +// NUM_WORKERS_PER_SHEPHERD then they must have specific reasons for doing so. +// Returns 0 if no Qthreads env vars related to the number of threads were set, +// what HWPAR was set to if it was set, or -1 if NUM_SHEP and/or NUM_WPS were +// set since we can't figure out before Qthreads init what this will actually +// turn into without duplicating Qthreads logic (-1 is a sentinel for don't +// adjust the values, and give them as is to Qthreads.) +static int32_t chpl_qt_getenv_num_workers() { + int32_t hwpar; + int32_t num_wps; + int32_t num_sheps; + + hwpar = qt_internal_get_env_num("HWPAR", 0, 0); + num_wps = qt_internal_get_env_num("NUM_WORKERS_PER_SHEPHERD", 0, 0); + num_sheps = qt_internal_get_env_num("NUM_SHEPHERDS", 0, 0); + + if (hwpar) { + return hwpar; + } else if (num_wps || num_sheps) { + return -1; + } + + return 0; +} + + +// Sets up and returns the amount of hardware parallelism to use, limited to +// maxThreads. Returns -1 if we did not setup parallelism because a user +// explicitly requested a specific layout from qthreads. +static int32_t setupAvailableParallelism(int32_t maxThreads) { + int32_t numThreadsPerLocale; + int32_t qtEnvThreads; + int32_t hwpar; + char newenv_workers[QT_ENV_S] = { 0 }; + + // Experience has shown that Qthreads generally performs best with + // num_workers = numCores (and thus worker_unit = core) but if the user has + // explicitly requested more threads through the chapel or Qthread env + // vars, we override the default. + numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); + qtEnvThreads = chpl_qt_getenv_num_workers(); + hwpar = 0; + + // User set chapel level env var (CHPL_RT_NUM_THREADS_PER_LOCALE) + // This is limited to the number of logical CPUs on the node. + if (numThreadsPerLocale != 0) { + int32_t numPUsPerLocale; + + hwpar = numThreadsPerLocale; + + numPUsPerLocale = chpl_getNumLogicalCpus(true); + if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) { + if (verbosity >= 2) { + printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d " + "to prevent oversubscription of the system.\n", + hwpar, numPUsPerLocale); + } + + // Do not oversubscribe the system, use all available resources. + hwpar = numPUsPerLocale; + } + } + // User set qthreads level env var + // (HWPAR or (NUM_SHEPHERDS and NUM_WORKERS_PER_SHEPHERD)) + else if (qtEnvThreads != 0) { + hwpar = qtEnvThreads; + } + // User did not set chapel or qthreads vars -- our default + else { + hwpar = chpl_getNumPhysicalCpus(true); + } + + // hwpar will only be <= 0 if the user set QT_NUM_SHEPHERDS and/or + // QT_NUM_WORKERS_PER_SHEPHERD in which case we assume as "expert" user and + // don't impose any thread limits or set worker_unit. + if (hwpar > 0) { + // Limit the parallelism to the maximum imposed by the comm layer. + if (0 < maxThreads && maxThreads < hwpar) { + hwpar = maxThreads; + } + + // If there is more parallelism requested than the number of cores, set the + // worker unit to pu, otherwise core. + if (hwpar > chpl_getNumPhysicalCpus(true)) { + chpl_qt_setenv("WORKER_UNIT", "pu", 0); + } else { + chpl_qt_setenv("WORKER_UNIT", "core", 0); + } + + // Unset relevant Qthreads environment variables. + qt_internal_unset_envstr("HWPAR"); + qt_internal_unset_envstr("NUM_SHEPHERDS"); + qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); + + snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar); + if (THREADQUEUE_POLICY_TRUE == qt_threadqueue_policy(SINGLE_WORKER)) { + chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1); + chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1); + } else { + chpl_qt_setenv("HWPAR", newenv_workers, 1); + } + } + return hwpar; +} + +static void setupCallStacks(int32_t hwpar) { + size_t callStackSize; + + // If the user compiled with no stack checks (either explicitly or + // implicitly) turn off qthread guard pages. TODO there should also be a + // chpl level env var backing this at runtime (can be the same var.) + // Also turn off guard pages if the heap page size isn't the same as + // the system page size, because when that's the case we can reliably + // make the guard pages un-referenceable. (This typically arises when + // the heap is on hugepages, as is often the case on Cray systems.) + // + // Note that we won't override an explicit setting of QT_GUARD_PAGES + // in the former case, but we do in the latter case. + if (CHPL_STACK_CHECKS == 0) { + chpl_qt_setenv("GUARD_PAGES", "false", 0); + } + else if (chpl_getHeapPageSize() != chpl_getSysPageSize()) { + chpl_qt_setenv("GUARD_PAGES", "false", 1); + } + + // Precedence (high-to-low): + // 1) Chapel environment (CHPL_RT_CALL_STACK_SIZE) + // 2) QTHREAD_STACK_SIZE + // 3) Chapel default + if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 || + (qt_internal_get_env_num("STACK_SIZE", 0, 0) == 0 && + (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) { + char newenv_stack[QT_ENV_S]; + snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize); + chpl_qt_setenv("STACK_SIZE", newenv_stack, 1); + + // Qthreads sets up memory pools expecting the item_size to be small. + // Stacks are allocated in this manner too, but our default stack size + // is quite large, so we limit the max memory allocated for a pool. We + // default to a multiple of callStackSize and hwpar, with the thought + // that available memory is generally proportional to the amount of + // parallelism. For some architectures, this isn't true so we set a max + // upper bound. And if the callStackSize is small, we don't want to + // limit all qthreads pool allocations to a small value, so we have a + // lower bound as well. Note that qthread stacks are slightly larger + // than specified to store a book keeping structure and possibly guard + // pages, so we thrown an extra MB. + if (hwpar > 0) { + const size_t oneMB = 1024 * 1024; + const size_t allocSizeLowerBound = 33 * oneMB; + const size_t allocSizeUpperBound = 513 * oneMB; + size_t maxPoolAllocSize; + char newenv_alloc[QT_ENV_S]; + + maxPoolAllocSize = 2 * hwpar * callStackSize + oneMB; + if (maxPoolAllocSize < allocSizeLowerBound) { + maxPoolAllocSize = allocSizeLowerBound; + } else if (maxPoolAllocSize > allocSizeUpperBound) { + maxPoolAllocSize = allocSizeUpperBound; + } + snprintf(newenv_alloc, sizeof(newenv_alloc), "%zu", maxPoolAllocSize); + chpl_qt_setenv("MAX_POOL_ALLOC_SIZE", newenv_alloc, 0); + } + } +} + +void chpl_task_init(void) +{ + int32_t commMaxThreads; + int32_t hwpar; + pthread_t initer; + + chpl_qthread_process_pthread = pthread_self(); + chpl_qthread_process_tls.chpl_data.id = qthread_incr(&next_task_id, 1); + + commMaxThreads = chpl_comm_getMaxThreads(); + + // Setup hardware parallelism, the stack size, and stack guards + hwpar = setupAvailableParallelism(commMaxThreads); + setupCallStacks(hwpar); + + if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); } + + // Initialize qthreads + pthread_create(&initer, NULL, initializer, NULL); + while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); + + // Now that Qthreads is up and running, do a sanity check and make sure + // that the number of workers is less than any comm layer limit. This is + // mainly need for the case where a user set QT_NUM_SHEPHERDS and/or + // QT_NUM_WORKERS_PER_SHEPHERD in which case we don't impose any limits on + // the number of threads qthreads creates beforehand + assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads); + + if (blockreport || taskreport) { + if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { + perror("Could not register SIGINT handler"); + } + } +} + +void chpl_task_exit(void) +{ +#ifdef CHAPEL_PROFILE + profile_print(); +#endif /* CHAPEL_PROFILE */ + +#ifdef QTHREAD_MULTINODE +#else + if (qthread_shep() == NO_SHEPHERD) { + /* sometimes, tasking is told to shutdown even though it hasn't been + * told to start yet */ + if (chpl_qthread_done_initializing == 1) { + qthread_syncvar_fill(&canexit); + while (done_finalizing == 0) SPINLOCK_BODY(); + } + } else { + qthread_syncvar_fill(&exit_ret); + } +#endif /* QTHREAD_MULTINODE */ +} + +static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, + chpl_task_prvDataImpl_t *chpl_data) { + if (chpl_task_have_callbacks(event_kind)) { + if (chpl_data->id == chpl_nullTaskID) + chpl_data->id = qthread_incr(&next_task_id, 1); + chpl_task_do_callbacks(event_kind, + chpl_data->task_filename, + chpl_data->task_lineno, + chpl_data->id, + chpl_data->is_executeOn); + } +} + +static aligned_t chapel_wrapper(void *arg) +{ + chpl_qthread_wrapper_args_t *rarg = arg; + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + + data->chpl_data = rarg->chpl_data; + data->lock_filename = NULL; + data->lock_lineno = 0; + + if (rarg->countRunning) { + chpl_taskRunningCntInc(0, NULL); + } + + wrap_callbacks(chpl_task_cb_event_kind_begin, &data->chpl_data); + + (*(chpl_fn_p)(rarg->fn))(rarg->args); + + wrap_callbacks(chpl_task_cb_event_kind_end, &data->chpl_data); + + if (rarg->countRunning) { + chpl_taskRunningCntDec(0, NULL); + } + + return 0; +} + +typedef struct { + chpl_fn_p fn; + void *arg; +} comm_task_wrapper_info_t; + +static void *comm_task_wrapper(void *arg) +{ + comm_task_wrapper_info_t *rarg = arg; + chpl_moveToLastCPU(); + (*(chpl_fn_p)(rarg->fn))(rarg->arg); + return 0; +} + +// Start the main task. +// +// Warning: this method is not called within a Qthread task context. Do +// not use methods that require task context (e.g., task-local storage). +void chpl_task_callMain(void (*chpl_main)(void)) +{ + chpl_qthread_wrapper_args_t wrapper_args = + {chpl_main, NULL, false, + PRV_DATA_IMPL_VAL("
", 0, + chpl_qthread_process_tls.chpl_data.id, false, + c_sublocid_any_val, false) }; + + qthread_debug(CHAPEL_CALLS, "[%d] begin chpl_task_callMain()\n", chpl_nodeID); + +#ifdef QTHREAD_MULTINODE + qthread_debug(CHAPEL_BEHAVIOR, "[%d] calling spr_unify\n", chpl_nodeID); + int const rc = spr_unify(); + assert(SPR_OK == rc); +#endif /* QTHREAD_MULTINODE */ + + wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data); + + qthread_fork_syncvar(chapel_wrapper, &wrapper_args, &exit_ret); + qthread_syncvar_readFF(NULL, &exit_ret); + + qthread_debug(CHAPEL_BEHAVIOR, "[%d] main task finished\n", chpl_nodeID); + qthread_debug(CHAPEL_CALLS, "[%d] end chpl_task_callMain()\n", chpl_nodeID); +} + +void chpl_task_stdModulesInitialized(void) +{ + // + // It's not safe to call the module code to count the main task as + // running until after the modules have been initialized. That's + // when this function is called, so now count the main task. + // + canCountRunningTasks = true; + chpl_taskRunningCntInc(0, NULL); +} + +int chpl_task_createCommTask(chpl_fn_p fn, + void *arg) +{ +#ifndef QTHREAD_MULTINODE + // + // The wrapper info must be static because it won't be referred to + // until the new pthread calls comm_task_wrapper(). And, it is + // safe for it to be static because we will be called at most once + // on each node. + // + static + comm_task_wrapper_info_t wrapper_info; + wrapper_info.fn = fn; + wrapper_info.arg = arg; + return pthread_create(&chpl_qthread_comm_pthread, + NULL, comm_task_wrapper, &wrapper_info); +#else + return 0; +#endif +} + +void chpl_task_addToTaskList(chpl_fn_int_t fid, + void *arg, + c_sublocid_t subloc, + chpl_task_list_p *task_list, + int32_t task_list_locale, + chpl_bool is_begin_stmt, + int lineno, + c_string filename) +{ + chpl_bool serial_state = chpl_task_getSerial(); + + assert(subloc != c_sublocid_none); + + PROFILE_INCR(profile_task_addToTaskList,1); + + if (serial_state) { + // call the function directly. + (chpl_ftable[fid])(arg); + } else { + chpl_qthread_wrapper_args_t wrapper_args = + {chpl_ftable[fid], arg, false, + PRV_DATA_IMPL_VAL(filename, lineno, chpl_nullTaskID, false, + subloc, serial_state) }; + + wrap_callbacks(chpl_task_cb_event_kind_create, + &wrapper_args.chpl_data); + + if (subloc == c_sublocid_any) { + qthread_fork_copyargs(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL, + (qthread_shepherd_id_t) subloc); + } + } +} + +void chpl_task_processTaskList(chpl_task_list_p task_list) +{ + PROFILE_INCR(profile_task_processTaskList,1); +} + +void chpl_task_executeTasksInList(chpl_task_list_p task_list) +{ + PROFILE_INCR(profile_task_executeTasksInList,1); +} + +void chpl_task_freeTaskList(chpl_task_list_p task_list) +{ + PROFILE_INCR(profile_task_freeTaskList,1); +} + +void chpl_task_startMovedTask(chpl_fn_p fp, + void *arg, + c_sublocid_t subloc, + chpl_taskID_t id, + chpl_bool serial_state) +{ + assert(subloc != c_sublocid_none); + assert(id == chpl_nullTaskID); + + chpl_qthread_wrapper_args_t wrapper_args = + {fp, arg, canCountRunningTasks, + PRV_DATA_IMPL_VAL("", 0, chpl_nullTaskID, true, + subloc, serial_state) }; + + PROFILE_INCR(profile_task_startMovedTask,1); + + wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data); + + if (subloc == c_sublocid_any) { + qthread_fork_copyargs(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, + sizeof(chpl_qthread_wrapper_args_t), NULL, + (qthread_shepherd_id_t) subloc); + } +} + +// +// chpl_task_getSubloc() is in tasks-qthreads.h +// + +// +// chpl_task_setSubloc() is in tasks-qthreads.h +// + +// +// chpl_task_getRequestedSubloc() is in tasks-qthreads.h +// + + +// Returns '(unsigned int)-1' if called outside of the tasking layer. +chpl_taskID_t chpl_task_getId(void) +{ + chpl_qthread_tls_t * tls = chpl_qthread_get_tasklocal(); + + PROFILE_INCR(profile_task_getId,1); + + if (tls == NULL) + return (chpl_taskID_t) -1; + + if (tls->chpl_data.id == chpl_nullTaskID) + tls->chpl_data.id = qthread_incr(&next_task_id, 1); + + return tls->chpl_data.id; +} + +void chpl_task_sleep(int secs) +{ + if (qthread_shep() == NO_SHEPHERD) { + sleep(secs); + } else { + qtimer_t t = qtimer_create(); + qtimer_start(t); + do { + qthread_yield(); + qtimer_stop(t); + } while (qtimer_secs(t) < secs); + } +} + +/* The get- and setSerial() methods assume the beginning of the task-local + * data segment holds a chpl_bool denoting the serial state. */ +chpl_bool chpl_task_getSerial(void) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + + PROFILE_INCR(profile_task_getSerial,1); + + return data->chpl_data.prvdata.serial_state; +} + +void chpl_task_setSerial(chpl_bool state) +{ + chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + data->chpl_data.prvdata.serial_state = state; + + PROFILE_INCR(profile_task_setSerial,1); +} + +uint32_t chpl_task_getMaxPar(void) { + // + // We assume here that the caller (in the LocaleModel module code) + // is interested in the number of workers on the whole node, and + // will decide itself how much parallelism to create across and + // within sublocales, if there are any. + // + return (uint32_t) qthread_num_workers(); +} + +c_sublocid_t chpl_task_getNumSublocales(void) +{ + // FIXME: What we really want here is the number of NUMA + // sublocales we are supporting. For now we use the number of + // shepherds as a proxy for that. + return (c_sublocid_t) qthread_num_shepherds(); +} + +size_t chpl_task_getCallStackSize(void) +{ + PROFILE_INCR(profile_task_getCallStackSize,1); + + return qthread_readstate(STACK_SIZE); +} + +// XXX: Should probably reflect all shepherds +uint32_t chpl_task_getNumQueuedTasks(void) +{ + return qthread_readstate(NODE_BUSYNESS); +} + +uint32_t chpl_task_getNumRunningTasks(void) +{ + chpl_internal_error("chpl_task_getNumRunningTasks() called"); + return 1; +} + +int32_t chpl_task_getNumBlockedTasks(void) +{ + // This isn't accurate, but in the absence of better information + // it's the best we can do. + return 0; +} + +// Threads + +uint32_t chpl_task_getNumThreads(void) +{ + return (uint32_t)qthread_num_workers(); +} + +// Ew. Talk about excessive bookkeeping. +uint32_t chpl_task_getNumIdleThreads(void) +{ + return 0; +} + +/* vim:set expandtab: */ diff --git a/util/printchplenv b/util/printchplenv index e66b1d2694..5ad79dff86 100755 --- a/util/printchplenv +++ b/util/printchplenv @@ -175,6 +175,8 @@ def print_mode(mode='list', anonymize=False): mode, filters=('runtime',)) if tasks == 'qthreads': link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args()) + if tasks == 'atmi': + link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args()) print_var(' CHPL_RE2_UNIQ_CFG_PATH', chpl_3p_re2_configs.get_uniq_cfg_path(), diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash index 3625a4657f..0dee283cc9 100644 --- a/util/setchplenv_hsa.bash +++ b/util/setchplenv_hsa.bash @@ -61,8 +61,8 @@ export CHPL_COMM=none echo " to none" echo -n "Setting CHPL_TASKS" -export CHPL_TASKS=qthreads -echo " to qthreads" +export CHPL_TASKS=atmi +echo " to atmi" echo -n "Setting CHPL_ATOMICS" export CHPL_ATOMICS=intrinsics @@ -92,6 +92,10 @@ echo -n "Disabling NUMA" export CHPL_HWLOC_CFG_OPTIONS=" --disable-libnuma" echo " done" +echo -n "Setting CHPL_HWLOC" +export CHPL_HWLOC=hwloc +echo " to hwloc" + echo -n "Disabling LLVM support" export CHPL_LLVM=none echo " done" From 891dd14a2b0db577b8b770296633a7a62bb443d5 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Wed, 15 Feb 2017 13:50:58 -0600 Subject: [PATCH 03/10] Updated ATMI tasking layer to master branch; ATMI layer should be called by setting the CHPL_TASKS to atmi instead of qthreads --- modules/internal/LocaleModelHelpSetup.chpl | 6 +- .../localeModels/hsa/LocaleModel.chpl | 2 +- runtime/include/chpl-atmi.h | 13 +- runtime/include/chpltypes.h | 1 + runtime/include/tasks/atmi/tasks-atmi.h | 314 ++-------- runtime/src/chpl-atmi.c | 5 +- runtime/src/tasks/atmi/Makefile | 5 +- runtime/src/tasks/atmi/Makefile.include | 12 +- runtime/src/tasks/atmi/Makefile.share | 6 +- runtime/src/tasks/atmi/tasks-atmi.c | 580 ++++++++++++------ 10 files changed, 457 insertions(+), 487 deletions(-) diff --git a/modules/internal/LocaleModelHelpSetup.chpl b/modules/internal/LocaleModelHelpSetup.chpl index 69c3562ec1..93ab89f26d 100644 --- a/modules/internal/LocaleModelHelpSetup.chpl +++ b/modules/internal/LocaleModelHelpSetup.chpl @@ -172,9 +172,6 @@ module LocaleModelHelpSetup { helpSetupLocaleFlat(dst, local_name); - extern proc chpl_task_getNumSublocales(): int(32); - numSublocales = chpl_task_getNumSublocales(); - extern proc chpl_task_getMaxPar(): uint(32); @@ -191,7 +188,8 @@ module LocaleModelHelpSetup { then local_name = chpl_nodeName():string + "-" + _node_id : string; else local_name = chpl_nodeName():string; - numSublocales = 2; + extern proc chpl_task_getNumSublocales(): int(32); + numSublocales = chpl_task_getNumSublocales(); const origSubloc = chpl_task_getRequestedSubloc(); diff --git a/modules/internal/localeModels/hsa/LocaleModel.chpl b/modules/internal/localeModels/hsa/LocaleModel.chpl index 8c54136b3e..627cfd0d19 100644 --- a/modules/internal/localeModels/hsa/LocaleModel.chpl +++ b/modules/internal/localeModels/hsa/LocaleModel.chpl @@ -157,7 +157,7 @@ module LocaleModel { return {0..#numSublocales}; } - proc getChildCount() return 0; + proc getChildCount() return numSublocales; iter getChildIndices() : int { for idx in {0..#numSublocales} do // chpl_emptyLocaleSpace do diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h index 0f4c07feae..6285760810 100644 --- a/runtime/include/chpl-atmi.h +++ b/runtime/include/chpl-atmi.h @@ -14,21 +14,12 @@ atmi_kernel_t reduction_kernel; atmi_kernel_t *gpu_kernels; +atmi_machine_t *g_machine; + enum { GPU_KERNEL_IMPL = 10565, REDUCTION_GPU_IMPL = 42 }; -/* -typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) { - uint64_t in; - uint64_t out; - uint32_t count; -} hsail_reduce_kernarg_t; - -typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) { - uint64_t bundle; -} hsail_kernarg_t; -*/ int chpl_hsa_initialize(void); diff --git a/runtime/include/chpltypes.h b/runtime/include/chpltypes.h index 0e6b4644df..439bef780b 100644 --- a/runtime/include/chpltypes.h +++ b/runtime/include/chpltypes.h @@ -189,6 +189,7 @@ typedef int32_t chpl_bool32; typedef int64_t chpl_bool64; typedef void (*chpl_fn_p)(void*); // function pointer for runtime ftable +typedef const char *chpl_fn_name; typedef int16_t chpl_fn_int_t; // int type for ftable indexing // Function table names and information, for VisualDebug use diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h index 58dab0a982..81ad131901 100644 --- a/runtime/include/tasks/atmi/tasks-atmi.h +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -6,7 +6,7 @@ **************************************************************************/ /* - * Copyright 2004-2015 Cray Inc. + * Copyright 2004-2017 Cray Inc. * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -27,68 +27,25 @@ #ifndef _tasks_qthreads_h_ #define _tasks_qthreads_h_ -#include +#include "chpl-atmi.h" +#include "chpl-tasks-prvdata.h" +#include "chpltypes.h" + +#include "qthread.h" +#include "qthread-chapel.h" #include -#include -#include #include +#include +#include -#include "chpltypes.h" -#include "chpl-tasks-prvdata.h" +#ifdef __cplusplus +extern "C" { +#endif #define CHPL_COMM_YIELD_TASK_WHILE_POLLING void chpl_task_yield(void); -// For mutexes -// type(s) -// threadlayer_mutex_t -// threadlayer_mutex_p -// functions -// threadlayer_mutex_init() -// threadlayer_mutex_new() -// threadlayer_mutex_lock() -// threadlayer_mutex_unlock() -// -// For thread management -// type(s) -// -// functions -// threadlayer_thread_id() -// threadlayer_thread_cancel() -// threadlayer_thread_join() -// -// For sync variables -// type(s) -// threadlayer_sync_aux_t -// functions -// threadlayer_sync_suspend() -// threadlayer_sync_awaken() -// threadlayer_sync_init() -// threadlayer_sync_destroy() -// -// For task management -// type(s) -// -// functions -// threadlayer_init() -// threadlayer_thread_create() -// threadlayer_pool_suspend() -// threadlayer_pool_awaken() -// threadlayer_get_thread_private_data() -// threadlayer_set_thread_private_data() -// -// The types are declared in the threads-*.h file for each specific -// threading layer, and the callback functions are declared here. The -// interfaces and requirements for these other types and callback -// functions are described elsewhere in this file. -// -// Although the above list may seem long, in practice many of the -// functions are quite simple, and with luck also easily extrapolated -// from what is done for other threading layers. For an example of an -// implementation, see "pthreads" threading. -// - // // Type (and default value) used to communicate task identifiers // between C code and Chapel code in the runtime. @@ -96,19 +53,9 @@ void chpl_task_yield(void); typedef unsigned int chpl_taskID_t; #define chpl_nullTaskID QTHREAD_NULL_TASK_ID -// -// Mutexes -// -typedef syncvar_t chpl_mutex_t; - // // Sync variables // -// The threading layer's threadlayer_sync_aux_t may include any -// additional members the layer needs to support the suspend/awaken -// callbacks efficiently. The FIFO tasking code itself does not -// refer to this type or the tl_aux member at all. -// typedef struct { aligned_t lockers_in; aligned_t lockers_out; @@ -118,207 +65,43 @@ typedef struct { syncvar_t signal_empty; } chpl_sync_aux_t; -#define chpl_sync_reset(x) qthread_syncvar_empty(&(x)->sync_aux.signal_full) - -#define chpl_read_FE(x) ({ \ - uint64_t y; \ - qthread_syncvar_readFE(&y, &(x)->sync_aux.signal_full); \ - y; }) - -#define chpl_read_FF(x) ({ \ - uint64_t y; \ - qthread_syncvar_readFF(&y, &(x)->sync_aux.signal_full); \ - y; }) - -#define chpl_read_XX(x) ((x)->sync_aux.signal_full.u.s.data) - -#define chpl_write_EF(x, y) do { \ - uint64_t z = (uint64_t)(y); \ - qthread_syncvar_writeEF(&(x)->sync_aux.signal_full, &z); \ -} while(0) - -#define chpl_write_FF(x, y) do { \ - uint64_t z, dummy; \ - z = (uint64_t)(y); \ - qthread_syncvar_readFE(&dummy, &(x)->sync_aux.signal_full); \ - qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \ -} while(0) - -#define chpl_write_XF(x, y) do { \ - uint64_t z = (uint64_t)(y); \ - qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \ -} while(0) - -#define chpl_single_reset(x) qthread_syncvar_empty(&(x)->single_aux.signal_full) - -#define chpl_single_read_FF(x) ({ \ - uint64_t y; \ - qthread_syncvar_readFF(&y, &(x)->single_aux.signal_full); \ - y; }) - -#define chpl_single_write_EF(x, y) do { \ - uint64_t z = (uint64_t)(y); \ - qthread_syncvar_writeEF(&(x)->single_aux.signal_full, &z); \ -} while(0) - -#define chpl_single_read_XX(x) ((x)->single_aux.signal_full.u.s.data) - -// Tasks - -// -// Handy services for threading layer callback functions. -// -// The FIFO tasking implementation also provides the following service -// routines that can be used by threading layer callback functions. -// - -// -// The remaining declarations are all for callback functions to be -// provided by the threading layer. -// - // -// These are called once each, from CHPL_TASKING_INIT() and -// CHPL_TASKING_EXIT(). +// Task private data // -void threadlayer_init(void); -void threadlayer_exit(void); -// -// Mutexes -// -/*void qthread_mutex_init(qthread_mutex_p); - * qthread_mutex_p qthread_mutex_new(void); - * void qthread_mutex_lock(qthread_mutex_p); - * void qthread_mutex_unlock(qthread_mutex_p);*/ - -// -// Sync variables -// -// The CHPL_SYNC_WAIT_{FULL,EMPTY}_AND_LOCK() functions should call -// threadlayer_sync_suspend() when a sync variable is not in the desired -// full/empty state. The call will be made with the sync variable's -// mutex held. (Thus, threadlayer_sync_suspend() can dependably tell -// that the desired state must be the opposite of the state it initially -// sees the variable in.) It should return (with the mutex again held) -// as soon as it can once either the sync variable changes to the -// desired state, or (if the given deadline pointer is non-NULL) the -// deadline passes. It can return also early, before either of these -// things occur, with no ill effects. If a deadline is given and it -// does pass, then threadlayer_sync_suspend() must return true; -// otherwise false. -// -// The less the function can execute while waiting for the sync variable -// to change state, and the quicker it can un-suspend when the variable -// does change state, the better overall performance will be. Obviously -// the sync variable's mutex must be unlocked while the routine waits -// for the variable to change state or the deadline to pass, or livelock -// may result. -// -// The CHPL_SYNC_MARK_AND_SIGNAL_{FULL,EMPTY}() functions will call -// threadlayer_sync_awaken() every time they are called, not just when -// they change the state of the sync variable. -// -// Threadlayer_sync_{init,destroy}() are called to initialize or -// destroy, respectively, the contents of the tl_aux member of the -// chpl_sync_aux_t for the specific threading layer. -// -/*chpl_bool threadlayer_sync_suspend(chpl_sync_aux_t *s, - * struct timeval *deadline); - * void threadlayer_sync_awaken(chpl_sync_aux_t *s); - * void threadlayer_sync_init(chpl_sync_aux_t *s); - * void threadlayer_sync_destroy(chpl_sync_aux_t *s);*/ - -// -// Task management -// - -// -// The interface for thread creation may need to be extended eventually -// to allow for specifying such things as stack sizes and/or locations. -// -/*int threadlayer_thread_create(threadlayer_threadID_t*, void*(*)(void*), void*);*/ - -// -// Threadlayer_pool_suspend() is called when a thread finds nothing in -// the pool of unclaimed tasks, and so has no work to do. The call will -// be made with the pointed-to mutex held. It should return (with the -// mutex again held) as soon as it can once either the task pool is no -// longer empty or (if the given deadline pointer is non-NULL) the -// deadline passes. It can return also early, before either of these -// things occur, with no ill effects. If a deadline is given and it -// does pass, then threadlayer_pool_suspend() must return true; -// otherwise false. -// -// The less the function can execute while waiting for the pool to -// become nonempty, and the quicker it can un-suspend when that happens, -// the better overall performance will be. -// -// The mutex passed to threadlayer_pool_suspend() is the one that -// provides mutual exclusion for changes to the task pool. Allowing -// access to this mutex simplifies the implementation for certain -// threading layers, such as those based on pthreads condition -// variables. However, it also introduces a complication in that it -// allows a threading layer to create deadlock or livelock situations if -// it is not careful. Certainly the mutex must be unlocked while the -// routine waits for the task pool to fill or the deadline to pass, or -// livelock may result. -// -/*chpl_bool threadlayer_pool_suspend(chpl_mutex_t*, struct timeval*); - * void threadlayer_pool_awaken(void);*/ - -// -// Thread private data -// -// These set and get a pointer to thread private data associated with -// each thread. This is for the use of the FIFO tasking implementation -// itself. If the threading layer also needs to store some data private -// to each thread, it must make other arrangements to do so. -// -/*void threadlayer_set_thread_private_data(void*); - * void* threadlayer_get_thread_private_data(void);*/ - - -#ifndef QTHREAD_MULTINODE extern #ifdef __cplusplus "C" #endif -volatile int chpl_qthread_done_initializing; -#endif - -typedef struct { - c_string task_filename; - int task_lineno; - chpl_taskID_t id; - chpl_bool is_executeOn; - c_sublocid_t requestedSubloc; // requested sublocal for task - chpl_task_prvData_t prvdata; -} chpl_task_prvDataImpl_t; -// Define PRV_DATA_IMPL_VAL to set up a chpl_task_prvData_t. -#define PRV_DATA_IMPL_VAL(_fn, _ln, _id, _is_execOn, _subloc, _serial) \ - { .task_filename = _fn, \ - .task_lineno = _ln, \ - .id = _id, \ - .is_executeOn = _is_execOn, \ - .requestedSubloc = _subloc, \ - .prvdata = { .serial_state = _serial } } +volatile int chpl_qthread_done_initializing; +// +// Task layer private area argument bundle header +// typedef struct { - void *fn; - void *args; - chpl_bool countRunning; - chpl_task_prvDataImpl_t chpl_data; -} chpl_qthread_wrapper_args_t; + chpl_bool serial_state; + chpl_bool countRunning; + chpl_bool is_executeOn; + int lineno; + int filename; + c_sublocid_t requestedSubloc; + chpl_fn_int_t requested_fid; + chpl_fn_p requested_fn; + chpl_fn_name requested_fn_name; + chpl_taskID_t id; +} chpl_task_bundle_t; // Structure of task-local storage typedef struct chpl_qthread_tls_s { - /* Task private data: serial state, etc. */ - chpl_task_prvDataImpl_t chpl_data; - /* Reports */ - c_string lock_filename; - size_t lock_lineno; + chpl_task_bundle_t *bundle; + // The below fields could move to chpl_task_bundleData_t + // That would reduce the size of the task local storage, + // but increase the size of executeOn bundles. + chpl_task_prvData_t prvdata; + /* Reports */ + int lock_filename; + int lock_lineno; } chpl_qthread_tls_t; extern pthread_t chpl_qthread_process_pthread; @@ -331,13 +114,13 @@ extern chpl_qthread_tls_t chpl_qthread_comm_task_tls; void chpl_task_stdModulesInitialized(void); // Wrap qthread_get_tasklocal() and assert that it is always available. -static inline chpl_qthread_tls_t * chpl_qthread_get_tasklocal(void) +static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void) { chpl_qthread_tls_t* tls; if (chpl_qthread_done_initializing) { - tls = (chpl_qthread_tls_t *) - qthread_get_tasklocal(sizeof(chpl_qthread_tls_t)); + tls = (chpl_qthread_tls_t*) + qthread_get_tasklocal(sizeof(chpl_qthread_tls_t)); if (tls == NULL) { pthread_t me = pthread_self(); if (pthread_equal(me, chpl_qthread_comm_pthread)) @@ -362,12 +145,15 @@ static inline chpl_task_prvData_t* chpl_task_getPrvData(void) { chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); if (data) { - return &data->chpl_data.prvdata; + return &data->prvdata; } assert(data); return NULL; } +// +// Sublocale support +// #ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL #error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!" #else @@ -405,7 +191,7 @@ void chpl_task_setSubloc(c_sublocid_t subloc) if ((curr_shep = qthread_shep()) != NO_SHEPHERD) { chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); if (data) { - data->chpl_data.requestedSubloc = subloc; + data->bundle->requestedSubloc = subloc; } if (subloc != c_sublocid_any && @@ -425,21 +211,27 @@ c_sublocid_t chpl_task_getRequestedSubloc(void) { chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); if (data) { - return data->chpl_data.requestedSubloc; + return data->bundle->requestedSubloc; } return c_sublocid_any; } +// +// Can we support remote caching? +// #ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL #error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!" #else #define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1 #endif -extern int chpl_qthread_supports_remote_cache; static inline int chpl_task_supportsRemoteCache(void) { - return chpl_qthread_supports_remote_cache; + return CHPL_QTHREAD_SUPPORTS_REMOTE_CACHE; } +#ifdef __cplusplus +} // end extern "C" +#endif + #endif // ifndef _tasks_qthreads_h_ /* vim:set expandtab: */ diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c index f396931ab1..543db24ec1 100644 --- a/runtime/src/chpl-atmi.c +++ b/runtime/src/chpl-atmi.c @@ -26,9 +26,6 @@ */ int chpl_hsa_initialize(void) { - atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); - if(st != ATMI_STATUS_SUCCESS) return -1; - char reduce_kernel_filename[1024]; char gen_kernel_filename[1024]; int arglen = strlen(chpl_executionCommand)+1; @@ -68,7 +65,7 @@ int chpl_hsa_initialize(void) /* FIXME: Create all reduction kernels, not just the int64-sum kernel */ const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename}; atmi_platform_type_t module_types[2] = {module_type, module_type}; - st = atmi_module_register(modules, module_types, 2); + atmi_status_t st = atmi_module_register(modules, module_types, 2); OUTPUT_ATMI_STATUS(st, Registering all modules); size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)}; diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile index cb1bf64265..c78c8f6377 100644 --- a/runtime/src/tasks/atmi/Makefile +++ b/runtime/src/tasks/atmi/Makefile @@ -1,4 +1,4 @@ -# Copyright 2004-2015 Cray Inc. +# Copyright 2004-2017 Cray Inc. # Other additional copyright holders may be indicated within. # # The entirety of this work is licensed under the Apache License, @@ -22,6 +22,9 @@ ifndef CHPL_MAKE_HOME export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/.. endif +# +# standard header +# include $(RUNTIME_ROOT)/make/Makefile.runtime.head TASKS_OBJDIR = $(RUNTIME_OBJDIR) diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include index a510dc9a60..35b1225bf7 100644 --- a/runtime/src/tasks/atmi/Makefile.include +++ b/runtime/src/tasks/atmi/Makefile.include @@ -1,4 +1,4 @@ -# Copyright 2004-2015 Cray Inc. +# Copyright 2004-2017 Cray Inc. # Other additional copyright holders may be indicated within. # # The entirety of this work is licensed under the Apache License, @@ -17,13 +17,9 @@ TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) -TASKS_OBJDIR = $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/$(RUNTIME_OBJDIR) +TASKS_OBJDIR = $(RUNTIME_BUILD)/$(TASKS_SUBDIR) -# -# point to sources under third-party, at least until they're -# contributed back to the usual local directory -# -ALL_SRCS += $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.c \ - $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.h +ALL_SRCS += $(CURDIR)/$(TASKS_SUBDIR)/*.c \ + $(RUNTIME_INCLUDE_ROOT)/tasks/$(CHPL_MAKE_TASKS)/*.h include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share index 374f9cc760..74c39c92a2 100644 --- a/runtime/src/tasks/atmi/Makefile.share +++ b/runtime/src/tasks/atmi/Makefile.share @@ -1,4 +1,4 @@ -# Copyright 2004-2015 Cray Inc. +# Copyright 2004-2017 Cray Inc. # Other additional copyright holders may be indicated within. # # The entirety of this work is licensed under the Apache License, @@ -15,11 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -TASKS_SRCS = +TASKS_SRCS = tasks-$(CHPL_MAKE_TASKS).c SVN_SRCS = $(TASKS_SRCS) SRCS = $(SVN_SRCS) TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o) - -RUNTIME_CFLAGS += -I$(QTHREAD_INCLUDE) diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c index 8902c2949e..a518957493 100644 --- a/runtime/src/tasks/atmi/tasks-atmi.c +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -8,7 +8,7 @@ // /* - * Copyright 2004-2015 Cray Inc. + * Copyright 2004-2017 Cray Inc. * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -34,56 +34,61 @@ #endif #include "chplrt.h" -#include "chplsys.h" -#include "tasks-qthreads.h" -//#include "tasks-atmi.h" + +#include "arg.h" +#include "error.h" #include "chplcgfns.h" #include "chpl-comm.h" +#include "chplexit.h" #include "chpl-locale-model.h" +#include "chpl-mem.h" +#include "chplsys.h" +#include "chpl-linefile-support.h" #include "chpl-tasks.h" #include "chpl-tasks-callbacks-internal.h" -#include "config.h" -#include "error.h" -#include "arg.h" -#include "signal.h" -#include "chplexit.h" -#include -#include +//#include "tasks-qthreads.h" +#include "tasks-atmi.h" +#include + +#include "qthread.h" +#include "qthread/qtimer.h" +#include "qthread-chapel.h" + #include -#include #include +#include +#include +#include +#include +#include +#include #include #include -#include +#include + +#define OUTPUT_ATMI_STATUS(status, msg) \ +{ \ + if (ATMI_STATUS_SUCCESS != (status)) { \ + fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \ +#msg, status); \ + atmi_finalize(); \ + return status; \ + } \ +} -//#include -#if 1 -#include "qthread/qthread.h" -#include "qthread/qtimer.h" -#include "qt_feb.h" -#include "qt_syncvar.h" -#include "qt_hash.h" -#include "qt_atomics.h" -#include "qt_shepherd_innards.h" -#include "qt_envariables.h" -#include "qt_debug.h" - -#ifdef QTHREAD_MULTINODE -#include "qthread/spr.h" -#endif /* QTHREAD_MULTINODE */ -#include -#endif +//#define SUPPORT_BLOCKREPORT +//#define SUPPORT_TASKREPORT + #ifdef CHAPEL_PROFILE # define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0) /* Tasks */ static aligned_t profile_task_yield = 0; static aligned_t profile_task_addToTaskList = 0; -static aligned_t profile_task_processTaskList = 0; static aligned_t profile_task_executeTasksInList = 0; -static aligned_t profile_task_freeTaskList = 0; +static aligned_t profile_task_taskCallFTable = 0; static aligned_t profile_task_startMovedTask = 0; static aligned_t profile_task_getId = 0; static aligned_t profile_task_sleep = 0; @@ -106,9 +111,8 @@ static void profile_print(void) /* Tasks */ fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield); fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList); - fprintf(stderr, "task processTaskList: %lu\n", (unsigned long)profile_task_processTaskList); fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList); - fprintf(stderr, "task freeTaskList: %lu\n", (unsigned long)profile_task_freeTaskList); + fprintf(stderr, "task taskCallFTable: %lu\n", (unsigned long)profile_task_taskCallFTable); fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask); fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId); fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep); @@ -130,9 +134,14 @@ static void profile_print(void) # define PROFILE_INCR(counter,count) #endif /* CHAPEL_PROFILE */ -#ifndef QTHREAD_MULTINODE +// +// Startup and shutdown control. The mutex is used just for the side +// effect of its (very portable) memory fence. +// volatile int chpl_qthread_done_initializing; -#endif +static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER; +static volatile int done_finalizing; +static pthread_mutex_t done_init_final_mux = PTHREAD_MUTEX_INITIALIZER; // Make qt env sizes uniform. Same as qt, but they use the literal everywhere #define QT_ENV_S 100 @@ -141,7 +150,7 @@ volatile int chpl_qthread_done_initializing; struct chpl_task_list { chpl_fn_p fun; void *arg; - c_string filename; + int32_t filename; int lineno; chpl_task_list_p next; }; @@ -151,26 +160,37 @@ static aligned_t next_task_id = 1; pthread_t chpl_qthread_process_pthread; pthread_t chpl_qthread_comm_pthread; +chpl_task_bundle_t chpl_qthread_process_bundle = { + .serial_state = false, + .countRunning = false, + .is_executeOn = false, + .lineno = 0, + .filename = CHPL_FILE_IDX_MAIN_TASK, + .requestedSubloc = c_sublocid_any_val, + .requested_fid = FID_NONE, + .requested_fn = NULL, + .id = chpl_nullTaskID }; + +chpl_task_bundle_t chpl_qthread_comm_task_bundle = { + .serial_state = false, + .countRunning = false, + .is_executeOn = false, + .lineno = 0, + .filename = CHPL_FILE_IDX_COMM_TASK, + .requestedSubloc = c_sublocid_any_val, + .requested_fid = FID_NONE, + .requested_fn = NULL, + .id = chpl_nullTaskID }; + chpl_qthread_tls_t chpl_qthread_process_tls = { - PRV_DATA_IMPL_VAL("
", 0, chpl_nullTaskID, false, - c_sublocid_any_val, false), - NULL, 0 }; + .bundle = &chpl_qthread_process_bundle, + .lock_filename = 0, + .lock_lineno = 0 }; chpl_qthread_tls_t chpl_qthread_comm_task_tls = { - PRV_DATA_IMPL_VAL("", 0, chpl_nullTaskID, false, - c_sublocid_any_val, false), - NULL, 0 }; - -// -// QTHREADS_SUPPORTS_REMOTE_CACHE is set in the Chapel Qthreads -// Makefile, based on the Qthreads scheduler configuration. -// -int chpl_qthread_supports_remote_cache = QTHREADS_SUPPORTS_REMOTE_CACHE; - -// -// structs chpl_task_prvDataImpl_t, chpl_qthread_wrapper_args_t and -// chpl_qthread_tls_t have been moved to tasks-qthreads.h -// + .bundle = &chpl_qthread_comm_task_bundle, + .lock_filename = 0, + .lock_lineno = 0 }; // // chpl_qthread_get_tasklocal() is in tasks-qthreads.h @@ -232,7 +252,7 @@ void chpl_sync_unlock(chpl_sync_aux_t *s) } static inline void about_to_block(int32_t lineno, - c_string filename) + int32_t filename) { chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); assert(data); @@ -243,7 +263,7 @@ static inline void about_to_block(int32_t lineno, void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s, int32_t lineno, - c_string filename) + int32_t filename) { PROFILE_INCR(profile_sync_waitFullAndLock, 1); @@ -258,7 +278,7 @@ void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s, void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s, int32_t lineno, - c_string filename) + int32_t filename) { PROFILE_INCR(profile_sync_waitEmptyAndLock, 1); @@ -313,6 +333,7 @@ void chpl_sync_destroyAux(chpl_sync_aux_t *s) PROFILE_INCR(profile_sync_destroyAux, 1); } +#ifdef SUPPORT_BLOCKREPORT static void chapel_display_thread(qt_key_t addr, qthread_f f, void *arg, @@ -325,11 +346,21 @@ static void chapel_display_thread(qt_key_t addr, if (rep) { if ((rep->lock_lineno > 0) && rep->lock_filename) { - fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", rep->lock_filename, rep->lock_lineno, rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", + chpl_lookupFilename(rep->lock_filename), rep->lock_lineno, + chpl_lookupFilename(rep->chpl_data.task_filename), + rep->chpl_data.task_lineno); } else if (rep->lock_lineno == 0 && rep->lock_filename) { - fprintf(stderr, "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", rep->lock_filename, rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + fprintf(stderr, + "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", + chpl_lookupFilename(rep->lock_filename), + chpl_lookupFilename(rep->chpl_data.task_filename), + rep->chpl_data.task_lineno); } else if (rep->lock_lineno == 0) { - fprintf(stderr, "Waiting for dependencies (uninitialized task %s:%zu)\n", rep->chpl_data.task_filename, rep->chpl_data.task_lineno); + fprintf(stderr, + "Waiting for dependencies (uninitialized task %s:%zu)\n", + chpl_lookupFilename(rep->chpl_data.task_filename), + rep->chpl_data.task_lineno); } fflush(stderr); } @@ -340,18 +371,28 @@ static void report_locked_threads(void) qthread_feb_callback(chapel_display_thread, NULL); qthread_syncvar_callback(chapel_display_thread, NULL); } +#endif // SUPPORT_BLOCKREPORT static void SIGINT_handler(int sig) { signal(sig, SIG_IGN); if (blockreport) { +#ifdef SUPPORT_BLOCKREPORT report_locked_threads(); +#else + fprintf(stderr, + "Blockreport is currently unsupported by the qthreads " + "tasking layer.\n"); +#endif } if (taskreport) { +#ifdef SUPPORT_TASKREPORT + report_all_tasks(); +#else fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n"); - // report_all_tasks(); +#endif } chpl_exit_any(1); @@ -359,24 +400,53 @@ static void SIGINT_handler(int sig) // Tasks -#ifndef QTHREAD_MULTINODE -static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER; -static volatile int done_finalizing = 0; - static void *initializer(void *junk) { qthread_initialize(); - MACHINE_FENCE; + (void) pthread_mutex_lock(&done_init_final_mux); // implicit memory fence chpl_qthread_done_initializing = 1; + (void) pthread_mutex_unlock(&done_init_final_mux); qthread_syncvar_readFF(NULL, &canexit); qthread_finalize(); - MACHINE_FENCE; + (void) pthread_mutex_lock(&done_init_final_mux); // implicit memory fence done_finalizing = 1; + (void) pthread_mutex_unlock(&done_init_final_mux); + return NULL; } -#endif /* ! QTHREAD_MULTINODE */ + +// +// Qthreads environment helper functions. +// + +static char* chpl_qt_getenv_str(const char* var) { + char name[100]; + char* ev; + + snprintf(name, sizeof(name), "QT_%s", var); + if ((ev = getenv(name)) == NULL) { + snprintf(name, sizeof(name), "QTHREAD_%s", var); + ev = getenv(name); + } + + return ev; +} + +static unsigned long int chpl_qt_getenv_num(const char* var, + unsigned long int default_val) { + char* ev; + unsigned long int ret_val = default_val; + + if ((ev = chpl_qt_getenv_str(var)) != NULL) { + unsigned long int val; + if (sscanf(ev, "%lu", &val) == 1) + ret_val = val; + } + + return ret_val; +} // Helper function to set a qthreads env var. This is meant to mirror setenv // functionality, but qthreads has two environment variables for every setting: @@ -384,7 +454,8 @@ static void *initializer(void *junk) // wraps the overriding logic. In verbose mode it prints out if we overrode // values, or if we were prevented from setting values because they existed // (and override was 0.) -static void chpl_qt_setenv(char* var, char* val, int32_t override) { +static void chpl_qt_setenv(const char* var, const char* val, + int32_t override) { char qt_env[QT_ENV_S] = { 0 }; char qthread_env[QT_ENV_S] = { 0 }; char *qt_val; @@ -402,7 +473,10 @@ static void chpl_qt_setenv(char* var, char* val, int32_t override) { eitherSet = (qt_val != NULL || qthread_val != NULL); if (override || !eitherSet) { - if (verbosity >= 2 && override && eitherSet) { + if (verbosity >= 2 + && override + && eitherSet + && strcmp(val, (qt_val != NULL) ? qt_val : qthread_val) != 0) { printf("QTHREADS: Overriding the value of %s and %s " "with %s\n", qt_env, qthread_env, val); } @@ -418,11 +492,22 @@ static void chpl_qt_setenv(char* var, char* val, int32_t override) { set_env = qthread_env; set_val = qthread_val; } - printf("QTHREADS: Not setting %s to %s because %s is set to %s and " - "overriding was not requested\n", qt_env, val, set_env, set_val); + if (strcmp(val, set_val) != 0) + printf("QTHREADS: Not setting %s to %s because %s is set to %s and " + "overriding was not requested\n", qt_env, val, set_env, set_val); } } +static void chpl_qt_unsetenv(const char* var) { + char name[100]; + + snprintf(name, sizeof(name), "QT_%s", var); + (void) unsetenv(name); + + snprintf(name, sizeof(name), "QTHREAD_%s", var); + (void) unsetenv(name); +} + // Determine the number of workers based on environment settings. If a user set // HWPAR, they are saying they want to use HWPAR many workers, but let the // runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or @@ -432,14 +517,14 @@ static void chpl_qt_setenv(char* var, char* val, int32_t override) { // set since we can't figure out before Qthreads init what this will actually // turn into without duplicating Qthreads logic (-1 is a sentinel for don't // adjust the values, and give them as is to Qthreads.) -static int32_t chpl_qt_getenv_num_workers() { +static int32_t chpl_qt_getenv_num_workers(void) { int32_t hwpar; int32_t num_wps; int32_t num_sheps; - hwpar = qt_internal_get_env_num("HWPAR", 0, 0); - num_wps = qt_internal_get_env_num("NUM_WORKERS_PER_SHEPHERD", 0, 0); - num_sheps = qt_internal_get_env_num("NUM_SHEPHERDS", 0, 0); + hwpar = (int32_t) chpl_qt_getenv_num("HWPAR", 0); + num_wps = (int32_t) chpl_qt_getenv_num("NUM_WORKERS_PER_SHEPHERD", 0); + num_sheps = (int32_t) chpl_qt_getenv_num("NUM_SHEPHERDS", 0); if (hwpar) { return hwpar; @@ -477,7 +562,7 @@ static int32_t setupAvailableParallelism(int32_t maxThreads) { numPUsPerLocale = chpl_getNumLogicalCpus(true); if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) { - if (verbosity >= 2) { + if (verbosity > 0) { printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d " "to prevent oversubscription of the system.\n", hwpar, numPUsPerLocale); @@ -515,12 +600,12 @@ static int32_t setupAvailableParallelism(int32_t maxThreads) { } // Unset relevant Qthreads environment variables. - qt_internal_unset_envstr("HWPAR"); - qt_internal_unset_envstr("NUM_SHEPHERDS"); - qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); + chpl_qt_unsetenv("HWPAR"); + chpl_qt_unsetenv("NUM_SHEPHERDS"); + chpl_qt_unsetenv("NUM_WORKERS_PER_SHEPHERD"); snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar); - if (THREADQUEUE_POLICY_TRUE == qt_threadqueue_policy(SINGLE_WORKER)) { + if (CHPL_QTHREAD_SCHEDULER_ONE_WORKER_PER_SHEPHERD) { chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1); chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1); } else { @@ -555,7 +640,7 @@ static void setupCallStacks(int32_t hwpar) { // 2) QTHREAD_STACK_SIZE // 3) Chapel default if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 || - (qt_internal_get_env_num("STACK_SIZE", 0, 0) == 0 && + (chpl_qt_getenv_num("STACK_SIZE", 0) == 0 && (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) { char newenv_stack[QT_ENV_S]; snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize); @@ -591,26 +676,61 @@ static void setupCallStacks(int32_t hwpar) { } } +static void setupTasklocalStorage(void) { + unsigned long int tasklocal_size; + char newenv[QT_ENV_S]; + + // Make sure Qthreads knows how much space we need for per-task + // local storage. + tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0); + if (tasklocal_size < sizeof(chpl_qthread_tls_t)) { + snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_qthread_tls_t)); + chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1); + } +} + +static void setupWorkStealing(void) { + // In our experience the current work stealing implementation hurts + // performance, so disable it. Note that we don't override, so a user could + // try working stealing out by setting {QT,QTHREAD}_STEAL_RATIO. Also note + // that not all schedulers support work stealing, but it doesn't hurt to + // set this env var for those configs anyways. + chpl_qt_setenv("STEAL_RATIO", "0", 0); +} + void chpl_task_init(void) { + atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); + printf("ATMI task initialized\n"); + if(st != ATMI_STATUS_SUCCESS) return; + + g_machine = atmi_machine_get_info(); + int32_t commMaxThreads; int32_t hwpar; pthread_t initer; + pthread_attr_t pAttr; chpl_qthread_process_pthread = pthread_self(); - chpl_qthread_process_tls.chpl_data.id = qthread_incr(&next_task_id, 1); + chpl_qthread_process_bundle.id = qthread_incr(&next_task_id, 1); commMaxThreads = chpl_comm_getMaxThreads(); - // Setup hardware parallelism, the stack size, and stack guards + // Set up hardware parallelism, the stack size and stack guards, + // tasklocal storage, and work stealing hwpar = setupAvailableParallelism(commMaxThreads); setupCallStacks(hwpar); + setupTasklocalStorage(); + setupWorkStealing(); if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); } // Initialize qthreads - pthread_create(&initer, NULL, initializer, NULL); - while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); + pthread_attr_init(&pAttr); + pthread_attr_setdetachstate(&pAttr, PTHREAD_CREATE_DETACHED); + pthread_create(&initer, &pAttr, initializer, NULL); + while (chpl_qthread_done_initializing == 0) + sched_yield(); // Now that Qthreads is up and running, do a sanity check and make sure // that the number of workers is less than any comm layer limit. This is @@ -628,60 +748,88 @@ void chpl_task_init(void) void chpl_task_exit(void) { + atmi_finalize(); #ifdef CHAPEL_PROFILE profile_print(); #endif /* CHAPEL_PROFILE */ -#ifdef QTHREAD_MULTINODE -#else if (qthread_shep() == NO_SHEPHERD) { /* sometimes, tasking is told to shutdown even though it hasn't been * told to start yet */ if (chpl_qthread_done_initializing == 1) { qthread_syncvar_fill(&canexit); - while (done_finalizing == 0) SPINLOCK_BODY(); + while (done_finalizing == 0) + sched_yield(); } } else { qthread_syncvar_fill(&exit_ret); } -#endif /* QTHREAD_MULTINODE */ } static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, - chpl_task_prvDataImpl_t *chpl_data) { + chpl_task_bundle_t* bundle) { if (chpl_task_have_callbacks(event_kind)) { - if (chpl_data->id == chpl_nullTaskID) - chpl_data->id = qthread_incr(&next_task_id, 1); + if (bundle->id == chpl_nullTaskID) + bundle->id = qthread_incr(&next_task_id, 1); chpl_task_do_callbacks(event_kind, - chpl_data->task_filename, - chpl_data->task_lineno, - chpl_data->id, - chpl_data->is_executeOn); + bundle->requested_fid, + bundle->filename, + bundle->lineno, + bundle->id, + bundle->is_executeOn); } } + +// If we stored chpl_taskID_t in chpl_task_bundleData_t, +// this struct and the following function may not be necessary. +typedef void (*main_ptr_t)(void); +typedef struct { + chpl_task_bundle_t arg; + main_ptr_t chpl_main; +} main_wrapper_bundle_t; + +static aligned_t main_wrapper(void *arg) +{ + chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg; + chpl_task_bundle_t *bundle = &m_bundle->arg; + chpl_qthread_tls_t pv = {.bundle = bundle}; + + *tls = pv; + + // main call doesn't need countRunning / chpl_taskRunningCntInc + + wrap_callbacks(chpl_task_cb_event_kind_begin, bundle); + + (m_bundle->chpl_main)(); + + wrap_callbacks(chpl_task_cb_event_kind_end, bundle); + + return 0; +} + + static aligned_t chapel_wrapper(void *arg) { - chpl_qthread_wrapper_args_t *rarg = arg; - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; + chpl_qthread_tls_t pv = {.bundle = bundle}; - data->chpl_data = rarg->chpl_data; - data->lock_filename = NULL; - data->lock_lineno = 0; + *tls = pv; - if (rarg->countRunning) { - chpl_taskRunningCntInc(0, NULL); - } + if (bundle->countRunning) + chpl_taskRunningCntInc(0, 0); - wrap_callbacks(chpl_task_cb_event_kind_begin, &data->chpl_data); + wrap_callbacks(chpl_task_cb_event_kind_begin, bundle); - (*(chpl_fn_p)(rarg->fn))(rarg->args); + // launch GPU kernel here? + (bundle->requested_fn)(arg); - wrap_callbacks(chpl_task_cb_event_kind_end, &data->chpl_data); + wrap_callbacks(chpl_task_cb_event_kind_end, bundle); - if (rarg->countRunning) { - chpl_taskRunningCntDec(0, NULL); - } + if (bundle->countRunning) + chpl_taskRunningCntDec(0, 0); return 0; } @@ -705,27 +853,24 @@ static void *comm_task_wrapper(void *arg) // not use methods that require task context (e.g., task-local storage). void chpl_task_callMain(void (*chpl_main)(void)) { - chpl_qthread_wrapper_args_t wrapper_args = - {chpl_main, NULL, false, - PRV_DATA_IMPL_VAL("
", 0, - chpl_qthread_process_tls.chpl_data.id, false, - c_sublocid_any_val, false) }; - - qthread_debug(CHAPEL_CALLS, "[%d] begin chpl_task_callMain()\n", chpl_nodeID); - -#ifdef QTHREAD_MULTINODE - qthread_debug(CHAPEL_BEHAVIOR, "[%d] calling spr_unify\n", chpl_nodeID); - int const rc = spr_unify(); - assert(SPR_OK == rc); -#endif /* QTHREAD_MULTINODE */ - - wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data); - - qthread_fork_syncvar(chapel_wrapper, &wrapper_args, &exit_ret); + main_wrapper_bundle_t arg; + + arg.arg.serial_state = false; + arg.arg.countRunning = false; + arg.arg.is_executeOn = false; + arg.arg.requestedSubloc = c_sublocid_any_val; + arg.arg.requested_fid = FID_NONE; + arg.arg.requested_fn = NULL; + arg.arg.requested_fn_name = NULL; + arg.arg.lineno = 0; + arg.arg.filename = CHPL_FILE_IDX_MAIN_TASK; + arg.arg.id = chpl_qthread_process_bundle.id; + arg.chpl_main = chpl_main; + + wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg); + + qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret); qthread_syncvar_readFF(NULL, &exit_ret); - - qthread_debug(CHAPEL_BEHAVIOR, "[%d] main task finished\n", chpl_nodeID); - qthread_debug(CHAPEL_CALLS, "[%d] end chpl_task_callMain()\n", chpl_nodeID); } void chpl_task_stdModulesInitialized(void) @@ -736,13 +881,12 @@ void chpl_task_stdModulesInitialized(void) // when this function is called, so now count the main task. // canCountRunningTasks = true; - chpl_taskRunningCntInc(0, NULL); + chpl_taskRunningCntInc(0, 0); } int chpl_task_createCommTask(chpl_fn_p fn, void *arg) { -#ifndef QTHREAD_MULTINODE // // The wrapper info must be static because it won't be referred to // until the new pthread calls comm_task_wrapper(). And, it is @@ -755,21 +899,20 @@ int chpl_task_createCommTask(chpl_fn_p fn, wrapper_info.arg = arg; return pthread_create(&chpl_qthread_comm_pthread, NULL, comm_task_wrapper, &wrapper_info); -#else - return 0; -#endif } -void chpl_task_addToTaskList(chpl_fn_int_t fid, - void *arg, - c_sublocid_t subloc, - chpl_task_list_p *task_list, - int32_t task_list_locale, - chpl_bool is_begin_stmt, - int lineno, - c_string filename) +void chpl_task_addToTaskList(chpl_fn_int_t fid, + chpl_task_bundle_t *arg, + size_t arg_size, + c_sublocid_t subloc, + void **task_list, + int32_t task_list_locale, + chpl_bool is_begin_stmt, + int lineno, + int32_t filename) { chpl_bool serial_state = chpl_task_getSerial(); + chpl_fn_p requested_fn = chpl_ftable[fid]; assert(subloc != c_sublocid_none); @@ -777,68 +920,92 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, if (serial_state) { // call the function directly. - (chpl_ftable[fid])(arg); + requested_fn(arg); } else { - chpl_qthread_wrapper_args_t wrapper_args = - {chpl_ftable[fid], arg, false, - PRV_DATA_IMPL_VAL(filename, lineno, chpl_nullTaskID, false, - subloc, serial_state) }; - - wrap_callbacks(chpl_task_cb_event_kind_create, - &wrapper_args.chpl_data); + arg->serial_state = false; + arg->countRunning = false; + arg->is_executeOn = false; + arg->requestedSubloc = subloc; + arg->requested_fid = fid; + arg->requested_fn = requested_fn; + arg->requested_fn_name = NULL; + arg->lineno = lineno; + arg->filename = filename; + arg->id = chpl_nullTaskID; + + wrap_callbacks(chpl_task_cb_event_kind_create, arg); if (subloc == c_sublocid_any) { - qthread_fork_copyargs(chapel_wrapper, &wrapper_args, - sizeof(chpl_qthread_wrapper_args_t), NULL); + qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); } else { - qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, - sizeof(chpl_qthread_wrapper_args_t), NULL, - (qthread_shepherd_id_t) subloc); + qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, + NULL, (qthread_shepherd_id_t) subloc); } } } -void chpl_task_processTaskList(chpl_task_list_p task_list) +void chpl_task_executeTasksInList(void **task_list) { - PROFILE_INCR(profile_task_processTaskList,1); + PROFILE_INCR(profile_task_executeTasksInList,1); } -void chpl_task_executeTasksInList(chpl_task_list_p task_list) +static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p fp, + void *arg, size_t arg_size, + c_sublocid_t subloc, chpl_bool serial_state, + int lineno, int32_t filename) { - PROFILE_INCR(profile_task_executeTasksInList,1); + chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; + + bundle->serial_state = serial_state; + bundle->countRunning = canCountRunningTasks; + bundle->is_executeOn = true; + bundle->requestedSubloc = subloc; + bundle->requested_fid = fid; + bundle->requested_fn = fp; + bundle->requested_fn_name = fname; + bundle->lineno = lineno; + bundle->filename = filename; + bundle->id = chpl_nullTaskID; + + wrap_callbacks(chpl_task_cb_event_kind_create, bundle); + + if (subloc < 0) { + qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, + NULL, (qthread_shepherd_id_t) subloc); + } } -void chpl_task_freeTaskList(chpl_task_list_p task_list) +void chpl_task_taskCallFTable(chpl_fn_int_t fid, + chpl_task_bundle_t *arg, size_t arg_size, + c_sublocid_t subloc, + int lineno, int32_t filename) { - PROFILE_INCR(profile_task_freeTaskList,1); + PROFILE_INCR(profile_task_taskCall,1); + + taskCallBody(fid, NULL, chpl_ftable[fid], arg, arg_size, subloc, false, lineno, filename); } -void chpl_task_startMovedTask(chpl_fn_p fp, - void *arg, - c_sublocid_t subloc, - chpl_taskID_t id, - chpl_bool serial_state) +void chpl_task_startMovedTask(chpl_fn_int_t fid, + chpl_fn_p fp, + chpl_task_bundle_t *arg, + size_t arg_size, + c_sublocid_t subloc, + chpl_taskID_t id, + chpl_bool serial_state) { - assert(subloc != c_sublocid_none); + // + // For now the incoming task ID is simply dropped, though we check + // to make sure the caller wasn't expecting us to do otherwise. If + // we someday make task IDs global we will need to be able to set + // the ID of this moved task. + // assert(id == chpl_nullTaskID); - chpl_qthread_wrapper_args_t wrapper_args = - {fp, arg, canCountRunningTasks, - PRV_DATA_IMPL_VAL("", 0, chpl_nullTaskID, true, - subloc, serial_state) }; - PROFILE_INCR(profile_task_startMovedTask,1); - wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data); - - if (subloc == c_sublocid_any) { - qthread_fork_copyargs(chapel_wrapper, &wrapper_args, - sizeof(chpl_qthread_wrapper_args_t), NULL); - } else { - qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, - sizeof(chpl_qthread_wrapper_args_t), NULL, - (qthread_shepherd_id_t) subloc); - } + taskCallBody(fid, NULL, fp, arg, arg_size, subloc, serial_state, 0, CHPL_FILE_IDX_UNKNOWN); } // @@ -857,23 +1024,46 @@ void chpl_task_startMovedTask(chpl_fn_p fp, // Returns '(unsigned int)-1' if called outside of the tasking layer. chpl_taskID_t chpl_task_getId(void) { - chpl_qthread_tls_t * tls = chpl_qthread_get_tasklocal(); + chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + chpl_taskID_t *id_ptr = NULL; PROFILE_INCR(profile_task_getId,1); if (tls == NULL) return (chpl_taskID_t) -1; - if (tls->chpl_data.id == chpl_nullTaskID) - tls->chpl_data.id = qthread_incr(&next_task_id, 1); + id_ptr = &tls->bundle->id; + + if (*id_ptr == chpl_nullTaskID) + *id_ptr = qthread_incr(&next_task_id, 1); - return tls->chpl_data.id; + return *id_ptr; } -void chpl_task_sleep(int secs) +void chpl_task_sleep(double secs) { if (qthread_shep() == NO_SHEPHERD) { - sleep(secs); + struct timeval deadline; + struct timeval now; + + // + // Figure out when this task can proceed again, and until then, keep + // yielding. + // + gettimeofday(&deadline, NULL); + deadline.tv_usec += (suseconds_t) lround((secs - trunc(secs)) * 1.0e6); + if (deadline.tv_usec > 1000000) { + deadline.tv_sec++; + deadline.tv_usec -= 1000000; + } + deadline.tv_sec += (time_t) trunc(secs); + + do { + chpl_task_yield(); + gettimeofday(&now, NULL); + } while (now.tv_sec < deadline.tv_sec + || (now.tv_sec == deadline.tv_sec + && now.tv_usec < deadline.tv_usec)); } else { qtimer_t t = qtimer_create(); qtimer_start(t); @@ -881,6 +1071,7 @@ void chpl_task_sleep(int secs) qthread_yield(); qtimer_stop(t); } while (qtimer_secs(t) < secs); + qtimer_destroy(t); } } @@ -892,18 +1083,20 @@ chpl_bool chpl_task_getSerial(void) PROFILE_INCR(profile_task_getSerial,1); - return data->chpl_data.prvdata.serial_state; + return data->bundle->serial_state; } void chpl_task_setSerial(chpl_bool state) { chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); - data->chpl_data.prvdata.serial_state = state; + data->bundle->serial_state = state; PROFILE_INCR(profile_task_setSerial,1); } uint32_t chpl_task_getMaxPar(void) { + //chpl_internal_error("Qthreads max tasks par asked\n"); + printf("Qthreads max task par asked\n"); // // We assume here that the caller (in the LocaleModel module code) // is interested in the number of workers on the whole node, and @@ -918,7 +1111,8 @@ c_sublocid_t chpl_task_getNumSublocales(void) // FIXME: What we really want here is the number of NUMA // sublocales we are supporting. For now we use the number of // shepherds as a proxy for that. - return (c_sublocid_t) qthread_num_shepherds(); + // return (c_sublocid_t) qthread_num_shepherds(); + return (c_sublocid_t) g_machine->device_count_by_type[ATMI_DEVTYPE_ALL]; } size_t chpl_task_getCallStackSize(void) From 468ed90499d7cc653b8ffcf3e2a656f5811aaaef Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Sat, 4 Mar 2017 02:56:54 -0600 Subject: [PATCH 04/10] Extended Chapel tasking interface to use more of ATMI --- make/compiler/Makefile.hsa | 4 +- runtime/include/chpl-atmi.h | 5 +- runtime/include/tasks/atmi/tasks-atmi.h | 67 ++++----- runtime/src/tasks/atmi/tasks-atmi.c | 181 +++++++++++++++++------- 4 files changed, 162 insertions(+), 95 deletions(-) diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa index 7c1baa28f0..0fc1c3d234 100644 --- a/make/compiler/Makefile.hsa +++ b/make/compiler/Makefile.hsa @@ -20,8 +20,8 @@ CLOC=/opt/rocm/cloc/bin/cloc.sh LIBS+=-latmi_runtime -lm # TODO: move these in third-party directory? -GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib -HSA_INCLUDES=-I/opt/rocm/libatmi/include +GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/atmi/lib +HSA_INCLUDES=-I/opt/rocm/atmi/include else # HSA locations CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h index 6285760810..82c8a54780 100644 --- a/runtime/include/chpl-atmi.h +++ b/runtime/include/chpl-atmi.h @@ -13,12 +13,15 @@ atmi_kernel_t reduction_kernel; atmi_kernel_t *gpu_kernels; +atmi_kernel_t main_kernel; +int g_num_cpu_kernels; atmi_machine_t *g_machine; enum { GPU_KERNEL_IMPL = 10565, - REDUCTION_GPU_IMPL = 42 + REDUCTION_GPU_IMPL = 42, + CPU_FUNCTION_IMPL = 43 }; int chpl_hsa_initialize(void); diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h index 81ad131901..44adc76308 100644 --- a/runtime/include/tasks/atmi/tasks-atmi.h +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -30,7 +30,7 @@ #include "chpl-atmi.h" #include "chpl-tasks-prvdata.h" #include "chpltypes.h" - +#include "chpl-mem.h" #include "qthread.h" #include "qthread-chapel.h" @@ -50,8 +50,8 @@ void chpl_task_yield(void); // Type (and default value) used to communicate task identifiers // between C code and Chapel code in the runtime. // -typedef unsigned int chpl_taskID_t; -#define chpl_nullTaskID QTHREAD_NULL_TASK_ID +typedef uint64_t chpl_taskID_t; +#define chpl_nullTaskID ATMI_NULL_TASK_HANDLE // // Sync variables @@ -93,7 +93,7 @@ typedef struct { } chpl_task_bundle_t; // Structure of task-local storage -typedef struct chpl_qthread_tls_s { +typedef struct chpl_atmi_tls_s { chpl_task_bundle_t *bundle; // The below fields could move to chpl_task_bundleData_t // That would reduce the size of the task local storage, @@ -102,39 +102,23 @@ typedef struct chpl_qthread_tls_s { /* Reports */ int lock_filename; int lock_lineno; -} chpl_qthread_tls_t; +} chpl_atmi_tls_t; + +extern pthread_key_t tls_cache; extern pthread_t chpl_qthread_process_pthread; extern pthread_t chpl_qthread_comm_pthread; -extern chpl_qthread_tls_t chpl_qthread_process_tls; -extern chpl_qthread_tls_t chpl_qthread_comm_task_tls; +extern chpl_atmi_tls_t chpl_qthread_process_tls; +extern chpl_atmi_tls_t chpl_qthread_comm_task_tls; #define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized void chpl_task_stdModulesInitialized(void); -// Wrap qthread_get_tasklocal() and assert that it is always available. -static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void) -{ - chpl_qthread_tls_t* tls; - - if (chpl_qthread_done_initializing) { - tls = (chpl_qthread_tls_t*) - qthread_get_tasklocal(sizeof(chpl_qthread_tls_t)); - if (tls == NULL) { - pthread_t me = pthread_self(); - if (pthread_equal(me, chpl_qthread_comm_pthread)) - tls = &chpl_qthread_comm_task_tls; - else if (pthread_equal(me, chpl_qthread_process_pthread)) - tls = &chpl_qthread_process_tls; - } - assert(tls); - } - else - tls = NULL; +extern pthread_t null_thread; - return tls; -} +// Wrap qthread_get_tasklocal() and assert that it is always available. +extern chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void); #ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL #error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!" @@ -143,7 +127,7 @@ static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void) #endif static inline chpl_task_prvData_t* chpl_task_getPrvData(void) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); if (data) { return &data->prvdata; } @@ -162,7 +146,11 @@ static inline chpl_task_prvData_t* chpl_task_getPrvData(void) static inline c_sublocid_t chpl_task_getSubloc(void) { - return (c_sublocid_t) qthread_shep(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) + return data->bundle->requestedSubloc; + else + return c_sublocid_any; } #ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL @@ -173,8 +161,6 @@ c_sublocid_t chpl_task_getSubloc(void) static inline void chpl_task_setSubloc(c_sublocid_t subloc) { - qthread_shepherd_id_t curr_shep; - assert(subloc != c_sublocid_none); // Only change sublocales if the caller asked for a particular one, @@ -188,16 +174,10 @@ void chpl_task_setSubloc(c_sublocid_t subloc) // before tasking init and in any case would be done from the // main thread of execution, which doesn't have a shepherd. // The code below wouldn't work in that situation. - if ((curr_shep = qthread_shep()) != NO_SHEPHERD) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); - if (data) { - data->bundle->requestedSubloc = subloc; - } - - if (subloc != c_sublocid_any && - (qthread_shepherd_id_t) subloc != curr_shep) { - qthread_migrate_to((qthread_shepherd_id_t) subloc); - } + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) { + data->bundle->requestedSubloc = subloc; + printf("Setting ATMI requested subloc to %d\n", subloc); } } @@ -209,10 +189,11 @@ void chpl_task_setSubloc(c_sublocid_t subloc) static inline c_sublocid_t chpl_task_getRequestedSubloc(void) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); if (data) { return data->bundle->requestedSubloc; } + return c_sublocid_any; } diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c index a518957493..8f926c8738 100644 --- a/runtime/src/tasks/atmi/tasks-atmi.c +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -66,6 +66,11 @@ #include #include +#define max_num_cpu_kernels 4096 +int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; +atmi_kernel_t cpu_kernels[max_num_cpu_kernels]; + + #define OUTPUT_ATMI_STATUS(status, msg) \ { \ if (ATMI_STATUS_SUCCESS != (status)) { \ @@ -76,8 +81,6 @@ } \ } - - //#define SUPPORT_BLOCKREPORT //#define SUPPORT_TASKREPORT @@ -182,20 +185,22 @@ chpl_task_bundle_t chpl_qthread_comm_task_bundle = { .requested_fn = NULL, .id = chpl_nullTaskID }; -chpl_qthread_tls_t chpl_qthread_process_tls = { +chpl_atmi_tls_t chpl_qthread_process_tls = { .bundle = &chpl_qthread_process_bundle, .lock_filename = 0, .lock_lineno = 0 }; -chpl_qthread_tls_t chpl_qthread_comm_task_tls = { +chpl_atmi_tls_t chpl_qthread_comm_task_tls = { .bundle = &chpl_qthread_comm_task_bundle, .lock_filename = 0, .lock_lineno = 0 }; // -// chpl_qthread_get_tasklocal() is in tasks-qthreads.h +// chpl_atmi_get_tasklocal() is in tasks-qthreads.h // +pthread_key_t tls_cache; + static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; static volatile chpl_bool canCountRunningTasks = false; @@ -203,11 +208,11 @@ static volatile chpl_bool canCountRunningTasks = false; void chpl_task_yield(void) { PROFILE_INCR(profile_task_yield,1); - if (qthread_shep() == NO_SHEPHERD) { + /*if (qthread_shep() == NO_SHEPHERD) { sched_yield(); } else { qthread_yield(); - } + }*/ } // Sync variables @@ -251,10 +256,37 @@ void chpl_sync_unlock(chpl_sync_aux_t *s) qthread_incr(&s->lockers_out, 1); } +inline chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void) +{ + chpl_atmi_tls_t* tls = NULL; + + pthread_t me = pthread_self(); + if (pthread_equal(me, chpl_qthread_comm_pthread)) + tls = &chpl_qthread_comm_task_tls; + else if (pthread_equal(me, chpl_qthread_process_pthread)) + tls = &chpl_qthread_process_tls; + else { + atmi_task_handle_t t = get_atmi_task_handle(); + if(t != ATMI_NULL_TASK_HANDLE) { + tls = (chpl_atmi_tls_t *)pthread_getspecific(tls_cache); + if(tls == NULL) { + // FIXME: when to free? + tls = (chpl_atmi_tls_t *)chpl_mem_alloc(sizeof(chpl_atmi_tls_t), + CHPL_RT_MD_THREAD_PRV_DATA, + 0, 0); + pthread_setspecific(tls_cache, tls); + } + } + assert(tls != NULL); + } + + return tls; +} + static inline void about_to_block(int32_t lineno, int32_t filename) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); assert(data); data->lock_lineno = lineno; @@ -342,7 +374,7 @@ static void chapel_display_thread(qt_key_t addr, void *tls, void *callarg) { - chpl_qthread_tls_t *rep = (chpl_qthread_tls_t *)tls; + chpl_atmi_tls_t *rep = (chpl_atmi_tls_t *)tls; if (rep) { if ((rep->lock_lineno > 0) && rep->lock_filename) { @@ -683,8 +715,8 @@ static void setupTasklocalStorage(void) { // Make sure Qthreads knows how much space we need for per-task // local storage. tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0); - if (tasklocal_size < sizeof(chpl_qthread_tls_t)) { - snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_qthread_tls_t)); + if (tasklocal_size < sizeof(chpl_atmi_tls_t)) { + snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_atmi_tls_t)); chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1); } } @@ -698,13 +730,28 @@ static void setupWorkStealing(void) { chpl_qt_setenv("STEAL_RATIO", "0", 0); } +static aligned_t chapel_wrapper(void *arg); +static aligned_t main_wrapper(void *arg); +// If we stored chpl_taskID_t in chpl_task_bundleData_t, +// this struct and the following function may not be necessary. +typedef void (*main_ptr_t)(void); +typedef struct { + chpl_task_bundle_t arg; + main_ptr_t chpl_main; +} main_wrapper_bundle_t; + void chpl_task_init(void) { atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); - printf("ATMI task initialized\n"); if(st != ATMI_STATUS_SUCCESS) return; g_machine = atmi_machine_get_info(); + + pthread_key_create(&tls_cache, NULL); + //g_num_cpu_kernels = sizeof(chpl_ftable)/sizeof(chpl_ftable[0]); + size_t main_arg_size = sizeof(main_wrapper_bundle_t); + atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size); + atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL); int32_t commMaxThreads; int32_t hwpar; @@ -748,11 +795,18 @@ void chpl_task_init(void) void chpl_task_exit(void) { + //for(int i = 0; i < g_num_cpu_kernels; i++) { + for(int i = 0; i < max_num_cpu_kernels; i++) { + if(cpu_kernels_initialized[i]) + atmi_kernel_release(cpu_kernels[i]); + } + atmi_kernel_release(main_kernel); atmi_finalize(); #ifdef CHAPEL_PROFILE profile_print(); #endif /* CHAPEL_PROFILE */ + pthread_key_delete(tls_cache); if (qthread_shep() == NO_SHEPHERD) { /* sometimes, tasking is told to shutdown even though it hasn't been * told to start yet */ @@ -781,20 +835,12 @@ static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, } -// If we stored chpl_taskID_t in chpl_task_bundleData_t, -// this struct and the following function may not be necessary. -typedef void (*main_ptr_t)(void); -typedef struct { - chpl_task_bundle_t arg; - main_ptr_t chpl_main; -} main_wrapper_bundle_t; - static aligned_t main_wrapper(void *arg) { - chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg; chpl_task_bundle_t *bundle = &m_bundle->arg; - chpl_qthread_tls_t pv = {.bundle = bundle}; + chpl_atmi_tls_t pv = {.bundle = bundle}; *tls = pv; @@ -809,18 +855,17 @@ static aligned_t main_wrapper(void *arg) return 0; } - static aligned_t chapel_wrapper(void *arg) { - chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; - chpl_qthread_tls_t pv = {.bundle = bundle}; + chpl_atmi_tls_t pv = {.bundle = bundle}; *tls = pv; if (bundle->countRunning) chpl_taskRunningCntInc(0, 0); - + wrap_callbacks(chpl_task_cb_event_kind_begin, bundle); // launch GPU kernel here? @@ -869,8 +914,18 @@ void chpl_task_callMain(void (*chpl_main)(void)) wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg); - qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret); - qthread_syncvar_readFF(NULL, &exit_ret); + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + //void *arg1 = (void *)&arg; + kernargs[0] = (void *)&arg; + //atmi_task_launch(lparm, main_kernel, kernargs); + + main_wrapper(&arg); + //qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret); + //qthread_syncvar_readFF(NULL, &exit_ret); } void chpl_task_stdModulesInitialized(void) @@ -911,6 +966,7 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, int lineno, int32_t filename) { + //printf("Adding %d fn to task list\n", fid); chpl_bool serial_state = chpl_task_getSerial(); chpl_fn_p requested_fn = chpl_ftable[fid]; @@ -935,12 +991,24 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, wrap_callbacks(chpl_task_cb_event_kind_create, arg); - if (subloc == c_sublocid_any) { + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + //lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + if(!cpu_kernels_initialized[fid]) { + atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); + atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL); + cpu_kernels_initialized[fid] = 1; + } + kernargs[0] = (void *)arg; + atmi_task_launch(lparm, cpu_kernels[fid], kernargs); + /*if (subloc == c_sublocid_any) { qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); } else { qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, NULL, (qthread_shepherd_id_t) subloc); - } + }*/ } } @@ -954,6 +1022,7 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p c_sublocid_t subloc, chpl_bool serial_state, int lineno, int32_t filename) { + //printf("Adding %d fn to task call body\n", fid); chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; bundle->serial_state = serial_state; @@ -969,12 +1038,24 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p wrap_callbacks(chpl_task_cb_event_kind_create, bundle); - if (subloc < 0) { + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + //lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + if(!cpu_kernels_initialized[fid]) { + atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); + atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL); + cpu_kernels_initialized[fid] = 1; + } + kernargs[0] = (void *)arg; + atmi_task_launch(lparm, cpu_kernels[fid], kernargs); + /*if (subloc < 0) { qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); } else { qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, - NULL, (qthread_shepherd_id_t) subloc); - } + NULL, (qthread_shepherd_id_t) 0); + }*/ } void chpl_task_taskCallFTable(chpl_fn_int_t fid, @@ -1024,20 +1105,13 @@ void chpl_task_startMovedTask(chpl_fn_int_t fid, // Returns '(unsigned int)-1' if called outside of the tasking layer. chpl_taskID_t chpl_task_getId(void) { - chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal(); - chpl_taskID_t *id_ptr = NULL; - PROFILE_INCR(profile_task_getId,1); - if (tls == NULL) + atmi_task_handle_t t = get_atmi_task_handle(); + if(t == ATMI_NULL_TASK_HANDLE) return (chpl_taskID_t) -1; - id_ptr = &tls->bundle->id; - - if (*id_ptr == chpl_nullTaskID) - *id_ptr = qthread_incr(&next_task_id, 1); - - return *id_ptr; + return t; } void chpl_task_sleep(double secs) @@ -1077,18 +1151,27 @@ void chpl_task_sleep(double secs) /* The get- and setSerial() methods assume the beginning of the task-local * data segment holds a chpl_bool denoting the serial state. */ +#if 0 +chpl_bool get_serial(chpl_taskID_t id) { + if(g_task_map.find(id) == g_task_map.end()) + return true; + else + return g_task_map[id]; +} +#endif + chpl_bool chpl_task_getSerial(void) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); PROFILE_INCR(profile_task_getSerial,1); - + return data->bundle->serial_state; } void chpl_task_setSerial(chpl_bool state) { - chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal(); + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); data->bundle->serial_state = state; PROFILE_INCR(profile_task_setSerial,1); @@ -1096,14 +1179,14 @@ void chpl_task_setSerial(chpl_bool state) uint32_t chpl_task_getMaxPar(void) { //chpl_internal_error("Qthreads max tasks par asked\n"); - printf("Qthreads max task par asked\n"); + //printf("Qthreads max task par asked\n"); // // We assume here that the caller (in the LocaleModel module code) // is interested in the number of workers on the whole node, and // will decide itself how much parallelism to create across and // within sublocales, if there are any. // - return (uint32_t) qthread_num_workers(); + return (uint32_t) 8;//qthread_num_workers(); } c_sublocid_t chpl_task_getNumSublocales(void) @@ -1145,7 +1228,7 @@ int32_t chpl_task_getNumBlockedTasks(void) uint32_t chpl_task_getNumThreads(void) { - return (uint32_t)qthread_num_workers(); + return (uint32_t) 8;//qthread_num_workers(); } // Ew. Talk about excessive bookkeeping. From cc5406c05ec1dcf967986089a8cd9253674bc437 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Sat, 4 Mar 2017 03:15:49 -0600 Subject: [PATCH 05/10] minor bug fix for constant expression --- runtime/include/tasks/atmi/tasks-atmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h index 44adc76308..3f1497b61d 100644 --- a/runtime/include/tasks/atmi/tasks-atmi.h +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -51,7 +51,7 @@ void chpl_task_yield(void); // between C code and Chapel code in the runtime. // typedef uint64_t chpl_taskID_t; -#define chpl_nullTaskID ATMI_NULL_TASK_HANDLE +#define chpl_nullTaskID ATMI_TASK_HANDLE(0xFFFFFFFFFFFFFFFF) // // Sync variables From 5307da3caac2841fb94d92037826b54ffe3e2a60 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Mon, 3 Apr 2017 03:07:38 -0500 Subject: [PATCH 06/10] [Feature] cobegin uses ATMI for CPU task management. But, TARGET_HSA should be enabled to do that, but it is just conforming to the rest of our conditional compilation strategy anyway. --- compiler/AST/build.cpp | 20 +++++ compiler/AST/expr.cpp | 16 +++- compiler/codegen/expr.cpp | 10 ++- compiler/include/expr.h | 12 ++- compiler/passes/buildDefaultFunctions.cpp | 4 + compiler/passes/insertLineNumbers.cpp | 9 ++- compiler/passes/parallel.cpp | 28 +++++-- modules/internal/ChapelBase.chpl | 82 +++++++++++++++++++- modules/internal/LocaleModelHelpRuntime.chpl | 10 ++- runtime/include/chpl-tasks.h | 5 ++ runtime/src/tasks/atmi/tasks-atmi.c | 57 ++++++++++++++ 11 files changed, 228 insertions(+), 25 deletions(-) diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp index 6bacf2f15e..8ffa86a44d 100644 --- a/compiler/AST/build.cpp +++ b/compiler/AST/build.cpp @@ -2870,6 +2870,7 @@ buildOnStmt(Expr* expr, Expr* stmt) { } } +//#define TARGET_HSA_BEGIN BlockStmt* buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { @@ -2890,12 +2891,20 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { return body; } else { BlockStmt* block = buildChapelStmt(); +#ifdef TARGET_HSA_BEGIN block->insertAtTail(new CallExpr("_upEndCount")); +#else + block->insertAtTail(new CallExpr("_addToTaskGroup")); +#endif BlockStmt* beginBlock = new BlockStmt(); beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN)); addByrefVars(beginBlock, byref_vars); beginBlock->insertAtHead(stmt); +#ifdef TARGET_HSA_BEGIN beginBlock->insertAtTail(new CallExpr("_downEndCount")); +#else + beginBlock->insertAtTail(new CallExpr("_completeTaskGroup")); +#endif block->insertAtTail(beginBlock); return block; } @@ -2947,13 +2956,24 @@ buildCobeginStmt(CallExpr* byref_vars, BlockStmt* block) { addByrefVars(beginBlk, byref_vars ? byref_vars->copy() : NULL); stmt->insertBefore(beginBlk); beginBlk->insertAtHead(stmt->remove()); +#ifdef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", cobeginCount)); block->insertAtHead(new CallExpr("_upEndCount", cobeginCount)); +#else + beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", cobeginCount)); +#endif } +#ifndef TARGET_HSA + block->insertAtHead(new CallExpr("_initTaskGroup", cobeginCount)); +#endif block->insertAtHead(new CallExpr(PRIM_MOVE, cobeginCount, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gTrue))); block->insertAtHead(new DefExpr(cobeginCount)); +#ifdef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount", cobeginCount)); +#else + block->insertAtTail(new CallExpr("_finalizeTaskGroup", cobeginCount)); +#endif block->insertAtTail(new CallExpr("_endCountFree", cobeginCount)); block->astloc = cobeginCount->astloc; // grab the location of 'cobegin' kw diff --git a/compiler/AST/expr.cpp b/compiler/AST/expr.cpp index a651651883..feb9b73209 100644 --- a/compiler/AST/expr.cpp +++ b/compiler/AST/expr.cpp @@ -793,7 +793,8 @@ CallExpr::CallExpr(BaseAST* base, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(NULL), baseExpr(NULL), @@ -815,6 +816,7 @@ CallExpr::CallExpr(BaseAST* base, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; @@ -827,7 +829,8 @@ CallExpr::CallExpr(PrimitiveOp* prim, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(prim), baseExpr(NULL), @@ -840,6 +843,7 @@ CallExpr::CallExpr(PrimitiveOp* prim, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; @@ -851,7 +855,8 @@ CallExpr::CallExpr(PrimitiveTag prim, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(primitives[prim]), baseExpr(NULL), @@ -864,6 +869,7 @@ CallExpr::CallExpr(PrimitiveTag prim, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; @@ -876,7 +882,8 @@ CallExpr::CallExpr(const char* name, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(NULL), baseExpr(new UnresolvedSymExpr(name)), @@ -889,6 +896,7 @@ CallExpr::CallExpr(const char* name, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; diff --git a/compiler/codegen/expr.cpp b/compiler/codegen/expr.cpp index c3f0fd583a..1fee3d395e 100644 --- a/compiler/codegen/expr.cpp +++ b/compiler/codegen/expr.cpp @@ -5350,11 +5350,12 @@ void CallExpr::codegenInvokeOnFun() { void CallExpr::codegenInvokeTaskFun(const char* name) { FnSymbol* fn = isResolved(); GenRet taskList = codegenValue(get(1)); + GenRet taskGroup = codegenValue(get(6)); GenRet taskListNode; GenRet taskBundle; GenRet bundleSize; - std::vector args(8); + std::vector args(9); // get(1) is a ref/wide ref to a task list value // get(2) is the node ID owning the task list @@ -5374,9 +5375,10 @@ void CallExpr::codegenInvokeTaskFun(const char* name) { args[2] = codegenCast("chpl_task_bundle_p", taskBundle); args[3] = bundleSize; args[4] = taskList; - args[5] = codegenValue(taskListNode); - args[6] = fn->linenum(); - args[7] = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32); + args[5] = taskGroup; + args[6] = codegenValue(taskListNode); + args[7] = fn->linenum(); + args[8] = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32); genComment(fn->cname, true); diff --git a/compiler/include/expr.h b/compiler/include/expr.h index a26acaafed..5233984569 100644 --- a/compiler/include/expr.h +++ b/compiler/include/expr.h @@ -199,28 +199,32 @@ class CallExpr : public Expr { BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); CallExpr(PrimitiveOp* prim, BaseAST* arg1 = NULL, BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); CallExpr(PrimitiveTag prim, BaseAST* arg1 = NULL, BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); CallExpr(const char* name, BaseAST* arg1 = NULL, BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); ~CallExpr(); diff --git a/compiler/passes/buildDefaultFunctions.cpp b/compiler/passes/buildDefaultFunctions.cpp index 6ffa649baf..4c784fb274 100644 --- a/compiler/passes/buildDefaultFunctions.cpp +++ b/compiler/passes/buildDefaultFunctions.cpp @@ -581,7 +581,11 @@ static void build_chpl_entry_points() { // endcount (see comment above) // if (fMinimalModules == false) { +#ifdef TARGET_HSA chpl_gen_main->insertAtTail(new CallExpr("_waitEndCount")); +#else + chpl_gen_main->insertAtTail(new CallExpr("_finalizeTaskGroup")); +#endif } chpl_gen_main->insertAtTail(new CallExpr(PRIM_RETURN, main_ret)); diff --git a/compiler/passes/insertLineNumbers.cpp b/compiler/passes/insertLineNumbers.cpp index 1c4d34beae..b272c4d324 100644 --- a/compiler/passes/insertLineNumbers.cpp +++ b/compiler/passes/insertLineNumbers.cpp @@ -283,6 +283,7 @@ void insertLineNumbers() { insertNilChecks(); } + printf("Done 1\n"); // loop over all primitives that require a line number and filename // and pass them an actual line number and filename forv_Vec(CallExpr, call, gCallExprs) { @@ -291,6 +292,7 @@ void insertLineNumbers() { } } + printf("Done 2\n"); // loop over all marked functions ("insert line file info"), add // line number and filename arguments to these functions, and add // them to the queue @@ -304,6 +306,7 @@ void insertLineNumbers() { } } + printf("Done 3\n"); // loop over all functions in the queue and all calls to these // functions, and pass the calls an actual line number and filename forv_Vec(FnSymbol, fn, queue) { @@ -312,7 +315,9 @@ void insertLineNumbers() { } } + printf("Done 4\n"); moveLinenoInsideArgBundle(); + printf("Done 5\n"); } static void moveLinenoInsideArgBundle() @@ -325,8 +330,8 @@ static void moveLinenoInsideArgBundle() // than the expected number. Both block types below expect an // argument bundle, and the on-block expects an additional argument // that is the locale on which it should be executed. - if ((fn->numFormals() > 4 && fn->hasFlag(FLAG_ON_BLOCK)) || - (fn->numFormals() > 5 && !fn->hasFlag(FLAG_ON_BLOCK) && + if ((fn->numFormals() > 5 && fn->hasFlag(FLAG_ON_BLOCK)) || + (fn->numFormals() > 6 && !fn->hasFlag(FLAG_ON_BLOCK) && (fn->hasFlag(FLAG_BEGIN_BLOCK) || fn->hasFlag(FLAG_COBEGIN_OR_COFORALL_BLOCK)))) { diff --git a/compiler/passes/parallel.cpp b/compiler/passes/parallel.cpp index 78ef16120e..e88151bc20 100644 --- a/compiler/passes/parallel.cpp +++ b/compiler/passes/parallel.cpp @@ -99,7 +99,7 @@ static BundleArgsFnData bundleArgsFnDataInit = { true, NULL, NULL }; static void insertEndCounts(); static void passArgsToNestedFns(); static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnData &baData); -static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode); +static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup); static void findBlockRefActuals(Vec& refSet, Vec& refVec); static void findHeapVarsAndRefs(Map*>& defMap, Vec& refSet, Vec& refVec, @@ -410,6 +410,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) { // first argument to a task launch function. Symbol* endCount = NULL; VarSymbol *taskList = NULL; + VarSymbol *taskGroup = NULL; VarSymbol *taskListNode = NULL; if (!fn->hasFlag(FLAG_ON)) { @@ -466,6 +467,16 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) { endCount, endCount->typeInfo()->getField("taskList")))); + // Now get the taskGroup field out of the end count. + + taskGroup = newTemp(astr("_taskGroup", fn->name), dtCVoidPtr); + + fcall->insertBefore(new DefExpr(taskGroup)); + fcall->insertBefore(new CallExpr(PRIM_MOVE, taskGroup, + new CallExpr(PRIM_GET_MEMBER, + endCount, + endCount->typeInfo()->getField("taskGroup")))); + // Now get the node ID field for the end count, // which is where the task list is stored. @@ -481,7 +492,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) { create_block_fn_wrapper(fn, fcall, baData); // call wrapper-function - call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode); + call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode, taskGroup); baData.firstCall = false; } @@ -492,7 +503,8 @@ static CallExpr* helpFindDownEndCount(BlockStmt* block) while (cur && (isCallExpr(cur) || isDefExpr(cur) || isBlockStmt(cur))) { if (CallExpr* call = toCallExpr(cur)) { if (call->isResolved()) - if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0) + if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0 || + strcmp(call->resolvedFunction()->name, "_completeTaskGroup") == 0) return call; } else if (BlockStmt* inner = toBlockStmt(cur)) { // Need to handle local blocks since the compiler @@ -646,6 +658,10 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD dtCVoidPtr->refType ); taskListArg->addFlag(FLAG_NO_CODEGEN); wrap_fn->insertFormalAtTail(taskListArg); + ArgSymbol *taskGroupArg = new ArgSymbol( INTENT_IN, "dummy_taskGroup", + dtCVoidPtr->refType ); + taskGroupArg->addFlag(FLAG_NO_CODEGEN); + wrap_fn->insertFormalAtTail(taskGroupArg); ArgSymbol *taskListNode = new ArgSymbol( INTENT_IN, "dummy_taskListNode", dtInt[INT_SIZE_DEFAULT]); taskListNode->addFlag(FLAG_NO_CODEGEN); @@ -749,20 +765,20 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, - Symbol* taskList, Symbol* taskListNode) + Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup) { // The wrapper function is called with the bundled argument list. if (fn->hasFlag(FLAG_ON)) { // For an on block, the first argument is also passed directly // to the wrapper function. // The forking function uses this to fork a task on the target locale. - fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc)); + fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc));//, new SymExpr(taskGroup))); } else { // For non-on blocks, the task list is passed directly to the function // (so that codegen can find it). // We need the taskList. INT_ASSERT(taskList); - fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc)); + fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc, new SymExpr(taskGroup))); } fcall->remove(); // rm orig. call diff --git a/modules/internal/ChapelBase.chpl b/modules/internal/ChapelBase.chpl index fcdb87690c..ba6210d01b 100644 --- a/modules/internal/ChapelBase.chpl +++ b/modules/internal/ChapelBase.chpl @@ -763,7 +763,8 @@ module ChapelBase { type taskType; var i: iType, taskCnt: taskType, - taskList: c_void_ptr = _defaultOf(c_void_ptr); + taskList: c_void_ptr = _defaultOf(c_void_ptr), + taskGroup: c_void_ptr = _defaultOf(c_void_ptr); } // This function is called once by the initiating task. No on @@ -786,10 +787,17 @@ module ChapelBase { } } + // Compiler looks for this variable to determine the return type of // the "get end count" primitive. type _remoteEndCountType = _endCountAlloc(false).type; + pragma "insert line file info" + extern proc chpl_taskGroupInit(): c_void_ptr; + extern proc chpl_taskGroupGet(): c_void_ptr; + extern proc chpl_taskGroupComplete(); + extern proc chpl_taskGroupFinalize(task_group: c_void_ptr); + // This function is called once by the initiating task. As above, no // on statement needed. pragma "dont disable remote value forwarding" @@ -797,6 +805,36 @@ module ChapelBase { delete e; } + // This function is called by the initiating task once for each new + // task *before* any of the tasks are started. As above, no on + // statement needed. + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true) { + e.taskGroup = chpl_taskGroupGet(); + } + + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) { + e.taskGroup = chpl_taskGroupGet(); + } + + // This function is called by the initiating task once for each new + // task *before* any of the tasks are started. As above, no on + // statement needed. + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _initTaskGroup(e: _EndCount, param countRunningTasks=true) { + e.taskGroup = chpl_taskGroupInit(); + } + + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _initTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) { + e.taskGroup = chpl_taskGroupInit(); + } + // This function is called by the initiating task once for each new // task *before* any of the tasks are started. As above, no on // statement needed. @@ -838,6 +876,26 @@ module ChapelBase { e.i.sub(1, memory_order_release); } + // This function is called once by each newly initiated task. No on + // statement is needed because the call to sub() will do a remote + // fork (on) if needed. + pragma "dont disable remote value forwarding" + proc _completeTaskGroup(e: _EndCount) { + chpl_taskGroupComplete(); + } + + // This function is called once by the initiating task. As above, no + // on statement needed. + pragma "dont disable remote value forwarding" + proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true) { + chpl_taskGroupFinalize(e.taskGroup); + } + + pragma "dont disable remote value forwarding" + proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) { + chpl_taskGroupFinalize(e.taskGroup); + } + // This function is called once by the initiating task. As above, no // on statement needed. pragma "dont disable remote value forwarding" @@ -867,7 +925,7 @@ module ChapelBase { proc _waitEndCount(e: _EndCount, param countRunningTasks=true, numTasks) { // See if we can help with any of the started tasks chpl_taskListExecute(e.taskList); - + // Wait for all tasks to finish e.i.waitFor(0, memory_order_acquire); @@ -876,6 +934,26 @@ module ChapelBase { } } + proc _addToTaskGroup(param countRunningTasks=true) { + var e = __primitive("get end count"); + _addToTaskGroup(e, countRunningTasks); + } + + proc _initTaskGroup(param countRunningTasks=true) { + var e = __primitive("get end count"); + _initTaskGroup(e, countRunningTasks); + } + + proc _completeTaskGroup() { + var e = __primitive("get end count"); + _completeTaskGroup(e); + } + + proc _finalizeTaskGroup(param countRunningTasks=true) { + var e = __primitive("get end count"); + _finalizeTaskGroup(e, countRunningTasks); + } + proc _upEndCount(param countRunningTasks=true) { var e = __primitive("get end count"); _upEndCount(e, countRunningTasks); diff --git a/modules/internal/LocaleModelHelpRuntime.chpl b/modules/internal/LocaleModelHelpRuntime.chpl index 1eca09e9a9..33550f1ae2 100644 --- a/modules/internal/LocaleModelHelpRuntime.chpl +++ b/modules/internal/LocaleModelHelpRuntime.chpl @@ -118,7 +118,7 @@ module LocaleModelHelpRuntime { args: chpl_task_bundle_p, args_size: size_t, subloc_id: int, ref tlist: c_void_ptr, tlist_node_id: int, - is_begin: bool); + is_begin: bool, tgroup: c_void_ptr); extern proc chpl_task_executeTasksInList(ref tlist: c_void_ptr); // @@ -131,10 +131,12 @@ module LocaleModelHelpRuntime { args: chpl_task_bundle_p, // function args args_size: size_t, // args size ref tlist: c_void_ptr, // task list + tgroup: c_void_ptr, // task group tlist_node_id: int // task list owner node ) { + warning("begin fn : ", fn); chpl_task_addToTaskList(fn, args, args_size, - subloc_id, tlist, tlist_node_id, true); + subloc_id, tlist, tlist_node_id, true, tgroup); } // @@ -148,10 +150,12 @@ module LocaleModelHelpRuntime { args: chpl_task_bundle_p, // function args args_size: size_t, // args size ref tlist: c_void_ptr, // task list + tgroup: c_void_ptr, // task group tlist_node_id: int // task list owner node ) { + warning("cobegin fn : ", fn); chpl_task_addToTaskList(fn, args, args_size, - subloc_id, tlist, tlist_node_id, false); + subloc_id, tlist, tlist_node_id, false, tgroup); } // diff --git a/runtime/include/chpl-tasks.h b/runtime/include/chpl-tasks.h index 6eac21fb8c..d9fb916627 100644 --- a/runtime/include/chpl-tasks.h +++ b/runtime/include/chpl-tasks.h @@ -158,10 +158,15 @@ void chpl_task_addToTaskList( void**, // task list c_nodeid_t, // locale (node) where task list resides chpl_bool, // is begin{} stmt? (vs. cobegin or coforall) + void *, // task group int, // line at which function begins int32_t); // name of file containing function void chpl_task_executeTasksInList(void**); +void *chpl_taskGroupInit(int lineno, int32_t filename); +void *chpl_taskGroupGet(); +void chpl_taskGroupFinalize(void *task_group); +void chpl_taskGroupComplete(); // // Call a chpl_ftable[] function in a task. // diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c index 8f926c8738..95b6b48080 100644 --- a/runtime/src/tasks/atmi/tasks-atmi.c +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -45,6 +45,7 @@ #include "chplsys.h" #include "chpl-linefile-support.h" #include "chpl-tasks.h" +#include "chpl-atomics.h" #include "chpl-tasks-callbacks-internal.h" //#include "tasks-qthreads.h" #include "tasks-atmi.h" @@ -69,6 +70,8 @@ #define max_num_cpu_kernels 4096 int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; atmi_kernel_t cpu_kernels[max_num_cpu_kernels]; +atmi_kernel_t dummy_kernel; +static uint_least64_t atmi_tg_id = 0; #define OUTPUT_ATMI_STATUS(status, msg) \ @@ -732,6 +735,7 @@ static void setupWorkStealing(void) { static aligned_t chapel_wrapper(void *arg); static aligned_t main_wrapper(void *arg); +static aligned_t dummy_wrapper(); // If we stored chpl_taskID_t in chpl_task_bundleData_t, // this struct and the following function may not be necessary. typedef void (*main_ptr_t)(void); @@ -752,6 +756,8 @@ void chpl_task_init(void) size_t main_arg_size = sizeof(main_wrapper_bundle_t); atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size); atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL); + atmi_kernel_create_empty(&dummy_kernel, 0, NULL); + atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL); int32_t commMaxThreads; int32_t hwpar; @@ -800,6 +806,7 @@ void chpl_task_exit(void) if(cpu_kernels_initialized[i]) atmi_kernel_release(cpu_kernels[i]); } + atmi_kernel_release(dummy_kernel); atmi_kernel_release(main_kernel); atmi_finalize(); #ifdef CHAPEL_PROFILE @@ -835,6 +842,8 @@ static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, } +static aligned_t dummy_wrapper() {} + static aligned_t main_wrapper(void *arg) { chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); @@ -956,6 +965,48 @@ int chpl_task_createCommTask(chpl_fn_p fn, NULL, comm_task_wrapper, &wrapper_info); } +void *chpl_taskGroupGet() { + void *ret = (void *)get_atmi_task_group(); + printf("Adding to task group: %p\n", ret); + return ret; +} + +void *chpl_taskGroupInit(int lineno, int32_t filename) { + atmi_task_group_t *tg = (atmi_task_group_t *)chpl_malloc(sizeof(atmi_task_group_t)); + // TODO: add to a list of task groups and free all of them at the very end. + int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed); + printf("Next task group ID: %d\n", next_id); + tg->id = next_id; + tg->ordered = ATMI_FALSE; + return tg; +} + +void chpl_taskGroupComplete() { + // no-op in ATMI. down count in other tasking layers +} + +void chpl_taskGroupFinalize(void *tg) { + int cpu_id = 0; // which subloc? + atmi_task_group_t *cur_tg = get_atmi_task_group(); + + if(cur_tg) { + // if I am within a task, provide a chain dependency to the incoming group + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + lparm->groupable = ATMI_TRUE; + lparm->group = cur_tg; + lparm->num_required_groups = 1; + lparm->required_groups = (atmi_task_group_t **)&tg; + atmi_task_launch(lparm, dummy_kernel, NULL); + } + else { + // if I am not within a task (main task), simply sync + printf("Waiting for task group: %p\n", tg); + atmi_task_group_sync((atmi_task_group_t *)tg); + } + //chpl_free(tg); +} + void chpl_task_addToTaskList(chpl_fn_int_t fid, chpl_task_bundle_t *arg, size_t arg_size, @@ -963,6 +1014,7 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, void **task_list, int32_t task_list_locale, chpl_bool is_begin_stmt, + void *task_group, int lineno, int32_t filename) { @@ -995,6 +1047,11 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, ATMI_LPARM_CPU(lparm, cpu_id); lparm->kernel_id = CPU_FUNCTION_IMPL; //lparm->synchronous = ATMI_TRUE; + if(task_group) { + lparm->groupable = ATMI_TRUE; + lparm->group = (atmi_task_group_t *)task_group; + } + void *kernargs[1]; if(!cpu_kernels_initialized[fid]) { atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); From 9efaf1e0955eb2246d327b072ea2489ba3e41f0e Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Mon, 3 Apr 2017 12:38:43 -0500 Subject: [PATCH 07/10] [Bug fix] Minor fix for conditional compilation and returning 0 as the chpl-atmi null task instead of -1. Rationale to return 0 instead of chpl_nullTaskID: ATMI returns -1 as the NULL task, but we maintain 0 as the null task in chpl-atmi. Impl: chpl-atmi does not create a new task for the main wrapper. Instead it creates a dummy ATMI task in the beginning but does not launch/activate it. This helps us recognize 0 as the main/dummy task and not -1 that is expected by ATMI as the NULL task. --- compiler/AST/build.cpp | 12 ++++++------ compiler/passes/buildDefaultFunctions.cpp | 2 +- compiler/passes/insertLineNumbers.cpp | 5 ----- runtime/src/tasks/atmi/tasks-atmi.c | 11 ++++++++++- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp index 8ffa86a44d..d66d9faa25 100644 --- a/compiler/AST/build.cpp +++ b/compiler/AST/build.cpp @@ -2870,7 +2870,7 @@ buildOnStmt(Expr* expr, Expr* stmt) { } } -//#define TARGET_HSA_BEGIN +#define TARGET_HSA_BEGIN BlockStmt* buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { @@ -2891,7 +2891,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { return body; } else { BlockStmt* block = buildChapelStmt(); -#ifdef TARGET_HSA_BEGIN +#ifndef TARGET_HSA_BEGIN block->insertAtTail(new CallExpr("_upEndCount")); #else block->insertAtTail(new CallExpr("_addToTaskGroup")); @@ -2900,7 +2900,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN)); addByrefVars(beginBlock, byref_vars); beginBlock->insertAtHead(stmt); -#ifdef TARGET_HSA_BEGIN +#ifndef TARGET_HSA_BEGIN beginBlock->insertAtTail(new CallExpr("_downEndCount")); #else beginBlock->insertAtTail(new CallExpr("_completeTaskGroup")); @@ -2956,20 +2956,20 @@ buildCobeginStmt(CallExpr* byref_vars, BlockStmt* block) { addByrefVars(beginBlk, byref_vars ? byref_vars->copy() : NULL); stmt->insertBefore(beginBlk); beginBlk->insertAtHead(stmt->remove()); -#ifdef TARGET_HSA +#ifndef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", cobeginCount)); block->insertAtHead(new CallExpr("_upEndCount", cobeginCount)); #else beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", cobeginCount)); #endif } -#ifndef TARGET_HSA +#ifdef TARGET_HSA block->insertAtHead(new CallExpr("_initTaskGroup", cobeginCount)); #endif block->insertAtHead(new CallExpr(PRIM_MOVE, cobeginCount, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gTrue))); block->insertAtHead(new DefExpr(cobeginCount)); -#ifdef TARGET_HSA +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount", cobeginCount)); #else block->insertAtTail(new CallExpr("_finalizeTaskGroup", cobeginCount)); diff --git a/compiler/passes/buildDefaultFunctions.cpp b/compiler/passes/buildDefaultFunctions.cpp index 4c784fb274..d8e6cff7c3 100644 --- a/compiler/passes/buildDefaultFunctions.cpp +++ b/compiler/passes/buildDefaultFunctions.cpp @@ -581,7 +581,7 @@ static void build_chpl_entry_points() { // endcount (see comment above) // if (fMinimalModules == false) { -#ifdef TARGET_HSA +#ifndef TARGET_HSA chpl_gen_main->insertAtTail(new CallExpr("_waitEndCount")); #else chpl_gen_main->insertAtTail(new CallExpr("_finalizeTaskGroup")); diff --git a/compiler/passes/insertLineNumbers.cpp b/compiler/passes/insertLineNumbers.cpp index b272c4d324..a9e302366d 100644 --- a/compiler/passes/insertLineNumbers.cpp +++ b/compiler/passes/insertLineNumbers.cpp @@ -283,7 +283,6 @@ void insertLineNumbers() { insertNilChecks(); } - printf("Done 1\n"); // loop over all primitives that require a line number and filename // and pass them an actual line number and filename forv_Vec(CallExpr, call, gCallExprs) { @@ -292,7 +291,6 @@ void insertLineNumbers() { } } - printf("Done 2\n"); // loop over all marked functions ("insert line file info"), add // line number and filename arguments to these functions, and add // them to the queue @@ -306,7 +304,6 @@ void insertLineNumbers() { } } - printf("Done 3\n"); // loop over all functions in the queue and all calls to these // functions, and pass the calls an actual line number and filename forv_Vec(FnSymbol, fn, queue) { @@ -315,9 +312,7 @@ void insertLineNumbers() { } } - printf("Done 4\n"); moveLinenoInsideArgBundle(); - printf("Done 5\n"); } static void moveLinenoInsideArgBundle() diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c index 95b6b48080..daeb6be8fa 100644 --- a/runtime/src/tasks/atmi/tasks-atmi.c +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -759,6 +759,7 @@ void chpl_task_init(void) atmi_kernel_create_empty(&dummy_kernel, 0, NULL); atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL); + atmi_task_handle_t dummy_handle = atmi_task_create(dummy_kernel); int32_t commMaxThreads; int32_t hwpar; pthread_t initer; @@ -1165,8 +1166,16 @@ chpl_taskID_t chpl_task_getId(void) PROFILE_INCR(profile_task_getId,1); atmi_task_handle_t t = get_atmi_task_handle(); + // Rationale to return 0 instead of chpl_nullTaskID: ATMI returns -1 as the NULL + // task, but we maintain 0 as the null task in chpl-atmi. + // Impl: chpl-atmi does not create a new task for the main wrapper. Instead it + // creates a dummy ATMI task in the beginning but does not launch/activate + // it. This helps us recognize 0 as the main/dummy task and not -1 that is + // expected by ATMI as the NULL task. + // FIXME: this could go against the chpl design of having *every* task being + // launched, including the main wrapper task. Any problems with that? if(t == ATMI_NULL_TASK_HANDLE) - return (chpl_taskID_t) -1; + return (chpl_taskID_t) 0; return t; } From 49b3883cff082512359b2baf5280ff7d38901a33 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Wed, 5 Apr 2017 02:01:24 -0500 Subject: [PATCH 08/10] added support for coforall in ATMI --- compiler/AST/build.cpp | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp index d66d9faa25..91ac51a402 100644 --- a/compiler/AST/build.cpp +++ b/compiler/AST/build.cpp @@ -1609,12 +1609,21 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices, beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL)); addByrefVars(beginBlk, byref_vars); beginBlk->insertAtHead(body); +#ifndef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount)); +#else + beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount)); +#endif BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered); block->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue))); block->insertAtHead(new DefExpr(coforallCount)); +#ifndef TARGET_HSA beginBlk->insertBefore(new CallExpr("_upEndCount", coforallCount)); block->insertAtTail(new CallExpr("_waitEndCount", coforallCount)); +#else + beginBlk->insertBefore(new CallExpr("_initTaskGroup", coforallCount)); + block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount)); +#endif block->insertAtTail(new CallExpr("_endCountFree", coforallCount)); nonVectorCoforallBlk->insertAtTail(block); } @@ -1624,15 +1633,27 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices, beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL)); addByrefVars(beginBlk, byref_vars); beginBlk->insertAtHead(body->copy()); +#ifndef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount)); +#else + beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount)); +#endif VarSymbol* numTasks = newTemp("numTasks"); vectorCoforallBlk->insertAtTail(new DefExpr(numTasks)); vectorCoforallBlk->insertAtTail(new CallExpr(PRIM_MOVE, numTasks, new CallExpr(".", tmpIter, new_CStringSymbol("size")))); +#ifndef TARGET_HSA vectorCoforallBlk->insertAtTail(new CallExpr("_upEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#else + vectorCoforallBlk->insertAtTail(new CallExpr("_initTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#endif BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered); vectorCoforallBlk->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue))); vectorCoforallBlk->insertAtHead(new DefExpr(coforallCount)); +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#else + block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#endif block->insertAtTail(new CallExpr("_endCountFree", coforallCount)); vectorCoforallBlk->insertAtTail(block); } @@ -2870,7 +2891,6 @@ buildOnStmt(Expr* expr, Expr* stmt) { } } -#define TARGET_HSA_BEGIN BlockStmt* buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { @@ -2891,7 +2911,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { return body; } else { BlockStmt* block = buildChapelStmt(); -#ifndef TARGET_HSA_BEGIN +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_upEndCount")); #else block->insertAtTail(new CallExpr("_addToTaskGroup")); @@ -2900,7 +2920,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN)); addByrefVars(beginBlock, byref_vars); beginBlock->insertAtHead(stmt); -#ifndef TARGET_HSA_BEGIN +#ifndef TARGET_HSA beginBlock->insertAtTail(new CallExpr("_downEndCount")); #else beginBlock->insertAtTail(new CallExpr("_completeTaskGroup")); From b764626a15164775179c36c56e9cb8e39890d005 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Wed, 5 Apr 2017 17:50:39 -0500 Subject: [PATCH 09/10] enabled sync statement to use ATMI --- compiler/AST/build.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp index 91ac51a402..b9412aba95 100644 --- a/compiler/AST/build.cpp +++ b/compiler/AST/build.cpp @@ -2940,7 +2940,11 @@ buildSyncStmt(Expr* stmt) { block->insertAtTail(new CallExpr(PRIM_MOVE, endCountSave, new CallExpr(PRIM_GET_END_COUNT))); block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gFalse))); block->insertAtTail(stmt); +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount")); +#else + block->insertAtTail(new CallExpr("_finalizeTaskGroup")); +#endif block->insertAtTail(new CallExpr("_endCountFree", new CallExpr(PRIM_GET_END_COUNT))); block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, endCountSave)); return block; From 5dd51ad47c91d660ae79f19b8f741262066dd187 Mon Sep 17 00:00:00 2001 From: Ashwin Aji Date: Fri, 28 Apr 2017 02:35:34 -0500 Subject: [PATCH 10/10] Recycling task groups and at the same time, fixing the max depth of nesting of coforall and cobegin blocks --- runtime/src/tasks/atmi/tasks-atmi.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c index daeb6be8fa..9194494b18 100644 --- a/runtime/src/tasks/atmi/tasks-atmi.c +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -67,6 +67,10 @@ #include #include +// FIXME: Good idea to recycle the task groups, so this will limit +// the depth of nested coforall s and cobegins. Fix to use an ATMI +// environment variable. +#define max_num_task_groups 8 #define max_num_cpu_kernels 4096 int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; atmi_kernel_t cpu_kernels[max_num_cpu_kernels]; @@ -759,6 +763,11 @@ void chpl_task_init(void) atmi_kernel_create_empty(&dummy_kernel, 0, NULL); atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL); + // this increment needed to let the main task not be treated as an ATMI task + // because we directly launch the main task with the main thread and dont treat + // it as an ATMI task. This increment fools the rest of the runtime to think that + // the main task is an ATMI task, but it is in fact not an ATMI task. + int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed); atmi_task_handle_t dummy_handle = atmi_task_create(dummy_kernel); int32_t commMaxThreads; int32_t hwpar; @@ -968,7 +977,6 @@ int chpl_task_createCommTask(chpl_fn_p fn, void *chpl_taskGroupGet() { void *ret = (void *)get_atmi_task_group(); - printf("Adding to task group: %p\n", ret); return ret; } @@ -976,7 +984,9 @@ void *chpl_taskGroupInit(int lineno, int32_t filename) { atmi_task_group_t *tg = (atmi_task_group_t *)chpl_malloc(sizeof(atmi_task_group_t)); // TODO: add to a list of task groups and free all of them at the very end. int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed); - printf("Next task group ID: %d\n", next_id); + // loop around the task groups. + // FIXME: how will this affect the main task that is not an ATMI task? Incr by 1 after this? + next_id %= max_num_task_groups; tg->id = next_id; tg->ordered = ATMI_FALSE; return tg; @@ -1002,7 +1012,6 @@ void chpl_taskGroupFinalize(void *tg) { } else { // if I am not within a task (main task), simply sync - printf("Waiting for task group: %p\n", tg); atmi_task_group_sync((atmi_task_group_t *)tg); } //chpl_free(tg); @@ -1048,8 +1057,8 @@ void chpl_task_addToTaskList(chpl_fn_int_t fid, ATMI_LPARM_CPU(lparm, cpu_id); lparm->kernel_id = CPU_FUNCTION_IMPL; //lparm->synchronous = ATMI_TRUE; + lparm->groupable = ATMI_TRUE; if(task_group) { - lparm->groupable = ATMI_TRUE; lparm->group = (atmi_task_group_t *)task_group; } @@ -1080,7 +1089,6 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p c_sublocid_t subloc, chpl_bool serial_state, int lineno, int32_t filename) { - //printf("Adding %d fn to task call body\n", fid); chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; bundle->serial_state = serial_state; @@ -1252,7 +1260,8 @@ uint32_t chpl_task_getMaxPar(void) { // will decide itself how much parallelism to create across and // within sublocales, if there are any. // - return (uint32_t) 8;//qthread_num_workers(); + return qthread_num_workers(); + //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count; } c_sublocid_t chpl_task_getNumSublocales(void) @@ -1294,7 +1303,8 @@ int32_t chpl_task_getNumBlockedTasks(void) uint32_t chpl_task_getNumThreads(void) { - return (uint32_t) 8;//qthread_num_workers(); + return qthread_num_workers(); + //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count; } // Ew. Talk about excessive bookkeeping.