From 524daf4dfd87a468b6682dfac56561dcdf885651 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Thu, 1 Dec 2016 18:56:15 -0600
Subject: [PATCH 01/10] Adding ATMI as the runtime layer to launch Chapel's
 generated GPU kernels

---
 make/compiler/Makefile.hsa          |   6 +-
 runtime/include/chpl-atmi.h         |  40 ++++++++
 runtime/include/chpl-gen-includes.h |   2 +-
 runtime/src/Makefile.share          |   4 +-
 runtime/src/chpl-atmi-reducehost.c  | 136 ++++++++++++++++++++++++++++
 runtime/src/chpl-atmi.c             | 116 ++++++++++++++++++++++++
 util/setchplenv_hsa.bash            |   4 +
 7 files changed, 302 insertions(+), 6 deletions(-)
 create mode 100644 runtime/include/chpl-atmi.h
 create mode 100644 runtime/src/chpl-atmi-reducehost.c
 create mode 100644 runtime/src/chpl-atmi.c

diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa
index fa5115fa79..7c1baa28f0 100644
--- a/make/compiler/Makefile.hsa
+++ b/make/compiler/Makefile.hsa
@@ -17,11 +17,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu
 ifdef CHPL_ROCM
 # ROCm locations
 CLOC=/opt/rocm/cloc/bin/cloc.sh
-LIBS+=-lhsa-runtime64 -lhsakmt -lm
+LIBS+=-latmi_runtime -lm
 
 # TODO: move these in third-party directory?
-GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib
-HSA_INCLUDES=-I/opt/rocm/hsa/include
+GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib
+HSA_INCLUDES=-I/opt/rocm/libatmi/include
 else
 # HSA locations
 CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh
diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h
new file mode 100644
index 0000000000..0f4c07feae
--- /dev/null
+++ b/runtime/include/chpl-atmi.h
@@ -0,0 +1,40 @@
+#ifndef _chpl_atmi_h_
+#define _chpl_atmi_h_
+
+#include <atmi_runtime.h>
+#include <stddef.h> /* size_t */
+#include <stdint.h> /* uintXX_t */
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif /* __cplusplus */
+
+#include "chpltypes.h"
+#include "chpl-hsa-kernelparams.h"
+
+atmi_kernel_t reduction_kernel;
+atmi_kernel_t *gpu_kernels;
+
+enum {
+    GPU_KERNEL_IMPL = 10565,
+    REDUCTION_GPU_IMPL = 42
+};    
+/*
+typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
+    uint64_t in;
+    uint64_t out;
+    uint32_t count;
+} hsail_reduce_kernarg_t;
+
+typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
+    uint64_t bundle;
+} hsail_kernarg_t;
+*/
+
+int chpl_hsa_initialize(void);
+
+int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count);
+int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count);
+
+void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
+                        uint32_t wkitem_count_x, void *bundled_args);
+#endif //_chpl_atmi_h_
diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h
index 4527659c5d..2f414baa78 100644
--- a/runtime/include/chpl-gen-includes.h
+++ b/runtime/include/chpl-gen-includes.h
@@ -31,7 +31,7 @@
 #include "chpl-tasks.h"
 #include "chpltypes.h"
 #ifdef TARGET_HSA
-#include "chpl-hsa.h"
+#include "chpl-atmi.h"
 #endif
 
 //
diff --git a/runtime/src/Makefile.share b/runtime/src/Makefile.share
index 732e06ad1f..8d93b13d3a 100644
--- a/runtime/src/Makefile.share
+++ b/runtime/src/Makefile.share
@@ -18,9 +18,9 @@
 
 ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa)
 HSA_SRCS = \
-	chpl-hsa.c \
+	chpl-atmi.c \
 	chpl-hsa-reducekernels.cl \
-	chpl-hsa-reducehost.c
+	chpl-atmi-reducehost.c
 endif
 
 COMMON_LAUNCHER_SRCS = \
diff --git a/runtime/src/chpl-atmi-reducehost.c b/runtime/src/chpl-atmi-reducehost.c
new file mode 100644
index 0000000000..3d642a7f22
--- /dev/null
+++ b/runtime/src/chpl-atmi-reducehost.c
@@ -0,0 +1,136 @@
+
+#include "chpl-atmi.h"
+#include "chplrt.h"
+#include "chplexit.h"
+#include "chpl-mem.h"
+
+/*enum ReduceOp {
+  MAX,
+  MIN,
+  SUM,
+  PROD,
+  BITAND,
+  BITOR,
+  BITXOR,
+  LOGAND,
+  LOGOR
+  };
+  */
+
+/*
+ * Estimate and schedule the required number of GPU kernels
+ */
+    static inline
+void atmi_sched_reducekernels(size_t count, 
+        void *darray[2], size_t *iter_ct,
+        size_t *items_left)
+{
+    size_t incount, outcount, i, iter, in, out;
+    uint32_t max_num_wkgrps, num_wkgroups, grid_size_x;
+
+    const int num_args = 3;
+    atmi_task_group_t task_group = {1, ATMI_TRUE};
+    ATMI_LPARM(lparm);
+    lparm->group = &task_group;
+    lparm->kernel_id = REDUCTION_GPU_IMPL;
+    lparm->synchronous = ATMI_FALSE;
+    lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
+
+    incount = count;
+    max_num_wkgrps = incount / WKGRP_SIZE;
+    num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE  - 1) / SEQ_CHUNK_SIZE;
+    grid_size_x = num_wkgroups * WKGRP_SIZE;
+    outcount = num_wkgroups;
+    iter = 0;
+    while (grid_size_x > WKGRP_SIZE) {
+        in = (iter & 1);
+        out = (iter & 1) ^ 1;
+
+        void *args[] = {&darray[in], &darray[out], &incount};
+        lparm->gridDim[0] = grid_size_x;
+        lparm->groupDim[0] = WKGRP_SIZE;
+        atmi_task_launch(lparm, reduction_kernel, args);
+
+        iter += 1;
+        incount = outcount;
+        max_num_wkgrps = incount / WKGRP_SIZE;
+        num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE  - 1) / SEQ_CHUNK_SIZE;
+        grid_size_x = num_wkgroups * WKGRP_SIZE;
+        outcount = num_wkgroups;
+    }
+
+    if (iter > 0) {
+        atmi_task_group_sync(&task_group);
+    }
+
+    (*items_left) = incount;
+    (*iter_ct) = iter;
+}
+
+/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count)
+  {
+  int32_t res;
+  size_t iter, items_left, out, i;
+  int32_t * darray[2];
+  hsa_symbol_info_t * symbol_info;
+  symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index
+  darray[0] = src;
+  if (0 != chpl_posix_memalign((void **) &darray[1], 64,
+  count * sizeof(int32_t))) {
+  chpl_exit_any(1);
+  }
+
+  hsa_sched_reducekernels(count, symbol_info, (void**)darray,
+  &iter, &items_left);
+
+  res = 0;
+  out = (iter & 1);
+  chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
+  for (i = 0; i < items_left; ++i) res += darray[out][i];
+
+  chpl_free (darray[1]);
+  return res;
+  }*/
+
+int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count)
+{
+    int64_t res;
+    size_t iter, items_left, out, i;
+    int64_t * darray[2];
+    darray[0] = src;
+    if (0 != chpl_posix_memalign((void **) &darray[1], 64,
+                count * sizeof(int64_t))) {
+        chpl_exit_any(1);
+    }
+
+    atmi_sched_reducekernels(count, (void**)darray,
+            &iter, &items_left);
+
+    res = 0;
+    out = (iter & 1);
+    chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
+    for (i = 0; i < items_left; ++i) res += darray[out][i];
+
+    chpl_free (darray[1]);
+    return res;
+}
+
+//FIXME: use the op argument like this to extend this to different ops
+/*if (!strcasecmp(op, "Max"))
+  opType = MAX;
+  else if (!strcasecmp(op, "Min"))
+  opType = MIN;
+  else if (!strcasecmp(op, "Sum"))
+  opType = SUM;
+  else if (!strcasecmp(op, "Product"))
+  opType = PROD;
+  else if (!strcasecmp(op, "LogicalAnd"))
+  opType = LOGAND;
+  else if (!strcasecmp(op, "LogicalOr"))
+  opType = LOGOR;
+  else if (!strcasecmp(op, "BitwiseAnd"))
+  opType = BITAND;
+  else if (!strcasecmp(op, "BitwiseOr"))
+  opType = BITOR;
+  else if (!strcasecmp(op, "BitwiseXor"))
+  opType = BITXOR; */
diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c
new file mode 100644
index 0000000000..f396931ab1
--- /dev/null
+++ b/runtime/src/chpl-atmi.c
@@ -0,0 +1,116 @@
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include "chpl-atmi.h"
+#include "chplrt.h"
+#include "chpl-mem.h"
+#include "chplcgfns.h"
+
+#define OUTPUT_ATMI_STATUS(status, msg) \
+{ \
+    if (ATMI_STATUS_SUCCESS != (status)) { \
+        fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
+#msg, status); \
+        atmi_finalize(); \
+        return status; \
+    } \
+}
+
+/**
+ * Initialize the ATMI/HSA runtime
+ */
+int chpl_hsa_initialize(void)
+{
+    atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
+    if(st != ATMI_STATUS_SUCCESS) return -1;
+
+    char reduce_kernel_filename[1024];
+    char gen_kernel_filename[1024];
+    int arglen = strlen(chpl_executionCommand)+1;
+    char* argCopy = chpl_mem_allocMany(arglen, sizeof(char),
+            CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0);
+    char *binName;
+    int cx;
+
+    cx = snprintf(reduce_kernel_filename, 1024,
+#ifdef ROCM
+            "%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME,
+#else
+            "%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME,
+#endif
+            CHPL_RUNTIME_OBJDIR);
+    if (cx < 0 || cx  >= 256) {
+        OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename);
+    }
+    strcpy(argCopy, chpl_executionCommand);
+    binName = strtok(argCopy, " ");
+#ifdef ROCM
+    cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName);
+#else
+    cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName);
+#endif
+    if (cx < 0 || cx  >= 256) {
+        OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename);
+    }
+    chpl_mem_free(argCopy, 0, 0);
+
+#ifdef ROCM
+    atmi_platform_type_t module_type = AMDGCN;
+#else
+    atmi_platform_type_t module_type = BRIG;
+#endif
+
+    /* FIXME: Create all reduction kernels, not just the int64-sum kernel */
+    const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename};
+    atmi_platform_type_t module_types[2] = {module_type, module_type};
+    st = atmi_module_register(modules, module_types, 2);
+    OUTPUT_ATMI_STATUS(st, Registering all modules);
+
+    size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)};
+    const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]);
+    atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes);
+    atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL);
+
+    size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; 
+    const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]);
+    gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels);
+    for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) {
+        //FIXME: get the actual kernel name
+        const char *kernel_name = chpl_gpu_kernels[i];
+        atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes);
+        atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL);
+    }
+
+    return ATMI_STATUS_SUCCESS;
+}
+
+/**
+ * Release resources used by the base kernels and tear down the HSA structures
+ */
+int hsa_shutdown(void)
+{
+    chpl_free(gpu_kernels);
+    atmi_finalize();
+}
+ 
+/*
+ * Enqueue/execute a kernel
+ */
+void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
+        uint32_t wkitem_count_x, void *bundled_args)
+{
+    void *args[] = {&bundled_args}; 
+    ATMI_LPARM_1D(lparm, wkitem_count_x);
+    lparm->groupDim[0] = wkgrp_size_x;
+    lparm->synchronous = ATMI_TRUE;
+
+    lparm->kernel_id = GPU_KERNEL_IMPL;
+    lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
+    atmi_task_launch(lparm, gpu_kernels[kernel_idx], args);
+}
+
diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash
index cfd76b1c41..3625a4657f 100644
--- a/util/setchplenv_hsa.bash
+++ b/util/setchplenv_hsa.bash
@@ -105,6 +105,10 @@ if [ "$1" == "debug" ]; then
     export CHPL_DEBUG=1
 fi
 
+echo -n "Setting CHPL_ROCM"
+export CHPL_ROCM=1
+echo " to 1"
+
 echo -n "Setting CHPL_LOCALE_MODEL"
 export CHPL_LOCALE_MODEL=hsa
 echo " to hsa"

From 37bb0ba7699bc2d6296a4532a6fbde01d7dd1530 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Thu, 15 Dec 2016 11:51:07 -0600
Subject: [PATCH 02/10] ATMI as the new heterogeneous tasking layer -- 1st pass

---
 make/tasks/Makefile.atmi                |  19 +
 runtime/Makefile.help                   |   3 +
 runtime/etc/Makefile.tasks-atmi         |  19 +
 runtime/include/tasks/atmi/tasks-atmi.h | 445 +++++++++++
 runtime/src/tasks/atmi/Makefile         |  39 +
 runtime/src/tasks/atmi/Makefile.include |  29 +
 runtime/src/tasks/atmi/Makefile.share   |  25 +
 runtime/src/tasks/atmi/tasks-atmi.c     | 963 ++++++++++++++++++++++++
 util/printchplenv                       |   2 +
 util/setchplenv_hsa.bash                |   8 +-
 10 files changed, 1550 insertions(+), 2 deletions(-)
 create mode 100644 make/tasks/Makefile.atmi
 create mode 100644 runtime/etc/Makefile.tasks-atmi
 create mode 100644 runtime/include/tasks/atmi/tasks-atmi.h
 create mode 100644 runtime/src/tasks/atmi/Makefile
 create mode 100644 runtime/src/tasks/atmi/Makefile.include
 create mode 100644 runtime/src/tasks/atmi/Makefile.share
 create mode 100644 runtime/src/tasks/atmi/tasks-atmi.c

diff --git a/make/tasks/Makefile.atmi b/make/tasks/Makefile.atmi
new file mode 100644
index 0000000000..00ca53fb62
--- /dev/null
+++ b/make/tasks/Makefile.atmi
@@ -0,0 +1,19 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+#
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+RUNTIME_INCLS += -I$(QTHREAD_INCLUDE_DIR)
+CHPL_MAKE_THREADS=none
diff --git a/runtime/Makefile.help b/runtime/Makefile.help
index 00e96f04c4..a6d84c1ff5 100644
--- a/runtime/Makefile.help
+++ b/runtime/Makefile.help
@@ -168,4 +168,7 @@ include $(RUNTIME_ROOT)/make/Makefile.runtime.foot
 src/tasks/qthreads/Makefile.include:
 	cd src/tasks/qthreads && $(MAKE) copy_qthread_files
 
+src/tasks/atmi/Makefile.include:
+	cd src/tasks/atmi && $(MAKE) copy_qthread_files
+
 .NOTPARALLEL:
diff --git a/runtime/etc/Makefile.tasks-atmi b/runtime/etc/Makefile.tasks-atmi
new file mode 100644
index 0000000000..ce328fd06b
--- /dev/null
+++ b/runtime/etc/Makefile.tasks-atmi
@@ -0,0 +1,19 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+GEN_CFLAGS += -I$(QTHREAD_INCLUDE_DIR)
+GEN_LFLAGS += -L$(QTHREAD_LIB_DIR) -Wl,-rpath,$(QTHREAD_LIB_DIR)
diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h
new file mode 100644
index 0000000000..58dab0a982
--- /dev/null
+++ b/runtime/include/tasks/atmi/tasks-atmi.h
@@ -0,0 +1,445 @@
+/**************************************************************************
+*  Copyright 2011 Sandia Corporation. Under the terms of Contract
+*  DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
+*  or on behalf of the U.S. Government. Export of this program may require a
+*  license from the United States Government
+**************************************************************************/
+
+/*
+ * Copyright 2004-2015 Cray Inc.
+ * Other additional copyright holders may be indicated within.
+ *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _tasks_qthreads_h_
+#define _tasks_qthreads_h_
+
+#include <stdint.h>
+
+#include <assert.h>
+#include <qthread.h>
+#include <stdio.h>
+#include <pthread.h>
+
+#include "chpltypes.h"
+#include "chpl-tasks-prvdata.h"
+
+#define CHPL_COMM_YIELD_TASK_WHILE_POLLING
+void chpl_task_yield(void);
+
+// For mutexes
+//   type(s)
+//     threadlayer_mutex_t
+//     threadlayer_mutex_p
+//   functions
+//     threadlayer_mutex_init()
+//     threadlayer_mutex_new()
+//     threadlayer_mutex_lock()
+//     threadlayer_mutex_unlock()
+//
+// For thread management
+//   type(s)
+//     <none>
+//   functions
+//     threadlayer_thread_id()
+//     threadlayer_thread_cancel()
+//     threadlayer_thread_join()
+//
+// For sync variables
+//   type(s)
+//     threadlayer_sync_aux_t
+//   functions
+//     threadlayer_sync_suspend()
+//     threadlayer_sync_awaken()
+//     threadlayer_sync_init()
+//     threadlayer_sync_destroy()
+//
+// For task management
+//   type(s)
+//     <none>
+//   functions
+//     threadlayer_init()
+//     threadlayer_thread_create()
+//     threadlayer_pool_suspend()
+//     threadlayer_pool_awaken()
+//     threadlayer_get_thread_private_data()
+//     threadlayer_set_thread_private_data()
+//
+// The types are declared in the threads-*.h file for each specific
+// threading layer, and the callback functions are declared here.  The
+// interfaces and requirements for these other types and callback
+// functions are described elsewhere in this file.
+//
+// Although the above list may seem long, in practice many of the
+// functions are quite simple, and with luck also easily extrapolated
+// from what is done for other threading layers.  For an example of an
+// implementation, see "pthreads" threading.
+//
+
+//
+// Type (and default value) used to communicate task identifiers
+// between C code and Chapel code in the runtime.
+//
+typedef unsigned int chpl_taskID_t;
+#define chpl_nullTaskID QTHREAD_NULL_TASK_ID
+
+//
+// Mutexes
+//
+typedef syncvar_t chpl_mutex_t;
+
+//
+// Sync variables
+//
+// The threading layer's threadlayer_sync_aux_t may include any
+// additional members the layer needs to support the suspend/awaken
+// callbacks efficiently.  The FIFO tasking code itself does not
+// refer to this type or the tl_aux member at all.
+//
+typedef struct {
+    aligned_t lockers_in;
+    aligned_t lockers_out;
+    uint_fast32_t uncontested_locks;
+    int       is_full;
+    syncvar_t signal_full;
+    syncvar_t signal_empty;
+} chpl_sync_aux_t;
+
+#define chpl_sync_reset(x) qthread_syncvar_empty(&(x)->sync_aux.signal_full)
+
+#define chpl_read_FE(x) ({                                                           \
+                             uint64_t y;                                             \
+                             qthread_syncvar_readFE(&y, &(x)->sync_aux.signal_full); \
+                             y; })
+
+#define chpl_read_FF(x) ({                                                           \
+                             uint64_t y;                                             \
+                             qthread_syncvar_readFF(&y, &(x)->sync_aux.signal_full); \
+                             y; })
+
+#define chpl_read_XX(x) ((x)->sync_aux.signal_full.u.s.data)
+
+#define chpl_write_EF(x, y) do {                                 \
+        uint64_t z = (uint64_t)(y);                              \
+        qthread_syncvar_writeEF(&(x)->sync_aux.signal_full, &z); \
+} while(0)
+
+#define chpl_write_FF(x, y) do {                                    \
+        uint64_t z, dummy;                                          \
+        z = (uint64_t)(y);                                          \
+        qthread_syncvar_readFE(&dummy, &(x)->sync_aux.signal_full); \
+        qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z);     \
+} while(0)
+
+#define chpl_write_XF(x, y) do {                                \
+        uint64_t z = (uint64_t)(y);                             \
+        qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \
+} while(0)
+
+#define chpl_single_reset(x) qthread_syncvar_empty(&(x)->single_aux.signal_full)
+
+#define chpl_single_read_FF(x) ({                                                             \
+                                    uint64_t y;                                               \
+                                    qthread_syncvar_readFF(&y, &(x)->single_aux.signal_full); \
+                                    y; })
+
+#define chpl_single_write_EF(x, y) do {                            \
+        uint64_t z = (uint64_t)(y);                                \
+        qthread_syncvar_writeEF(&(x)->single_aux.signal_full, &z); \
+} while(0)
+
+#define chpl_single_read_XX(x) ((x)->single_aux.signal_full.u.s.data)
+
+// Tasks
+
+//
+// Handy services for threading layer callback functions.
+//
+// The FIFO tasking implementation also provides the following service
+// routines that can be used by threading layer callback functions.
+//
+
+//
+// The remaining declarations are all for callback functions to be
+// provided by the threading layer.
+//
+
+//
+// These are called once each, from CHPL_TASKING_INIT() and
+// CHPL_TASKING_EXIT().
+//
+void threadlayer_init(void);
+void threadlayer_exit(void);
+
+//
+// Mutexes
+//
+/*void qthread_mutex_init(qthread_mutex_p);
+ * qthread_mutex_p qthread_mutex_new(void);
+ * void qthread_mutex_lock(qthread_mutex_p);
+ * void qthread_mutex_unlock(qthread_mutex_p);*/
+
+//
+// Sync variables
+//
+// The CHPL_SYNC_WAIT_{FULL,EMPTY}_AND_LOCK() functions should call
+// threadlayer_sync_suspend() when a sync variable is not in the desired
+// full/empty state.  The call will be made with the sync variable's
+// mutex held.  (Thus, threadlayer_sync_suspend() can dependably tell
+// that the desired state must be the opposite of the state it initially
+// sees the variable in.)  It should return (with the mutex again held)
+// as soon as it can once either the sync variable changes to the
+// desired state, or (if the given deadline pointer is non-NULL) the
+// deadline passes.  It can return also early, before either of these
+// things occur, with no ill effects.  If a deadline is given and it
+// does pass, then threadlayer_sync_suspend() must return true;
+// otherwise false.
+//
+// The less the function can execute while waiting for the sync variable
+// to change state, and the quicker it can un-suspend when the variable
+// does change state, the better overall performance will be.  Obviously
+// the sync variable's mutex must be unlocked while the routine waits
+// for the variable to change state or the deadline to pass, or livelock
+// may result.
+//
+// The CHPL_SYNC_MARK_AND_SIGNAL_{FULL,EMPTY}() functions will call
+// threadlayer_sync_awaken() every time they are called, not just when
+// they change the state of the sync variable.
+//
+// Threadlayer_sync_{init,destroy}() are called to initialize or
+// destroy, respectively, the contents of the tl_aux member of the
+// chpl_sync_aux_t for the specific threading layer.
+//
+/*chpl_bool threadlayer_sync_suspend(chpl_sync_aux_t *s,
+ *                                 struct timeval *deadline);
+ * void threadlayer_sync_awaken(chpl_sync_aux_t *s);
+ * void threadlayer_sync_init(chpl_sync_aux_t *s);
+ * void threadlayer_sync_destroy(chpl_sync_aux_t *s);*/
+
+//
+// Task management
+//
+
+//
+// The interface for thread creation may need to be extended eventually
+// to allow for specifying such things as stack sizes and/or locations.
+//
+/*int threadlayer_thread_create(threadlayer_threadID_t*, void*(*)(void*), void*);*/
+
+//
+// Threadlayer_pool_suspend() is called when a thread finds nothing in
+// the pool of unclaimed tasks, and so has no work to do.  The call will
+// be made with the pointed-to mutex held.  It should return (with the
+// mutex again held) as soon as it can once either the task pool is no
+// longer empty or (if the given deadline pointer is non-NULL) the
+// deadline passes.  It can return also early, before either of these
+// things occur, with no ill effects.  If a deadline is given and it
+// does pass, then threadlayer_pool_suspend() must return true;
+// otherwise false.
+//
+// The less the function can execute while waiting for the pool to
+// become nonempty, and the quicker it can un-suspend when that happens,
+// the better overall performance will be.
+//
+// The mutex passed to threadlayer_pool_suspend() is the one that
+// provides mutual exclusion for changes to the task pool.  Allowing
+// access to this mutex simplifies the implementation for certain
+// threading layers, such as those based on pthreads condition
+// variables.  However, it also introduces a complication in that it
+// allows a threading layer to create deadlock or livelock situations if
+// it is not careful.  Certainly the mutex must be unlocked while the
+// routine waits for the task pool to fill or the deadline to pass, or
+// livelock may result.
+//
+/*chpl_bool threadlayer_pool_suspend(chpl_mutex_t*, struct timeval*);
+ * void threadlayer_pool_awaken(void);*/
+
+//
+// Thread private data
+//
+// These set and get a pointer to thread private data associated with
+// each thread.  This is for the use of the FIFO tasking implementation
+// itself.  If the threading layer also needs to store some data private
+// to each thread, it must make other arrangements to do so.
+//
+/*void  threadlayer_set_thread_private_data(void*);
+ * void* threadlayer_get_thread_private_data(void);*/
+
+
+#ifndef QTHREAD_MULTINODE
+extern
+#ifdef __cplusplus
+"C"
+#endif
+volatile int chpl_qthread_done_initializing;
+#endif
+
+typedef struct {
+    c_string task_filename;
+    int task_lineno;
+    chpl_taskID_t id;
+    chpl_bool is_executeOn;
+    c_sublocid_t requestedSubloc;  // requested sublocal for task
+    chpl_task_prvData_t prvdata;
+} chpl_task_prvDataImpl_t;
+
+// Define PRV_DATA_IMPL_VAL to set up a chpl_task_prvData_t.
+#define PRV_DATA_IMPL_VAL(_fn, _ln, _id, _is_execOn, _subloc, _serial) \
+        { .task_filename = _fn, \
+          .task_lineno = _ln, \
+          .id = _id, \
+          .is_executeOn = _is_execOn, \
+          .requestedSubloc = _subloc, \
+          .prvdata = { .serial_state = _serial } }
+
+typedef struct {
+    void                     *fn;
+    void                     *args;
+    chpl_bool                countRunning;
+    chpl_task_prvDataImpl_t  chpl_data;
+} chpl_qthread_wrapper_args_t;
+
+// Structure of task-local storage
+typedef struct chpl_qthread_tls_s {
+    /* Task private data: serial state, etc. */
+    chpl_task_prvDataImpl_t chpl_data;
+    /* Reports */
+    c_string    lock_filename;
+    size_t      lock_lineno;
+} chpl_qthread_tls_t;
+
+extern pthread_t chpl_qthread_process_pthread;
+extern pthread_t chpl_qthread_comm_pthread;
+
+extern chpl_qthread_tls_t chpl_qthread_process_tls;
+extern chpl_qthread_tls_t chpl_qthread_comm_task_tls;
+
+#define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized
+void chpl_task_stdModulesInitialized(void);
+
+// Wrap qthread_get_tasklocal() and assert that it is always available.
+static inline chpl_qthread_tls_t * chpl_qthread_get_tasklocal(void)
+{
+    chpl_qthread_tls_t* tls;
+
+    if (chpl_qthread_done_initializing) {
+        tls = (chpl_qthread_tls_t *)
+              qthread_get_tasklocal(sizeof(chpl_qthread_tls_t));
+        if (tls == NULL) {
+            pthread_t me = pthread_self();
+            if (pthread_equal(me, chpl_qthread_comm_pthread))
+                tls = &chpl_qthread_comm_task_tls;
+            else if (pthread_equal(me, chpl_qthread_process_pthread))
+                tls = &chpl_qthread_process_tls;
+        }
+        assert(tls);
+    }
+    else
+        tls = NULL;
+
+    return tls;
+}
+
+#ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL
+#error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_GET_PRVDATA_IMPL_DECL 1
+#endif
+static inline chpl_task_prvData_t* chpl_task_getPrvData(void)
+{
+    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    if (data) {
+        return &data->chpl_data.prvdata;
+    }
+    assert(data);
+    return NULL;
+}
+
+#ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL
+#error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_GETSUBLOC_IMPL_DECL 1
+#endif
+static inline
+c_sublocid_t chpl_task_getSubloc(void)
+{
+    return (c_sublocid_t) qthread_shep();
+}
+
+#ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL
+#error "CHPL_TASK_SETSUBLOC_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_SETSUBLOC_IMPL_DECL 1
+#endif
+static inline
+void chpl_task_setSubloc(c_sublocid_t subloc)
+{
+    qthread_shepherd_id_t curr_shep;
+
+    assert(subloc != c_sublocid_none);
+
+    // Only change sublocales if the caller asked for a particular one,
+    // which is not the current one, and we're a (movable) task.
+    //
+    // Note: It's likely that this won't work in all cases where we need
+    //       it.  In particular, we envision needing to move execution
+    //       from sublocale to sublocale while initializing the memory
+    //       layer, in order to get the NUMA domain affinity right for
+    //       the subparts of the heap.  But this will be happening well
+    //       before tasking init and in any case would be done from the
+    //       main thread of execution, which doesn't have a shepherd.
+    //       The code below wouldn't work in that situation.
+    if ((curr_shep = qthread_shep()) != NO_SHEPHERD) {
+        chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+        if (data) {
+            data->chpl_data.requestedSubloc = subloc;
+        }
+
+        if (subloc != c_sublocid_any &&
+            (qthread_shepherd_id_t) subloc != curr_shep) {
+            qthread_migrate_to((qthread_shepherd_id_t) subloc);
+        }
+    }
+}
+
+#ifdef CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL
+#error "CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL 1
+#endif
+static inline
+c_sublocid_t chpl_task_getRequestedSubloc(void)
+{
+    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    if (data) {
+        return data->chpl_data.requestedSubloc;
+    }
+    return c_sublocid_any;
+}
+
+#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL
+#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1
+#endif
+extern int chpl_qthread_supports_remote_cache;
+static inline
+int chpl_task_supportsRemoteCache(void) {
+  return chpl_qthread_supports_remote_cache;
+}
+
+#endif // ifndef _tasks_qthreads_h_
+/* vim:set expandtab: */
diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile
new file mode 100644
index 0000000000..cb1bf64265
--- /dev/null
+++ b/runtime/src/tasks/atmi/Makefile
@@ -0,0 +1,39 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+RUNTIME_ROOT = ../../..
+RUNTIME_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS)
+
+ifndef CHPL_MAKE_HOME
+export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/..
+endif
+
+include $(RUNTIME_ROOT)/make/Makefile.runtime.head
+
+TASKS_OBJDIR = $(RUNTIME_OBJDIR)
+include Makefile.share
+
+TARGETS = $(TASKS_OBJS)
+
+include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules
+
+FORCE:
+
+#
+# standard footer
+#
+include $(RUNTIME_ROOT)/make/Makefile.runtime.foot
diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include
new file mode 100644
index 0000000000..a510dc9a60
--- /dev/null
+++ b/runtime/src/tasks/atmi/Makefile.include
@@ -0,0 +1,29 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS)
+
+TASKS_OBJDIR = $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/$(RUNTIME_OBJDIR)
+
+#
+# point to sources under third-party, at least until they're
+# contributed back to the usual local directory
+#
+ALL_SRCS += $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.c \
+	$(QTHREAD_SUBDIR)/src/interfaces/chapel/*.h
+
+include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share
diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share
new file mode 100644
index 0000000000..374f9cc760
--- /dev/null
+++ b/runtime/src/tasks/atmi/Makefile.share
@@ -0,0 +1,25 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TASKS_SRCS = 
+
+SVN_SRCS = $(TASKS_SRCS)
+SRCS = $(SVN_SRCS)
+
+TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o)
+
+RUNTIME_CFLAGS += -I$(QTHREAD_INCLUDE)
diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
new file mode 100644
index 0000000000..8902c2949e
--- /dev/null
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -0,0 +1,963 @@
+//
+// Qthreads implementation of Chapel tasking interface
+//
+// Copyright 2011 Sandia Corporation. Under the terms of Contract
+// DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
+// or on behalf of the U.S. Government. Export of this program may require a
+// license from the United States Government
+//
+
+/*
+ * Copyright 2004-2015 Cray Inc.
+ * Other additional copyright holders may be indicated within.
+ *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// For SVID definitions (setenv)
+#define _SVID_SOURCE
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "chplrt.h"
+#include "chplsys.h"
+#include "tasks-qthreads.h"
+//#include "tasks-atmi.h"
+#include "chplcgfns.h"
+#include "chpl-comm.h"
+#include "chpl-locale-model.h"
+#include "chpl-tasks.h"
+#include "chpl-tasks-callbacks-internal.h"
+#include "config.h"
+#include "error.h"
+#include "arg.h"
+#include "signal.h"
+#include "chplexit.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <sched.h>
+
+//#include <atmi_runtime.h>
+
+#if 1
+#include "qthread/qthread.h"
+#include "qthread/qtimer.h"
+#include "qt_feb.h"
+#include "qt_syncvar.h"
+#include "qt_hash.h"
+#include "qt_atomics.h"
+#include "qt_shepherd_innards.h"
+#include "qt_envariables.h"
+#include "qt_debug.h"
+
+#ifdef QTHREAD_MULTINODE
+#include "qthread/spr.h"
+#endif /* QTHREAD_MULTINODE */
+
+#include <pthread.h>
+#endif 
+#ifdef CHAPEL_PROFILE
+# define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0)
+
+/* Tasks */
+static aligned_t profile_task_yield = 0;
+static aligned_t profile_task_addToTaskList = 0;
+static aligned_t profile_task_processTaskList = 0;
+static aligned_t profile_task_executeTasksInList = 0;
+static aligned_t profile_task_freeTaskList = 0;
+static aligned_t profile_task_startMovedTask = 0;
+static aligned_t profile_task_getId = 0;
+static aligned_t profile_task_sleep = 0;
+static aligned_t profile_task_getSerial = 0;
+static aligned_t profile_task_setSerial = 0;
+static aligned_t profile_task_getCallStackSize = 0;
+/* Sync */
+static aligned_t profile_sync_lock= 0;
+static aligned_t profile_sync_unlock= 0;
+static aligned_t profile_sync_waitFullAndLock= 0;
+static aligned_t profile_sync_waitEmptyAndLock= 0;
+static aligned_t profile_sync_markAndSignalFull= 0;
+static aligned_t profile_sync_markAndSignalEmpty= 0;
+static aligned_t profile_sync_isFull= 0;
+static aligned_t profile_sync_initAux= 0;
+static aligned_t profile_sync_destroyAux= 0;
+
+static void profile_print(void)
+{
+    /* Tasks */
+    fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield);
+    fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList);
+    fprintf(stderr, "task processTaskList: %lu\n", (unsigned long)profile_task_processTaskList);
+    fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList);
+    fprintf(stderr, "task freeTaskList: %lu\n", (unsigned long)profile_task_freeTaskList);
+    fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask);
+    fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId);
+    fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep);
+    fprintf(stderr, "task getSerial: %lu\n", (unsigned long)profile_task_getSerial);
+    fprintf(stderr, "task setSerial: %lu\n", (unsigned long)profile_task_setSerial);
+    fprintf(stderr, "task getCallStackSize: %lu\n", (unsigned long)profile_task_getCallStackSize);
+    /* Sync */
+    fprintf(stderr, "sync lock: %lu\n", (unsigned long)profile_sync_lock);
+    fprintf(stderr, "sync unlock: %lu\n", (unsigned long)profile_sync_unlock);
+    fprintf(stderr, "sync waitFullAndLock: %lu\n", (unsigned long)profile_sync_waitFullAndLock);
+    fprintf(stderr, "sync waitEmptyAndLock: %lu\n", (unsigned long)profile_sync_waitEmptyAndLock);
+    fprintf(stderr, "sync markAndSignalFull: %lu\n", (unsigned long)profile_sync_markAndSignalFull);
+    fprintf(stderr, "sync markAndSignalEmpty: %lu\n", (unsigned long)profile_sync_markAndSignalEmpty);
+    fprintf(stderr, "sync isFull: %lu\n", (unsigned long)profile_sync_isFull);
+    fprintf(stderr, "sync initAux: %lu\n", (unsigned long)profile_sync_initAux);
+    fprintf(stderr, "sync destroyAux: %lu\n", (unsigned long)profile_sync_destroyAux);
+}
+#else
+# define PROFILE_INCR(counter,count)
+#endif /* CHAPEL_PROFILE */
+
+#ifndef QTHREAD_MULTINODE
+volatile int chpl_qthread_done_initializing;
+#endif
+
+// Make qt env sizes uniform. Same as qt, but they use the literal everywhere
+#define QT_ENV_S 100
+
+// aka chpl_task_list_p
+struct chpl_task_list {
+    chpl_fn_p        fun;
+    void            *arg;
+    c_string         filename;
+    int              lineno;
+    chpl_task_list_p next;
+};
+
+static aligned_t next_task_id = 1;
+
+pthread_t chpl_qthread_process_pthread;
+pthread_t chpl_qthread_comm_pthread;
+
+chpl_qthread_tls_t chpl_qthread_process_tls = {
+    PRV_DATA_IMPL_VAL("<main task>", 0, chpl_nullTaskID, false,
+                      c_sublocid_any_val, false),
+    NULL, 0 };
+
+chpl_qthread_tls_t chpl_qthread_comm_task_tls = {
+    PRV_DATA_IMPL_VAL("<comm thread>", 0, chpl_nullTaskID, false,
+                      c_sublocid_any_val, false),
+    NULL, 0 };
+
+//
+// QTHREADS_SUPPORTS_REMOTE_CACHE is set in the Chapel Qthreads
+// Makefile, based on the Qthreads scheduler configuration.
+//
+int chpl_qthread_supports_remote_cache = QTHREADS_SUPPORTS_REMOTE_CACHE;
+
+//
+// structs chpl_task_prvDataImpl_t, chpl_qthread_wrapper_args_t and
+// chpl_qthread_tls_t have been moved to tasks-qthreads.h
+//
+
+//
+// chpl_qthread_get_tasklocal() is in tasks-qthreads.h
+//
+
+static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER;
+
+static volatile chpl_bool canCountRunningTasks = false;
+
+void chpl_task_yield(void)
+{
+    PROFILE_INCR(profile_task_yield,1);
+    if (qthread_shep() == NO_SHEPHERD) {
+        sched_yield();
+    } else {
+        qthread_yield();
+    }
+}
+
+// Sync variables
+void chpl_sync_lock(chpl_sync_aux_t *s)
+{
+    aligned_t l;
+    chpl_bool uncontested_lock = true;
+
+    PROFILE_INCR(profile_sync_lock, 1);
+
+    //
+    // To prevent starvation due to never switching away from a task that is
+    // spinning while doing readXX() on a sync variable, yield if this sync var
+    // has a "lot" of uncontested locks. Note that the uncontested locks do not
+    // have to be consecutive. Also note that the number of uncontested locks
+    // is a lossy counter. Currently a "lot" is defined as ~100 uncontested
+    // locks, with care taken to not yield on the first uncontested lock.
+    //
+    // If real qthreads sync vars were used, it's possible this wouldn't be
+    // needed.
+    //
+
+    l = qthread_incr(&s->lockers_in, 1);
+
+    while (l != s->lockers_out) {
+        uncontested_lock = false;
+        qthread_yield();
+    }
+
+    if (uncontested_lock) {
+        if ((++s->uncontested_locks & 0x5F) == 0) {
+            qthread_yield();
+        }
+    }
+}
+
+void chpl_sync_unlock(chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_unlock, 1);
+
+    qthread_incr(&s->lockers_out, 1);
+}
+
+static inline void about_to_block(int32_t  lineno,
+                                  c_string filename)
+{
+    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    assert(data);
+
+    data->lock_lineno   = lineno;
+    data->lock_filename = filename;
+}
+
+void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s,
+                               int32_t          lineno,
+                               c_string         filename)
+{
+    PROFILE_INCR(profile_sync_waitFullAndLock, 1);
+
+    if (blockreport) { about_to_block(lineno, filename); }
+    chpl_sync_lock(s);
+    while (s->is_full == 0) {
+        chpl_sync_unlock(s);
+        qthread_syncvar_readFE(NULL, &(s->signal_full));
+        chpl_sync_lock(s);
+    }
+}
+
+void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s,
+                                int32_t          lineno,
+                                c_string         filename)
+{
+    PROFILE_INCR(profile_sync_waitEmptyAndLock, 1);
+
+    if (blockreport) { about_to_block(lineno, filename); }
+    chpl_sync_lock(s);
+    while (s->is_full != 0) {
+        chpl_sync_unlock(s);
+        qthread_syncvar_readFE(NULL, &(s->signal_empty));
+        chpl_sync_lock(s);
+    }
+}
+
+void chpl_sync_markAndSignalFull(chpl_sync_aux_t *s)         // and unlock
+{
+    PROFILE_INCR(profile_sync_markAndSignalFull, 1);
+
+    qthread_syncvar_fill(&(s->signal_full));
+    s->is_full = 1;
+    chpl_sync_unlock(s);
+}
+
+void chpl_sync_markAndSignalEmpty(chpl_sync_aux_t *s)         // and unlock
+{
+    PROFILE_INCR(profile_sync_markAndSignalEmpty, 1);
+
+    qthread_syncvar_fill(&(s->signal_empty));
+    s->is_full = 0;
+    chpl_sync_unlock(s);
+}
+
+chpl_bool chpl_sync_isFull(void            *val_ptr,
+                           chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_isFull, 1);
+
+    return s->is_full;
+}
+
+void chpl_sync_initAux(chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_initAux, 1);
+
+    s->lockers_in   = 0;
+    s->lockers_out  = 0;
+    s->is_full      = 0;
+    s->signal_empty = SYNCVAR_EMPTY_INITIALIZER;
+    s->signal_full  = SYNCVAR_EMPTY_INITIALIZER;
+}
+
+void chpl_sync_destroyAux(chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_destroyAux, 1);
+}
+
+static void chapel_display_thread(qt_key_t     addr,
+                                  qthread_f    f,
+                                  void        *arg,
+                                  void        *retloc,
+                                  unsigned int thread_id,
+                                  void        *tls,
+                                  void        *callarg)
+{
+    chpl_qthread_tls_t *rep = (chpl_qthread_tls_t *)tls;
+
+    if (rep) {
+        if ((rep->lock_lineno > 0) && rep->lock_filename) {
+            fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", rep->lock_filename, rep->lock_lineno, rep->chpl_data.task_filename, rep->chpl_data.task_lineno);
+        } else if (rep->lock_lineno == 0 && rep->lock_filename) {
+            fprintf(stderr, "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", rep->lock_filename, rep->chpl_data.task_filename, rep->chpl_data.task_lineno);
+        } else if (rep->lock_lineno == 0) {
+            fprintf(stderr, "Waiting for dependencies (uninitialized task %s:%zu)\n", rep->chpl_data.task_filename, rep->chpl_data.task_lineno);
+        }
+        fflush(stderr);
+    }
+}
+
+static void report_locked_threads(void)
+{
+    qthread_feb_callback(chapel_display_thread, NULL);
+    qthread_syncvar_callback(chapel_display_thread, NULL);
+}
+
+static void SIGINT_handler(int sig)
+{
+    signal(sig, SIG_IGN);
+
+    if (blockreport) {
+        report_locked_threads();
+    }
+
+    if (taskreport) {
+        fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n");
+        // report_all_tasks();
+    }
+
+    chpl_exit_any(1);
+}
+
+// Tasks
+
+#ifndef QTHREAD_MULTINODE
+static syncvar_t canexit            = SYNCVAR_STATIC_EMPTY_INITIALIZER;
+static volatile int done_finalizing = 0;
+
+static void *initializer(void *junk)
+{
+    qthread_initialize();
+    MACHINE_FENCE;
+    chpl_qthread_done_initializing = 1;
+
+    qthread_syncvar_readFF(NULL, &canexit);
+
+    qthread_finalize();
+    MACHINE_FENCE;
+    done_finalizing = 1;
+    return NULL;
+}
+#endif /* ! QTHREAD_MULTINODE */
+
+// Helper function to set a qthreads env var. This is meant to mirror setenv
+// functionality, but qthreads has two environment variables for every setting:
+// a QT_ and a QTHREAD_ version. We often forget to think about both so this
+// wraps the overriding logic. In verbose mode it prints out if we overrode
+// values, or if we were prevented from setting values because they existed
+// (and override was 0.)
+static void chpl_qt_setenv(char* var, char* val, int32_t override) {
+    char      qt_env[QT_ENV_S]  = { 0 };
+    char      qthread_env[QT_ENV_S] = { 0 };
+    char      *qt_val;
+    char      *qthread_val;
+    chpl_bool eitherSet = false;
+
+    strncpy(qt_env, "QT_", sizeof(qt_env));
+    strncat(qt_env, var, sizeof(qt_env) - 1);
+
+    strncpy(qthread_env, "QTHREAD_", sizeof(qthread_env));
+    strncat(qthread_env, var, sizeof(qthread_env) - 1);
+
+    qt_val = getenv(qt_env);
+    qthread_val = getenv(qthread_env);
+    eitherSet = (qt_val != NULL || qthread_val != NULL);
+
+    if (override || !eitherSet) {
+        if (verbosity >= 2 && override && eitherSet) {
+            printf("QTHREADS: Overriding the value of %s and %s "
+                   "with %s\n", qt_env, qthread_env, val);
+        }
+        (void) setenv(qt_env, val, 1);
+        (void) setenv(qthread_env, val, 1);
+    } else if (verbosity >= 2) {
+        char* set_env = NULL;
+        char* set_val = NULL;
+        if (qt_val != NULL) {
+            set_env = qt_env;
+            set_val = qt_val;
+        } else {
+            set_env = qthread_env;
+            set_val = qthread_val;
+        }
+        printf("QTHREADS: Not setting %s to %s because %s is set to %s and "
+               "overriding was not requested\n", qt_env, val, set_env, set_val);
+    }
+}
+
+// Determine the number of workers based on environment settings. If a user set
+// HWPAR, they are saying they want to use HWPAR many workers, but let the
+// runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or
+// NUM_WORKERS_PER_SHEPHERD then they must have specific reasons for doing so.
+// Returns 0 if no Qthreads env vars related to the number of threads were set,
+// what HWPAR was set to if it was set, or -1 if NUM_SHEP and/or NUM_WPS were
+// set since we can't figure out before Qthreads init what this will actually
+// turn into without duplicating Qthreads logic (-1 is a sentinel for don't
+// adjust the values, and give them as is to Qthreads.)
+static int32_t chpl_qt_getenv_num_workers() {
+    int32_t  hwpar;
+    int32_t  num_wps;
+    int32_t  num_sheps;
+
+    hwpar = qt_internal_get_env_num("HWPAR", 0, 0);
+    num_wps = qt_internal_get_env_num("NUM_WORKERS_PER_SHEPHERD", 0, 0);
+    num_sheps = qt_internal_get_env_num("NUM_SHEPHERDS", 0, 0);
+
+    if (hwpar) {
+        return hwpar;
+    } else if (num_wps || num_sheps) {
+        return -1;
+    }
+
+    return 0;
+}
+
+
+// Sets up and returns the amount of hardware parallelism to use, limited to
+// maxThreads. Returns -1 if we did not setup parallelism because a user
+// explicitly requested a specific layout from qthreads.
+static int32_t setupAvailableParallelism(int32_t maxThreads) {
+    int32_t   numThreadsPerLocale;
+    int32_t   qtEnvThreads;
+    int32_t   hwpar;
+    char      newenv_workers[QT_ENV_S] = { 0 };
+
+    // Experience has shown that Qthreads generally performs best with
+    // num_workers = numCores (and thus worker_unit = core) but if the user has
+    // explicitly requested more threads through the chapel or Qthread env
+    // vars, we override the default.
+    numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale();
+    qtEnvThreads = chpl_qt_getenv_num_workers();
+    hwpar = 0;
+
+    // User set chapel level env var (CHPL_RT_NUM_THREADS_PER_LOCALE)
+    // This is limited to the number of logical CPUs on the node.
+    if (numThreadsPerLocale != 0) {
+        int32_t numPUsPerLocale;
+
+        hwpar = numThreadsPerLocale;
+
+        numPUsPerLocale = chpl_getNumLogicalCpus(true);
+        if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) {
+            if (verbosity >= 2) {
+                printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d "
+                       "to prevent oversubscription of the system.\n",
+                       hwpar, numPUsPerLocale);
+            }
+
+            // Do not oversubscribe the system, use all available resources.
+            hwpar = numPUsPerLocale;
+        }
+    }
+    // User set qthreads level env var
+    // (HWPAR or (NUM_SHEPHERDS and NUM_WORKERS_PER_SHEPHERD))
+    else if (qtEnvThreads != 0) {
+        hwpar = qtEnvThreads;
+    }
+    // User did not set chapel or qthreads vars -- our default
+    else {
+        hwpar = chpl_getNumPhysicalCpus(true);
+    }
+
+    // hwpar will only be <= 0 if the user set QT_NUM_SHEPHERDS and/or
+    // QT_NUM_WORKERS_PER_SHEPHERD in which case we assume as "expert" user and
+    // don't impose any thread limits or set worker_unit.
+    if (hwpar > 0) {
+        // Limit the parallelism to the maximum imposed by the comm layer.
+        if (0 < maxThreads && maxThreads < hwpar) {
+            hwpar = maxThreads;
+        }
+
+        // If there is more parallelism requested than the number of cores, set the
+        // worker unit to pu, otherwise core.
+        if (hwpar > chpl_getNumPhysicalCpus(true)) {
+          chpl_qt_setenv("WORKER_UNIT", "pu", 0);
+        } else {
+          chpl_qt_setenv("WORKER_UNIT", "core", 0);
+        }
+
+        // Unset relevant Qthreads environment variables.
+        qt_internal_unset_envstr("HWPAR");
+        qt_internal_unset_envstr("NUM_SHEPHERDS");
+        qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD");
+
+        snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar);
+        if (THREADQUEUE_POLICY_TRUE == qt_threadqueue_policy(SINGLE_WORKER)) {
+            chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1);
+            chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1);
+        } else {
+            chpl_qt_setenv("HWPAR", newenv_workers, 1);
+        }
+    }
+    return hwpar;
+}
+
+static void setupCallStacks(int32_t hwpar) {
+    size_t callStackSize;
+
+    // If the user compiled with no stack checks (either explicitly or
+    // implicitly) turn off qthread guard pages. TODO there should also be a
+    // chpl level env var backing this at runtime (can be the same var.)
+    // Also turn off guard pages if the heap page size isn't the same as
+    // the system page size, because when that's the case we can reliably
+    // make the guard pages un-referenceable.  (This typically arises when
+    // the heap is on hugepages, as is often the case on Cray systems.)
+    //
+    // Note that we won't override an explicit setting of QT_GUARD_PAGES
+    // in the former case, but we do in the latter case.
+    if (CHPL_STACK_CHECKS == 0) {
+        chpl_qt_setenv("GUARD_PAGES", "false", 0);
+    }
+    else if (chpl_getHeapPageSize() != chpl_getSysPageSize()) {
+        chpl_qt_setenv("GUARD_PAGES", "false", 1);
+    }
+
+    // Precedence (high-to-low):
+    // 1) Chapel environment (CHPL_RT_CALL_STACK_SIZE)
+    // 2) QTHREAD_STACK_SIZE
+    // 3) Chapel default
+    if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 ||
+        (qt_internal_get_env_num("STACK_SIZE", 0, 0) == 0 &&
+         (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) {
+        char newenv_stack[QT_ENV_S];
+        snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize);
+        chpl_qt_setenv("STACK_SIZE", newenv_stack, 1);
+
+        // Qthreads sets up memory pools expecting the item_size to be small.
+        // Stacks are allocated in this manner too, but our default stack size
+        // is quite large, so we limit the max memory allocated for a pool. We
+        // default to a multiple of callStackSize and hwpar, with the thought
+        // that available memory is generally proportional to the amount of
+        // parallelism. For some architectures, this isn't true so we set a max
+        // upper bound. And if the callStackSize is small, we don't want to
+        // limit all qthreads pool allocations to a small value, so we have a
+        // lower bound as well. Note that qthread stacks are slightly larger
+        // than specified to store a book keeping structure and possibly guard
+        // pages, so we thrown an extra MB.
+        if (hwpar > 0) {
+            const size_t oneMB = 1024 * 1024;
+            const size_t allocSizeLowerBound =  33 * oneMB;
+            const size_t allocSizeUpperBound = 513 * oneMB;
+            size_t maxPoolAllocSize;
+            char newenv_alloc[QT_ENV_S];
+
+            maxPoolAllocSize = 2 * hwpar * callStackSize + oneMB;
+            if (maxPoolAllocSize < allocSizeLowerBound) {
+                maxPoolAllocSize = allocSizeLowerBound;
+            } else if (maxPoolAllocSize > allocSizeUpperBound) {
+                maxPoolAllocSize = allocSizeUpperBound;
+            }
+            snprintf(newenv_alloc, sizeof(newenv_alloc), "%zu", maxPoolAllocSize);
+            chpl_qt_setenv("MAX_POOL_ALLOC_SIZE", newenv_alloc, 0);
+        }
+    }
+}
+
+void chpl_task_init(void)
+{
+    int32_t   commMaxThreads;
+    int32_t   hwpar;
+    pthread_t initer;
+
+    chpl_qthread_process_pthread = pthread_self();
+    chpl_qthread_process_tls.chpl_data.id = qthread_incr(&next_task_id, 1);
+
+    commMaxThreads = chpl_comm_getMaxThreads();
+
+    // Setup hardware parallelism, the stack size, and stack guards
+    hwpar = setupAvailableParallelism(commMaxThreads);
+    setupCallStacks(hwpar);
+
+    if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); }
+
+    // Initialize qthreads
+    pthread_create(&initer, NULL, initializer, NULL);
+    while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY();
+
+    // Now that Qthreads is up and running, do a sanity check and make sure
+    // that the number of workers is less than any comm layer limit. This is
+    // mainly need for the case where a user set QT_NUM_SHEPHERDS and/or
+    // QT_NUM_WORKERS_PER_SHEPHERD in which case we don't impose any limits on
+    // the number of threads qthreads creates beforehand
+    assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads);
+
+    if (blockreport || taskreport) {
+        if (signal(SIGINT, SIGINT_handler) == SIG_ERR) {
+            perror("Could not register SIGINT handler");
+        }
+    }
+}
+
+void chpl_task_exit(void)
+{
+#ifdef CHAPEL_PROFILE
+    profile_print();
+#endif /* CHAPEL_PROFILE */
+
+#ifdef QTHREAD_MULTINODE
+#else
+    if (qthread_shep() == NO_SHEPHERD) {
+        /* sometimes, tasking is told to shutdown even though it hasn't been
+         * told to start yet */
+        if (chpl_qthread_done_initializing == 1) {
+            qthread_syncvar_fill(&canexit);
+            while (done_finalizing == 0) SPINLOCK_BODY();
+        }
+    } else {
+        qthread_syncvar_fill(&exit_ret);
+    }
+#endif /* QTHREAD_MULTINODE */
+}
+
+static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind,
+                                  chpl_task_prvDataImpl_t *chpl_data) {
+    if (chpl_task_have_callbacks(event_kind)) {
+        if (chpl_data->id == chpl_nullTaskID)
+            chpl_data->id = qthread_incr(&next_task_id, 1);
+        chpl_task_do_callbacks(event_kind,
+                               chpl_data->task_filename,
+                               chpl_data->task_lineno,
+                               chpl_data->id,
+                               chpl_data->is_executeOn);
+    }
+}
+
+static aligned_t chapel_wrapper(void *arg)
+{
+    chpl_qthread_wrapper_args_t *rarg = arg;
+    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+
+    data->chpl_data = rarg->chpl_data;
+    data->lock_filename = NULL;
+    data->lock_lineno = 0;
+
+    if (rarg->countRunning) {
+        chpl_taskRunningCntInc(0, NULL);
+    }
+
+    wrap_callbacks(chpl_task_cb_event_kind_begin, &data->chpl_data);
+
+    (*(chpl_fn_p)(rarg->fn))(rarg->args);
+
+    wrap_callbacks(chpl_task_cb_event_kind_end, &data->chpl_data);
+
+    if (rarg->countRunning) {
+        chpl_taskRunningCntDec(0, NULL);
+    }
+
+    return 0;
+}
+
+typedef struct {
+    chpl_fn_p fn;
+    void *arg;
+} comm_task_wrapper_info_t;
+
+static void *comm_task_wrapper(void *arg)
+{
+    comm_task_wrapper_info_t *rarg = arg;
+    chpl_moveToLastCPU();
+    (*(chpl_fn_p)(rarg->fn))(rarg->arg);
+    return 0;
+}
+
+// Start the main task.
+//
+// Warning: this method is not called within a Qthread task context. Do
+// not use methods that require task context (e.g., task-local storage).
+void chpl_task_callMain(void (*chpl_main)(void))
+{
+    chpl_qthread_wrapper_args_t wrapper_args =
+        {chpl_main, NULL, false,
+         PRV_DATA_IMPL_VAL("<main task>", 0,
+                           chpl_qthread_process_tls.chpl_data.id, false,
+                           c_sublocid_any_val, false) };
+
+    qthread_debug(CHAPEL_CALLS, "[%d] begin chpl_task_callMain()\n", chpl_nodeID);
+
+#ifdef QTHREAD_MULTINODE
+    qthread_debug(CHAPEL_BEHAVIOR, "[%d] calling spr_unify\n", chpl_nodeID);
+    int const rc = spr_unify();
+    assert(SPR_OK == rc);
+#endif /* QTHREAD_MULTINODE */
+
+    wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data);
+
+    qthread_fork_syncvar(chapel_wrapper, &wrapper_args, &exit_ret);
+    qthread_syncvar_readFF(NULL, &exit_ret);
+
+    qthread_debug(CHAPEL_BEHAVIOR, "[%d] main task finished\n", chpl_nodeID);
+    qthread_debug(CHAPEL_CALLS, "[%d] end chpl_task_callMain()\n", chpl_nodeID);
+}
+
+void chpl_task_stdModulesInitialized(void)
+{
+    //
+    // It's not safe to call the module code to count the main task as
+    // running until after the modules have been initialized.  That's
+    // when this function is called, so now count the main task.
+    //
+    canCountRunningTasks = true;
+    chpl_taskRunningCntInc(0, NULL);
+}
+
+int chpl_task_createCommTask(chpl_fn_p fn,
+                             void     *arg)
+{
+#ifndef QTHREAD_MULTINODE
+    //
+    // The wrapper info must be static because it won't be referred to
+    // until the new pthread calls comm_task_wrapper().  And, it is
+    // safe for it to be static because we will be called at most once
+    // on each node.
+    //
+    static
+        comm_task_wrapper_info_t wrapper_info;
+    wrapper_info.fn = fn;
+    wrapper_info.arg = arg;
+    return pthread_create(&chpl_qthread_comm_pthread,
+                          NULL, comm_task_wrapper, &wrapper_info);
+#else
+    return 0;
+#endif
+}
+
+void chpl_task_addToTaskList(chpl_fn_int_t     fid,
+                             void             *arg,
+                             c_sublocid_t      subloc,
+                             chpl_task_list_p *task_list,
+                             int32_t           task_list_locale,
+                             chpl_bool         is_begin_stmt,
+                             int               lineno,
+                             c_string          filename)
+{
+    chpl_bool serial_state = chpl_task_getSerial();
+
+    assert(subloc != c_sublocid_none);
+
+    PROFILE_INCR(profile_task_addToTaskList,1);
+
+    if (serial_state) {
+        // call the function directly.
+        (chpl_ftable[fid])(arg);
+    } else {
+        chpl_qthread_wrapper_args_t wrapper_args =
+            {chpl_ftable[fid], arg, false,
+             PRV_DATA_IMPL_VAL(filename, lineno, chpl_nullTaskID, false,
+                               subloc, serial_state) };
+
+        wrap_callbacks(chpl_task_cb_event_kind_create,
+                       &wrapper_args.chpl_data);
+
+        if (subloc == c_sublocid_any) {
+            qthread_fork_copyargs(chapel_wrapper, &wrapper_args,
+                                  sizeof(chpl_qthread_wrapper_args_t), NULL);
+        } else {
+            qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args,
+                                     sizeof(chpl_qthread_wrapper_args_t), NULL,
+                                     (qthread_shepherd_id_t) subloc);
+        }
+    }
+}
+
+void chpl_task_processTaskList(chpl_task_list_p task_list)
+{
+    PROFILE_INCR(profile_task_processTaskList,1);
+}
+
+void chpl_task_executeTasksInList(chpl_task_list_p task_list)
+{
+    PROFILE_INCR(profile_task_executeTasksInList,1);
+}
+
+void chpl_task_freeTaskList(chpl_task_list_p task_list)
+{
+    PROFILE_INCR(profile_task_freeTaskList,1);
+}
+
+void chpl_task_startMovedTask(chpl_fn_p      fp,
+                              void          *arg,
+                              c_sublocid_t   subloc,
+                              chpl_taskID_t  id,
+                              chpl_bool      serial_state)
+{
+    assert(subloc != c_sublocid_none);
+    assert(id == chpl_nullTaskID);
+
+    chpl_qthread_wrapper_args_t wrapper_args =
+        {fp, arg, canCountRunningTasks,
+         PRV_DATA_IMPL_VAL("<unknown>", 0, chpl_nullTaskID, true,
+                           subloc, serial_state) };
+
+    PROFILE_INCR(profile_task_startMovedTask,1);
+
+    wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data);
+
+    if (subloc == c_sublocid_any) {
+        qthread_fork_copyargs(chapel_wrapper, &wrapper_args,
+                              sizeof(chpl_qthread_wrapper_args_t), NULL);
+    } else {
+        qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args,
+                                 sizeof(chpl_qthread_wrapper_args_t), NULL,
+                                 (qthread_shepherd_id_t) subloc);
+    }
+}
+
+//
+// chpl_task_getSubloc() is in tasks-qthreads.h
+//
+
+//
+// chpl_task_setSubloc() is in tasks-qthreads.h
+//
+
+//
+// chpl_task_getRequestedSubloc() is in tasks-qthreads.h
+//
+
+
+// Returns '(unsigned int)-1' if called outside of the tasking layer.
+chpl_taskID_t chpl_task_getId(void)
+{
+    chpl_qthread_tls_t * tls = chpl_qthread_get_tasklocal();
+
+    PROFILE_INCR(profile_task_getId,1);
+
+    if (tls == NULL)
+        return (chpl_taskID_t) -1;
+
+    if (tls->chpl_data.id == chpl_nullTaskID)
+        tls->chpl_data.id = qthread_incr(&next_task_id, 1);
+
+    return tls->chpl_data.id;
+}
+
+void chpl_task_sleep(int secs)
+{
+    if (qthread_shep() == NO_SHEPHERD) {
+        sleep(secs);
+    } else {
+        qtimer_t t = qtimer_create();
+        qtimer_start(t);
+        do {
+            qthread_yield();
+            qtimer_stop(t);
+        } while (qtimer_secs(t) < secs);
+    }
+}
+
+/* The get- and setSerial() methods assume the beginning of the task-local
+ * data segment holds a chpl_bool denoting the serial state. */
+chpl_bool chpl_task_getSerial(void)
+{
+    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+
+    PROFILE_INCR(profile_task_getSerial,1);
+
+    return data->chpl_data.prvdata.serial_state;
+}
+
+void chpl_task_setSerial(chpl_bool state)
+{
+    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    data->chpl_data.prvdata.serial_state = state;
+
+    PROFILE_INCR(profile_task_setSerial,1);
+}
+
+uint32_t chpl_task_getMaxPar(void) {
+    //
+    // We assume here that the caller (in the LocaleModel module code)
+    // is interested in the number of workers on the whole node, and
+    // will decide itself how much parallelism to create across and
+    // within sublocales, if there are any.
+    //
+    return (uint32_t) qthread_num_workers();
+}
+
+c_sublocid_t chpl_task_getNumSublocales(void)
+{
+    // FIXME: What we really want here is the number of NUMA
+    // sublocales we are supporting.  For now we use the number of
+    // shepherds as a proxy for that.
+    return (c_sublocid_t) qthread_num_shepherds();
+}
+
+size_t chpl_task_getCallStackSize(void)
+{
+    PROFILE_INCR(profile_task_getCallStackSize,1);
+
+    return qthread_readstate(STACK_SIZE);
+}
+
+// XXX: Should probably reflect all shepherds
+uint32_t chpl_task_getNumQueuedTasks(void)
+{
+    return qthread_readstate(NODE_BUSYNESS);
+}
+
+uint32_t chpl_task_getNumRunningTasks(void)
+{
+    chpl_internal_error("chpl_task_getNumRunningTasks() called");
+    return 1;
+}
+
+int32_t chpl_task_getNumBlockedTasks(void)
+{
+    // This isn't accurate, but in the absence of better information
+    // it's the best we can do.
+    return 0;
+}
+
+// Threads
+
+uint32_t chpl_task_getNumThreads(void)
+{
+    return (uint32_t)qthread_num_workers();
+}
+
+// Ew. Talk about excessive bookkeeping.
+uint32_t chpl_task_getNumIdleThreads(void)
+{
+    return 0;
+}
+
+/* vim:set expandtab: */
diff --git a/util/printchplenv b/util/printchplenv
index e66b1d2694..5ad79dff86 100755
--- a/util/printchplenv
+++ b/util/printchplenv
@@ -175,6 +175,8 @@ def print_mode(mode='list', anonymize=False):
                   mode, filters=('runtime',))
         if tasks == 'qthreads':
             link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args())
+        if tasks == 'atmi':
+            link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args())
 
         print_var('  CHPL_RE2_UNIQ_CFG_PATH',
                   chpl_3p_re2_configs.get_uniq_cfg_path(),
diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash
index 3625a4657f..0dee283cc9 100644
--- a/util/setchplenv_hsa.bash
+++ b/util/setchplenv_hsa.bash
@@ -61,8 +61,8 @@ export CHPL_COMM=none
 echo " to none"
 
 echo -n "Setting CHPL_TASKS"
-export CHPL_TASKS=qthreads
-echo " to qthreads"
+export CHPL_TASKS=atmi
+echo " to atmi"
 
 echo -n "Setting CHPL_ATOMICS"
 export CHPL_ATOMICS=intrinsics
@@ -92,6 +92,10 @@ echo -n "Disabling NUMA"
 export CHPL_HWLOC_CFG_OPTIONS=" --disable-libnuma"
 echo " done"
 
+echo -n "Setting CHPL_HWLOC"
+export CHPL_HWLOC=hwloc
+echo " to hwloc"
+
 echo -n "Disabling LLVM support"
 export CHPL_LLVM=none
 echo " done"

From 891dd14a2b0db577b8b770296633a7a62bb443d5 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Wed, 15 Feb 2017 13:50:58 -0600
Subject: [PATCH 03/10] Updated ATMI tasking layer to master branch; ATMI layer
 should be called by setting the CHPL_TASKS to atmi instead of qthreads

---
 modules/internal/LocaleModelHelpSetup.chpl    |   6 +-
 .../localeModels/hsa/LocaleModel.chpl         |   2 +-
 runtime/include/chpl-atmi.h                   |  13 +-
 runtime/include/chpltypes.h                   |   1 +
 runtime/include/tasks/atmi/tasks-atmi.h       | 314 ++--------
 runtime/src/chpl-atmi.c                       |   5 +-
 runtime/src/tasks/atmi/Makefile               |   5 +-
 runtime/src/tasks/atmi/Makefile.include       |  12 +-
 runtime/src/tasks/atmi/Makefile.share         |   6 +-
 runtime/src/tasks/atmi/tasks-atmi.c           | 580 ++++++++++++------
 10 files changed, 457 insertions(+), 487 deletions(-)

diff --git a/modules/internal/LocaleModelHelpSetup.chpl b/modules/internal/LocaleModelHelpSetup.chpl
index 69c3562ec1..93ab89f26d 100644
--- a/modules/internal/LocaleModelHelpSetup.chpl
+++ b/modules/internal/LocaleModelHelpSetup.chpl
@@ -172,9 +172,6 @@ module LocaleModelHelpSetup {
 
     helpSetupLocaleFlat(dst, local_name);
 
-    extern proc chpl_task_getNumSublocales(): int(32);
-    numSublocales = chpl_task_getNumSublocales();
-
     extern proc chpl_task_getMaxPar(): uint(32);
 
 
@@ -191,7 +188,8 @@ module LocaleModelHelpSetup {
     then local_name = chpl_nodeName():string + "-" + _node_id : string;
     else local_name = chpl_nodeName():string;
 
-    numSublocales = 2;
+    extern proc chpl_task_getNumSublocales(): int(32);
+    numSublocales = chpl_task_getNumSublocales();
 
     const origSubloc = chpl_task_getRequestedSubloc();
 
diff --git a/modules/internal/localeModels/hsa/LocaleModel.chpl b/modules/internal/localeModels/hsa/LocaleModel.chpl
index 8c54136b3e..627cfd0d19 100644
--- a/modules/internal/localeModels/hsa/LocaleModel.chpl
+++ b/modules/internal/localeModels/hsa/LocaleModel.chpl
@@ -157,7 +157,7 @@ module LocaleModel {
       return {0..#numSublocales};
     }
 
-    proc getChildCount() return 0;
+    proc getChildCount() return numSublocales;
 
     iter getChildIndices() : int {
       for idx in {0..#numSublocales} do // chpl_emptyLocaleSpace do
diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h
index 0f4c07feae..6285760810 100644
--- a/runtime/include/chpl-atmi.h
+++ b/runtime/include/chpl-atmi.h
@@ -14,21 +14,12 @@
 atmi_kernel_t reduction_kernel;
 atmi_kernel_t *gpu_kernels;
 
+atmi_machine_t *g_machine;
+
 enum {
     GPU_KERNEL_IMPL = 10565,
     REDUCTION_GPU_IMPL = 42
 };    
-/*
-typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
-    uint64_t in;
-    uint64_t out;
-    uint32_t count;
-} hsail_reduce_kernarg_t;
-
-typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
-    uint64_t bundle;
-} hsail_kernarg_t;
-*/
 
 int chpl_hsa_initialize(void);
 
diff --git a/runtime/include/chpltypes.h b/runtime/include/chpltypes.h
index 0e6b4644df..439bef780b 100644
--- a/runtime/include/chpltypes.h
+++ b/runtime/include/chpltypes.h
@@ -189,6 +189,7 @@ typedef int32_t chpl_bool32;
 typedef int64_t chpl_bool64;
 
 typedef void (*chpl_fn_p)(void*); // function pointer for runtime ftable
+typedef const char *chpl_fn_name;
 typedef int16_t chpl_fn_int_t;    // int type for ftable indexing
 
 // Function table names and information, for VisualDebug use
diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h
index 58dab0a982..81ad131901 100644
--- a/runtime/include/tasks/atmi/tasks-atmi.h
+++ b/runtime/include/tasks/atmi/tasks-atmi.h
@@ -6,7 +6,7 @@
 **************************************************************************/
 
 /*
- * Copyright 2004-2015 Cray Inc.
+ * Copyright 2004-2017 Cray Inc.
  * Other additional copyright holders may be indicated within.
  *
  * The entirety of this work is licensed under the Apache License,
@@ -27,68 +27,25 @@
 #ifndef _tasks_qthreads_h_
 #define _tasks_qthreads_h_
 
-#include <stdint.h>
+#include "chpl-atmi.h"
+#include "chpl-tasks-prvdata.h"
+#include "chpltypes.h"
+
+#include "qthread.h"
+#include "qthread-chapel.h"
 
 #include <assert.h>
-#include <qthread.h>
-#include <stdio.h>
 #include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
 
-#include "chpltypes.h"
-#include "chpl-tasks-prvdata.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define CHPL_COMM_YIELD_TASK_WHILE_POLLING
 void chpl_task_yield(void);
 
-// For mutexes
-//   type(s)
-//     threadlayer_mutex_t
-//     threadlayer_mutex_p
-//   functions
-//     threadlayer_mutex_init()
-//     threadlayer_mutex_new()
-//     threadlayer_mutex_lock()
-//     threadlayer_mutex_unlock()
-//
-// For thread management
-//   type(s)
-//     <none>
-//   functions
-//     threadlayer_thread_id()
-//     threadlayer_thread_cancel()
-//     threadlayer_thread_join()
-//
-// For sync variables
-//   type(s)
-//     threadlayer_sync_aux_t
-//   functions
-//     threadlayer_sync_suspend()
-//     threadlayer_sync_awaken()
-//     threadlayer_sync_init()
-//     threadlayer_sync_destroy()
-//
-// For task management
-//   type(s)
-//     <none>
-//   functions
-//     threadlayer_init()
-//     threadlayer_thread_create()
-//     threadlayer_pool_suspend()
-//     threadlayer_pool_awaken()
-//     threadlayer_get_thread_private_data()
-//     threadlayer_set_thread_private_data()
-//
-// The types are declared in the threads-*.h file for each specific
-// threading layer, and the callback functions are declared here.  The
-// interfaces and requirements for these other types and callback
-// functions are described elsewhere in this file.
-//
-// Although the above list may seem long, in practice many of the
-// functions are quite simple, and with luck also easily extrapolated
-// from what is done for other threading layers.  For an example of an
-// implementation, see "pthreads" threading.
-//
-
 //
 // Type (and default value) used to communicate task identifiers
 // between C code and Chapel code in the runtime.
@@ -96,19 +53,9 @@ void chpl_task_yield(void);
 typedef unsigned int chpl_taskID_t;
 #define chpl_nullTaskID QTHREAD_NULL_TASK_ID
 
-//
-// Mutexes
-//
-typedef syncvar_t chpl_mutex_t;
-
 //
 // Sync variables
 //
-// The threading layer's threadlayer_sync_aux_t may include any
-// additional members the layer needs to support the suspend/awaken
-// callbacks efficiently.  The FIFO tasking code itself does not
-// refer to this type or the tl_aux member at all.
-//
 typedef struct {
     aligned_t lockers_in;
     aligned_t lockers_out;
@@ -118,207 +65,43 @@ typedef struct {
     syncvar_t signal_empty;
 } chpl_sync_aux_t;
 
-#define chpl_sync_reset(x) qthread_syncvar_empty(&(x)->sync_aux.signal_full)
-
-#define chpl_read_FE(x) ({                                                           \
-                             uint64_t y;                                             \
-                             qthread_syncvar_readFE(&y, &(x)->sync_aux.signal_full); \
-                             y; })
-
-#define chpl_read_FF(x) ({                                                           \
-                             uint64_t y;                                             \
-                             qthread_syncvar_readFF(&y, &(x)->sync_aux.signal_full); \
-                             y; })
-
-#define chpl_read_XX(x) ((x)->sync_aux.signal_full.u.s.data)
-
-#define chpl_write_EF(x, y) do {                                 \
-        uint64_t z = (uint64_t)(y);                              \
-        qthread_syncvar_writeEF(&(x)->sync_aux.signal_full, &z); \
-} while(0)
-
-#define chpl_write_FF(x, y) do {                                    \
-        uint64_t z, dummy;                                          \
-        z = (uint64_t)(y);                                          \
-        qthread_syncvar_readFE(&dummy, &(x)->sync_aux.signal_full); \
-        qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z);     \
-} while(0)
-
-#define chpl_write_XF(x, y) do {                                \
-        uint64_t z = (uint64_t)(y);                             \
-        qthread_syncvar_writeF(&(x)->sync_aux.signal_full, &z); \
-} while(0)
-
-#define chpl_single_reset(x) qthread_syncvar_empty(&(x)->single_aux.signal_full)
-
-#define chpl_single_read_FF(x) ({                                                             \
-                                    uint64_t y;                                               \
-                                    qthread_syncvar_readFF(&y, &(x)->single_aux.signal_full); \
-                                    y; })
-
-#define chpl_single_write_EF(x, y) do {                            \
-        uint64_t z = (uint64_t)(y);                                \
-        qthread_syncvar_writeEF(&(x)->single_aux.signal_full, &z); \
-} while(0)
-
-#define chpl_single_read_XX(x) ((x)->single_aux.signal_full.u.s.data)
-
-// Tasks
-
-//
-// Handy services for threading layer callback functions.
-//
-// The FIFO tasking implementation also provides the following service
-// routines that can be used by threading layer callback functions.
-//
-
-//
-// The remaining declarations are all for callback functions to be
-// provided by the threading layer.
-//
-
 //
-// These are called once each, from CHPL_TASKING_INIT() and
-// CHPL_TASKING_EXIT().
+// Task private data
 //
-void threadlayer_init(void);
-void threadlayer_exit(void);
 
-//
-// Mutexes
-//
-/*void qthread_mutex_init(qthread_mutex_p);
- * qthread_mutex_p qthread_mutex_new(void);
- * void qthread_mutex_lock(qthread_mutex_p);
- * void qthread_mutex_unlock(qthread_mutex_p);*/
-
-//
-// Sync variables
-//
-// The CHPL_SYNC_WAIT_{FULL,EMPTY}_AND_LOCK() functions should call
-// threadlayer_sync_suspend() when a sync variable is not in the desired
-// full/empty state.  The call will be made with the sync variable's
-// mutex held.  (Thus, threadlayer_sync_suspend() can dependably tell
-// that the desired state must be the opposite of the state it initially
-// sees the variable in.)  It should return (with the mutex again held)
-// as soon as it can once either the sync variable changes to the
-// desired state, or (if the given deadline pointer is non-NULL) the
-// deadline passes.  It can return also early, before either of these
-// things occur, with no ill effects.  If a deadline is given and it
-// does pass, then threadlayer_sync_suspend() must return true;
-// otherwise false.
-//
-// The less the function can execute while waiting for the sync variable
-// to change state, and the quicker it can un-suspend when the variable
-// does change state, the better overall performance will be.  Obviously
-// the sync variable's mutex must be unlocked while the routine waits
-// for the variable to change state or the deadline to pass, or livelock
-// may result.
-//
-// The CHPL_SYNC_MARK_AND_SIGNAL_{FULL,EMPTY}() functions will call
-// threadlayer_sync_awaken() every time they are called, not just when
-// they change the state of the sync variable.
-//
-// Threadlayer_sync_{init,destroy}() are called to initialize or
-// destroy, respectively, the contents of the tl_aux member of the
-// chpl_sync_aux_t for the specific threading layer.
-//
-/*chpl_bool threadlayer_sync_suspend(chpl_sync_aux_t *s,
- *                                 struct timeval *deadline);
- * void threadlayer_sync_awaken(chpl_sync_aux_t *s);
- * void threadlayer_sync_init(chpl_sync_aux_t *s);
- * void threadlayer_sync_destroy(chpl_sync_aux_t *s);*/
-
-//
-// Task management
-//
-
-//
-// The interface for thread creation may need to be extended eventually
-// to allow for specifying such things as stack sizes and/or locations.
-//
-/*int threadlayer_thread_create(threadlayer_threadID_t*, void*(*)(void*), void*);*/
-
-//
-// Threadlayer_pool_suspend() is called when a thread finds nothing in
-// the pool of unclaimed tasks, and so has no work to do.  The call will
-// be made with the pointed-to mutex held.  It should return (with the
-// mutex again held) as soon as it can once either the task pool is no
-// longer empty or (if the given deadline pointer is non-NULL) the
-// deadline passes.  It can return also early, before either of these
-// things occur, with no ill effects.  If a deadline is given and it
-// does pass, then threadlayer_pool_suspend() must return true;
-// otherwise false.
-//
-// The less the function can execute while waiting for the pool to
-// become nonempty, and the quicker it can un-suspend when that happens,
-// the better overall performance will be.
-//
-// The mutex passed to threadlayer_pool_suspend() is the one that
-// provides mutual exclusion for changes to the task pool.  Allowing
-// access to this mutex simplifies the implementation for certain
-// threading layers, such as those based on pthreads condition
-// variables.  However, it also introduces a complication in that it
-// allows a threading layer to create deadlock or livelock situations if
-// it is not careful.  Certainly the mutex must be unlocked while the
-// routine waits for the task pool to fill or the deadline to pass, or
-// livelock may result.
-//
-/*chpl_bool threadlayer_pool_suspend(chpl_mutex_t*, struct timeval*);
- * void threadlayer_pool_awaken(void);*/
-
-//
-// Thread private data
-//
-// These set and get a pointer to thread private data associated with
-// each thread.  This is for the use of the FIFO tasking implementation
-// itself.  If the threading layer also needs to store some data private
-// to each thread, it must make other arrangements to do so.
-//
-/*void  threadlayer_set_thread_private_data(void*);
- * void* threadlayer_get_thread_private_data(void);*/
-
-
-#ifndef QTHREAD_MULTINODE
 extern
 #ifdef __cplusplus
 "C"
 #endif
-volatile int chpl_qthread_done_initializing;
-#endif
-
-typedef struct {
-    c_string task_filename;
-    int task_lineno;
-    chpl_taskID_t id;
-    chpl_bool is_executeOn;
-    c_sublocid_t requestedSubloc;  // requested sublocal for task
-    chpl_task_prvData_t prvdata;
-} chpl_task_prvDataImpl_t;
 
-// Define PRV_DATA_IMPL_VAL to set up a chpl_task_prvData_t.
-#define PRV_DATA_IMPL_VAL(_fn, _ln, _id, _is_execOn, _subloc, _serial) \
-        { .task_filename = _fn, \
-          .task_lineno = _ln, \
-          .id = _id, \
-          .is_executeOn = _is_execOn, \
-          .requestedSubloc = _subloc, \
-          .prvdata = { .serial_state = _serial } }
+volatile int chpl_qthread_done_initializing;
 
+//
+// Task layer private area argument bundle header
+//
 typedef struct {
-    void                     *fn;
-    void                     *args;
-    chpl_bool                countRunning;
-    chpl_task_prvDataImpl_t  chpl_data;
-} chpl_qthread_wrapper_args_t;
+  chpl_bool serial_state;
+  chpl_bool countRunning;
+  chpl_bool is_executeOn;
+  int lineno;
+  int filename;
+  c_sublocid_t requestedSubloc;
+  chpl_fn_int_t requested_fid;
+  chpl_fn_p requested_fn;
+  chpl_fn_name requested_fn_name;
+  chpl_taskID_t id;
+} chpl_task_bundle_t;
 
 // Structure of task-local storage
 typedef struct chpl_qthread_tls_s {
-    /* Task private data: serial state, etc. */
-    chpl_task_prvDataImpl_t chpl_data;
-    /* Reports */
-    c_string    lock_filename;
-    size_t      lock_lineno;
+  chpl_task_bundle_t *bundle;
+  // The below fields could move to chpl_task_bundleData_t
+  // That would reduce the size of the task local storage,
+  // but increase the size of executeOn bundles.
+  chpl_task_prvData_t prvdata;
+  /* Reports */
+  int     lock_filename;
+  int     lock_lineno;
 } chpl_qthread_tls_t;
 
 extern pthread_t chpl_qthread_process_pthread;
@@ -331,13 +114,13 @@ extern chpl_qthread_tls_t chpl_qthread_comm_task_tls;
 void chpl_task_stdModulesInitialized(void);
 
 // Wrap qthread_get_tasklocal() and assert that it is always available.
-static inline chpl_qthread_tls_t * chpl_qthread_get_tasklocal(void)
+static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void)
 {
     chpl_qthread_tls_t* tls;
 
     if (chpl_qthread_done_initializing) {
-        tls = (chpl_qthread_tls_t *)
-              qthread_get_tasklocal(sizeof(chpl_qthread_tls_t));
+        tls = (chpl_qthread_tls_t*)
+               qthread_get_tasklocal(sizeof(chpl_qthread_tls_t));
         if (tls == NULL) {
             pthread_t me = pthread_self();
             if (pthread_equal(me, chpl_qthread_comm_pthread))
@@ -362,12 +145,15 @@ static inline chpl_task_prvData_t* chpl_task_getPrvData(void)
 {
     chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
     if (data) {
-        return &data->chpl_data.prvdata;
+        return &data->prvdata;
     }
     assert(data);
     return NULL;
 }
 
+//
+// Sublocale support
+//
 #ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL
 #error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!"
 #else
@@ -405,7 +191,7 @@ void chpl_task_setSubloc(c_sublocid_t subloc)
     if ((curr_shep = qthread_shep()) != NO_SHEPHERD) {
         chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
         if (data) {
-            data->chpl_data.requestedSubloc = subloc;
+            data->bundle->requestedSubloc = subloc;
         }
 
         if (subloc != c_sublocid_any &&
@@ -425,21 +211,27 @@ c_sublocid_t chpl_task_getRequestedSubloc(void)
 {
     chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
     if (data) {
-        return data->chpl_data.requestedSubloc;
+        return data->bundle->requestedSubloc;
     }
     return c_sublocid_any;
 }
 
+//
+// Can we support remote caching?
+//
 #ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL
 #error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!"
 #else
 #define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1
 #endif
-extern int chpl_qthread_supports_remote_cache;
 static inline
 int chpl_task_supportsRemoteCache(void) {
-  return chpl_qthread_supports_remote_cache;
+  return CHPL_QTHREAD_SUPPORTS_REMOTE_CACHE;
 }
 
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
 #endif // ifndef _tasks_qthreads_h_
 /* vim:set expandtab: */
diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c
index f396931ab1..543db24ec1 100644
--- a/runtime/src/chpl-atmi.c
+++ b/runtime/src/chpl-atmi.c
@@ -26,9 +26,6 @@
  */
 int chpl_hsa_initialize(void)
 {
-    atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
-    if(st != ATMI_STATUS_SUCCESS) return -1;
-
     char reduce_kernel_filename[1024];
     char gen_kernel_filename[1024];
     int arglen = strlen(chpl_executionCommand)+1;
@@ -68,7 +65,7 @@ int chpl_hsa_initialize(void)
     /* FIXME: Create all reduction kernels, not just the int64-sum kernel */
     const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename};
     atmi_platform_type_t module_types[2] = {module_type, module_type};
-    st = atmi_module_register(modules, module_types, 2);
+    atmi_status_t st = atmi_module_register(modules, module_types, 2);
     OUTPUT_ATMI_STATUS(st, Registering all modules);
 
     size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)};
diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile
index cb1bf64265..c78c8f6377 100644
--- a/runtime/src/tasks/atmi/Makefile
+++ b/runtime/src/tasks/atmi/Makefile
@@ -1,4 +1,4 @@
-# Copyright 2004-2015 Cray Inc.
+# Copyright 2004-2017 Cray Inc.
 # Other additional copyright holders may be indicated within.
 # 
 # The entirety of this work is licensed under the Apache License,
@@ -22,6 +22,9 @@ ifndef CHPL_MAKE_HOME
 export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/..
 endif
 
+#
+# standard header
+#
 include $(RUNTIME_ROOT)/make/Makefile.runtime.head
 
 TASKS_OBJDIR = $(RUNTIME_OBJDIR)
diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include
index a510dc9a60..35b1225bf7 100644
--- a/runtime/src/tasks/atmi/Makefile.include
+++ b/runtime/src/tasks/atmi/Makefile.include
@@ -1,4 +1,4 @@
-# Copyright 2004-2015 Cray Inc.
+# Copyright 2004-2017 Cray Inc.
 # Other additional copyright holders may be indicated within.
 # 
 # The entirety of this work is licensed under the Apache License,
@@ -17,13 +17,9 @@
 
 TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS)
 
-TASKS_OBJDIR = $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/$(RUNTIME_OBJDIR)
+TASKS_OBJDIR = $(RUNTIME_BUILD)/$(TASKS_SUBDIR)
 
-#
-# point to sources under third-party, at least until they're
-# contributed back to the usual local directory
-#
-ALL_SRCS += $(QTHREAD_SUBDIR)/src/interfaces/chapel/*.c \
-	$(QTHREAD_SUBDIR)/src/interfaces/chapel/*.h
+ALL_SRCS += $(CURDIR)/$(TASKS_SUBDIR)/*.c \
+	$(RUNTIME_INCLUDE_ROOT)/tasks/$(CHPL_MAKE_TASKS)/*.h
 
 include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share
diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share
index 374f9cc760..74c39c92a2 100644
--- a/runtime/src/tasks/atmi/Makefile.share
+++ b/runtime/src/tasks/atmi/Makefile.share
@@ -1,4 +1,4 @@
-# Copyright 2004-2015 Cray Inc.
+# Copyright 2004-2017 Cray Inc.
 # Other additional copyright holders may be indicated within.
 # 
 # The entirety of this work is licensed under the Apache License,
@@ -15,11 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-TASKS_SRCS = 
+TASKS_SRCS = tasks-$(CHPL_MAKE_TASKS).c
 
 SVN_SRCS = $(TASKS_SRCS)
 SRCS = $(SVN_SRCS)
 
 TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o)
-
-RUNTIME_CFLAGS += -I$(QTHREAD_INCLUDE)
diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
index 8902c2949e..a518957493 100644
--- a/runtime/src/tasks/atmi/tasks-atmi.c
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -8,7 +8,7 @@
 //
 
 /*
- * Copyright 2004-2015 Cray Inc.
+ * Copyright 2004-2017 Cray Inc.
  * Other additional copyright holders may be indicated within.
  *
  * The entirety of this work is licensed under the Apache License,
@@ -34,56 +34,61 @@
 #endif
 
 #include "chplrt.h"
-#include "chplsys.h"
-#include "tasks-qthreads.h"
-//#include "tasks-atmi.h"
+
+#include "arg.h"
+#include "error.h"
 #include "chplcgfns.h"
 #include "chpl-comm.h"
+#include "chplexit.h"
 #include "chpl-locale-model.h"
+#include "chpl-mem.h"
+#include "chplsys.h"
+#include "chpl-linefile-support.h"
 #include "chpl-tasks.h"
 #include "chpl-tasks-callbacks-internal.h"
-#include "config.h"
-#include "error.h"
-#include "arg.h"
-#include "signal.h"
-#include "chplexit.h"
-#include <stdio.h>
-#include <stdlib.h>
+//#include "tasks-qthreads.h"
+#include "tasks-atmi.h"
+#include <atmi_runtime.h>
+
+#include "qthread.h"
+#include "qthread/qtimer.h"
+#include "qthread-chapel.h"
+
 #include <assert.h>
-#include <inttypes.h>
 #include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <sys/time.h>
 #include <unistd.h>
-#include <sched.h>
+#include <math.h>
+
+#define OUTPUT_ATMI_STATUS(status, msg) \
+{ \
+    if (ATMI_STATUS_SUCCESS != (status)) { \
+        fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
+#msg, status); \
+        atmi_finalize(); \
+        return status; \
+    } \
+}
 
-//#include <atmi_runtime.h>
 
-#if 1
-#include "qthread/qthread.h"
-#include "qthread/qtimer.h"
-#include "qt_feb.h"
-#include "qt_syncvar.h"
-#include "qt_hash.h"
-#include "qt_atomics.h"
-#include "qt_shepherd_innards.h"
-#include "qt_envariables.h"
-#include "qt_debug.h"
-
-#ifdef QTHREAD_MULTINODE
-#include "qthread/spr.h"
-#endif /* QTHREAD_MULTINODE */
 
-#include <pthread.h>
-#endif 
+//#define SUPPORT_BLOCKREPORT
+//#define SUPPORT_TASKREPORT
+
 #ifdef CHAPEL_PROFILE
 # define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0)
 
 /* Tasks */
 static aligned_t profile_task_yield = 0;
 static aligned_t profile_task_addToTaskList = 0;
-static aligned_t profile_task_processTaskList = 0;
 static aligned_t profile_task_executeTasksInList = 0;
-static aligned_t profile_task_freeTaskList = 0;
+static aligned_t profile_task_taskCallFTable = 0;
 static aligned_t profile_task_startMovedTask = 0;
 static aligned_t profile_task_getId = 0;
 static aligned_t profile_task_sleep = 0;
@@ -106,9 +111,8 @@ static void profile_print(void)
     /* Tasks */
     fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield);
     fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList);
-    fprintf(stderr, "task processTaskList: %lu\n", (unsigned long)profile_task_processTaskList);
     fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList);
-    fprintf(stderr, "task freeTaskList: %lu\n", (unsigned long)profile_task_freeTaskList);
+    fprintf(stderr, "task taskCallFTable: %lu\n", (unsigned long)profile_task_taskCallFTable);
     fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask);
     fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId);
     fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep);
@@ -130,9 +134,14 @@ static void profile_print(void)
 # define PROFILE_INCR(counter,count)
 #endif /* CHAPEL_PROFILE */
 
-#ifndef QTHREAD_MULTINODE
+//
+// Startup and shutdown control.  The mutex is used just for the side
+// effect of its (very portable) memory fence.
+//
 volatile int chpl_qthread_done_initializing;
-#endif
+static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER;
+static volatile int done_finalizing;
+static pthread_mutex_t done_init_final_mux = PTHREAD_MUTEX_INITIALIZER;
 
 // Make qt env sizes uniform. Same as qt, but they use the literal everywhere
 #define QT_ENV_S 100
@@ -141,7 +150,7 @@ volatile int chpl_qthread_done_initializing;
 struct chpl_task_list {
     chpl_fn_p        fun;
     void            *arg;
-    c_string         filename;
+    int32_t          filename;
     int              lineno;
     chpl_task_list_p next;
 };
@@ -151,26 +160,37 @@ static aligned_t next_task_id = 1;
 pthread_t chpl_qthread_process_pthread;
 pthread_t chpl_qthread_comm_pthread;
 
+chpl_task_bundle_t chpl_qthread_process_bundle = {
+                                   .serial_state = false,
+                                   .countRunning = false,
+                                   .is_executeOn = false,
+                                   .lineno = 0,
+                                   .filename = CHPL_FILE_IDX_MAIN_TASK,
+                                   .requestedSubloc = c_sublocid_any_val,
+                                   .requested_fid = FID_NONE,
+                                   .requested_fn = NULL,
+                                   .id = chpl_nullTaskID };
+
+chpl_task_bundle_t chpl_qthread_comm_task_bundle = {
+                                   .serial_state = false,
+                                   .countRunning = false,
+                                   .is_executeOn = false,
+                                   .lineno = 0,
+                                   .filename = CHPL_FILE_IDX_COMM_TASK,
+                                   .requestedSubloc = c_sublocid_any_val,
+                                   .requested_fid = FID_NONE,
+                                   .requested_fn = NULL,
+                                   .id = chpl_nullTaskID };
+
 chpl_qthread_tls_t chpl_qthread_process_tls = {
-    PRV_DATA_IMPL_VAL("<main task>", 0, chpl_nullTaskID, false,
-                      c_sublocid_any_val, false),
-    NULL, 0 };
+                               .bundle = &chpl_qthread_process_bundle,
+                               .lock_filename = 0,
+                               .lock_lineno = 0 };
 
 chpl_qthread_tls_t chpl_qthread_comm_task_tls = {
-    PRV_DATA_IMPL_VAL("<comm thread>", 0, chpl_nullTaskID, false,
-                      c_sublocid_any_val, false),
-    NULL, 0 };
-
-//
-// QTHREADS_SUPPORTS_REMOTE_CACHE is set in the Chapel Qthreads
-// Makefile, based on the Qthreads scheduler configuration.
-//
-int chpl_qthread_supports_remote_cache = QTHREADS_SUPPORTS_REMOTE_CACHE;
-
-//
-// structs chpl_task_prvDataImpl_t, chpl_qthread_wrapper_args_t and
-// chpl_qthread_tls_t have been moved to tasks-qthreads.h
-//
+                               .bundle = &chpl_qthread_comm_task_bundle,
+                               .lock_filename = 0,
+                               .lock_lineno = 0 };
 
 //
 // chpl_qthread_get_tasklocal() is in tasks-qthreads.h
@@ -232,7 +252,7 @@ void chpl_sync_unlock(chpl_sync_aux_t *s)
 }
 
 static inline void about_to_block(int32_t  lineno,
-                                  c_string filename)
+                                  int32_t filename)
 {
     chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
     assert(data);
@@ -243,7 +263,7 @@ static inline void about_to_block(int32_t  lineno,
 
 void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s,
                                int32_t          lineno,
-                               c_string         filename)
+                               int32_t         filename)
 {
     PROFILE_INCR(profile_sync_waitFullAndLock, 1);
 
@@ -258,7 +278,7 @@ void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s,
 
 void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s,
                                 int32_t          lineno,
-                                c_string         filename)
+                                int32_t         filename)
 {
     PROFILE_INCR(profile_sync_waitEmptyAndLock, 1);
 
@@ -313,6 +333,7 @@ void chpl_sync_destroyAux(chpl_sync_aux_t *s)
     PROFILE_INCR(profile_sync_destroyAux, 1);
 }
 
+#ifdef SUPPORT_BLOCKREPORT
 static void chapel_display_thread(qt_key_t     addr,
                                   qthread_f    f,
                                   void        *arg,
@@ -325,11 +346,21 @@ static void chapel_display_thread(qt_key_t     addr,
 
     if (rep) {
         if ((rep->lock_lineno > 0) && rep->lock_filename) {
-            fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", rep->lock_filename, rep->lock_lineno, rep->chpl_data.task_filename, rep->chpl_data.task_lineno);
+          fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n",
+                  chpl_lookupFilename(rep->lock_filename), rep->lock_lineno,
+                  chpl_lookupFilename(rep->chpl_data.task_filename),
+                  rep->chpl_data.task_lineno);
         } else if (rep->lock_lineno == 0 && rep->lock_filename) {
-            fprintf(stderr, "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", rep->lock_filename, rep->chpl_data.task_filename, rep->chpl_data.task_lineno);
+          fprintf(stderr,
+                  "Waiting for more work (line 0? file:%s) (task %s:%zu)\n",
+                  chpl_lookupFilename(rep->lock_filename),
+                  chpl_lookupFilename(rep->chpl_data.task_filename),
+                  rep->chpl_data.task_lineno);
         } else if (rep->lock_lineno == 0) {
-            fprintf(stderr, "Waiting for dependencies (uninitialized task %s:%zu)\n", rep->chpl_data.task_filename, rep->chpl_data.task_lineno);
+          fprintf(stderr,
+                  "Waiting for dependencies (uninitialized task %s:%zu)\n",
+                  chpl_lookupFilename(rep->chpl_data.task_filename),
+                  rep->chpl_data.task_lineno);
         }
         fflush(stderr);
     }
@@ -340,18 +371,28 @@ static void report_locked_threads(void)
     qthread_feb_callback(chapel_display_thread, NULL);
     qthread_syncvar_callback(chapel_display_thread, NULL);
 }
+#endif  // SUPPORT_BLOCKREPORT
 
 static void SIGINT_handler(int sig)
 {
     signal(sig, SIG_IGN);
 
     if (blockreport) {
+#ifdef SUPPORT_BLOCKREPORT
         report_locked_threads();
+#else
+        fprintf(stderr,
+                "Blockreport is currently unsupported by the qthreads "
+                "tasking layer.\n");
+#endif
     }
 
     if (taskreport) {
+#ifdef SUPPORT_TASKREPORT
+        report_all_tasks();
+#else
         fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n");
-        // report_all_tasks();
+#endif
     }
 
     chpl_exit_any(1);
@@ -359,24 +400,53 @@ static void SIGINT_handler(int sig)
 
 // Tasks
 
-#ifndef QTHREAD_MULTINODE
-static syncvar_t canexit            = SYNCVAR_STATIC_EMPTY_INITIALIZER;
-static volatile int done_finalizing = 0;
-
 static void *initializer(void *junk)
 {
     qthread_initialize();
-    MACHINE_FENCE;
+    (void) pthread_mutex_lock(&done_init_final_mux);  // implicit memory fence
     chpl_qthread_done_initializing = 1;
+    (void) pthread_mutex_unlock(&done_init_final_mux);
 
     qthread_syncvar_readFF(NULL, &canexit);
 
     qthread_finalize();
-    MACHINE_FENCE;
+    (void) pthread_mutex_lock(&done_init_final_mux);  // implicit memory fence
     done_finalizing = 1;
+    (void) pthread_mutex_unlock(&done_init_final_mux);
+
     return NULL;
 }
-#endif /* ! QTHREAD_MULTINODE */
+
+//
+// Qthreads environment helper functions.
+//
+
+static char* chpl_qt_getenv_str(const char* var) {
+    char name[100];
+    char* ev;
+
+    snprintf(name, sizeof(name), "QT_%s", var);
+    if ((ev = getenv(name)) == NULL) {
+        snprintf(name, sizeof(name), "QTHREAD_%s", var);
+        ev = getenv(name);
+    }
+
+    return ev;
+}
+
+static unsigned long int chpl_qt_getenv_num(const char* var,
+                                            unsigned long int default_val) {
+    char* ev;
+    unsigned long int ret_val = default_val;
+
+    if ((ev = chpl_qt_getenv_str(var)) != NULL) {
+        unsigned long int val;
+        if (sscanf(ev, "%lu", &val) == 1)
+            ret_val = val;
+    }
+
+    return ret_val;
+}
 
 // Helper function to set a qthreads env var. This is meant to mirror setenv
 // functionality, but qthreads has two environment variables for every setting:
@@ -384,7 +454,8 @@ static void *initializer(void *junk)
 // wraps the overriding logic. In verbose mode it prints out if we overrode
 // values, or if we were prevented from setting values because they existed
 // (and override was 0.)
-static void chpl_qt_setenv(char* var, char* val, int32_t override) {
+static void chpl_qt_setenv(const char* var, const char* val,
+                           int32_t override) {
     char      qt_env[QT_ENV_S]  = { 0 };
     char      qthread_env[QT_ENV_S] = { 0 };
     char      *qt_val;
@@ -402,7 +473,10 @@ static void chpl_qt_setenv(char* var, char* val, int32_t override) {
     eitherSet = (qt_val != NULL || qthread_val != NULL);
 
     if (override || !eitherSet) {
-        if (verbosity >= 2 && override && eitherSet) {
+        if (verbosity >= 2
+            && override
+            && eitherSet
+            && strcmp(val, (qt_val != NULL) ? qt_val : qthread_val) != 0) {
             printf("QTHREADS: Overriding the value of %s and %s "
                    "with %s\n", qt_env, qthread_env, val);
         }
@@ -418,11 +492,22 @@ static void chpl_qt_setenv(char* var, char* val, int32_t override) {
             set_env = qthread_env;
             set_val = qthread_val;
         }
-        printf("QTHREADS: Not setting %s to %s because %s is set to %s and "
-               "overriding was not requested\n", qt_env, val, set_env, set_val);
+        if (strcmp(val, set_val) != 0)
+          printf("QTHREADS: Not setting %s to %s because %s is set to %s and "
+                 "overriding was not requested\n", qt_env, val, set_env, set_val);
     }
 }
 
+static void chpl_qt_unsetenv(const char* var) {
+  char name[100];
+
+  snprintf(name, sizeof(name), "QT_%s", var);
+  (void) unsetenv(name);
+
+  snprintf(name, sizeof(name), "QTHREAD_%s", var);
+  (void) unsetenv(name);
+}
+
 // Determine the number of workers based on environment settings. If a user set
 // HWPAR, they are saying they want to use HWPAR many workers, but let the
 // runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or
@@ -432,14 +517,14 @@ static void chpl_qt_setenv(char* var, char* val, int32_t override) {
 // set since we can't figure out before Qthreads init what this will actually
 // turn into without duplicating Qthreads logic (-1 is a sentinel for don't
 // adjust the values, and give them as is to Qthreads.)
-static int32_t chpl_qt_getenv_num_workers() {
+static int32_t chpl_qt_getenv_num_workers(void) {
     int32_t  hwpar;
     int32_t  num_wps;
     int32_t  num_sheps;
 
-    hwpar = qt_internal_get_env_num("HWPAR", 0, 0);
-    num_wps = qt_internal_get_env_num("NUM_WORKERS_PER_SHEPHERD", 0, 0);
-    num_sheps = qt_internal_get_env_num("NUM_SHEPHERDS", 0, 0);
+    hwpar = (int32_t) chpl_qt_getenv_num("HWPAR", 0);
+    num_wps = (int32_t) chpl_qt_getenv_num("NUM_WORKERS_PER_SHEPHERD", 0);
+    num_sheps = (int32_t) chpl_qt_getenv_num("NUM_SHEPHERDS", 0);
 
     if (hwpar) {
         return hwpar;
@@ -477,7 +562,7 @@ static int32_t setupAvailableParallelism(int32_t maxThreads) {
 
         numPUsPerLocale = chpl_getNumLogicalCpus(true);
         if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) {
-            if (verbosity >= 2) {
+            if (verbosity > 0) {
                 printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d "
                        "to prevent oversubscription of the system.\n",
                        hwpar, numPUsPerLocale);
@@ -515,12 +600,12 @@ static int32_t setupAvailableParallelism(int32_t maxThreads) {
         }
 
         // Unset relevant Qthreads environment variables.
-        qt_internal_unset_envstr("HWPAR");
-        qt_internal_unset_envstr("NUM_SHEPHERDS");
-        qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD");
+        chpl_qt_unsetenv("HWPAR");
+        chpl_qt_unsetenv("NUM_SHEPHERDS");
+        chpl_qt_unsetenv("NUM_WORKERS_PER_SHEPHERD");
 
         snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar);
-        if (THREADQUEUE_POLICY_TRUE == qt_threadqueue_policy(SINGLE_WORKER)) {
+        if (CHPL_QTHREAD_SCHEDULER_ONE_WORKER_PER_SHEPHERD) {
             chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1);
             chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1);
         } else {
@@ -555,7 +640,7 @@ static void setupCallStacks(int32_t hwpar) {
     // 2) QTHREAD_STACK_SIZE
     // 3) Chapel default
     if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 ||
-        (qt_internal_get_env_num("STACK_SIZE", 0, 0) == 0 &&
+        (chpl_qt_getenv_num("STACK_SIZE", 0) == 0 &&
          (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) {
         char newenv_stack[QT_ENV_S];
         snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize);
@@ -591,26 +676,61 @@ static void setupCallStacks(int32_t hwpar) {
     }
 }
 
+static void setupTasklocalStorage(void) {
+    unsigned long int tasklocal_size;
+    char newenv[QT_ENV_S];
+
+    // Make sure Qthreads knows how much space we need for per-task
+    // local storage.
+    tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0);
+    if (tasklocal_size < sizeof(chpl_qthread_tls_t)) {
+        snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_qthread_tls_t));
+        chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1);
+    }
+}
+
+static void setupWorkStealing(void) {
+    // In our experience the current work stealing implementation hurts
+    // performance, so disable it. Note that we don't override, so a user could
+    // try working stealing out by setting {QT,QTHREAD}_STEAL_RATIO. Also note
+    // that not all schedulers support work stealing, but it doesn't hurt to
+    // set this env var for those configs anyways.
+    chpl_qt_setenv("STEAL_RATIO", "0", 0);
+}
+
 void chpl_task_init(void)
 {
+    atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
+    printf("ATMI task initialized\n");
+    if(st != ATMI_STATUS_SUCCESS) return;
+
+    g_machine = atmi_machine_get_info();
+
     int32_t   commMaxThreads;
     int32_t   hwpar;
     pthread_t initer;
+    pthread_attr_t pAttr;
 
     chpl_qthread_process_pthread = pthread_self();
-    chpl_qthread_process_tls.chpl_data.id = qthread_incr(&next_task_id, 1);
+    chpl_qthread_process_bundle.id = qthread_incr(&next_task_id, 1);
 
     commMaxThreads = chpl_comm_getMaxThreads();
 
-    // Setup hardware parallelism, the stack size, and stack guards
+    // Set up hardware parallelism, the stack size and stack guards,
+    // tasklocal storage, and work stealing
     hwpar = setupAvailableParallelism(commMaxThreads);
     setupCallStacks(hwpar);
+    setupTasklocalStorage();
+    setupWorkStealing();
 
     if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); }
 
     // Initialize qthreads
-    pthread_create(&initer, NULL, initializer, NULL);
-    while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY();
+    pthread_attr_init(&pAttr);
+    pthread_attr_setdetachstate(&pAttr, PTHREAD_CREATE_DETACHED);
+    pthread_create(&initer, &pAttr, initializer, NULL);
+    while (chpl_qthread_done_initializing == 0)
+        sched_yield();
 
     // Now that Qthreads is up and running, do a sanity check and make sure
     // that the number of workers is less than any comm layer limit. This is
@@ -628,60 +748,88 @@ void chpl_task_init(void)
 
 void chpl_task_exit(void)
 {
+    atmi_finalize();
 #ifdef CHAPEL_PROFILE
     profile_print();
 #endif /* CHAPEL_PROFILE */
 
-#ifdef QTHREAD_MULTINODE
-#else
     if (qthread_shep() == NO_SHEPHERD) {
         /* sometimes, tasking is told to shutdown even though it hasn't been
          * told to start yet */
         if (chpl_qthread_done_initializing == 1) {
             qthread_syncvar_fill(&canexit);
-            while (done_finalizing == 0) SPINLOCK_BODY();
+            while (done_finalizing == 0)
+                sched_yield();
         }
     } else {
         qthread_syncvar_fill(&exit_ret);
     }
-#endif /* QTHREAD_MULTINODE */
 }
 
 static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind,
-                                  chpl_task_prvDataImpl_t *chpl_data) {
+                                  chpl_task_bundle_t* bundle) {
     if (chpl_task_have_callbacks(event_kind)) {
-        if (chpl_data->id == chpl_nullTaskID)
-            chpl_data->id = qthread_incr(&next_task_id, 1);
+        if (bundle->id == chpl_nullTaskID)
+            bundle->id = qthread_incr(&next_task_id, 1);
         chpl_task_do_callbacks(event_kind,
-                               chpl_data->task_filename,
-                               chpl_data->task_lineno,
-                               chpl_data->id,
-                               chpl_data->is_executeOn);
+                               bundle->requested_fid,
+                               bundle->filename,
+                               bundle->lineno,
+                               bundle->id,
+                               bundle->is_executeOn);
     }
 }
 
+
+// If we stored chpl_taskID_t in chpl_task_bundleData_t,
+// this struct and the following function may not be necessary.
+typedef void (*main_ptr_t)(void);
+typedef struct {
+  chpl_task_bundle_t arg;
+  main_ptr_t chpl_main;
+} main_wrapper_bundle_t;
+
+static aligned_t main_wrapper(void *arg)
+{
+    chpl_qthread_tls_t         *tls = chpl_qthread_get_tasklocal();
+    main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg;
+    chpl_task_bundle_t      *bundle = &m_bundle->arg;
+    chpl_qthread_tls_t         pv = {.bundle = bundle};
+
+    *tls = pv;
+
+    // main call doesn't need countRunning / chpl_taskRunningCntInc
+
+    wrap_callbacks(chpl_task_cb_event_kind_begin, bundle);
+
+    (m_bundle->chpl_main)();
+
+    wrap_callbacks(chpl_task_cb_event_kind_end, bundle);
+
+    return 0;
+}
+
+
 static aligned_t chapel_wrapper(void *arg)
 {
-    chpl_qthread_wrapper_args_t *rarg = arg;
-    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    chpl_qthread_tls_t    *tls = chpl_qthread_get_tasklocal();
+    chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
+    chpl_qthread_tls_t      pv = {.bundle = bundle};
 
-    data->chpl_data = rarg->chpl_data;
-    data->lock_filename = NULL;
-    data->lock_lineno = 0;
+    *tls = pv;
 
-    if (rarg->countRunning) {
-        chpl_taskRunningCntInc(0, NULL);
-    }
+    if (bundle->countRunning)
+      chpl_taskRunningCntInc(0, 0);
 
-    wrap_callbacks(chpl_task_cb_event_kind_begin, &data->chpl_data);
+    wrap_callbacks(chpl_task_cb_event_kind_begin, bundle);
 
-    (*(chpl_fn_p)(rarg->fn))(rarg->args);
+    // launch GPU kernel here? 
+    (bundle->requested_fn)(arg);
 
-    wrap_callbacks(chpl_task_cb_event_kind_end, &data->chpl_data);
+    wrap_callbacks(chpl_task_cb_event_kind_end, bundle);
 
-    if (rarg->countRunning) {
-        chpl_taskRunningCntDec(0, NULL);
-    }
+    if (bundle->countRunning)
+      chpl_taskRunningCntDec(0, 0);
 
     return 0;
 }
@@ -705,27 +853,24 @@ static void *comm_task_wrapper(void *arg)
 // not use methods that require task context (e.g., task-local storage).
 void chpl_task_callMain(void (*chpl_main)(void))
 {
-    chpl_qthread_wrapper_args_t wrapper_args =
-        {chpl_main, NULL, false,
-         PRV_DATA_IMPL_VAL("<main task>", 0,
-                           chpl_qthread_process_tls.chpl_data.id, false,
-                           c_sublocid_any_val, false) };
-
-    qthread_debug(CHAPEL_CALLS, "[%d] begin chpl_task_callMain()\n", chpl_nodeID);
-
-#ifdef QTHREAD_MULTINODE
-    qthread_debug(CHAPEL_BEHAVIOR, "[%d] calling spr_unify\n", chpl_nodeID);
-    int const rc = spr_unify();
-    assert(SPR_OK == rc);
-#endif /* QTHREAD_MULTINODE */
-
-    wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data);
-
-    qthread_fork_syncvar(chapel_wrapper, &wrapper_args, &exit_ret);
+    main_wrapper_bundle_t arg;
+
+    arg.arg.serial_state      = false;
+    arg.arg.countRunning      = false;
+    arg.arg.is_executeOn      = false;
+    arg.arg.requestedSubloc   = c_sublocid_any_val;
+    arg.arg.requested_fid     = FID_NONE;
+    arg.arg.requested_fn      = NULL;
+    arg.arg.requested_fn_name = NULL;
+    arg.arg.lineno            = 0;
+    arg.arg.filename           = CHPL_FILE_IDX_MAIN_TASK;
+    arg.arg.id                = chpl_qthread_process_bundle.id;
+    arg.chpl_main             = chpl_main;
+
+    wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg);
+
+    qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret);
     qthread_syncvar_readFF(NULL, &exit_ret);
-
-    qthread_debug(CHAPEL_BEHAVIOR, "[%d] main task finished\n", chpl_nodeID);
-    qthread_debug(CHAPEL_CALLS, "[%d] end chpl_task_callMain()\n", chpl_nodeID);
 }
 
 void chpl_task_stdModulesInitialized(void)
@@ -736,13 +881,12 @@ void chpl_task_stdModulesInitialized(void)
     // when this function is called, so now count the main task.
     //
     canCountRunningTasks = true;
-    chpl_taskRunningCntInc(0, NULL);
+    chpl_taskRunningCntInc(0, 0);
 }
 
 int chpl_task_createCommTask(chpl_fn_p fn,
                              void     *arg)
 {
-#ifndef QTHREAD_MULTINODE
     //
     // The wrapper info must be static because it won't be referred to
     // until the new pthread calls comm_task_wrapper().  And, it is
@@ -755,21 +899,20 @@ int chpl_task_createCommTask(chpl_fn_p fn,
     wrapper_info.arg = arg;
     return pthread_create(&chpl_qthread_comm_pthread,
                           NULL, comm_task_wrapper, &wrapper_info);
-#else
-    return 0;
-#endif
 }
 
-void chpl_task_addToTaskList(chpl_fn_int_t     fid,
-                             void             *arg,
-                             c_sublocid_t      subloc,
-                             chpl_task_list_p *task_list,
-                             int32_t           task_list_locale,
-                             chpl_bool         is_begin_stmt,
-                             int               lineno,
-                             c_string          filename)
+void chpl_task_addToTaskList(chpl_fn_int_t       fid,
+                             chpl_task_bundle_t *arg,
+                             size_t              arg_size,
+                             c_sublocid_t        subloc,
+                             void              **task_list,
+                             int32_t             task_list_locale,
+                             chpl_bool           is_begin_stmt,
+                             int                 lineno,
+                             int32_t             filename)
 {
     chpl_bool serial_state = chpl_task_getSerial();
+    chpl_fn_p requested_fn = chpl_ftable[fid];
 
     assert(subloc != c_sublocid_none);
 
@@ -777,68 +920,92 @@ void chpl_task_addToTaskList(chpl_fn_int_t     fid,
 
     if (serial_state) {
         // call the function directly.
-        (chpl_ftable[fid])(arg);
+        requested_fn(arg);
     } else {
-        chpl_qthread_wrapper_args_t wrapper_args =
-            {chpl_ftable[fid], arg, false,
-             PRV_DATA_IMPL_VAL(filename, lineno, chpl_nullTaskID, false,
-                               subloc, serial_state) };
-
-        wrap_callbacks(chpl_task_cb_event_kind_create,
-                       &wrapper_args.chpl_data);
+        arg->serial_state      = false;
+        arg->countRunning      = false;
+        arg->is_executeOn      = false;
+        arg->requestedSubloc   = subloc;
+        arg->requested_fid     = fid;
+        arg->requested_fn      = requested_fn;
+        arg->requested_fn_name = NULL;
+        arg->lineno            = lineno;
+        arg->filename          = filename;
+        arg->id                = chpl_nullTaskID;
+
+        wrap_callbacks(chpl_task_cb_event_kind_create, arg);
 
         if (subloc == c_sublocid_any) {
-            qthread_fork_copyargs(chapel_wrapper, &wrapper_args,
-                                  sizeof(chpl_qthread_wrapper_args_t), NULL);
+            qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL);
         } else {
-            qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args,
-                                     sizeof(chpl_qthread_wrapper_args_t), NULL,
-                                     (qthread_shepherd_id_t) subloc);
+            qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size,
+                                     NULL, (qthread_shepherd_id_t) subloc);
         }
     }
 }
 
-void chpl_task_processTaskList(chpl_task_list_p task_list)
+void chpl_task_executeTasksInList(void **task_list)
 {
-    PROFILE_INCR(profile_task_processTaskList,1);
+    PROFILE_INCR(profile_task_executeTasksInList,1);
 }
 
-void chpl_task_executeTasksInList(chpl_task_list_p task_list)
+static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p fp,
+                                void *arg, size_t arg_size,
+                                c_sublocid_t subloc,  chpl_bool serial_state,
+                                int lineno, int32_t filename)
 {
-    PROFILE_INCR(profile_task_executeTasksInList,1);
+    chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
+
+    bundle->serial_state       = serial_state;
+    bundle->countRunning       = canCountRunningTasks;
+    bundle->is_executeOn       = true;
+    bundle->requestedSubloc    = subloc;
+    bundle->requested_fid      = fid;
+    bundle->requested_fn       = fp;
+    bundle->requested_fn_name  = fname;
+    bundle->lineno             = lineno;
+    bundle->filename           = filename;
+    bundle->id                 = chpl_nullTaskID;
+
+    wrap_callbacks(chpl_task_cb_event_kind_create, bundle);
+
+    if (subloc < 0) {
+        qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL);
+    } else {
+        qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size,
+                                 NULL, (qthread_shepherd_id_t) subloc);
+    }
 }
 
-void chpl_task_freeTaskList(chpl_task_list_p task_list)
+void chpl_task_taskCallFTable(chpl_fn_int_t fid,
+                              chpl_task_bundle_t *arg, size_t arg_size,
+                              c_sublocid_t subloc,
+                              int lineno, int32_t filename)
 {
-    PROFILE_INCR(profile_task_freeTaskList,1);
+    PROFILE_INCR(profile_task_taskCall,1);
+
+    taskCallBody(fid, NULL, chpl_ftable[fid], arg, arg_size, subloc, false, lineno, filename);
 }
 
-void chpl_task_startMovedTask(chpl_fn_p      fp,
-                              void          *arg,
-                              c_sublocid_t   subloc,
-                              chpl_taskID_t  id,
-                              chpl_bool      serial_state)
+void chpl_task_startMovedTask(chpl_fn_int_t       fid,
+                              chpl_fn_p           fp,
+                              chpl_task_bundle_t *arg,
+                              size_t              arg_size,
+                              c_sublocid_t        subloc,
+                              chpl_taskID_t       id,
+                              chpl_bool           serial_state)
 {
-    assert(subloc != c_sublocid_none);
+    //
+    // For now the incoming task ID is simply dropped, though we check
+    // to make sure the caller wasn't expecting us to do otherwise.  If
+    // we someday make task IDs global we will need to be able to set
+    // the ID of this moved task.
+    //
     assert(id == chpl_nullTaskID);
 
-    chpl_qthread_wrapper_args_t wrapper_args =
-        {fp, arg, canCountRunningTasks,
-         PRV_DATA_IMPL_VAL("<unknown>", 0, chpl_nullTaskID, true,
-                           subloc, serial_state) };
-
     PROFILE_INCR(profile_task_startMovedTask,1);
 
-    wrap_callbacks(chpl_task_cb_event_kind_create, &wrapper_args.chpl_data);
-
-    if (subloc == c_sublocid_any) {
-        qthread_fork_copyargs(chapel_wrapper, &wrapper_args,
-                              sizeof(chpl_qthread_wrapper_args_t), NULL);
-    } else {
-        qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args,
-                                 sizeof(chpl_qthread_wrapper_args_t), NULL,
-                                 (qthread_shepherd_id_t) subloc);
-    }
+    taskCallBody(fid, NULL, fp, arg, arg_size, subloc, serial_state, 0, CHPL_FILE_IDX_UNKNOWN);
 }
 
 //
@@ -857,23 +1024,46 @@ void chpl_task_startMovedTask(chpl_fn_p      fp,
 // Returns '(unsigned int)-1' if called outside of the tasking layer.
 chpl_taskID_t chpl_task_getId(void)
 {
-    chpl_qthread_tls_t * tls = chpl_qthread_get_tasklocal();
+    chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal();
+    chpl_taskID_t *id_ptr = NULL;
 
     PROFILE_INCR(profile_task_getId,1);
 
     if (tls == NULL)
         return (chpl_taskID_t) -1;
 
-    if (tls->chpl_data.id == chpl_nullTaskID)
-        tls->chpl_data.id = qthread_incr(&next_task_id, 1);
+    id_ptr = &tls->bundle->id;
+
+    if (*id_ptr == chpl_nullTaskID)
+        *id_ptr = qthread_incr(&next_task_id, 1);
 
-    return tls->chpl_data.id;
+    return *id_ptr;
 }
 
-void chpl_task_sleep(int secs)
+void chpl_task_sleep(double secs)
 {
     if (qthread_shep() == NO_SHEPHERD) {
-        sleep(secs);
+        struct timeval deadline;
+        struct timeval now;
+
+        //
+        // Figure out when this task can proceed again, and until then, keep
+        // yielding.
+        //
+        gettimeofday(&deadline, NULL);
+        deadline.tv_usec += (suseconds_t) lround((secs - trunc(secs)) * 1.0e6);
+        if (deadline.tv_usec > 1000000) {
+            deadline.tv_sec++;
+            deadline.tv_usec -= 1000000;
+        }
+        deadline.tv_sec += (time_t) trunc(secs);
+
+        do {
+            chpl_task_yield();
+            gettimeofday(&now, NULL);
+        } while (now.tv_sec < deadline.tv_sec
+                 || (now.tv_sec == deadline.tv_sec
+                     && now.tv_usec < deadline.tv_usec));
     } else {
         qtimer_t t = qtimer_create();
         qtimer_start(t);
@@ -881,6 +1071,7 @@ void chpl_task_sleep(int secs)
             qthread_yield();
             qtimer_stop(t);
         } while (qtimer_secs(t) < secs);
+        qtimer_destroy(t);
     }
 }
 
@@ -892,18 +1083,20 @@ chpl_bool chpl_task_getSerial(void)
 
     PROFILE_INCR(profile_task_getSerial,1);
 
-    return data->chpl_data.prvdata.serial_state;
+    return data->bundle->serial_state;
 }
 
 void chpl_task_setSerial(chpl_bool state)
 {
     chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
-    data->chpl_data.prvdata.serial_state = state;
+    data->bundle->serial_state = state;
 
     PROFILE_INCR(profile_task_setSerial,1);
 }
 
 uint32_t chpl_task_getMaxPar(void) {
+    //chpl_internal_error("Qthreads max tasks par asked\n");
+    printf("Qthreads max task par asked\n");
     //
     // We assume here that the caller (in the LocaleModel module code)
     // is interested in the number of workers on the whole node, and
@@ -918,7 +1111,8 @@ c_sublocid_t chpl_task_getNumSublocales(void)
     // FIXME: What we really want here is the number of NUMA
     // sublocales we are supporting.  For now we use the number of
     // shepherds as a proxy for that.
-    return (c_sublocid_t) qthread_num_shepherds();
+    // return (c_sublocid_t) qthread_num_shepherds();
+    return (c_sublocid_t) g_machine->device_count_by_type[ATMI_DEVTYPE_ALL];
 }
 
 size_t chpl_task_getCallStackSize(void)

From 468ed90499d7cc653b8ffcf3e2a656f5811aaaef Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Sat, 4 Mar 2017 02:56:54 -0600
Subject: [PATCH 04/10] Extended Chapel tasking interface to use more of ATMI

---
 make/compiler/Makefile.hsa              |   4 +-
 runtime/include/chpl-atmi.h             |   5 +-
 runtime/include/tasks/atmi/tasks-atmi.h |  67 ++++-----
 runtime/src/tasks/atmi/tasks-atmi.c     | 181 +++++++++++++++++-------
 4 files changed, 162 insertions(+), 95 deletions(-)

diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa
index 7c1baa28f0..0fc1c3d234 100644
--- a/make/compiler/Makefile.hsa
+++ b/make/compiler/Makefile.hsa
@@ -20,8 +20,8 @@ CLOC=/opt/rocm/cloc/bin/cloc.sh
 LIBS+=-latmi_runtime -lm
 
 # TODO: move these in third-party directory?
-GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib
-HSA_INCLUDES=-I/opt/rocm/libatmi/include
+GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/atmi/lib
+HSA_INCLUDES=-I/opt/rocm/atmi/include
 else
 # HSA locations
 CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh
diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h
index 6285760810..82c8a54780 100644
--- a/runtime/include/chpl-atmi.h
+++ b/runtime/include/chpl-atmi.h
@@ -13,12 +13,15 @@
 
 atmi_kernel_t reduction_kernel;
 atmi_kernel_t *gpu_kernels;
+atmi_kernel_t main_kernel;
+int g_num_cpu_kernels;
 
 atmi_machine_t *g_machine;
 
 enum {
     GPU_KERNEL_IMPL = 10565,
-    REDUCTION_GPU_IMPL = 42
+    REDUCTION_GPU_IMPL = 42,
+    CPU_FUNCTION_IMPL = 43
 };    
 
 int chpl_hsa_initialize(void);
diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h
index 81ad131901..44adc76308 100644
--- a/runtime/include/tasks/atmi/tasks-atmi.h
+++ b/runtime/include/tasks/atmi/tasks-atmi.h
@@ -30,7 +30,7 @@
 #include "chpl-atmi.h"
 #include "chpl-tasks-prvdata.h"
 #include "chpltypes.h"
-
+#include "chpl-mem.h"
 #include "qthread.h"
 #include "qthread-chapel.h"
 
@@ -50,8 +50,8 @@ void chpl_task_yield(void);
 // Type (and default value) used to communicate task identifiers
 // between C code and Chapel code in the runtime.
 //
-typedef unsigned int chpl_taskID_t;
-#define chpl_nullTaskID QTHREAD_NULL_TASK_ID
+typedef uint64_t chpl_taskID_t;
+#define chpl_nullTaskID ATMI_NULL_TASK_HANDLE
 
 //
 // Sync variables
@@ -93,7 +93,7 @@ typedef struct {
 } chpl_task_bundle_t;
 
 // Structure of task-local storage
-typedef struct chpl_qthread_tls_s {
+typedef struct chpl_atmi_tls_s {
   chpl_task_bundle_t *bundle;
   // The below fields could move to chpl_task_bundleData_t
   // That would reduce the size of the task local storage,
@@ -102,39 +102,23 @@ typedef struct chpl_qthread_tls_s {
   /* Reports */
   int     lock_filename;
   int     lock_lineno;
-} chpl_qthread_tls_t;
+} chpl_atmi_tls_t;
+
+extern pthread_key_t tls_cache;
 
 extern pthread_t chpl_qthread_process_pthread;
 extern pthread_t chpl_qthread_comm_pthread;
 
-extern chpl_qthread_tls_t chpl_qthread_process_tls;
-extern chpl_qthread_tls_t chpl_qthread_comm_task_tls;
+extern chpl_atmi_tls_t chpl_qthread_process_tls;
+extern chpl_atmi_tls_t chpl_qthread_comm_task_tls;
 
 #define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized
 void chpl_task_stdModulesInitialized(void);
 
-// Wrap qthread_get_tasklocal() and assert that it is always available.
-static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void)
-{
-    chpl_qthread_tls_t* tls;
-
-    if (chpl_qthread_done_initializing) {
-        tls = (chpl_qthread_tls_t*)
-               qthread_get_tasklocal(sizeof(chpl_qthread_tls_t));
-        if (tls == NULL) {
-            pthread_t me = pthread_self();
-            if (pthread_equal(me, chpl_qthread_comm_pthread))
-                tls = &chpl_qthread_comm_task_tls;
-            else if (pthread_equal(me, chpl_qthread_process_pthread))
-                tls = &chpl_qthread_process_tls;
-        }
-        assert(tls);
-    }
-    else
-        tls = NULL;
+extern pthread_t null_thread;
 
-    return tls;
-}
+// Wrap qthread_get_tasklocal() and assert that it is always available.
+extern chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void);
 
 #ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL
 #error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!"
@@ -143,7 +127,7 @@ static inline chpl_qthread_tls_t* chpl_qthread_get_tasklocal(void)
 #endif
 static inline chpl_task_prvData_t* chpl_task_getPrvData(void)
 {
-    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
     if (data) {
         return &data->prvdata;
     }
@@ -162,7 +146,11 @@ static inline chpl_task_prvData_t* chpl_task_getPrvData(void)
 static inline
 c_sublocid_t chpl_task_getSubloc(void)
 {
-    return (c_sublocid_t) qthread_shep();
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    if (data) 
+        return data->bundle->requestedSubloc;
+    else 
+        return c_sublocid_any;
 }
 
 #ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL
@@ -173,8 +161,6 @@ c_sublocid_t chpl_task_getSubloc(void)
 static inline
 void chpl_task_setSubloc(c_sublocid_t subloc)
 {
-    qthread_shepherd_id_t curr_shep;
-
     assert(subloc != c_sublocid_none);
 
     // Only change sublocales if the caller asked for a particular one,
@@ -188,16 +174,10 @@ void chpl_task_setSubloc(c_sublocid_t subloc)
     //       before tasking init and in any case would be done from the
     //       main thread of execution, which doesn't have a shepherd.
     //       The code below wouldn't work in that situation.
-    if ((curr_shep = qthread_shep()) != NO_SHEPHERD) {
-        chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
-        if (data) {
-            data->bundle->requestedSubloc = subloc;
-        }
-
-        if (subloc != c_sublocid_any &&
-            (qthread_shepherd_id_t) subloc != curr_shep) {
-            qthread_migrate_to((qthread_shepherd_id_t) subloc);
-        }
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    if (data) {
+        data->bundle->requestedSubloc = subloc;
+        printf("Setting ATMI requested subloc to %d\n", subloc);
     }
 }
 
@@ -209,10 +189,11 @@ void chpl_task_setSubloc(c_sublocid_t subloc)
 static inline
 c_sublocid_t chpl_task_getRequestedSubloc(void)
 {
-    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
     if (data) {
         return data->bundle->requestedSubloc;
     }
+    
     return c_sublocid_any;
 }
 
diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
index a518957493..8f926c8738 100644
--- a/runtime/src/tasks/atmi/tasks-atmi.c
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -66,6 +66,11 @@
 #include <unistd.h>
 #include <math.h>
 
+#define max_num_cpu_kernels 4096
+int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; 
+atmi_kernel_t cpu_kernels[max_num_cpu_kernels];
+
+
 #define OUTPUT_ATMI_STATUS(status, msg) \
 { \
     if (ATMI_STATUS_SUCCESS != (status)) { \
@@ -76,8 +81,6 @@
     } \
 }
 
-
-
 //#define SUPPORT_BLOCKREPORT
 //#define SUPPORT_TASKREPORT
 
@@ -182,20 +185,22 @@ chpl_task_bundle_t chpl_qthread_comm_task_bundle = {
                                    .requested_fn = NULL,
                                    .id = chpl_nullTaskID };
 
-chpl_qthread_tls_t chpl_qthread_process_tls = {
+chpl_atmi_tls_t chpl_qthread_process_tls = {
                                .bundle = &chpl_qthread_process_bundle,
                                .lock_filename = 0,
                                .lock_lineno = 0 };
 
-chpl_qthread_tls_t chpl_qthread_comm_task_tls = {
+chpl_atmi_tls_t chpl_qthread_comm_task_tls = {
                                .bundle = &chpl_qthread_comm_task_bundle,
                                .lock_filename = 0,
                                .lock_lineno = 0 };
 
 //
-// chpl_qthread_get_tasklocal() is in tasks-qthreads.h
+// chpl_atmi_get_tasklocal() is in tasks-qthreads.h
 //
 
+pthread_key_t tls_cache;
+
 static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER;
 
 static volatile chpl_bool canCountRunningTasks = false;
@@ -203,11 +208,11 @@ static volatile chpl_bool canCountRunningTasks = false;
 void chpl_task_yield(void)
 {
     PROFILE_INCR(profile_task_yield,1);
-    if (qthread_shep() == NO_SHEPHERD) {
+    /*if (qthread_shep() == NO_SHEPHERD) {
         sched_yield();
     } else {
         qthread_yield();
-    }
+    }*/
 }
 
 // Sync variables
@@ -251,10 +256,37 @@ void chpl_sync_unlock(chpl_sync_aux_t *s)
     qthread_incr(&s->lockers_out, 1);
 }
 
+inline chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void)
+{
+    chpl_atmi_tls_t* tls = NULL;
+
+    pthread_t me = pthread_self();
+    if (pthread_equal(me, chpl_qthread_comm_pthread))
+        tls = &chpl_qthread_comm_task_tls;
+    else if (pthread_equal(me, chpl_qthread_process_pthread))
+        tls = &chpl_qthread_process_tls;
+    else {
+        atmi_task_handle_t t = get_atmi_task_handle();
+        if(t != ATMI_NULL_TASK_HANDLE) {
+            tls = (chpl_atmi_tls_t *)pthread_getspecific(tls_cache);
+            if(tls == NULL) {
+                // FIXME: when to free?
+                tls = (chpl_atmi_tls_t *)chpl_mem_alloc(sizeof(chpl_atmi_tls_t),
+                                               CHPL_RT_MD_THREAD_PRV_DATA,
+                                               0, 0);
+                pthread_setspecific(tls_cache, tls);
+            }
+        }
+        assert(tls != NULL);
+    }
+
+    return tls;
+}
+
 static inline void about_to_block(int32_t  lineno,
                                   int32_t filename)
 {
-    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
     assert(data);
 
     data->lock_lineno   = lineno;
@@ -342,7 +374,7 @@ static void chapel_display_thread(qt_key_t     addr,
                                   void        *tls,
                                   void        *callarg)
 {
-    chpl_qthread_tls_t *rep = (chpl_qthread_tls_t *)tls;
+    chpl_atmi_tls_t *rep = (chpl_atmi_tls_t *)tls;
 
     if (rep) {
         if ((rep->lock_lineno > 0) && rep->lock_filename) {
@@ -683,8 +715,8 @@ static void setupTasklocalStorage(void) {
     // Make sure Qthreads knows how much space we need for per-task
     // local storage.
     tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0);
-    if (tasklocal_size < sizeof(chpl_qthread_tls_t)) {
-        snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_qthread_tls_t));
+    if (tasklocal_size < sizeof(chpl_atmi_tls_t)) {
+        snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_atmi_tls_t));
         chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1);
     }
 }
@@ -698,13 +730,28 @@ static void setupWorkStealing(void) {
     chpl_qt_setenv("STEAL_RATIO", "0", 0);
 }
 
+static aligned_t chapel_wrapper(void *arg);
+static aligned_t main_wrapper(void *arg);
+// If we stored chpl_taskID_t in chpl_task_bundleData_t,
+// this struct and the following function may not be necessary.
+typedef void (*main_ptr_t)(void);
+typedef struct {
+  chpl_task_bundle_t arg;
+  main_ptr_t chpl_main;
+} main_wrapper_bundle_t;
+
 void chpl_task_init(void)
 {
     atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
-    printf("ATMI task initialized\n");
     if(st != ATMI_STATUS_SUCCESS) return;
 
     g_machine = atmi_machine_get_info();
+    
+    pthread_key_create(&tls_cache, NULL);
+    //g_num_cpu_kernels = sizeof(chpl_ftable)/sizeof(chpl_ftable[0]);
+    size_t main_arg_size = sizeof(main_wrapper_bundle_t);
+    atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size);
+    atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL);
 
     int32_t   commMaxThreads;
     int32_t   hwpar;
@@ -748,11 +795,18 @@ void chpl_task_init(void)
 
 void chpl_task_exit(void)
 {
+    //for(int i = 0; i < g_num_cpu_kernels; i++) {
+    for(int i = 0; i < max_num_cpu_kernels; i++) {
+        if(cpu_kernels_initialized[i]) 
+            atmi_kernel_release(cpu_kernels[i]);
+    }
+    atmi_kernel_release(main_kernel);
     atmi_finalize();
 #ifdef CHAPEL_PROFILE
     profile_print();
 #endif /* CHAPEL_PROFILE */
 
+    pthread_key_delete(tls_cache);
     if (qthread_shep() == NO_SHEPHERD) {
         /* sometimes, tasking is told to shutdown even though it hasn't been
          * told to start yet */
@@ -781,20 +835,12 @@ static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind,
 }
 
 
-// If we stored chpl_taskID_t in chpl_task_bundleData_t,
-// this struct and the following function may not be necessary.
-typedef void (*main_ptr_t)(void);
-typedef struct {
-  chpl_task_bundle_t arg;
-  main_ptr_t chpl_main;
-} main_wrapper_bundle_t;
-
 static aligned_t main_wrapper(void *arg)
 {
-    chpl_qthread_tls_t         *tls = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t         *tls = chpl_atmi_get_tasklocal();
     main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg;
     chpl_task_bundle_t      *bundle = &m_bundle->arg;
-    chpl_qthread_tls_t         pv = {.bundle = bundle};
+    chpl_atmi_tls_t         pv = {.bundle = bundle};
 
     *tls = pv;
 
@@ -809,18 +855,17 @@ static aligned_t main_wrapper(void *arg)
     return 0;
 }
 
-
 static aligned_t chapel_wrapper(void *arg)
 {
-    chpl_qthread_tls_t    *tls = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t    *tls = chpl_atmi_get_tasklocal();
     chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
-    chpl_qthread_tls_t      pv = {.bundle = bundle};
+    chpl_atmi_tls_t      pv = {.bundle = bundle};
 
     *tls = pv;
 
     if (bundle->countRunning)
       chpl_taskRunningCntInc(0, 0);
-
+    
     wrap_callbacks(chpl_task_cb_event_kind_begin, bundle);
 
     // launch GPU kernel here? 
@@ -869,8 +914,18 @@ void chpl_task_callMain(void (*chpl_main)(void))
 
     wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg);
 
-    qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret);
-    qthread_syncvar_readFF(NULL, &exit_ret);
+    int cpu_id = 0;//subloc;
+    ATMI_LPARM_CPU(lparm, cpu_id);
+    lparm->kernel_id = CPU_FUNCTION_IMPL;
+    lparm->synchronous = ATMI_TRUE;
+    void *kernargs[1];
+    //void *arg1 = (void *)&arg;
+    kernargs[0] = (void *)&arg;
+    //atmi_task_launch(lparm, main_kernel, kernargs);
+
+    main_wrapper(&arg);
+    //qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret);
+    //qthread_syncvar_readFF(NULL, &exit_ret);
 }
 
 void chpl_task_stdModulesInitialized(void)
@@ -911,6 +966,7 @@ void chpl_task_addToTaskList(chpl_fn_int_t       fid,
                              int                 lineno,
                              int32_t             filename)
 {
+    //printf("Adding %d fn to task list\n", fid);
     chpl_bool serial_state = chpl_task_getSerial();
     chpl_fn_p requested_fn = chpl_ftable[fid];
 
@@ -935,12 +991,24 @@ void chpl_task_addToTaskList(chpl_fn_int_t       fid,
 
         wrap_callbacks(chpl_task_cb_event_kind_create, arg);
 
-        if (subloc == c_sublocid_any) {
+        int cpu_id = 0;//subloc;
+        ATMI_LPARM_CPU(lparm, cpu_id);
+        lparm->kernel_id = CPU_FUNCTION_IMPL;
+        //lparm->synchronous = ATMI_TRUE;
+        void *kernargs[1];
+        if(!cpu_kernels_initialized[fid]) {
+            atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size);
+            atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL);
+            cpu_kernels_initialized[fid] = 1;
+        }
+        kernargs[0] = (void *)arg;
+        atmi_task_launch(lparm, cpu_kernels[fid], kernargs);
+        /*if (subloc == c_sublocid_any) {
             qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL);
         } else {
             qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size,
                                      NULL, (qthread_shepherd_id_t) subloc);
-        }
+        }*/
     }
 }
 
@@ -954,6 +1022,7 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p
                                 c_sublocid_t subloc,  chpl_bool serial_state,
                                 int lineno, int32_t filename)
 {
+    //printf("Adding %d fn to task call body\n", fid);
     chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
 
     bundle->serial_state       = serial_state;
@@ -969,12 +1038,24 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p
 
     wrap_callbacks(chpl_task_cb_event_kind_create, bundle);
 
-    if (subloc < 0) {
+    int cpu_id = 0;//subloc;
+    ATMI_LPARM_CPU(lparm, cpu_id);
+    lparm->kernel_id = CPU_FUNCTION_IMPL;
+    //lparm->synchronous = ATMI_TRUE;
+    void *kernargs[1];
+    if(!cpu_kernels_initialized[fid]) {
+        atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size);
+        atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL);
+        cpu_kernels_initialized[fid] = 1;
+    }
+    kernargs[0] = (void *)arg;
+    atmi_task_launch(lparm, cpu_kernels[fid], kernargs);
+    /*if (subloc < 0) {
         qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL);
     } else {
         qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size,
-                                 NULL, (qthread_shepherd_id_t) subloc);
-    }
+                                 NULL, (qthread_shepherd_id_t) 0);
+    }*/
 }
 
 void chpl_task_taskCallFTable(chpl_fn_int_t fid,
@@ -1024,20 +1105,13 @@ void chpl_task_startMovedTask(chpl_fn_int_t       fid,
 // Returns '(unsigned int)-1' if called outside of the tasking layer.
 chpl_taskID_t chpl_task_getId(void)
 {
-    chpl_qthread_tls_t *tls = chpl_qthread_get_tasklocal();
-    chpl_taskID_t *id_ptr = NULL;
-
     PROFILE_INCR(profile_task_getId,1);
 
-    if (tls == NULL)
+    atmi_task_handle_t t = get_atmi_task_handle();
+    if(t == ATMI_NULL_TASK_HANDLE)
         return (chpl_taskID_t) -1;
 
-    id_ptr = &tls->bundle->id;
-
-    if (*id_ptr == chpl_nullTaskID)
-        *id_ptr = qthread_incr(&next_task_id, 1);
-
-    return *id_ptr;
+    return t;
 }
 
 void chpl_task_sleep(double secs)
@@ -1077,18 +1151,27 @@ void chpl_task_sleep(double secs)
 
 /* The get- and setSerial() methods assume the beginning of the task-local
  * data segment holds a chpl_bool denoting the serial state. */
+#if 0
+chpl_bool get_serial(chpl_taskID_t id) {
+    if(g_task_map.find(id) == g_task_map.end()) 
+        return true;
+    else
+        return g_task_map[id];
+}
+#endif
+
 chpl_bool chpl_task_getSerial(void)
 {
-    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
 
     PROFILE_INCR(profile_task_getSerial,1);
-
+    
     return data->bundle->serial_state;
 }
 
 void chpl_task_setSerial(chpl_bool state)
 {
-    chpl_qthread_tls_t * data = chpl_qthread_get_tasklocal();
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
     data->bundle->serial_state = state;
 
     PROFILE_INCR(profile_task_setSerial,1);
@@ -1096,14 +1179,14 @@ void chpl_task_setSerial(chpl_bool state)
 
 uint32_t chpl_task_getMaxPar(void) {
     //chpl_internal_error("Qthreads max tasks par asked\n");
-    printf("Qthreads max task par asked\n");
+    //printf("Qthreads max task par asked\n");
     //
     // We assume here that the caller (in the LocaleModel module code)
     // is interested in the number of workers on the whole node, and
     // will decide itself how much parallelism to create across and
     // within sublocales, if there are any.
     //
-    return (uint32_t) qthread_num_workers();
+    return (uint32_t) 8;//qthread_num_workers();
 }
 
 c_sublocid_t chpl_task_getNumSublocales(void)
@@ -1145,7 +1228,7 @@ int32_t chpl_task_getNumBlockedTasks(void)
 
 uint32_t chpl_task_getNumThreads(void)
 {
-    return (uint32_t)qthread_num_workers();
+    return (uint32_t) 8;//qthread_num_workers();
 }
 
 // Ew. Talk about excessive bookkeeping.

From cc5406c05ec1dcf967986089a8cd9253674bc437 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Sat, 4 Mar 2017 03:15:49 -0600
Subject: [PATCH 05/10] minor bug fix for constant expression

---
 runtime/include/tasks/atmi/tasks-atmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h
index 44adc76308..3f1497b61d 100644
--- a/runtime/include/tasks/atmi/tasks-atmi.h
+++ b/runtime/include/tasks/atmi/tasks-atmi.h
@@ -51,7 +51,7 @@ void chpl_task_yield(void);
 // between C code and Chapel code in the runtime.
 //
 typedef uint64_t chpl_taskID_t;
-#define chpl_nullTaskID ATMI_NULL_TASK_HANDLE
+#define chpl_nullTaskID ATMI_TASK_HANDLE(0xFFFFFFFFFFFFFFFF) 
 
 //
 // Sync variables

From 5307da3caac2841fb94d92037826b54ffe3e2a60 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Mon, 3 Apr 2017 03:07:38 -0500
Subject: [PATCH 06/10] [Feature] cobegin uses ATMI for CPU task management.
 But, TARGET_HSA should be enabled to do that, but it is just conforming to
 the rest of our conditional compilation strategy anyway.

---
 compiler/AST/build.cpp                       | 20 +++++
 compiler/AST/expr.cpp                        | 16 +++-
 compiler/codegen/expr.cpp                    | 10 ++-
 compiler/include/expr.h                      | 12 ++-
 compiler/passes/buildDefaultFunctions.cpp    |  4 +
 compiler/passes/insertLineNumbers.cpp        |  9 ++-
 compiler/passes/parallel.cpp                 | 28 +++++--
 modules/internal/ChapelBase.chpl             | 82 +++++++++++++++++++-
 modules/internal/LocaleModelHelpRuntime.chpl | 10 ++-
 runtime/include/chpl-tasks.h                 |  5 ++
 runtime/src/tasks/atmi/tasks-atmi.c          | 57 ++++++++++++++
 11 files changed, 228 insertions(+), 25 deletions(-)

diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp
index 6bacf2f15e..8ffa86a44d 100644
--- a/compiler/AST/build.cpp
+++ b/compiler/AST/build.cpp
@@ -2870,6 +2870,7 @@ buildOnStmt(Expr* expr, Expr* stmt) {
   }
 }
 
+//#define TARGET_HSA_BEGIN
 
 BlockStmt*
 buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
@@ -2890,12 +2891,20 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
     return body;
   } else {
     BlockStmt* block = buildChapelStmt();
+#ifdef TARGET_HSA_BEGIN
     block->insertAtTail(new CallExpr("_upEndCount"));
+#else
+    block->insertAtTail(new CallExpr("_addToTaskGroup"));
+#endif
     BlockStmt* beginBlock = new BlockStmt();
     beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN));
     addByrefVars(beginBlock, byref_vars);
     beginBlock->insertAtHead(stmt);
+#ifdef TARGET_HSA_BEGIN
     beginBlock->insertAtTail(new CallExpr("_downEndCount"));
+#else
+    beginBlock->insertAtTail(new CallExpr("_completeTaskGroup"));
+#endif
     block->insertAtTail(beginBlock);
     return block;
   }
@@ -2947,13 +2956,24 @@ buildCobeginStmt(CallExpr* byref_vars, BlockStmt* block) {
     addByrefVars(beginBlk, byref_vars ? byref_vars->copy() : NULL);
     stmt->insertBefore(beginBlk);
     beginBlk->insertAtHead(stmt->remove());
+#ifdef TARGET_HSA
     beginBlk->insertAtTail(new CallExpr("_downEndCount", cobeginCount));
     block->insertAtHead(new CallExpr("_upEndCount", cobeginCount));
+#else
+    beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", cobeginCount));
+#endif
   }
+#ifndef TARGET_HSA
+    block->insertAtHead(new CallExpr("_initTaskGroup", cobeginCount));
+#endif
 
   block->insertAtHead(new CallExpr(PRIM_MOVE, cobeginCount, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gTrue)));
   block->insertAtHead(new DefExpr(cobeginCount));
+#ifdef TARGET_HSA
   block->insertAtTail(new CallExpr("_waitEndCount", cobeginCount));
+#else  
+  block->insertAtTail(new CallExpr("_finalizeTaskGroup", cobeginCount));
+#endif
   block->insertAtTail(new CallExpr("_endCountFree", cobeginCount));
 
   block->astloc = cobeginCount->astloc; // grab the location of 'cobegin' kw
diff --git a/compiler/AST/expr.cpp b/compiler/AST/expr.cpp
index a651651883..feb9b73209 100644
--- a/compiler/AST/expr.cpp
+++ b/compiler/AST/expr.cpp
@@ -793,7 +793,8 @@ CallExpr::CallExpr(BaseAST* base,
                    BaseAST* arg2,
                    BaseAST* arg3,
                    BaseAST* arg4,
-                   BaseAST* arg5) :
+                   BaseAST* arg5,
+                   BaseAST* arg6) :
   Expr(E_CallExpr),
   primitive(NULL),
   baseExpr(NULL),
@@ -815,6 +816,7 @@ CallExpr::CallExpr(BaseAST* base,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
@@ -827,7 +829,8 @@ CallExpr::CallExpr(PrimitiveOp* prim,
                    BaseAST*     arg2,
                    BaseAST*     arg3,
                    BaseAST*     arg4,
-                   BaseAST*     arg5) :
+                   BaseAST*     arg5,
+                   BaseAST*     arg6) :
   Expr(E_CallExpr),
   primitive(prim),
   baseExpr(NULL),
@@ -840,6 +843,7 @@ CallExpr::CallExpr(PrimitiveOp* prim,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
@@ -851,7 +855,8 @@ CallExpr::CallExpr(PrimitiveTag prim,
                    BaseAST*     arg2,
                    BaseAST*     arg3,
                    BaseAST*     arg4,
-                   BaseAST*     arg5) :
+                   BaseAST*     arg5,
+                   BaseAST*     arg6) :
   Expr(E_CallExpr),
   primitive(primitives[prim]),
   baseExpr(NULL),
@@ -864,6 +869,7 @@ CallExpr::CallExpr(PrimitiveTag prim,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
@@ -876,7 +882,8 @@ CallExpr::CallExpr(const char* name,
                    BaseAST*    arg2,
                    BaseAST*    arg3,
                    BaseAST*    arg4,
-                   BaseAST*    arg5) :
+                   BaseAST*    arg5,
+                   BaseAST*    arg6) :
   Expr(E_CallExpr),
   primitive(NULL),
   baseExpr(new UnresolvedSymExpr(name)),
@@ -889,6 +896,7 @@ CallExpr::CallExpr(const char* name,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
diff --git a/compiler/codegen/expr.cpp b/compiler/codegen/expr.cpp
index c3f0fd583a..1fee3d395e 100644
--- a/compiler/codegen/expr.cpp
+++ b/compiler/codegen/expr.cpp
@@ -5350,11 +5350,12 @@ void CallExpr::codegenInvokeOnFun() {
 void CallExpr::codegenInvokeTaskFun(const char* name) {
   FnSymbol*           fn            = isResolved();
   GenRet              taskList      = codegenValue(get(1));
+  GenRet              taskGroup     = codegenValue(get(6));
   GenRet              taskListNode;
   GenRet              taskBundle;
   GenRet              bundleSize;
 
-  std::vector<GenRet> args(8);
+  std::vector<GenRet> args(9);
 
   // get(1) is a ref/wide ref to a task list value
   // get(2) is the node ID owning the task list
@@ -5374,9 +5375,10 @@ void CallExpr::codegenInvokeTaskFun(const char* name) {
   args[2]      = codegenCast("chpl_task_bundle_p", taskBundle);
   args[3]      = bundleSize;
   args[4]      = taskList;
-  args[5]      = codegenValue(taskListNode);
-  args[6]      = fn->linenum();
-  args[7]      = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32);
+  args[5]      = taskGroup;
+  args[6]      = codegenValue(taskListNode);
+  args[7]      = fn->linenum();
+  args[8]      = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32);
 
   genComment(fn->cname, true);
 
diff --git a/compiler/include/expr.h b/compiler/include/expr.h
index a26acaafed..5233984569 100644
--- a/compiler/include/expr.h
+++ b/compiler/include/expr.h
@@ -199,28 +199,32 @@ class CallExpr : public Expr {
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   CallExpr(PrimitiveOp* prim,
            BaseAST*     arg1 = NULL,
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   CallExpr(PrimitiveTag prim,
            BaseAST*     arg1 = NULL,
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   CallExpr(const char*  name,
            BaseAST*     arg1 = NULL,
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   ~CallExpr();
 
diff --git a/compiler/passes/buildDefaultFunctions.cpp b/compiler/passes/buildDefaultFunctions.cpp
index 6ffa649baf..4c784fb274 100644
--- a/compiler/passes/buildDefaultFunctions.cpp
+++ b/compiler/passes/buildDefaultFunctions.cpp
@@ -581,7 +581,11 @@ static void build_chpl_entry_points() {
   // endcount (see comment above)
   //
   if (fMinimalModules == false) {
+#ifdef TARGET_HSA
     chpl_gen_main->insertAtTail(new CallExpr("_waitEndCount"));
+#else    
+    chpl_gen_main->insertAtTail(new CallExpr("_finalizeTaskGroup"));
+#endif
   }
 
   chpl_gen_main->insertAtTail(new CallExpr(PRIM_RETURN, main_ret));
diff --git a/compiler/passes/insertLineNumbers.cpp b/compiler/passes/insertLineNumbers.cpp
index 1c4d34beae..b272c4d324 100644
--- a/compiler/passes/insertLineNumbers.cpp
+++ b/compiler/passes/insertLineNumbers.cpp
@@ -283,6 +283,7 @@ void insertLineNumbers() {
     insertNilChecks();
   }
 
+  printf("Done 1\n");
   // loop over all primitives that require a line number and filename
   // and pass them an actual line number and filename
   forv_Vec(CallExpr, call, gCallExprs) {
@@ -291,6 +292,7 @@ void insertLineNumbers() {
     }
   }
 
+  printf("Done 2\n");
   // loop over all marked functions ("insert line file info"), add
   // line number and filename arguments to these functions, and add
   // them to the queue
@@ -304,6 +306,7 @@ void insertLineNumbers() {
     }
   }
 
+  printf("Done 3\n");
   // loop over all functions in the queue and all calls to these
   // functions, and pass the calls an actual line number and filename
   forv_Vec(FnSymbol, fn, queue) {
@@ -312,7 +315,9 @@ void insertLineNumbers() {
     }
   }
 
+  printf("Done 4\n");
   moveLinenoInsideArgBundle();
+  printf("Done 5\n");
 }
 
 static void moveLinenoInsideArgBundle()
@@ -325,8 +330,8 @@ static void moveLinenoInsideArgBundle()
     //  than the expected number.  Both block types below expect an
     //  argument bundle, and the on-block expects an additional argument
     //  that is the locale on which it should be executed.
-    if ((fn->numFormals() > 4 && fn->hasFlag(FLAG_ON_BLOCK)) ||
-        (fn->numFormals() > 5 && !fn->hasFlag(FLAG_ON_BLOCK) &&
+    if ((fn->numFormals() > 5 && fn->hasFlag(FLAG_ON_BLOCK)) ||
+        (fn->numFormals() > 6 && !fn->hasFlag(FLAG_ON_BLOCK) &&
          (fn->hasFlag(FLAG_BEGIN_BLOCK) ||
           fn->hasFlag(FLAG_COBEGIN_OR_COFORALL_BLOCK)))) {
 
diff --git a/compiler/passes/parallel.cpp b/compiler/passes/parallel.cpp
index 78ef16120e..e88151bc20 100644
--- a/compiler/passes/parallel.cpp
+++ b/compiler/passes/parallel.cpp
@@ -99,7 +99,7 @@ static BundleArgsFnData bundleArgsFnDataInit = { true, NULL, NULL };
 static void insertEndCounts();
 static void passArgsToNestedFns();
 static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnData &baData);
-static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode);
+static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup);
 static void findBlockRefActuals(Vec<Symbol*>& refSet, Vec<Symbol*>& refVec);
 static void findHeapVarsAndRefs(Map<Symbol*,Vec<SymExpr*>*>& defMap,
                                 Vec<Symbol*>& refSet, Vec<Symbol*>& refVec,
@@ -410,6 +410,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) {
   // first argument to a task launch function.
   Symbol* endCount = NULL;
   VarSymbol *taskList = NULL;
+  VarSymbol *taskGroup = NULL;
   VarSymbol *taskListNode = NULL;
 
   if (!fn->hasFlag(FLAG_ON)) {
@@ -466,6 +467,16 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) {
                                                   endCount,
                                                   endCount->typeInfo()->getField("taskList"))));
 
+    // Now get the taskGroup field out of the end count.
+
+    taskGroup = newTemp(astr("_taskGroup", fn->name), dtCVoidPtr);
+
+    fcall->insertBefore(new DefExpr(taskGroup));
+    fcall->insertBefore(new CallExpr(PRIM_MOVE, taskGroup,
+                                     new CallExpr(PRIM_GET_MEMBER,
+                                                  endCount,
+                                                  endCount->typeInfo()->getField("taskGroup"))));
+
 
     // Now get the node ID field for the end count,
     // which is where the task list is stored.
@@ -481,7 +492,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) {
     create_block_fn_wrapper(fn, fcall, baData);
 
   // call wrapper-function
-  call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode);
+  call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode, taskGroup);
   baData.firstCall = false;
 }
 
@@ -492,7 +503,8 @@ static CallExpr* helpFindDownEndCount(BlockStmt* block)
   while (cur && (isCallExpr(cur) || isDefExpr(cur) || isBlockStmt(cur))) {
     if (CallExpr* call = toCallExpr(cur)) {
       if (call->isResolved())
-        if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0)
+        if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0 ||
+            strcmp(call->resolvedFunction()->name, "_completeTaskGroup") == 0)
           return call;
     } else if (BlockStmt* inner = toBlockStmt(cur)) {
       // Need to handle local blocks since the compiler
@@ -646,6 +658,10 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD
                                             dtCVoidPtr->refType );
     taskListArg->addFlag(FLAG_NO_CODEGEN);
     wrap_fn->insertFormalAtTail(taskListArg);
+    ArgSymbol *taskGroupArg = new ArgSymbol( INTENT_IN, "dummy_taskGroup", 
+                                            dtCVoidPtr->refType );
+    taskGroupArg->addFlag(FLAG_NO_CODEGEN);
+    wrap_fn->insertFormalAtTail(taskGroupArg);
     ArgSymbol *taskListNode = new ArgSymbol( INTENT_IN, "dummy_taskListNode", 
                                              dtInt[INT_SIZE_DEFAULT]);
     taskListNode->addFlag(FLAG_NO_CODEGEN);
@@ -749,20 +765,20 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD
 
 static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol*
     args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn,
-    Symbol* taskList, Symbol* taskListNode)
+    Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup)
 {
   // The wrapper function is called with the bundled argument list.
   if (fn->hasFlag(FLAG_ON)) {
     // For an on block, the first argument is also passed directly
     // to the wrapper function.
     // The forking function uses this to fork a task on the target locale.
-    fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc));
+    fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc));//, new SymExpr(taskGroup)));
   } else {
     // For non-on blocks, the task list is passed directly to the function
     // (so that codegen can find it).
     // We need the taskList.
     INT_ASSERT(taskList);
-    fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc));
+    fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc, new SymExpr(taskGroup)));
   }
 
   fcall->remove();                     // rm orig. call
diff --git a/modules/internal/ChapelBase.chpl b/modules/internal/ChapelBase.chpl
index fcdb87690c..ba6210d01b 100644
--- a/modules/internal/ChapelBase.chpl
+++ b/modules/internal/ChapelBase.chpl
@@ -763,7 +763,8 @@ module ChapelBase {
     type taskType;
     var i: iType,
         taskCnt: taskType,
-        taskList: c_void_ptr = _defaultOf(c_void_ptr);
+        taskList: c_void_ptr = _defaultOf(c_void_ptr),
+        taskGroup: c_void_ptr = _defaultOf(c_void_ptr);
   }
 
   // This function is called once by the initiating task.  No on
@@ -786,10 +787,17 @@ module ChapelBase {
     }
   }
 
+
   // Compiler looks for this variable to determine the return type of
   // the "get end count" primitive.
   type _remoteEndCountType = _endCountAlloc(false).type;
 
+  pragma "insert line file info"
+  extern proc chpl_taskGroupInit(): c_void_ptr;
+  extern proc chpl_taskGroupGet(): c_void_ptr;
+  extern proc chpl_taskGroupComplete();
+  extern proc chpl_taskGroupFinalize(task_group: c_void_ptr);
+ 
   // This function is called once by the initiating task.  As above, no
   // on statement needed.
   pragma "dont disable remote value forwarding"
@@ -797,6 +805,36 @@ module ChapelBase {
     delete e;
   }
 
+  // This function is called by the initiating task once for each new
+  // task *before* any of the tasks are started.  As above, no on
+  // statement needed.
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true) {
+    e.taskGroup = chpl_taskGroupGet();
+  }
+
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) {
+    e.taskGroup = chpl_taskGroupGet();
+  }
+
+  // This function is called by the initiating task once for each new
+  // task *before* any of the tasks are started.  As above, no on
+  // statement needed.
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _initTaskGroup(e: _EndCount, param countRunningTasks=true) {
+    e.taskGroup = chpl_taskGroupInit();
+  }
+
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _initTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) {
+    e.taskGroup = chpl_taskGroupInit();
+  }
+
   // This function is called by the initiating task once for each new
   // task *before* any of the tasks are started.  As above, no on
   // statement needed.
@@ -838,6 +876,26 @@ module ChapelBase {
     e.i.sub(1, memory_order_release);
   }
 
+  // This function is called once by each newly initiated task.  No on
+  // statement is needed because the call to sub() will do a remote
+  // fork (on) if needed.
+  pragma "dont disable remote value forwarding"
+  proc _completeTaskGroup(e: _EndCount) {
+    chpl_taskGroupComplete();
+  }
+
+  // This function is called once by the initiating task.  As above, no
+  // on statement needed.
+  pragma "dont disable remote value forwarding"
+  proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true) {
+      chpl_taskGroupFinalize(e.taskGroup);
+  }
+
+  pragma "dont disable remote value forwarding"
+  proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) {
+    chpl_taskGroupFinalize(e.taskGroup);
+  }
+
   // This function is called once by the initiating task.  As above, no
   // on statement needed.
   pragma "dont disable remote value forwarding"
@@ -867,7 +925,7 @@ module ChapelBase {
   proc _waitEndCount(e: _EndCount, param countRunningTasks=true, numTasks) {
     // See if we can help with any of the started tasks
     chpl_taskListExecute(e.taskList);
-
+    
     // Wait for all tasks to finish
     e.i.waitFor(0, memory_order_acquire);
 
@@ -876,6 +934,26 @@ module ChapelBase {
     }
   }
 
+  proc _addToTaskGroup(param countRunningTasks=true) {
+    var e = __primitive("get end count");
+    _addToTaskGroup(e, countRunningTasks);
+  }
+
+  proc _initTaskGroup(param countRunningTasks=true) {
+    var e = __primitive("get end count");
+    _initTaskGroup(e, countRunningTasks);
+  }
+
+  proc _completeTaskGroup() {
+    var e = __primitive("get end count");
+    _completeTaskGroup(e);
+  }
+
+  proc _finalizeTaskGroup(param countRunningTasks=true) {
+    var e = __primitive("get end count");
+    _finalizeTaskGroup(e, countRunningTasks);
+  }
+
   proc _upEndCount(param countRunningTasks=true) {
     var e = __primitive("get end count");
     _upEndCount(e, countRunningTasks);
diff --git a/modules/internal/LocaleModelHelpRuntime.chpl b/modules/internal/LocaleModelHelpRuntime.chpl
index 1eca09e9a9..33550f1ae2 100644
--- a/modules/internal/LocaleModelHelpRuntime.chpl
+++ b/modules/internal/LocaleModelHelpRuntime.chpl
@@ -118,7 +118,7 @@ module LocaleModelHelpRuntime {
                                       args: chpl_task_bundle_p, args_size: size_t,
                                       subloc_id: int,
                                       ref tlist: c_void_ptr, tlist_node_id: int,
-                                      is_begin: bool);
+                                      is_begin: bool, tgroup: c_void_ptr);
   extern proc chpl_task_executeTasksInList(ref tlist: c_void_ptr);
 
   //
@@ -131,10 +131,12 @@ module LocaleModelHelpRuntime {
                              args: chpl_task_bundle_p,      // function args
                              args_size: size_t,     // args size
                              ref tlist: c_void_ptr, // task list
+                             tgroup: c_void_ptr, // task group
                              tlist_node_id: int     // task list owner node
                             ) {
+    warning("begin fn : ", fn);
     chpl_task_addToTaskList(fn, args, args_size,
-                            subloc_id, tlist, tlist_node_id, true);
+                            subloc_id, tlist, tlist_node_id, true, tgroup);
   }
 
   //
@@ -148,10 +150,12 @@ module LocaleModelHelpRuntime {
                               args: chpl_task_bundle_p,      // function args
                               args_size: size_t,     // args size
                               ref tlist: c_void_ptr, // task list
+                              tgroup: c_void_ptr, // task group
                               tlist_node_id: int     // task list owner node
                              ) {
+    warning("cobegin fn : ", fn);
     chpl_task_addToTaskList(fn, args, args_size,
-                            subloc_id, tlist, tlist_node_id, false);
+                            subloc_id, tlist, tlist_node_id, false, tgroup);
    }
 
   //
diff --git a/runtime/include/chpl-tasks.h b/runtime/include/chpl-tasks.h
index 6eac21fb8c..d9fb916627 100644
--- a/runtime/include/chpl-tasks.h
+++ b/runtime/include/chpl-tasks.h
@@ -158,10 +158,15 @@ void chpl_task_addToTaskList(
          void**,             // task list
          c_nodeid_t,         // locale (node) where task list resides
          chpl_bool,          // is begin{} stmt?  (vs. cobegin or coforall)
+         void *,             // task group
          int,                // line at which function begins
          int32_t);           // name of file containing function
 void chpl_task_executeTasksInList(void**);
 
+void *chpl_taskGroupInit(int lineno, int32_t filename);
+void *chpl_taskGroupGet();
+void chpl_taskGroupFinalize(void *task_group);
+void chpl_taskGroupComplete();
 //
 // Call a chpl_ftable[] function in a task.
 //
diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
index 8f926c8738..95b6b48080 100644
--- a/runtime/src/tasks/atmi/tasks-atmi.c
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -45,6 +45,7 @@
 #include "chplsys.h"
 #include "chpl-linefile-support.h"
 #include "chpl-tasks.h"
+#include "chpl-atomics.h"
 #include "chpl-tasks-callbacks-internal.h"
 //#include "tasks-qthreads.h"
 #include "tasks-atmi.h"
@@ -69,6 +70,8 @@
 #define max_num_cpu_kernels 4096
 int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; 
 atmi_kernel_t cpu_kernels[max_num_cpu_kernels];
+atmi_kernel_t dummy_kernel;
+static uint_least64_t atmi_tg_id = 0;
 
 
 #define OUTPUT_ATMI_STATUS(status, msg) \
@@ -732,6 +735,7 @@ static void setupWorkStealing(void) {
 
 static aligned_t chapel_wrapper(void *arg);
 static aligned_t main_wrapper(void *arg);
+static aligned_t dummy_wrapper();
 // If we stored chpl_taskID_t in chpl_task_bundleData_t,
 // this struct and the following function may not be necessary.
 typedef void (*main_ptr_t)(void);
@@ -752,6 +756,8 @@ void chpl_task_init(void)
     size_t main_arg_size = sizeof(main_wrapper_bundle_t);
     atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size);
     atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL);
+    atmi_kernel_create_empty(&dummy_kernel, 0, NULL);
+    atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL);
 
     int32_t   commMaxThreads;
     int32_t   hwpar;
@@ -800,6 +806,7 @@ void chpl_task_exit(void)
         if(cpu_kernels_initialized[i]) 
             atmi_kernel_release(cpu_kernels[i]);
     }
+    atmi_kernel_release(dummy_kernel);
     atmi_kernel_release(main_kernel);
     atmi_finalize();
 #ifdef CHAPEL_PROFILE
@@ -835,6 +842,8 @@ static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind,
 }
 
 
+static aligned_t dummy_wrapper() {} 
+
 static aligned_t main_wrapper(void *arg)
 {
     chpl_atmi_tls_t         *tls = chpl_atmi_get_tasklocal();
@@ -956,6 +965,48 @@ int chpl_task_createCommTask(chpl_fn_p fn,
                           NULL, comm_task_wrapper, &wrapper_info);
 }
 
+void *chpl_taskGroupGet() {
+    void *ret = (void *)get_atmi_task_group();
+    printf("Adding to task group: %p\n", ret);
+    return ret;
+}
+
+void *chpl_taskGroupInit(int lineno, int32_t filename) {
+    atmi_task_group_t *tg = (atmi_task_group_t *)chpl_malloc(sizeof(atmi_task_group_t));
+    // TODO: add to a list of task groups and free all of them at the very end.
+    int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed);
+    printf("Next task group ID: %d\n", next_id);
+    tg->id = next_id;
+    tg->ordered = ATMI_FALSE;
+    return tg;                                                                             
+}
+
+void chpl_taskGroupComplete() {
+    // no-op in ATMI. down count in other tasking layers
+}
+
+void chpl_taskGroupFinalize(void *tg) {
+    int cpu_id = 0; // which subloc?
+    atmi_task_group_t *cur_tg = get_atmi_task_group();
+
+    if(cur_tg) {
+        // if I am within a task, provide a chain dependency to the incoming group
+        ATMI_LPARM_CPU(lparm, cpu_id);
+        lparm->kernel_id = CPU_FUNCTION_IMPL; 
+        lparm->groupable = ATMI_TRUE;
+        lparm->group = cur_tg; 
+        lparm->num_required_groups = 1;
+        lparm->required_groups = (atmi_task_group_t **)&tg; 
+        atmi_task_launch(lparm, dummy_kernel, NULL);
+    }
+    else {
+        // if I am not within a task (main task), simply sync
+        printf("Waiting for task group: %p\n", tg);
+        atmi_task_group_sync((atmi_task_group_t *)tg);
+    }
+    //chpl_free(tg);
+}
+
 void chpl_task_addToTaskList(chpl_fn_int_t       fid,
                              chpl_task_bundle_t *arg,
                              size_t              arg_size,
@@ -963,6 +1014,7 @@ void chpl_task_addToTaskList(chpl_fn_int_t       fid,
                              void              **task_list,
                              int32_t             task_list_locale,
                              chpl_bool           is_begin_stmt,
+                             void               *task_group,
                              int                 lineno,
                              int32_t             filename)
 {
@@ -995,6 +1047,11 @@ void chpl_task_addToTaskList(chpl_fn_int_t       fid,
         ATMI_LPARM_CPU(lparm, cpu_id);
         lparm->kernel_id = CPU_FUNCTION_IMPL;
         //lparm->synchronous = ATMI_TRUE;
+        if(task_group) { 
+            lparm->groupable = ATMI_TRUE;
+            lparm->group = (atmi_task_group_t *)task_group;
+        }
+
         void *kernargs[1];
         if(!cpu_kernels_initialized[fid]) {
             atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size);

From 9efaf1e0955eb2246d327b072ea2489ba3e41f0e Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Mon, 3 Apr 2017 12:38:43 -0500
Subject: [PATCH 07/10] [Bug fix] Minor fix for conditional compilation and
 returning 0 as the chpl-atmi null task instead of -1. Rationale to return 0
 instead of chpl_nullTaskID: ATMI returns -1 as the NULL task, but we maintain
 0 as the null task in chpl-atmi. Impl: chpl-atmi does not create a new task
 for the main wrapper. Instead it creates a dummy ATMI task in the beginning
 but does not launch/activate it. This helps us recognize 0 as the main/dummy
 task and not -1 that is expected by ATMI as the NULL task.

---
 compiler/AST/build.cpp                    | 12 ++++++------
 compiler/passes/buildDefaultFunctions.cpp |  2 +-
 compiler/passes/insertLineNumbers.cpp     |  5 -----
 runtime/src/tasks/atmi/tasks-atmi.c       | 11 ++++++++++-
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp
index 8ffa86a44d..d66d9faa25 100644
--- a/compiler/AST/build.cpp
+++ b/compiler/AST/build.cpp
@@ -2870,7 +2870,7 @@ buildOnStmt(Expr* expr, Expr* stmt) {
   }
 }
 
-//#define TARGET_HSA_BEGIN
+#define TARGET_HSA_BEGIN
 
 BlockStmt*
 buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
@@ -2891,7 +2891,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
     return body;
   } else {
     BlockStmt* block = buildChapelStmt();
-#ifdef TARGET_HSA_BEGIN
+#ifndef TARGET_HSA_BEGIN
     block->insertAtTail(new CallExpr("_upEndCount"));
 #else
     block->insertAtTail(new CallExpr("_addToTaskGroup"));
@@ -2900,7 +2900,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
     beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN));
     addByrefVars(beginBlock, byref_vars);
     beginBlock->insertAtHead(stmt);
-#ifdef TARGET_HSA_BEGIN
+#ifndef TARGET_HSA_BEGIN
     beginBlock->insertAtTail(new CallExpr("_downEndCount"));
 #else
     beginBlock->insertAtTail(new CallExpr("_completeTaskGroup"));
@@ -2956,20 +2956,20 @@ buildCobeginStmt(CallExpr* byref_vars, BlockStmt* block) {
     addByrefVars(beginBlk, byref_vars ? byref_vars->copy() : NULL);
     stmt->insertBefore(beginBlk);
     beginBlk->insertAtHead(stmt->remove());
-#ifdef TARGET_HSA
+#ifndef TARGET_HSA
     beginBlk->insertAtTail(new CallExpr("_downEndCount", cobeginCount));
     block->insertAtHead(new CallExpr("_upEndCount", cobeginCount));
 #else
     beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", cobeginCount));
 #endif
   }
-#ifndef TARGET_HSA
+#ifdef TARGET_HSA
     block->insertAtHead(new CallExpr("_initTaskGroup", cobeginCount));
 #endif
 
   block->insertAtHead(new CallExpr(PRIM_MOVE, cobeginCount, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gTrue)));
   block->insertAtHead(new DefExpr(cobeginCount));
-#ifdef TARGET_HSA
+#ifndef TARGET_HSA
   block->insertAtTail(new CallExpr("_waitEndCount", cobeginCount));
 #else  
   block->insertAtTail(new CallExpr("_finalizeTaskGroup", cobeginCount));
diff --git a/compiler/passes/buildDefaultFunctions.cpp b/compiler/passes/buildDefaultFunctions.cpp
index 4c784fb274..d8e6cff7c3 100644
--- a/compiler/passes/buildDefaultFunctions.cpp
+++ b/compiler/passes/buildDefaultFunctions.cpp
@@ -581,7 +581,7 @@ static void build_chpl_entry_points() {
   // endcount (see comment above)
   //
   if (fMinimalModules == false) {
-#ifdef TARGET_HSA
+#ifndef TARGET_HSA
     chpl_gen_main->insertAtTail(new CallExpr("_waitEndCount"));
 #else    
     chpl_gen_main->insertAtTail(new CallExpr("_finalizeTaskGroup"));
diff --git a/compiler/passes/insertLineNumbers.cpp b/compiler/passes/insertLineNumbers.cpp
index b272c4d324..a9e302366d 100644
--- a/compiler/passes/insertLineNumbers.cpp
+++ b/compiler/passes/insertLineNumbers.cpp
@@ -283,7 +283,6 @@ void insertLineNumbers() {
     insertNilChecks();
   }
 
-  printf("Done 1\n");
   // loop over all primitives that require a line number and filename
   // and pass them an actual line number and filename
   forv_Vec(CallExpr, call, gCallExprs) {
@@ -292,7 +291,6 @@ void insertLineNumbers() {
     }
   }
 
-  printf("Done 2\n");
   // loop over all marked functions ("insert line file info"), add
   // line number and filename arguments to these functions, and add
   // them to the queue
@@ -306,7 +304,6 @@ void insertLineNumbers() {
     }
   }
 
-  printf("Done 3\n");
   // loop over all functions in the queue and all calls to these
   // functions, and pass the calls an actual line number and filename
   forv_Vec(FnSymbol, fn, queue) {
@@ -315,9 +312,7 @@ void insertLineNumbers() {
     }
   }
 
-  printf("Done 4\n");
   moveLinenoInsideArgBundle();
-  printf("Done 5\n");
 }
 
 static void moveLinenoInsideArgBundle()
diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
index 95b6b48080..daeb6be8fa 100644
--- a/runtime/src/tasks/atmi/tasks-atmi.c
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -759,6 +759,7 @@ void chpl_task_init(void)
     atmi_kernel_create_empty(&dummy_kernel, 0, NULL);
     atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL);
 
+    atmi_task_handle_t dummy_handle = atmi_task_create(dummy_kernel);
     int32_t   commMaxThreads;
     int32_t   hwpar;
     pthread_t initer;
@@ -1165,8 +1166,16 @@ chpl_taskID_t chpl_task_getId(void)
     PROFILE_INCR(profile_task_getId,1);
 
     atmi_task_handle_t t = get_atmi_task_handle();
+    // Rationale to return 0 instead of chpl_nullTaskID: ATMI returns -1 as the NULL 
+    // task, but we maintain 0 as the null task in chpl-atmi. 
+    // Impl: chpl-atmi does not create a new task for the main wrapper. Instead it
+    // creates a dummy ATMI task in the beginning but does not launch/activate
+    // it. This helps us recognize 0 as the main/dummy task and not -1 that is
+    // expected by ATMI as the NULL task. 
+    // FIXME: this could go against the chpl design of having *every* task being
+    // launched, including the main wrapper task. Any problems with that?
     if(t == ATMI_NULL_TASK_HANDLE)
-        return (chpl_taskID_t) -1;
+        return (chpl_taskID_t) 0;
 
     return t;
 }

From 49b3883cff082512359b2baf5280ff7d38901a33 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Wed, 5 Apr 2017 02:01:24 -0500
Subject: [PATCH 08/10] added support for coforall in ATMI

---
 compiler/AST/build.cpp | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp
index d66d9faa25..91ac51a402 100644
--- a/compiler/AST/build.cpp
+++ b/compiler/AST/build.cpp
@@ -1609,12 +1609,21 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices,
       beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL));
       addByrefVars(beginBlk, byref_vars);
       beginBlk->insertAtHead(body);
+#ifndef TARGET_HSA
       beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount));
+#else
+      beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount));
+#endif
       BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered);
       block->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue)));
       block->insertAtHead(new DefExpr(coforallCount));
+#ifndef TARGET_HSA
       beginBlk->insertBefore(new CallExpr("_upEndCount", coforallCount));
       block->insertAtTail(new CallExpr("_waitEndCount", coforallCount));
+#else
+      beginBlk->insertBefore(new CallExpr("_initTaskGroup", coforallCount));
+      block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount));
+#endif
       block->insertAtTail(new CallExpr("_endCountFree", coforallCount));
       nonVectorCoforallBlk->insertAtTail(block);
     }
@@ -1624,15 +1633,27 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices,
       beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL));
       addByrefVars(beginBlk, byref_vars);
       beginBlk->insertAtHead(body->copy());
+#ifndef TARGET_HSA
       beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount));
+#else
+      beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount));
+#endif
       VarSymbol* numTasks = newTemp("numTasks");
       vectorCoforallBlk->insertAtTail(new DefExpr(numTasks));
       vectorCoforallBlk->insertAtTail(new CallExpr(PRIM_MOVE, numTasks, new CallExpr(".", tmpIter,  new_CStringSymbol("size"))));
+#ifndef TARGET_HSA
       vectorCoforallBlk->insertAtTail(new CallExpr("_upEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#else
+      vectorCoforallBlk->insertAtTail(new CallExpr("_initTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#endif
       BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered);
       vectorCoforallBlk->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue)));
       vectorCoforallBlk->insertAtHead(new DefExpr(coforallCount));
+#ifndef TARGET_HSA
       block->insertAtTail(new CallExpr("_waitEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#else
+      block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#endif
       block->insertAtTail(new CallExpr("_endCountFree", coforallCount));
       vectorCoforallBlk->insertAtTail(block);
     }
@@ -2870,7 +2891,6 @@ buildOnStmt(Expr* expr, Expr* stmt) {
   }
 }
 
-#define TARGET_HSA_BEGIN
 
 BlockStmt*
 buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
@@ -2891,7 +2911,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
     return body;
   } else {
     BlockStmt* block = buildChapelStmt();
-#ifndef TARGET_HSA_BEGIN
+#ifndef TARGET_HSA
     block->insertAtTail(new CallExpr("_upEndCount"));
 #else
     block->insertAtTail(new CallExpr("_addToTaskGroup"));
@@ -2900,7 +2920,7 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
     beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN));
     addByrefVars(beginBlock, byref_vars);
     beginBlock->insertAtHead(stmt);
-#ifndef TARGET_HSA_BEGIN
+#ifndef TARGET_HSA
     beginBlock->insertAtTail(new CallExpr("_downEndCount"));
 #else
     beginBlock->insertAtTail(new CallExpr("_completeTaskGroup"));

From b764626a15164775179c36c56e9cb8e39890d005 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Wed, 5 Apr 2017 17:50:39 -0500
Subject: [PATCH 09/10] enabled sync statement to use ATMI

---
 compiler/AST/build.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp
index 91ac51a402..b9412aba95 100644
--- a/compiler/AST/build.cpp
+++ b/compiler/AST/build.cpp
@@ -2940,7 +2940,11 @@ buildSyncStmt(Expr* stmt) {
   block->insertAtTail(new CallExpr(PRIM_MOVE, endCountSave, new CallExpr(PRIM_GET_END_COUNT)));
   block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gFalse)));
   block->insertAtTail(stmt);
+#ifndef TARGET_HSA
   block->insertAtTail(new CallExpr("_waitEndCount"));
+#else
+  block->insertAtTail(new CallExpr("_finalizeTaskGroup"));
+#endif
   block->insertAtTail(new CallExpr("_endCountFree", new CallExpr(PRIM_GET_END_COUNT)));
   block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, endCountSave));
   return block;

From 5dd51ad47c91d660ae79f19b8f741262066dd187 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Fri, 28 Apr 2017 02:35:34 -0500
Subject: [PATCH 10/10] Recycling task groups and at the same time, fixing the
 max depth of nesting of coforall and cobegin blocks

---
 runtime/src/tasks/atmi/tasks-atmi.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
index daeb6be8fa..9194494b18 100644
--- a/runtime/src/tasks/atmi/tasks-atmi.c
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -67,6 +67,10 @@
 #include <unistd.h>
 #include <math.h>
 
+// FIXME: Good idea to recycle the task groups, so this will limit 
+// the depth of nested coforall s and cobegins. Fix to use an ATMI
+// environment variable.
+#define max_num_task_groups 8
 #define max_num_cpu_kernels 4096
 int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; 
 atmi_kernel_t cpu_kernels[max_num_cpu_kernels];
@@ -759,6 +763,11 @@ void chpl_task_init(void)
     atmi_kernel_create_empty(&dummy_kernel, 0, NULL);
     atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL);
 
+    // this increment needed to let the main task not be treated as an ATMI task
+    // because we directly launch the main task with the main thread and dont treat 
+    // it as an ATMI task. This increment fools the rest of the runtime to think that
+    // the main task is an ATMI task, but it is in fact not an ATMI task.
+    int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed);
     atmi_task_handle_t dummy_handle = atmi_task_create(dummy_kernel);
     int32_t   commMaxThreads;
     int32_t   hwpar;
@@ -968,7 +977,6 @@ int chpl_task_createCommTask(chpl_fn_p fn,
 
 void *chpl_taskGroupGet() {
     void *ret = (void *)get_atmi_task_group();
-    printf("Adding to task group: %p\n", ret);
     return ret;
 }
 
@@ -976,7 +984,9 @@ void *chpl_taskGroupInit(int lineno, int32_t filename) {
     atmi_task_group_t *tg = (atmi_task_group_t *)chpl_malloc(sizeof(atmi_task_group_t));
     // TODO: add to a list of task groups and free all of them at the very end.
     int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed);
-    printf("Next task group ID: %d\n", next_id);
+    // loop around the task groups. 
+    // FIXME: how will this affect the main task that is not an ATMI task? Incr by 1 after this?
+    next_id %= max_num_task_groups; 
     tg->id = next_id;
     tg->ordered = ATMI_FALSE;
     return tg;                                                                             
@@ -1002,7 +1012,6 @@ void chpl_taskGroupFinalize(void *tg) {
     }
     else {
         // if I am not within a task (main task), simply sync
-        printf("Waiting for task group: %p\n", tg);
         atmi_task_group_sync((atmi_task_group_t *)tg);
     }
     //chpl_free(tg);
@@ -1048,8 +1057,8 @@ void chpl_task_addToTaskList(chpl_fn_int_t       fid,
         ATMI_LPARM_CPU(lparm, cpu_id);
         lparm->kernel_id = CPU_FUNCTION_IMPL;
         //lparm->synchronous = ATMI_TRUE;
+        lparm->groupable = ATMI_TRUE;
         if(task_group) { 
-            lparm->groupable = ATMI_TRUE;
             lparm->group = (atmi_task_group_t *)task_group;
         }
 
@@ -1080,7 +1089,6 @@ static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p
                                 c_sublocid_t subloc,  chpl_bool serial_state,
                                 int lineno, int32_t filename)
 {
-    //printf("Adding %d fn to task call body\n", fid);
     chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
 
     bundle->serial_state       = serial_state;
@@ -1252,7 +1260,8 @@ uint32_t chpl_task_getMaxPar(void) {
     // will decide itself how much parallelism to create across and
     // within sublocales, if there are any.
     //
-    return (uint32_t) 8;//qthread_num_workers();
+    return qthread_num_workers();
+    //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count;
 }
 
 c_sublocid_t chpl_task_getNumSublocales(void)
@@ -1294,7 +1303,8 @@ int32_t chpl_task_getNumBlockedTasks(void)
 
 uint32_t chpl_task_getNumThreads(void)
 {
-    return (uint32_t) 8;//qthread_num_workers();
+    return qthread_num_workers();
+    //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count;
 }
 
 // Ew. Talk about excessive bookkeeping.