From 7980b1afff33d040b5cc04b17393e056a91205a3 Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Fri, 6 Mar 2015 15:59:32 +0100
Subject: [PATCH 1/8] Parallelised string factorisation

Shared variables with manual locking:
- hash table
- count
- reverse_keys
- reverse_values
- out_buffer
- chunk_

Shared variables without locking requirement:
- locks

Thread-local variables:
- thread_id
- in_buffer_ptr (points to thread-local buffer)
- out_buffer_ptr (points to thread-local buffer)

Locking scheme:
- For each thread a lock on the hash table (and other associated shared variables) exists.
- Each thread processing a chunk begins by acquiring its own lock on the shared hash table.
- The lock is released when the thread encounters an value that is new to the hash table.
- Once the thread is ready to write to the hash table, it waits to acquire the locks from all threads.
- After the write all locks are released.

---
Uncompressed bcolz timings:
```
--- uncached unique() ---
pandas (in-memory):
In [10]: %timeit -r 10 c.unique()
1 loops, best of 10: 881 ms per loop

bquery master over bcolz (persistent):
In [12]: %timeit -r 10 a.unique('mycol')
1 loops, best of 10: 2.1 s per loop
==> x2.38 slower than pandas

pull request over bcolz (persistent):
In [8]: %timeit -r 10 a.unique('mycol')
1 loops, best of 10: 834 ms per loop
==> x1.05 FASTER than pandas

---- cache_factor ---
bquery master over bcolz (persistent):
In [3]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True)
1 loops, best of 10: 2.51 s per loop

pull request with 2 threads over bcolz (persistent):
In [3]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True)
1 loops, best of 10: 1.16 s per loop
==> x2.16 faster than master

pull request with 1 thread over bcolz (persistent):
In [3]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True)
1 loops, best of 10: 1.69 s per loop
==> x1.48 faster than master (c.f. x1.48 from single-threaded PR #21)
==> parallel code seems to have no performance penalty on single-core machines
```

Compressed bcolz timings:
```
--- uncached unique() ---
pandas (in-memory):
In [10]: %timeit -r 10 c.unique()
1 loops, best of 10: 881 ms per loop

bquery master over bcolz (persistent):
In [12]: %timeit -r 10 a.unique('mycol')
1 loops, best of 10: 3.39 s per loop
==> x3.85 slower than pandas

pull request over bcolz (persistent):
In [8]: %timeit -r 10 a.unique('mycol')
1 loops, best of 10: 1.9 s per loop
==> x2.16 slower than pandas

---- cache_factor ---
bquery master over bcolz (persistent):
In [5]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True)
1 loops, best of 10: 4.09 s per loop

pull request with 2 threads over bcolz (persistent):
In [5]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True)
1 loops, best of 10: 2.48 s per loop
==> x1.65 faster than master

pull request with 1 thread over bcolz (persistent):
In [5]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True)
1 loops, best of 10: 3.26 s per loop
==> x1.25 faster than master (c.f. x1.28 from single-threaded PR #21)
```
---
 .gitignore            |   3 +
 bquery/ctable_ext.pyx | 225 ++++++++++++++++++++++++++++++++++--------
 bquery/khash.pxd      |   4 +-
 requirements.txt      |   2 +-
 setup.py              |   7 +-
 5 files changed, 195 insertions(+), 46 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7b12180..5c1395e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,7 +54,10 @@ docs/_build/
 target/
 
 # other
+*.pyx.orig
+cython_debug/
 bquery/ctable_ext.c
+bquery/ctable_ext.cpp
 bcolz
 bquery/benchmarks/vb_suite/benchmarks.db
 bquery/benchmarks/vb_suite/source
\ No newline at end of file
diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 1e4becb..e0de1e1 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -1,8 +1,14 @@
-import numpy as np
 import cython
-from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64
-from libc.stdlib cimport malloc
-from libc.string cimport strcpy
+from cython.parallel import parallel, prange, threadid
+from openmp cimport omp_lock_t, omp_init_lock, omp_set_lock, omp_unset_lock, omp_destroy_lock, omp_get_num_threads
+
+import numpy as np
+from numpy cimport ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t
+
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport strcpy, memcpy, memset
+from libcpp.vector cimport vector
 from khash cimport *
 from bcolz.carray_ext cimport carray, chunk
 
@@ -31,94 +37,229 @@ DEF _SORTED_COUNT_DISTINCT = 4
 @cython.boundscheck(False)
 cdef void _factorize_str_helper(Py_ssize_t iter_range,
                        Py_ssize_t allocation_size,
-                       ndarray in_buffer,
-                       ndarray[npy_uint64] out_buffer,
+                       char * in_buffer_ptr,
+                       uint64_t * out_buffer,
                        kh_str_t *table,
                        Py_ssize_t * count,
-                       dict reverse,
-                       ):
+                       vector[char *] & reverse_values,
+                       unsigned int thread_id,
+                       omp_lock_t kh_locks[],
+                       unsigned int num_threads,
+                       ) nogil:
     cdef:
-        Py_ssize_t i, idx
+        Py_ssize_t i, idx, itemsize
         int ret
+        unsigned int j
         char * element
         char * insert
         khiter_t k
 
     ret = 0
-
-    for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
+    itemsize = allocation_size - 1
+    # allocate enough memory to hold the string element, add one for the
+    # null byte that marks the end of the string.
+    # TODO: understand why zero-filling is necessary. Without zero-filling
+    # the buffer, duplicate keys occur in the reverse dict
+    element = <char *>calloc(allocation_size, sizeof(char))
+
+    omp_set_lock(&kh_locks[thread_id])
+    for i in xrange(iter_range):
+        # strings are stored without null termination in ndarrays: need a 
+        # buffer to append null termination to use usual string algorithms
+        memcpy(element, in_buffer_ptr, itemsize)
+        in_buffer_ptr += itemsize
+
+        # while reading from shared hash table, set lock for this thread
         k = kh_get_str(table, element)
         if k != table.n_buckets:
             idx = table.vals[k]
         else:
+            omp_unset_lock(&kh_locks[thread_id])
             # allocate enough memory to hold the string, add one for the
             # null byte that marks the end of the string.
             insert = <char *>malloc(allocation_size)
             # TODO: is strcpy really the best way to copy a string?
             strcpy(insert, element)
+            # acquire locks for all threads before writing to shared resources
+            for j in xrange(num_threads):
+                omp_set_lock(&kh_locks[j])
             k = kh_put_str(table, insert, &ret)
             table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
+            reverse_values.push_back(insert)
             count[0] += 1
+            # release all locks
+            for j in xrange(num_threads):
+                omp_unset_lock(&kh_locks[j])
+            omp_set_lock(&kh_locks[thread_id])
         out_buffer[i] = idx
 
+    omp_unset_lock(&kh_locks[thread_id])
+    free(element)
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def factorize_str(carray carray_, carray labels=None):
     cdef:
         chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements
+        Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \
+                   nchunks
+        unsigned int j, num_threads, thread_id, test
+        omp_lock_t * kh_locks
+        omp_lock_t chunk_lock, out_buffer_lock
+        vector[char *] reverse_values
         dict reverse
-        ndarray in_buffer
+        char * in_buffer_ptr
         ndarray[npy_uint64] out_buffer
+        char * out_buffer_org_ptr
+        uint64_t * out_buffer_ptr
         kh_str_t *table
+        uintptr_t leftover_array_ptr
+
 
     count = 0
     ret = 0
     reverse = {}
+    nchunks = carray_.nchunks
+    # 0 = set the number of parallel threads automatically:
+    num_threads = 0
 
     n = len(carray_)
     chunklen = carray_.chunklen
+    allocation_size = carray_.dtype.itemsize + 1
     if labels is None:
         labels = carray([], dtype='int64', expectedlen=n)
-    # in-buffer isn't typed, because cython doesn't support string arrays (?)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    in_buffer = np.empty(chunklen, dtype=carray_.dtype)
+
+    leftover_array_ptr = carray_.leftover_array.__array_interface__['data'][0]
+
     table = kh_init_str()
 
-    for i in range(carray_.nchunks):
-        chunk_ = carray_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        _factorize_str_helper(chunklen,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer.astype(np.int64))
 
+    # Allocate and initialise shared locks for hash-table writes for each child
+    # thread. (Shared because locks is assigned outside the parallel() block)
+    # num_threads=0: cython chooses no of parallel threads (= cpu cores?)
+    with nogil, parallel(num_threads=num_threads):
+        # determine the number of parallel threads that will be created
+        # (&var)[0]: workaround to keep num_threads shared and available after
+        #            with block
+        (&num_threads)[0] = omp_get_num_threads()
+    kh_locks = <omp_lock_t *> malloc(num_threads * sizeof(omp_lock_t))
+    for j in xrange(num_threads):
+        omp_init_lock(&kh_locks[j])
+
+    # initialise locks for other shared resources
+    omp_init_lock(&chunk_lock)
+    omp_init_lock(&out_buffer_lock)
+
+    # create an out_buffer ndarray object to interface with chunk.append()
+    # TODO: Avoid np.empty as it unnecessarily allocates space for the content.
+    #       Use PyArray_SimpleNewFromData instead!?
+    # Ref: http://blog.enthought.com/python/numpy-arrays-with-pre-allocated-memory
+    # Ref: https://gist.github.com/GaelVaroquaux/1249305
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    # we are going to redirect the data pointer, keep note of the original
+    # to be able to restore it. Otherwise python will try to free an invalid 
+    # pointer when it garbage collects the out_buffer object
+    out_buffer_org_ptr = out_buffer.data
+
+    # code in the parallel(...) block runs "isolated" (to some extent) in
+    # multiple child threads
+    # assignments within this block make variables thread-local, 
+    # e.g. in_buffer_ptr. BUT this does not seem to work with 'cdef class'
+    # defined objects, e.g. numpy.ndarray, bcolz.chunk, etc., and dereferencing.
+    # The latter is a useful work-around to modifying shared variables from
+    # within parallel the block if locking is handled manually.
+    with nogil, parallel(num_threads=num_threads):
+        # allocate an in-buffer and an out-buffer for each thread
+        # note 1: cannot use ndarrays like: in_buffer = np.empty(...)
+        #         In spite of the assignment cython does not create a 
+        #         thread local ndarray as it should! (Also see note 2 below.)
+        in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * sizeof(char))
+        out_buffer_ptr = <uint64_t *>malloc(chunklen * (allocation_size-1) * sizeof(uint64_t))
+
+        # This is the work-sharing loop. The body is executed in multiple
+        # threads with each thread receiving a different value for i.
+        for i in prange(nchunks):
+            thread_id = threadid()
+            # chunk._getitem is not thread-safe. This may be due to it
+            # setting properties of chunk using self. - Or not.
+            # TODO: check whether _getitem can be made thread-safe to 
+            #       get rid of this probably unnecessary thread lock
+            omp_set_lock(&chunk_lock)
+            with gil:
+                chunk_ = carray_.chunks[i]
+                # decompress into in_buffer
+                # note: _getitem releases gil during blosc decompression
+                chunk_._getitem(0, chunklen, in_buffer_ptr)
+            omp_unset_lock(&chunk_lock)
+
+            _factorize_str_helper(chunklen,
+                            allocation_size,
+                            in_buffer_ptr,
+                            out_buffer_ptr,
+                            table,
+                            &count,
+                            reverse_values,
+                            thread_id,
+                            kh_locks,
+                            num_threads,
+                            )
+
+            # note 2: labels.append(...) requires ndarray, but no thread-local 
+            #         ndarrays can be created. Use the shared out_buffer 
+            # ndarray object and redirect its data pointer to the thread-local
+            # c array out_buffer_ptr
+            omp_set_lock(&out_buffer_lock)
+            with gil:
+                out_buffer.data = <char *>out_buffer_ptr
+                # compress out_buffer into labels
+                # note: append(...) can freeze with bcolz=0.8.0
+                labels.append(out_buffer.astype(np.int64))
+            omp_unset_lock(&out_buffer_lock)
+
+        # Clean-up thread local variables
+        free(in_buffer_ptr)
+        free(out_buffer_ptr)
+
+    # processing of leftover_array is not parallelised in view of an upcoming
+    # changes to bcolz chunk access which will allow seamlessly folding 
+    # this into the prange block 
     leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
     if leftover_elements > 0:
+        out_buffer_ptr = <uint64_t *>malloc(chunklen * (allocation_size-1) * sizeof(uint64_t))
         _factorize_str_helper(leftover_elements,
-                          carray_.dtype.itemsize + 1,
-                          carray_.leftover_array,
-                          out_buffer,
-                          table,
-                          &count,
-                          reverse,
-                          )
+                            allocation_size,
+                            <char *>leftover_array_ptr,
+                            out_buffer_ptr,
+                            table,
+                            &count,
+                            reverse_values,
+                            0,
+                            kh_locks,
+                            num_threads,
+                            )
+        out_buffer.data = <char *>out_buffer_ptr
+        # compress out_buffer into labels
+        labels.append(out_buffer[:leftover_elements].astype(np.int64))
 
-    # compress out_buffer into labels
-    labels.append(out_buffer[:leftover_elements].astype(np.int64))
+    # destroy and free all locks
+    for j in xrange(num_threads):
+        omp_destroy_lock(&kh_locks[j])
+    free(kh_locks)
+    omp_destroy_lock(&chunk_lock)
+    omp_destroy_lock(&out_buffer_lock)
+
+    # restore out_buffer data pointer to avoid segfault when python thries
+    # to free the object's data
+    out_buffer.data = out_buffer_org_ptr
 
     kh_destroy_str(table)
 
+    # construct python dict from vectors and
+    # free the memory allocated for the strings in the reverse_values list
+    for i in xrange(reverse_values.size()):
+        reverse[i] = reverse_values[i]
+        free(reverse_values[i])
+
     return labels, reverse
 
 @cython.wraparound(False)
diff --git a/bquery/khash.pxd b/bquery/khash.pxd
index a8fd51a..5e855c7 100644
--- a/bquery/khash.pxd
+++ b/bquery/khash.pxd
@@ -48,9 +48,9 @@ cdef extern from "khash_python.h":
     inline kh_str_t* kh_init_str()
     inline void kh_destroy_str(kh_str_t*)
     inline void kh_clear_str(kh_str_t*)
-    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t)
+    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
     inline void kh_resize_str(kh_str_t*, khint_t)
-    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*)
+    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
     inline void kh_del_str(kh_str_t*, khint_t)
 
     bint kh_exist_str(kh_str_t*, khiter_t)
diff --git a/requirements.txt b/requirements.txt
index 86f0f4f..c85e29c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-bcolz>=0.8.0
+bcolz>=0.8.1
diff --git a/setup.py b/setup.py
index d311871..fbd0ac1 100644
--- a/setup.py
+++ b/setup.py
@@ -103,6 +103,10 @@ def check_import(pkgname, pkgver):
 # Allow setting the Blosc dir if installed in the system
 BLOSC_DIR = os.environ.get('BLOSC_DIR', '')
 
+# OpenMP libraries
+CFLAGS.append('-fopenmp')
+LFLAGS.append('-fopenmp')
+
 # Sources & libraries
 inc_dirs = ['bquery']
 lib_dirs = []
@@ -137,7 +141,8 @@ def check_import(pkgname, pkgver):
                     library_dirs=lib_dirs,
                     libraries=libs,
                     extra_link_args=LFLAGS,
-                    extra_compile_args=CFLAGS),
+                    extra_compile_args=CFLAGS,
+                    language='c++'),
       ],
       packages=['bquery', 'bquery.tests'],
 )

From 3ff7cde04f8a84cc3b5736990dfe0f1cc1b4b82a Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Fri, 6 Mar 2015 18:25:31 +0100
Subject: [PATCH 2/8] temporary fix for out-of-order chunk results

timings are similar
---
 bquery/ctable_ext.pyx | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index e0de1e1..0359302 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -110,6 +110,7 @@ def factorize_str(carray carray_, carray labels=None):
         dict reverse
         char * in_buffer_ptr
         ndarray[npy_uint64] out_buffer
+        ndarray[npy_uint64] tmp_labels
         char * out_buffer_org_ptr
         uint64_t * out_buffer_ptr
         kh_str_t *table
@@ -161,6 +162,10 @@ def factorize_str(carray carray_, carray labels=None):
     # pointer when it garbage collects the out_buffer object
     out_buffer_org_ptr = out_buffer.data
 
+    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
+    # full size out-buffer (temporary fix for out of order chunk results)
+    tmp_labels = np.empty(chunklen*nchunks+leftover_elements, dtype='uint64')
+
     # code in the parallel(...) block runs "isolated" (to some extent) in
     # multiple child threads
     # assignments within this block make variables thread-local, 
@@ -213,7 +218,15 @@ def factorize_str(carray carray_, carray labels=None):
                 out_buffer.data = <char *>out_buffer_ptr
                 # compress out_buffer into labels
                 # note: append(...) can freeze with bcolz=0.8.0
-                labels.append(out_buffer.astype(np.int64))
+                #labels.append(out_buffer.astype(np.int64))
+
+                # chunk results arrive out of order!
+                # Solutions: Either keep full-size shared out_buffer in memory 
+                # or find a way to write carray chunks out of order.
+                # Latter would be preferable.
+                # For now keeping thread-local out-buffers + full-size out-buffer
+                # in memory
+                tmp_labels[i*chunklen:(i+1)*chunklen] = out_buffer.astype(np.int64)
             omp_unset_lock(&out_buffer_lock)
 
         # Clean-up thread local variables
@@ -223,7 +236,6 @@ def factorize_str(carray carray_, carray labels=None):
     # processing of leftover_array is not parallelised in view of an upcoming
     # changes to bcolz chunk access which will allow seamlessly folding 
     # this into the prange block 
-    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
     if leftover_elements > 0:
         out_buffer_ptr = <uint64_t *>malloc(chunklen * (allocation_size-1) * sizeof(uint64_t))
         _factorize_str_helper(leftover_elements,
@@ -239,7 +251,8 @@ def factorize_str(carray carray_, carray labels=None):
                             )
         out_buffer.data = <char *>out_buffer_ptr
         # compress out_buffer into labels
-        labels.append(out_buffer[:leftover_elements].astype(np.int64))
+        #labels.append(out_buffer[:leftover_elements].astype(np.int64))
+        tmp_labels[nchunks*chunklen:nchunks*chunklen+leftover_elements+1] = out_buffer[:leftover_elements].astype(np.int64)
 
     # destroy and free all locks
     for j in xrange(num_threads):
@@ -254,6 +267,9 @@ def factorize_str(carray carray_, carray labels=None):
 
     kh_destroy_str(table)
 
+    # compress tmp_labels into carray
+    labels.append(tmp_labels)
+
     # construct python dict from vectors and
     # free the memory allocated for the strings in the reverse_values list
     for i in xrange(reverse_values.size()):

From 1a6b2477da5edeee584e87a57c342721ceb0c65d Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Fri, 6 Mar 2015 23:50:09 +0100
Subject: [PATCH 3/8] write labels carray chunks out-of-order

exhibits performance issues with unique(), presumably somehow linked to
the in-memory labels carray
---
 bquery/ctable.py      |  13 ++++
 bquery/ctable_ext.pyx | 165 ++++++++++++++++++++++++++++--------------
 2 files changed, 125 insertions(+), 53 deletions(-)

diff --git a/bquery/ctable.py b/bquery/ctable.py
index 804edf5..ee2df11 100644
--- a/bquery/ctable.py
+++ b/bquery/ctable.py
@@ -62,6 +62,19 @@ def cache_factor(self, col_list, refresh=False):
                 carray_factor = \
                     bcolz.carray([], dtype='int64', expectedlen=self.size,
                                    rootdir=col_factor_rootdir, mode='w')
+                # ensure carray_factor.chunklen is multiple of column chunklen
+                if carray_factor.chunklen % self[col].chunklen != 0:
+                    chunklen_multiplier = round(carray_factor.chunklen / 
+                                                self[col].chunklen)
+                    if chunklen_multiplier < 1.0:
+                        chunklen_multiplier = 1
+                    chunklen_multiplier = int(round(chunklen_multiplier))
+                    carray_factor = \
+                        bcolz.carray([], dtype='int64', 
+                                     chunklen=chunklen_multiplier * \
+                                              self[col].chunklen,
+                                     rootdir=col_factor_rootdir, mode='w')
+
                 _, values = \
                     ctable_ext.factorize(self[col], labels=carray_factor)
                 carray_factor.flush()
diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 0359302..404a945 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -10,7 +10,7 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport strcpy, memcpy, memset
 from libcpp.vector cimport vector
 from khash cimport *
-from bcolz.carray_ext cimport carray, chunk
+from bcolz.carray_ext cimport carray, chunks, chunk
 
 # ----------------------------------------------------------------------------
 #                        GLOBAL DEFINITIONS
@@ -101,16 +101,17 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
 def factorize_str(carray carray_, carray labels=None):
     cdef:
         chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \
-                   nchunks
-        unsigned int j, num_threads, thread_id, test
+        chunks labels_chunks
+        Py_ssize_t n, i, j, count, chunklen, leftover_elements, \
+                   allocation_size, nchunks, chunklen_multiplier, \
+                   tmp_chunks_counter
+        unsigned int num_threads, thread_id
         omp_lock_t * kh_locks
         omp_lock_t chunk_lock, out_buffer_lock
         vector[char *] reverse_values
         dict reverse
         char * in_buffer_ptr
         ndarray[npy_uint64] out_buffer
-        ndarray[npy_uint64] tmp_labels
         char * out_buffer_org_ptr
         uint64_t * out_buffer_ptr
         kh_str_t *table
@@ -120,15 +121,34 @@ def factorize_str(carray carray_, carray labels=None):
     count = 0
     ret = 0
     reverse = {}
+    tmp_chunks_list = []
+    tmp_chunks_counter = 0
     nchunks = carray_.nchunks
-    # 0 = set the number of parallel threads automatically:
+    # num_threads = 0: set the number of parallel threads automatically:
     num_threads = 0
 
     n = len(carray_)
     chunklen = carray_.chunklen
     allocation_size = carray_.dtype.itemsize + 1
+
     if labels is None:
         labels = carray([], dtype='int64', expectedlen=n)
+        tmp_multiplier = round(labels.chunklen / carray_.chunklen)
+        if tmp_multiplier < 1:
+            tmp_multiplier = 1
+        tmp_multiplier = int(round(tmp_multiplier))
+        labels = carray([], dtype='int64', 
+                        chunklen=tmp_multiplier * carray_.chunklen)
+    if labels.chunklen % chunklen != 0:
+        raise NotImplementedError('labels.chunklen must be a multiple of ' +
+                                  'carray_.chunklen')
+    chunklen_multiplier = labels.chunklen / carray_.chunklen
+    # provide access to cython chunks api for persistent labels
+    # fails with in-memory carray
+    if labels._rootdir is not None:
+        labels_chunks = labels.chunks
+
+    tmp_chunks_keys = [0] * int(carray_.size / labels.chunklen)
 
     leftover_array_ptr = carray_.leftover_array.__array_interface__['data'][0]
 
@@ -143,6 +163,7 @@ def factorize_str(carray carray_, carray labels=None):
         # (&var)[0]: workaround to keep num_threads shared and available after
         #            with block
         (&num_threads)[0] = omp_get_num_threads()
+    num_threads *= 2
     kh_locks = <omp_lock_t *> malloc(num_threads * sizeof(omp_lock_t))
     for j in xrange(num_threads):
         omp_init_lock(&kh_locks[j])
@@ -156,16 +177,12 @@ def factorize_str(carray carray_, carray labels=None):
     #       Use PyArray_SimpleNewFromData instead!?
     # Ref: http://blog.enthought.com/python/numpy-arrays-with-pre-allocated-memory
     # Ref: https://gist.github.com/GaelVaroquaux/1249305
-    out_buffer = np.empty(chunklen, dtype='uint64')
+    out_buffer = np.empty(labels.chunklen, dtype='uint64')
     # we are going to redirect the data pointer, keep note of the original
     # to be able to restore it. Otherwise python will try to free an invalid 
     # pointer when it garbage collects the out_buffer object
     out_buffer_org_ptr = out_buffer.data
 
-    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
-    # full size out-buffer (temporary fix for out of order chunk results)
-    tmp_labels = np.empty(chunklen*nchunks+leftover_elements, dtype='uint64')
-
     # code in the parallel(...) block runs "isolated" (to some extent) in
     # multiple child threads
     # assignments within this block make variables thread-local, 
@@ -178,36 +195,41 @@ def factorize_str(carray carray_, carray labels=None):
         # note 1: cannot use ndarrays like: in_buffer = np.empty(...)
         #         In spite of the assignment cython does not create a 
         #         thread local ndarray as it should! (Also see note 2 below.)
-        in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * sizeof(char))
-        out_buffer_ptr = <uint64_t *>malloc(chunklen * (allocation_size-1) * sizeof(uint64_t))
+        in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * 
+                                       sizeof(char))
+        out_buffer_ptr = <uint64_t *>malloc(chunklen_multiplier * chunklen * 
+                                            sizeof(uint64_t))
 
         # This is the work-sharing loop. The body is executed in multiple
         # threads with each thread receiving a different value for i.
-        for i in prange(nchunks):
+        for i in prange(0, nchunks, chunklen_multiplier):
             thread_id = threadid()
-            # chunk._getitem is not thread-safe. This may be due to it
-            # setting properties of chunk using self. - Or not.
-            # TODO: check whether _getitem can be made thread-safe to 
-            #       get rid of this probably unnecessary thread lock
-            omp_set_lock(&chunk_lock)
-            with gil:
-                chunk_ = carray_.chunks[i]
-                # decompress into in_buffer
-                # note: _getitem releases gil during blosc decompression
-                chunk_._getitem(0, chunklen, in_buffer_ptr)
-            omp_unset_lock(&chunk_lock)
-
-            _factorize_str_helper(chunklen,
-                            allocation_size,
-                            in_buffer_ptr,
-                            out_buffer_ptr,
-                            table,
-                            &count,
-                            reverse_values,
-                            thread_id,
-                            kh_locks,
-                            num_threads,
-                            )
+            # pack the results from multiple carray_-chunks into one 
+            # labels-chunk: allows out-of-order writing of labels at good speed
+            for j in xrange(chunklen_multiplier):
+                # chunk._getitem is not thread-safe. This may be due to it
+                # setting properties of chunk using self. - Or not.
+                # TODO: check whether _getitem can be made thread-safe to 
+                #       get rid of this probably unnecessary thread lock
+                omp_set_lock(&chunk_lock)
+                with gil:
+                    chunk_ = carray_.chunks[i+j]
+                    # decompress into in_buffer
+                    # note: _getitem releases gil during blosc decompression
+                    chunk_._getitem(0, chunklen, in_buffer_ptr)
+                omp_unset_lock(&chunk_lock)
+
+                _factorize_str_helper(chunklen,
+                                allocation_size,
+                                in_buffer_ptr,
+                                out_buffer_ptr + j * chunklen,
+                                table,
+                                &count,
+                                reverse_values,
+                                thread_id,
+                                kh_locks,
+                                num_threads,
+                                )
 
             # note 2: labels.append(...) requires ndarray, but no thread-local 
             #         ndarrays can be created. Use the shared out_buffer 
@@ -217,27 +239,64 @@ def factorize_str(carray carray_, carray labels=None):
             with gil:
                 out_buffer.data = <char *>out_buffer_ptr
                 # compress out_buffer into labels
-                # note: append(...) can freeze with bcolz=0.8.0
-                #labels.append(out_buffer.astype(np.int64))
-
-                # chunk results arrive out of order!
-                # Solutions: Either keep full-size shared out_buffer in memory 
-                # or find a way to write carray chunks out of order.
-                # Latter would be preferable.
-                # For now keeping thread-local out-buffers + full-size out-buffer
-                # in memory
-                tmp_labels[i*chunklen:(i+1)*chunklen] = out_buffer.astype(np.int64)
+                # _save() and append() freeze at versions prior to v0.8.1
+                if labels._rootdir is not None:
+                    labels_chunks._save(int(i/chunklen_multiplier), chunk(out_buffer, labels._dtype, 
+                                                 labels._cparams, _memory=False))
+                    labels._cbytes += chunk_.cbytes
+                    labels._nbytes += labels.chunklen * labels.itemsize
+                    labels_chunks.nchunks += 1
+                # in-memory chunks cannot be "_save()"-ed but must be appended
+                # to a python chunk list
+                else:
+                    # TODO: Why does the following not work? (Freezes...)
+                    # tmp_chunks_list[i] = chunk(out_buffer, labels._dtype, labels._cparams, _memory=True)
+                    #
+                    # Work-around until above problem is solved.
+                    tmp_chunks_list.append(chunk(out_buffer, labels._dtype, 
+                                                 labels._cparams, _memory=True))
+                    tmp_chunks_keys[int(i/chunklen_multiplier)] = tmp_chunks_counter
+                    (&tmp_chunks_counter)[0] += 1
+
             omp_unset_lock(&out_buffer_lock)
 
         # Clean-up thread local variables
         free(in_buffer_ptr)
         free(out_buffer_ptr)
 
+    # update chunk count (necessary due to use of _save() above
+    if labels._rootdir is None:
+        labels.chunks = [tmp_chunks_list[i] for i in tmp_chunks_keys]
+
+    # process remaining chunks that could not be processed en-bloc, chunks in-order
+    # TODO: parallelise after refactoring
+    # This is really slowing us down when chunk_multiplier = 1 !!!
+    # e.g. uncompressed unique()
+    in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * sizeof(char))
+    out_buffer_ptr = <uint64_t *>malloc(chunklen * sizeof(uint64_t))
+    for i in xrange(labels.size/carray_.chunklen, nchunks):
+        chunk_ = carray_.chunks[i]
+        chunk_._getitem(0, chunklen, in_buffer_ptr)
+        _factorize_str_helper(chunklen,
+                        allocation_size,
+                        in_buffer_ptr,
+                        out_buffer_ptr,
+                        table,
+                        &count,
+                        reverse_values,
+                        0,
+                        kh_locks,
+                        num_threads,
+                        )
+        #out_buffer.data = <char *>out_buffer_ptr
+        #labels.append(out_buffer.astype(np.int64))
+
+
     # processing of leftover_array is not parallelised in view of an upcoming
     # changes to bcolz chunk access which will allow seamlessly folding 
     # this into the prange block 
+    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
     if leftover_elements > 0:
-        out_buffer_ptr = <uint64_t *>malloc(chunklen * (allocation_size-1) * sizeof(uint64_t))
         _factorize_str_helper(leftover_elements,
                             allocation_size,
                             <char *>leftover_array_ptr,
@@ -251,8 +310,11 @@ def factorize_str(carray carray_, carray labels=None):
                             )
         out_buffer.data = <char *>out_buffer_ptr
         # compress out_buffer into labels
-        #labels.append(out_buffer[:leftover_elements].astype(np.int64))
-        tmp_labels[nchunks*chunklen:nchunks*chunklen+leftover_elements+1] = out_buffer[:leftover_elements].astype(np.int64)
+        labels.append(out_buffer[:leftover_elements].astype(np.int64))
+
+    
+    free(in_buffer_ptr)
+    free(out_buffer_ptr)
 
     # destroy and free all locks
     for j in xrange(num_threads):
@@ -267,9 +329,6 @@ def factorize_str(carray carray_, carray labels=None):
 
     kh_destroy_str(table)
 
-    # compress tmp_labels into carray
-    labels.append(tmp_labels)
-
     # construct python dict from vectors and
     # free the memory allocated for the strings in the reverse_values list
     for i in xrange(reverse_values.size()):

From 1158a88153fb7fcb24c16c44158cb2fad268f80b Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Sun, 8 Mar 2015 10:36:07 +0100
Subject: [PATCH 4/8] fixed in-memory labels carray

broke due to performance debugging
---
 bquery/ctable_ext.pyx | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 404a945..2acbdbc 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -266,7 +266,10 @@ def factorize_str(carray carray_, carray labels=None):
 
     # update chunk count (necessary due to use of _save() above
     if labels._rootdir is None:
-        labels.chunks = [tmp_chunks_list[i] for i in tmp_chunks_keys]
+        for i in xrange(tmp_chunks_counter):
+            labels.chunks.append(tmp_chunks_list[tmp_chunks_keys[i]])
+            labels._cbytes += tmp_chunks_list[tmp_chunks_keys[i]].cbytes
+            labels._nbytes += labels.chunklen * labels.itemsize
 
     # process remaining chunks that could not be processed en-bloc, chunks in-order
     # TODO: parallelise after refactoring
@@ -274,6 +277,7 @@ def factorize_str(carray carray_, carray labels=None):
     # e.g. uncompressed unique()
     in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * sizeof(char))
     out_buffer_ptr = <uint64_t *>malloc(chunklen * sizeof(uint64_t))
+    out_buffer.data = <char *>out_buffer_ptr
     for i in xrange(labels.size/carray_.chunklen, nchunks):
         chunk_ = carray_.chunks[i]
         chunk_._getitem(0, chunklen, in_buffer_ptr)
@@ -289,7 +293,7 @@ def factorize_str(carray carray_, carray labels=None):
                         num_threads,
                         )
         #out_buffer.data = <char *>out_buffer_ptr
-        #labels.append(out_buffer.astype(np.int64))
+        labels.append(out_buffer[:chunklen].astype(np.int64))
 
 
     # processing of leftover_array is not parallelised in view of an upcoming
@@ -308,7 +312,6 @@ def factorize_str(carray carray_, carray labels=None):
                             kh_locks,
                             num_threads,
                             )
-        out_buffer.data = <char *>out_buffer_ptr
         # compress out_buffer into labels
         labels.append(out_buffer[:leftover_elements].astype(np.int64))
 

From 5a3b4877a48c6fece58676206f41ef16ee7f1cff Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Mon, 16 Mar 2015 12:31:49 +0100
Subject: [PATCH 5/8] non-iterblocks implementation

---
 bquery/ctable_ext.pyx | 194 +++++++++++++++++++++++-------------------
 bquery/khash.h        |  60 +++++--------
 bquery/khash_python.h |  13 ++-
 setup.py              |  64 ++++++++++++--
 4 files changed, 196 insertions(+), 135 deletions(-)

diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 2acbdbc..80f0c5f 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -1,14 +1,16 @@
-import cython
+cimport cython
 from cython.parallel import parallel, prange, threadid
-from openmp cimport omp_lock_t, omp_init_lock, omp_set_lock, omp_unset_lock, omp_destroy_lock, omp_get_num_threads
+from openmp cimport *
 
 import numpy as np
-from numpy cimport ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t
+from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64,
+                    npy_int64, npy_float64, uint64_t)
 
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport strcpy, memcpy, memset
 from libcpp.vector cimport vector
+
 from khash cimport *
 from bcolz.carray_ext cimport carray, chunks, chunk
 
@@ -32,6 +34,65 @@ SORTED_COUNT_DISTINCT = 4
 DEF _SORTED_COUNT_DISTINCT = 4
 # ----------------------------------------------------------------------------
 
+
+cdef extern from *:
+    cdef void omp_start_critical "#pragma omp critical \n { //" () nogil
+    cdef void omp_end_critical "} //" () nogil
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef omp_lock_t * par_initialise_kh_locks(unsigned int * num_threads) nogil:
+    cdef:
+        int i
+        omp_lock_t * kh_locks
+
+    num_threads[0] = omp_get_num_threads()
+    kh_locks = <omp_lock_t *> malloc(num_threads[0] * sizeof(omp_lock_t))
+    for i in range(num_threads[0]):
+        omp_init_lock(&kh_locks[i])
+
+    return kh_locks
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef char * par_allocate_in_buffer(Py_ssize_t chunklen, 
+                                   Py_ssize_t itemsize) nogil:
+    return <char *>malloc(chunklen * itemsize * sizeof(char))
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef uint64_t * par_allocate_out_buffer(Py_ssize_t chunklen, 
+                                       Py_ssize_t chunklen_multiplier) nogil:
+    return <uint64_t *>malloc(chunklen_multiplier * chunklen 
+                              * sizeof(uint64_t))
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef check_labels_structure(carray carray_, carray labels):
+    # if no labels carray is given, create one with satisfying:
+    # labels.chunklen % carray_.chunklen = 0
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len(carray_))
+        chunklen_multiplier = round(labels.chunklen / carray_.chunklen)
+        if chunklen_multiplier < 1:
+            chunklen_multiplier = 1
+        chunklen_multiplier = int(round(chunklen_multiplier))
+        labels = carray([], dtype='int64', 
+                        chunklen=chunklen_multiplier * carray_.chunklen)
+
+    # if one was given, check chunklen is ok
+    elif labels.chunklen % carray_.chunklen != 0:
+        raise NotImplementedError('labels.chunklen must be a multiple of ' +
+                                  'carray_.chunklen')
+    # if labels was given, with ok chunklen, determine multiplier
+    else:
+        chunklen_multiplier = labels.chunklen / carray_.chunklen
+
+    return labels, chunklen_multiplier
+
+
 # Factorize Section
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -41,7 +102,6 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
                        uint64_t * out_buffer,
                        kh_str_t *table,
                        Py_ssize_t * count,
-                       vector[char *] & reverse_values,
                        unsigned int thread_id,
                        omp_lock_t kh_locks[],
                        unsigned int num_threads,
@@ -83,10 +143,15 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range,
             # acquire locks for all threads before writing to shared resources
             for j in xrange(num_threads):
                 omp_set_lock(&kh_locks[j])
-            k = kh_put_str(table, insert, &ret)
-            table.vals[k] = idx = count[0]
-            reverse_values.push_back(insert)
-            count[0] += 1
+            # check whether another thread has already added the entry by now
+            k = kh_get_str(table, element)
+            if k != table.n_buckets:
+                idx = table.vals[k]
+            # if not, add element
+            else:
+                k = kh_put_str(table, insert, &ret)
+                table.vals[k] = idx = count[0]
+                count[0] += 1
             # release all locks
             for j in xrange(num_threads):
                 omp_unset_lock(&kh_locks[j])
@@ -102,14 +167,10 @@ def factorize_str(carray carray_, carray labels=None):
     cdef:
         chunk chunk_
         chunks labels_chunks
-        Py_ssize_t n, i, j, count, chunklen, leftover_elements, \
-                   allocation_size, nchunks, chunklen_multiplier, \
-                   tmp_chunks_counter
-        unsigned int num_threads, thread_id
-        omp_lock_t * kh_locks
+        Py_ssize_t i, j, k, leftover_elements, \
+                   chunklen_multiplier
+        unsigned int thread_id
         omp_lock_t chunk_lock, out_buffer_lock
-        vector[char *] reverse_values
-        dict reverse
         char * in_buffer_ptr
         ndarray[npy_uint64] out_buffer
         char * out_buffer_org_ptr
@@ -117,32 +178,22 @@ def factorize_str(carray carray_, carray labels=None):
         kh_str_t *table
         uintptr_t leftover_array_ptr
 
+        Py_ssize_t count = 0
+        Py_ssize_t nchunks = carray_.nchunks
+        Py_ssize_t tmp_chunks_counter = 0
+        omp_lock_t * kh_locks = NULL
+        dict reverse = {}
+        list tmp_chunks_list = []
+        # num_threads = 0: set the number of parallel threads automatically:
+        unsigned int num_threads = 0
 
-    count = 0
-    ret = 0
-    reverse = {}
-    tmp_chunks_list = []
-    tmp_chunks_counter = 0
-    nchunks = carray_.nchunks
-    # num_threads = 0: set the number of parallel threads automatically:
-    num_threads = 0
+        Py_ssize_t chunklen = carray_.chunklen
+        Py_ssize_t allocation_size = carray_.dtype.itemsize + 1
 
-    n = len(carray_)
-    chunklen = carray_.chunklen
-    allocation_size = carray_.dtype.itemsize + 1
 
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=n)
-        tmp_multiplier = round(labels.chunklen / carray_.chunklen)
-        if tmp_multiplier < 1:
-            tmp_multiplier = 1
-        tmp_multiplier = int(round(tmp_multiplier))
-        labels = carray([], dtype='int64', 
-                        chunklen=tmp_multiplier * carray_.chunklen)
-    if labels.chunklen % chunklen != 0:
-        raise NotImplementedError('labels.chunklen must be a multiple of ' +
-                                  'carray_.chunklen')
-    chunklen_multiplier = labels.chunklen / carray_.chunklen
+
+    labels, chunklen_multiplier = check_labels_structure(carray_, labels)
+
     # provide access to cython chunks api for persistent labels
     # fails with in-memory carray
     if labels._rootdir is not None:
@@ -154,55 +205,24 @@ def factorize_str(carray carray_, carray labels=None):
 
     table = kh_init_str()
 
-
-    # Allocate and initialise shared locks for hash-table writes for each child
-    # thread. (Shared because locks is assigned outside the parallel() block)
-    # num_threads=0: cython chooses no of parallel threads (= cpu cores?)
-    with nogil, parallel(num_threads=num_threads):
-        # determine the number of parallel threads that will be created
-        # (&var)[0]: workaround to keep num_threads shared and available after
-        #            with block
-        (&num_threads)[0] = omp_get_num_threads()
-    num_threads *= 2
-    kh_locks = <omp_lock_t *> malloc(num_threads * sizeof(omp_lock_t))
-    for j in xrange(num_threads):
-        omp_init_lock(&kh_locks[j])
-
     # initialise locks for other shared resources
     omp_init_lock(&chunk_lock)
     omp_init_lock(&out_buffer_lock)
 
-    # create an out_buffer ndarray object to interface with chunk.append()
-    # TODO: Avoid np.empty as it unnecessarily allocates space for the content.
-    #       Use PyArray_SimpleNewFromData instead!?
-    # Ref: http://blog.enthought.com/python/numpy-arrays-with-pre-allocated-memory
-    # Ref: https://gist.github.com/GaelVaroquaux/1249305
     out_buffer = np.empty(labels.chunklen, dtype='uint64')
-    # we are going to redirect the data pointer, keep note of the original
-    # to be able to restore it. Otherwise python will try to free an invalid 
-    # pointer when it garbage collects the out_buffer object
     out_buffer_org_ptr = out_buffer.data
 
-    # code in the parallel(...) block runs "isolated" (to some extent) in
-    # multiple child threads
-    # assignments within this block make variables thread-local, 
-    # e.g. in_buffer_ptr. BUT this does not seem to work with 'cdef class'
-    # defined objects, e.g. numpy.ndarray, bcolz.chunk, etc., and dereferencing.
-    # The latter is a useful work-around to modifying shared variables from
-    # within parallel the block if locking is handled manually.
     with nogil, parallel(num_threads=num_threads):
+        # Initialise shared locks for hash-table writes for each child thread.
+        omp_start_critical()
+        (&kh_locks)[0] = par_initialise_kh_locks(&num_threads)
+        omp_end_critical()
+
         # allocate an in-buffer and an out-buffer for each thread
-        # note 1: cannot use ndarrays like: in_buffer = np.empty(...)
-        #         In spite of the assignment cython does not create a 
-        #         thread local ndarray as it should! (Also see note 2 below.)
-        in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * 
-                                       sizeof(char))
-        out_buffer_ptr = <uint64_t *>malloc(chunklen_multiplier * chunklen * 
-                                            sizeof(uint64_t))
-
-        # This is the work-sharing loop. The body is executed in multiple
-        # threads with each thread receiving a different value for i.
-        for i in prange(0, nchunks, chunklen_multiplier):
+        in_buffer_ptr = par_allocate_in_buffer(chunklen, allocation_size-1)
+        out_buffer_ptr = par_allocate_out_buffer(chunklen, chunklen_multiplier)
+
+        for i in prange(0, <int>(nchunks/chunklen_multiplier)*chunklen_multiplier, chunklen_multiplier):
             thread_id = threadid()
             # pack the results from multiple carray_-chunks into one 
             # labels-chunk: allows out-of-order writing of labels at good speed
@@ -225,7 +245,6 @@ def factorize_str(carray carray_, carray labels=None):
                                 out_buffer_ptr + j * chunklen,
                                 table,
                                 &count,
-                                reverse_values,
                                 thread_id,
                                 kh_locks,
                                 num_threads,
@@ -287,7 +306,6 @@ def factorize_str(carray carray_, carray labels=None):
                         out_buffer_ptr,
                         table,
                         &count,
-                        reverse_values,
                         0,
                         kh_locks,
                         num_threads,
@@ -307,7 +325,6 @@ def factorize_str(carray carray_, carray labels=None):
                             out_buffer_ptr,
                             table,
                             &count,
-                            reverse_values,
                             0,
                             kh_locks,
                             num_threads,
@@ -330,13 +347,14 @@ def factorize_str(carray carray_, carray labels=None):
     # to free the object's data
     out_buffer.data = out_buffer_org_ptr
 
-    kh_destroy_str(table)
-
-    # construct python dict from vectors and
-    # free the memory allocated for the strings in the reverse_values list
-    for i in xrange(reverse_values.size()):
-        reverse[i] = reverse_values[i]
-        free(reverse_values[i])
+    # construct python dict from vectors and free element memory
+    cdef unsigned long m
+    for m in xrange(table.n_buckets):
+        if not kh_exist_str(table, m):
+            continue
+        reverse[table.vals[m]] = <char*>table.keys[m]
+        free(<void*>table.keys[m])
+    kh_destroy_str(table)
 
     return labels, reverse
 
diff --git a/bquery/khash.h b/bquery/khash.h
index c0421c4..5e55088 100644
--- a/bquery/khash.h
+++ b/bquery/khash.h
@@ -116,7 +116,6 @@ int main() {
 
 #ifndef __AC_KHASH_H
 #define __AC_KHASH_H
-#endif
 
 /*!
   @header
@@ -139,33 +138,31 @@ typedef unsigned long khint32_t;
 #endif
 
 #if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khuint64_t;
-typedef signed long khint64_t;
+typedef unsigned long khint64_t;
 #else
-typedef unsigned long long khuint64_t;
-typedef signed long long khint64_t;
+typedef unsigned long long khint64_t;
 #endif
 
-
+#ifndef kh_inline
 #ifdef _MSC_VER
 #define kh_inline __inline
 #else
 #define kh_inline inline
+#endif
+#endif /* kh_inline */
 
-typedef double khfloat64_t;
 typedef khint32_t khint_t;
 typedef khint_t khiter_t;
 
-#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
-#define __ac_isdel(flag, i) (0)
-#define __ac_iseither(flag, i) __ac_isempty(flag, i)
-#define __ac_set_isdel_false(flag, i) (0)
-#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
-#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
-#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
-#define __ac_set_isdel_true(flag, i) (0)
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
 
-#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
 
 #ifndef kroundup32
 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
@@ -187,7 +184,7 @@ typedef khint_t khiter_t;
 static const double __ac_HASH_UPPER = 0.77;
 
 #define __KHASH_TYPE(name, khkey_t, khval_t) \
-	typedef struct { \
+	typedef struct kh_##name##_s { \
 		khint_t n_buckets, size, n_occupied, upper_bound; \
 		khint32_t *flags; \
 		khkey_t *keys; \
@@ -247,7 +244,7 @@ static const double __ac_HASH_UPPER = 0.77;
 			else { /* hash table size to be changed (shrink or expand); rehash */ \
 				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
 				if (!new_flags) return -1;								\
-				memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
 				if (h->n_buckets < new_n_buckets) {	/* expand */		\
 					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
 					if (!new_keys) { kfree(new_flags); return -1; }		\
@@ -268,7 +265,7 @@ static const double __ac_HASH_UPPER = 0.77;
 					khint_t new_mask;									\
 					new_mask = new_n_buckets - 1; 						\
 					if (kh_is_map) val = h->vals[j];					\
-					__ac_set_isempty_true(h->flags, j);					\
+					__ac_set_isdel_true(h->flags, j);					\
 					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
 						khint_t k, i, step = 0; \
 						k = __hash_func(key);							\
@@ -278,7 +275,7 @@ static const double __ac_HASH_UPPER = 0.77;
 						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
 							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
 							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-							__ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
 						} else { /* write the element and jump out of the loop */ \
 							h->keys[i] = key;							\
 							if (kh_is_map) h->vals[i] = val;			\
@@ -354,7 +351,7 @@ static const double __ac_HASH_UPPER = 0.77;
 	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
 
 #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-	__KHASH_TYPE(name, khkey_t, khval_t)								                 \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
 	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
 
 #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
@@ -382,7 +379,6 @@ static const double __ac_HASH_UPPER = 0.77;
   @abstract     64-bit integer comparison function
  */
 #define kh_int64_hash_equal(a, b) ((a) == (b))
-
 /*! @function
   @abstract     const char* hash function
   @param  s     Pointer to a null terminated string
@@ -432,7 +428,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @return       Pointer to the hash table [khash_t(name)*]
  */
-#define kh_init(name) kh_init_##name(void)
+#define kh_init(name) kh_init_##name()
 
 /*! @function
   @abstract     Destroy a hash table.
@@ -593,9 +589,6 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @abstract     Instantiate a hash map containing 64-bit integer keys
   @param  name  Name of the hash table [symbol]
  */
-#define KHASH_SET_INIT_UINT64(name)										\
-	KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
 #define KHASH_SET_INIT_INT64(name)										\
 	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
 
@@ -604,13 +597,9 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  khval_t  Type of values [type]
  */
-#define KHASH_MAP_INIT_UINT64(name, khval_t)								\
-	KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
 #define KHASH_MAP_INIT_INT64(name, khval_t)								\
 	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
 
-
 typedef const char *kh_cstr_t;
 /*! @function
   @abstract     Instantiate a hash map containing const char* keys
@@ -627,15 +616,4 @@ typedef const char *kh_cstr_t;
 #define KHASH_MAP_INIT_STR(name, khval_t)								\
 	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
 
-
-#define kh_exist_str(h, k) (kh_exist(h, k))
-#define kh_exist_float64(h, k) (kh_exist(h, k))
-#define kh_exist_int64(h, k) (kh_exist(h, k))
-#define kh_exist_int32(h, k) (kh_exist(h, k))
-
-KHASH_MAP_INIT_STR(str, size_t)
-KHASH_MAP_INIT_INT(int32, size_t)
-KHASH_MAP_INIT_INT64(int64, size_t)
-
-
 #endif /* __AC_KHASH_H */
diff --git a/bquery/khash_python.h b/bquery/khash_python.h
index b5b0de2..2566dff 100644
--- a/bquery/khash_python.h
+++ b/bquery/khash_python.h
@@ -1,7 +1,18 @@
 #include <Python.h>
 
+typedef double khfloat64_t;
+
 #include "khash.h"
 
+#define kh_exist_str(h, k) (kh_exist(h, k))
+#define kh_exist_float64(h, k) (kh_exist(h, k))
+#define kh_exist_int64(h, k) (kh_exist(h, k))
+#define kh_exist_int32(h, k) (kh_exist(h, k))
+
+KHASH_MAP_INIT_STR(str, size_t)
+KHASH_MAP_INIT_INT(int32, size_t)
+KHASH_MAP_INIT_INT64(int64, size_t)
+
 // kludge
 
 #define kh_float64_hash_func _Py_HashDouble
@@ -46,4 +57,4 @@ KHASH_SET_INIT_PYOBJECT(pyset)
 #define kh_exist_pymap(h, k) (kh_exist(h, k))
 #define kh_exist_pyset(h, k) (kh_exist(h, k))
 
-KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
\ No newline at end of file
+KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
diff --git a/setup.py b/setup.py
index fbd0ac1..acbb51f 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,64 @@ def check_import(pkgname, pkgver):
 ########### End of checks ##########
 
 
+########### Project-specific build options ###########
+
+copt = {'cygwin': ['-fopenmp'],
+        'emx': ['-fopenmp'],
+        'intel': ['-openmp'],
+        'intele': ['-openmp'],
+        'intelem': ['-openmp'],
+        'mingw32': ['-fopenmp'],
+        'msvc': ['/openmp'],
+        }
+lopt = {'cygwin': ['-fopenmp'],
+        'emx': ['-fopenmp'],
+        'mingw32': ['-fopenmp'],
+        }
+
+class bquery_build_ext(build_ext):
+    user_options = build_ext.user_options + \
+        [
+        ('from-templates', None,
+         "rebuild project from code generation templates"),
+        ]
+
+    def initialize_options(self):
+        self.from_templates = False
+        build_ext.initialize_options(self)
+
+    def run(self):
+        # regenerate cython code from templates
+        if self.from_templates:
+            try:
+                import jinja2
+            except:
+                exit_with_error(
+                    "You need the python package jinja2 to rebuild the " + \
+                    "extension from the templates")
+            execfile("bquery/templates/run_templates.py")
+
+        build_ext.run(self)
+
+    def build_extensions(self):
+        # set compiler-specific build flags, e.g. for openmp
+        c = self.compiler.compiler_type
+        if copt.has_key(c):
+           for e in self.extensions:
+               e.extra_compile_args += copt[ c ]
+        else:
+            print_warning(
+                "Openmp flags for compiler '%s' not configured in setup.py." +
+                "Building without parallel processing support.")
+        if lopt.has_key(c):
+            for e in self.extensions:
+                e.extra_link_args += lopt[ c ]
+        build_ext.build_extensions(self)
+
+
+######### End project-specific build options #########
+
+
 # bquery version
 VERSION = open('VERSION').read().strip()
 # Create the version.py file
@@ -103,10 +161,6 @@ def check_import(pkgname, pkgver):
 # Allow setting the Blosc dir if installed in the system
 BLOSC_DIR = os.environ.get('BLOSC_DIR', '')
 
-# OpenMP libraries
-CFLAGS.append('-fopenmp')
-LFLAGS.append('-fopenmp')
-
 # Sources & libraries
 inc_dirs = ['bquery']
 lib_dirs = []
@@ -132,7 +186,7 @@ def check_import(pkgname, pkgver):
       url='https://github.com/visualfabriq/bquery',
       license='http://www.opensource.org/licenses/bsd-license.php',
       platforms=['any'],
-      cmdclass={'build_ext': build_ext},
+      cmdclass={'build_ext': bquery_build_ext},
       ext_modules=[
           Extension("bquery.ctable_ext",
                     include_dirs=inc_dirs,

From b2453aa8f8dde8ea0711ec2773ef2ffee8412202 Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Mon, 16 Mar 2015 12:40:50 +0100
Subject: [PATCH 6/8] non-iterblocks with pandas khash

---
 bquery/khash.h        | 247 ++++++++++++++++++------------------------
 bquery/khash_python.h |  15 +--
 2 files changed, 105 insertions(+), 157 deletions(-)

diff --git a/bquery/khash.h b/bquery/khash.h
index 5e55088..4350ff0 100644
--- a/bquery/khash.h
+++ b/bquery/khash.h
@@ -33,6 +33,7 @@ int main() {
 	khiter_t k;
 	khash_t(32) *h = kh_init(32);
 	k = kh_put(32, h, 5, &ret);
+	if (!ret) kh_del(32, h, k);
 	kh_value(h, k) = 10;
 	k = kh_get(32, h, 10);
 	is_missing = (k == kh_end(h));
@@ -46,23 +47,6 @@ int main() {
 */
 
 /*
-  2013-05-02 (0.2.8):
-
-	* Use quadratic probing. When the capacity is power of 2, stepping function
-	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
-	  hashing on cache performance and is more robust than linear probing.
-
-	  In theory, double hashing should be more robust than quadratic probing.
-	  However, my implementation is probably not for large hash tables, because
-	  the second hash function is closely tied to the first hash function,
-	  which reduce the effectiveness of double hashing.
-
-	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
-
-  2011-12-29 (0.2.7):
-
-    * Minor code clean up; no actual effect.
-
   2011-09-16 (0.2.6):
 
 	* The capacity is a power of 2. This seems to dramatically improve the
@@ -123,13 +107,12 @@ int main() {
   Generic hash table library.
  */
 
-#define AC_VERSION_KHASH_H "0.2.8"
+#define AC_VERSION_KHASH_H "0.2.6"
 
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 
-/* compiler specific configuration */
 
 #if UINT_MAX == 0xffffffffu
 typedef unsigned int khint32_t;
@@ -138,78 +121,84 @@ typedef unsigned long khint32_t;
 #endif
 
 #if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khint64_t;
+typedef unsigned long khuint64_t;
+typedef signed long khint64_t;
 #else
-typedef unsigned long long khint64_t;
+typedef unsigned long long khuint64_t;
+typedef signed long long khint64_t;
 #endif
 
-#ifndef kh_inline
-#ifdef _MSC_VER
-#define kh_inline __inline
-#else
-#define kh_inline inline
+typedef double khfloat64_t;
+
+#ifndef PANDAS_INLINE
+  #if defined(__GNUC__)
+    #define PANDAS_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define PANDAS_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define PANDAS_INLINE inline
+  #else
+    #define PANDAS_INLINE
+  #endif
 #endif
-#endif /* kh_inline */
 
 typedef khint32_t khint_t;
 typedef khint_t khiter_t;
 
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
+#define __ac_isdel(flag, i) (0)
+#define __ac_iseither(flag, i) __ac_isempty(flag, i)
+#define __ac_set_isdel_false(flag, i) (0)
+#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
+#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
+#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
+#define __ac_set_isdel_true(flag, i) (0)
+
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
 
-#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
 
 #ifndef kroundup32
 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 #endif
 
-#ifndef kcalloc
-#define kcalloc(N,Z) calloc(N,Z)
-#endif
-#ifndef kmalloc
-#define kmalloc(Z) malloc(Z)
-#endif
-#ifndef krealloc
-#define krealloc(P,Z) realloc(P,Z)
-#endif
-#ifndef kfree
-#define kfree(P) free(P)
-#endif
-
 static const double __ac_HASH_UPPER = 0.77;
 
-#define __KHASH_TYPE(name, khkey_t, khval_t) \
-	typedef struct kh_##name##_s { \
-		khint_t n_buckets, size, n_occupied, upper_bound; \
-		khint32_t *flags; \
-		khkey_t *keys; \
-		khval_t *vals; \
-	} kh_##name##_t;
-
-#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
-	extern kh_##name##_t *kh_init_##name(void);							\
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	typedef struct {													\
+		khint_t n_buckets, size, n_occupied, upper_bound;				\
+		khint32_t *flags;												\
+		khkey_t *keys;													\
+		khval_t *vals;													\
+	} kh_##name##_t;													\
+	extern kh_##name##_t *kh_init_##name();								\
 	extern void kh_destroy_##name(kh_##name##_t *h);					\
 	extern void kh_clear_##name(kh_##name##_t *h);						\
 	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
-	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
 	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
 	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
 
-#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-	SCOPE kh_##name##_t *kh_init_##name(void) {							\
-		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	typedef struct {													\
+		khint_t n_buckets, size, n_occupied, upper_bound;				\
+		khint32_t *flags;												\
+		khkey_t *keys;													\
+		khval_t *vals;													\
+	} kh_##name##_t;													\
+	SCOPE kh_##name##_t *kh_init_##name(void) {								\
+		return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));		\
 	}																	\
 	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
 	{																	\
 		if (h) {														\
-			kfree((void *)h->keys); kfree(h->flags);					\
-			kfree((void *)h->vals);										\
-			kfree(h);													\
+			free(h->keys); free(h->flags);								\
+			free(h->vals);												\
+			free(h);													\
 		}																\
 	}																	\
 	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
@@ -222,19 +211,19 @@ static const double __ac_HASH_UPPER = 0.77;
 	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
 	{																	\
 		if (h->n_buckets) {												\
-			khint_t k, i, last, mask, step = 0; \
+			khint_t inc, k, i, last, mask;								\
 			mask = h->n_buckets - 1;									\
 			k = __hash_func(key); i = k & mask;							\
-			last = i; \
+			inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
 			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-				i = (i + (++step)) & mask; \
+				i = (i + inc) & mask; 									\
 				if (i == last) return h->n_buckets;						\
 			}															\
 			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
 		} else return 0;												\
 	}																	\
-	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
-	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+	SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
 		khint32_t *new_flags = 0;										\
 		khint_t j = 1;													\
 		{																\
@@ -242,18 +231,11 @@ static const double __ac_HASH_UPPER = 0.77;
 			if (new_n_buckets < 4) new_n_buckets = 4;					\
 			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
 			else { /* hash table size to be changed (shrink or expand); rehash */ \
-				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
-				if (!new_flags) return -1;								\
-				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
 				if (h->n_buckets < new_n_buckets) {	/* expand */		\
-					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-					if (!new_keys) { kfree(new_flags); return -1; }		\
-					h->keys = new_keys;									\
-					if (kh_is_map) {									\
-						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
-						if (!new_vals) { kfree(new_flags); return -1; }	\
-						h->vals = new_vals;								\
-					}													\
+					h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
 				} /* otherwise shrink */								\
 			}															\
 		}																\
@@ -265,17 +247,18 @@ static const double __ac_HASH_UPPER = 0.77;
 					khint_t new_mask;									\
 					new_mask = new_n_buckets - 1; 						\
 					if (kh_is_map) val = h->vals[j];					\
-					__ac_set_isdel_true(h->flags, j);					\
+					__ac_set_isempty_true(h->flags, j);					\
 					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
-						khint_t k, i, step = 0; \
+						khint_t inc, k, i;								\
 						k = __hash_func(key);							\
 						i = k & new_mask;								\
-						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						inc = __ac_inc(k, new_mask);					\
+						while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
 						__ac_set_isempty_false(new_flags, i);			\
 						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
 							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
 							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+							__ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
 						} else { /* write the element and jump out of the loop */ \
 							h->keys[i] = key;							\
 							if (kh_is_map) h->vals[i] = val;			\
@@ -285,38 +268,32 @@ static const double __ac_HASH_UPPER = 0.77;
 				}														\
 			}															\
 			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
-				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
-				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+				h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
 			}															\
-			kfree(h->flags); /* free the working space */				\
+			free(h->flags); /* free the working space */				\
 			h->flags = new_flags;										\
 			h->n_buckets = new_n_buckets;								\
 			h->n_occupied = h->size;									\
 			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
 		}																\
-		return 0;														\
 	}																	\
 	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
 	{																	\
 		khint_t x;														\
 		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
-			if (h->n_buckets > (h->size<<1)) {							\
-				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
-					*ret = -1; return h->n_buckets;						\
-				}														\
-			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
-				*ret = -1; return h->n_buckets;							\
-			}															\
+			if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
+			else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
 		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
 		{																\
-			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			khint_t inc, k, i, site, last, mask = h->n_buckets - 1;		\
 			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
 			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
 			else {														\
-				last = i; \
+				inc = __ac_inc(k, mask); last = i;						\
 				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
 					if (__ac_isdel(h->flags, i)) site = i;				\
-					i = (i + (++step)) & mask; \
+					i = (i + inc) & mask; 								\
 					if (i == last) { x = site; break; }					\
 				}														\
 				if (x == h->n_buckets) {								\
@@ -346,16 +323,8 @@ static const double __ac_HASH_UPPER = 0.77;
 		}																\
 	}
 
-#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
-	__KHASH_TYPE(name, khkey_t, khval_t) 								\
-	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-	__KHASH_TYPE(name, khkey_t, khval_t) 								\
-	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
 #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+	KHASH_INIT2(name, static PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
 
 /* --- BEGIN OF HASH FUNCTIONS --- */
 
@@ -379,15 +348,16 @@ static const double __ac_HASH_UPPER = 0.77;
   @abstract     64-bit integer comparison function
  */
 #define kh_int64_hash_equal(a, b) ((a) == (b))
+
 /*! @function
   @abstract     const char* hash function
   @param  s     Pointer to a null terminated string
   @return       The hash value
  */
-static kh_inline khint_t __ac_X31_hash_string(const char *s)
+static PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s)
 {
-	khint_t h = (khint_t)*s;
-	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	khint_t h = *s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
 	return h;
 }
 /*! @function
@@ -401,7 +371,7 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s)
  */
 #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
 
-static kh_inline khint_t __ac_Wang_hash(khint_t key)
+static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key)
 {
     key += ~(key << 15);
     key ^=  (key >> 10);
@@ -428,7 +398,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @return       Pointer to the hash table [khash_t(name)*]
  */
-#define kh_init(name) kh_init_##name()
+#define kh_init(name) kh_init_##name(void)
 
 /*! @function
   @abstract     Destroy a hash table.
@@ -457,8 +427,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  h     Pointer to the hash table [khash_t(name)*]
   @param  k     Key [type of keys]
-  @param  r     Extra return code: -1 if the operation failed;
-                0 if the key is present in the hash table;
+  @param  r     Extra return code: 0 if the key is present in the hash table;
                 1 if the bucket is empty (never used); 2 if the element in
 				the bucket has been deleted [int*]
   @return       Iterator to the inserted element [khint_t]
@@ -470,7 +439,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  h     Pointer to the hash table [khash_t(name)*]
   @param  k     Key [type of keys]
-  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+  @return       Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
  */
 #define kh_get(name, h, k) kh_get_##name(h, k)
 
@@ -540,34 +509,6 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
  */
 #define kh_n_buckets(h) ((h)->n_buckets)
 
-/*! @function
-  @abstract     Iterate over the entries in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  kvar  Variable to which key will be assigned
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
-	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
-		if (!kh_exist(h,__i)) continue;						\
-		(kvar) = kh_key(h,__i);								\
-		(vvar) = kh_val(h,__i);								\
-		code;												\
-	} }
-
-/*! @function
-  @abstract     Iterate over the values in the hash table
-  @param  h     Pointer to the hash table [khash_t(name)*]
-  @param  vvar  Variable to which value will be assigned
-  @param  code  Block of code to execute
- */
-#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
-	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
-		if (!kh_exist(h,__i)) continue;						\
-		(vvar) = kh_val(h,__i);								\
-		code;												\
-	} }
-
 /* More conenient interfaces */
 
 /*! @function
@@ -589,6 +530,9 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @abstract     Instantiate a hash map containing 64-bit integer keys
   @param  name  Name of the hash table [symbol]
  */
+#define KHASH_SET_INIT_UINT64(name)										\
+	KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
 #define KHASH_SET_INIT_INT64(name)										\
 	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
 
@@ -597,9 +541,13 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
   @param  name  Name of the hash table [symbol]
   @param  khval_t  Type of values [type]
  */
+#define KHASH_MAP_INIT_UINT64(name, khval_t)								\
+	KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
 #define KHASH_MAP_INIT_INT64(name, khval_t)								\
 	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
 
+
 typedef const char *kh_cstr_t;
 /*! @function
   @abstract     Instantiate a hash map containing const char* keys
@@ -616,4 +564,15 @@ typedef const char *kh_cstr_t;
 #define KHASH_MAP_INIT_STR(name, khval_t)								\
 	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
 
+
+#define kh_exist_str(h, k) (kh_exist(h, k))
+#define kh_exist_float64(h, k) (kh_exist(h, k))
+#define kh_exist_int64(h, k) (kh_exist(h, k))
+#define kh_exist_int32(h, k) (kh_exist(h, k))
+
+KHASH_MAP_INIT_STR(str, size_t)
+KHASH_MAP_INIT_INT(int32, size_t)
+KHASH_MAP_INIT_INT64(int64, size_t)
+
+
 #endif /* __AC_KHASH_H */
diff --git a/bquery/khash_python.h b/bquery/khash_python.h
index 2566dff..cdd94b5 100644
--- a/bquery/khash_python.h
+++ b/bquery/khash_python.h
@@ -1,22 +1,11 @@
 #include <Python.h>
 
-typedef double khfloat64_t;
-
 #include "khash.h"
 
-#define kh_exist_str(h, k) (kh_exist(h, k))
-#define kh_exist_float64(h, k) (kh_exist(h, k))
-#define kh_exist_int64(h, k) (kh_exist(h, k))
-#define kh_exist_int32(h, k) (kh_exist(h, k))
-
-KHASH_MAP_INIT_STR(str, size_t)
-KHASH_MAP_INIT_INT(int32, size_t)
-KHASH_MAP_INIT_INT64(int64, size_t)
-
 // kludge
 
 #define kh_float64_hash_func _Py_HashDouble
-#define kh_float64_hash_equal kh_int64_hash_equal
+#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
 
 #define KHASH_MAP_INIT_FLOAT64(name, khval_t)								\
 	KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal)
@@ -24,7 +13,7 @@ KHASH_MAP_INIT_INT64(int64, size_t)
 KHASH_MAP_INIT_FLOAT64(float64, size_t)
 
 
-int kh_inline pyobject_cmp(PyObject* a, PyObject* b) {
+int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
 	int result = PyObject_RichCompareBool(a, b, Py_EQ);
 	if (result < 0) {
 		PyErr_Clear();

From 8f76992f766f8e887d556290226be2bac0fcd202 Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Mon, 16 Mar 2015 12:43:38 +0100
Subject: [PATCH 7/8] iterblocks implementation with pandas khash

---
 bquery/ctable.py      |   21 +-
 bquery/ctable_ext.pyx | 2398 +++++++++++++++++------------------------
 2 files changed, 978 insertions(+), 1441 deletions(-)

diff --git a/bquery/ctable.py b/bquery/ctable.py
index ee2df11..0afa1f5 100644
--- a/bquery/ctable.py
+++ b/bquery/ctable.py
@@ -59,24 +59,8 @@ def cache_factor(self, col_list, refresh=False):
                 col_factor_rootdir = col_rootdir + '.factor'
                 col_values_rootdir = col_rootdir + '.values'
 
-                carray_factor = \
-                    bcolz.carray([], dtype='int64', expectedlen=self.size,
-                                   rootdir=col_factor_rootdir, mode='w')
-                # ensure carray_factor.chunklen is multiple of column chunklen
-                if carray_factor.chunklen % self[col].chunklen != 0:
-                    chunklen_multiplier = round(carray_factor.chunklen / 
-                                                self[col].chunklen)
-                    if chunklen_multiplier < 1.0:
-                        chunklen_multiplier = 1
-                    chunklen_multiplier = int(round(chunklen_multiplier))
-                    carray_factor = \
-                        bcolz.carray([], dtype='int64', 
-                                     chunklen=chunklen_multiplier * \
-                                              self[col].chunklen,
-                                     rootdir=col_factor_rootdir, mode='w')
-
-                _, values = \
-                    ctable_ext.factorize(self[col], labels=carray_factor)
+                carray_factor, values = \
+                    ctable_ext.factorize(self[col], rootdir=col_factor_rootdir)
                 carray_factor.flush()
 
                 carray_values = \
@@ -207,7 +191,6 @@ def factorize_groupby_cols(self, groupby_cols):
 
         return factor_list, values_list
 
-    @staticmethod
     def make_group_index(self, factor_list, values_list, groupby_cols,
                          array_length, bool_arr):
         # create unique groups for groupby loop
diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index 80f0c5f..a3b2ab9 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -1,1436 +1,990 @@
-cimport cython
-from cython.parallel import parallel, prange, threadid
+cimport cython
+from cython.parallel import parallel, prange, threadid
 from openmp cimport *
-
-import numpy as np
+
+import numpy as np
 from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64,
                     npy_int64, npy_float64, uint64_t)
-
-from libc.stdint cimport uintptr_t
-from libc.stdlib cimport malloc, calloc, free
-from libc.string cimport strcpy, memcpy, memset
-from libcpp.vector cimport vector
-
-from khash cimport *
-from bcolz.carray_ext cimport carray, chunks, chunk
-
-# ----------------------------------------------------------------------------
-#                        GLOBAL DEFINITIONS
-# ----------------------------------------------------------------------------
-
-SUM = 0
-DEF _SUM = 0
-
-COUNT = 1
-DEF _COUNT = 1
-
-COUNT_NA = 2
-DEF _COUNT_NA = 2
-
-COUNT_DISTINCT = 3
-DEF _COUNT_DISTINCT = 3
-
-SORTED_COUNT_DISTINCT = 4
-DEF _SORTED_COUNT_DISTINCT = 4
-# ----------------------------------------------------------------------------
-
-
-cdef extern from *:
-    cdef void omp_start_critical "#pragma omp critical \n { //" () nogil
-    cdef void omp_end_critical "} //" () nogil
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef omp_lock_t * par_initialise_kh_locks(unsigned int * num_threads) nogil:
-    cdef:
-        int i
-        omp_lock_t * kh_locks
-
-    num_threads[0] = omp_get_num_threads()
-    kh_locks = <omp_lock_t *> malloc(num_threads[0] * sizeof(omp_lock_t))
-    for i in range(num_threads[0]):
-        omp_init_lock(&kh_locks[i])
-
-    return kh_locks
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef char * par_allocate_in_buffer(Py_ssize_t chunklen, 
-                                   Py_ssize_t itemsize) nogil:
-    return <char *>malloc(chunklen * itemsize * sizeof(char))
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef uint64_t * par_allocate_out_buffer(Py_ssize_t chunklen, 
-                                       Py_ssize_t chunklen_multiplier) nogil:
-    return <uint64_t *>malloc(chunklen_multiplier * chunklen 
-                              * sizeof(uint64_t))
-
-
+
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport strcpy, memcpy, memset
+from libcpp.vector cimport vector
+
+from khash cimport *
+import bcolz as bz
+from bcolz.carray_ext cimport carray, chunks, chunk
+
+include "parallel_processing.pxi"
+
+# ----------------------------------------------------------------------------
+#                        GLOBAL DEFINITIONS
+# ----------------------------------------------------------------------------
+SUM = 0
+DEF _SUM = 0
+
+COUNT = 1
+DEF _COUNT = 1
+
+COUNT_NA = 2
+DEF _COUNT_NA = 2
+
+COUNT_DISTINCT = 3
+DEF _COUNT_DISTINCT = 3
+
+SORTED_COUNT_DISTINCT = 4
+DEF _SORTED_COUNT_DISTINCT = 4
+
+
+# ----------------------------------------------------------------------------
+#                           FACTORIZE SECTION
+# ----------------------------------------------------------------------------
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cdef check_labels_structure(carray carray_, carray labels):
-    # if no labels carray is given, create one with satisfying:
-    # labels.chunklen % carray_.chunklen = 0
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=len(carray_))
-        chunklen_multiplier = round(labels.chunklen / carray_.chunklen)
-        if chunklen_multiplier < 1:
-            chunklen_multiplier = 1
-        chunklen_multiplier = int(round(chunklen_multiplier))
-        labels = carray([], dtype='int64', 
-                        chunklen=chunklen_multiplier * carray_.chunklen)
-
-    # if one was given, check chunklen is ok
-    elif labels.chunklen % carray_.chunklen != 0:
-        raise NotImplementedError('labels.chunklen must be a multiple of ' +
-                                  'carray_.chunklen')
-    # if labels was given, with ok chunklen, determine multiplier
-    else:
-        chunklen_multiplier = labels.chunklen / carray_.chunklen
-
-    return labels, chunklen_multiplier
-
-
-# Factorize Section
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_str_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       char * in_buffer_ptr,
-                       uint64_t * out_buffer,
-                       kh_str_t *table,
-                       Py_ssize_t * count,
-                       unsigned int thread_id,
-                       omp_lock_t kh_locks[],
-                       unsigned int num_threads,
-                       ) nogil:
-    cdef:
-        Py_ssize_t i, idx, itemsize
-        int ret
-        unsigned int j
-        char * element
-        char * insert
-        khiter_t k
-
-    ret = 0
-    itemsize = allocation_size - 1
-    # allocate enough memory to hold the string element, add one for the
-    # null byte that marks the end of the string.
-    # TODO: understand why zero-filling is necessary. Without zero-filling
-    # the buffer, duplicate keys occur in the reverse dict
-    element = <char *>calloc(allocation_size, sizeof(char))
-
-    omp_set_lock(&kh_locks[thread_id])
-    for i in xrange(iter_range):
-        # strings are stored without null termination in ndarrays: need a 
-        # buffer to append null termination to use usual string algorithms
-        memcpy(element, in_buffer_ptr, itemsize)
-        in_buffer_ptr += itemsize
-
-        # while reading from shared hash table, set lock for this thread
-        k = kh_get_str(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            omp_unset_lock(&kh_locks[thread_id])
-            # allocate enough memory to hold the string, add one for the
-            # null byte that marks the end of the string.
-            insert = <char *>malloc(allocation_size)
-            # TODO: is strcpy really the best way to copy a string?
-            strcpy(insert, element)
-            # acquire locks for all threads before writing to shared resources
-            for j in xrange(num_threads):
-                omp_set_lock(&kh_locks[j])
+cdef void _factorize_str_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       char * in_buffer_ptr,
+                       uint64_t * out_buffer,
+                       kh_str_t *table,
+                       Py_ssize_t * count,
+                       unsigned int thread_id,
+                       omp_lock_t kh_locks[],
+                       unsigned int num_threads,
+                       ) nogil:
+    cdef:
+        Py_ssize_t i, idx
+        unsigned int j
+        khiter_t k
+
+        char * element
+        char * insert
+
+        int ret = 0
+        Py_ssize_t itemsize = allocation_size - 1
+
+    # allocate enough memory to hold the string element, add one for the
+    # null byte that marks the end of the string.
+    # TODO: understand why zero-filling is necessary. Without zero-filling
+    # the buffer, duplicate keys occur in the reverse dict
+    element = <char *>calloc(allocation_size, sizeof(char))
+
+    # lock ensures that table is in consistend state during access
+    omp_set_lock(&kh_locks[thread_id])
+    for i in xrange(iter_range):
+        # appends null character to element string
+        memcpy(element, in_buffer_ptr, itemsize)
+        in_buffer_ptr += itemsize
+
+        k = kh_get_str(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            omp_unset_lock(&kh_locks[thread_id])
+            insert = <char *>malloc(allocation_size)
+            # TODO: is strcpy really the best way to copy a string?
+            strcpy(insert, element)
+
+            # acquire locks for all threads to avoid inconsistent state
+            for j in xrange(num_threads):
+                omp_set_lock(&kh_locks[j])
             # check whether another thread has already added the entry by now
             k = kh_get_str(table, element)
             if k != table.n_buckets:
                 idx = table.vals[k]
             # if not, add element
             else:
-                k = kh_put_str(table, insert, &ret)
-                table.vals[k] = idx = count[0]
-                count[0] += 1
-            # release all locks
-            for j in xrange(num_threads):
-                omp_unset_lock(&kh_locks[j])
-            omp_set_lock(&kh_locks[thread_id])
-        out_buffer[i] = idx
-
-    omp_unset_lock(&kh_locks[thread_id])
-    free(element)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_str(carray carray_, carray labels=None):
-    cdef:
-        chunk chunk_
-        chunks labels_chunks
-        Py_ssize_t i, j, k, leftover_elements, \
-                   chunklen_multiplier
-        unsigned int thread_id
-        omp_lock_t chunk_lock, out_buffer_lock
-        char * in_buffer_ptr
-        ndarray[npy_uint64] out_buffer
-        char * out_buffer_org_ptr
-        uint64_t * out_buffer_ptr
-        kh_str_t *table
-        uintptr_t leftover_array_ptr
-
-        Py_ssize_t count = 0
-        Py_ssize_t nchunks = carray_.nchunks
-        Py_ssize_t tmp_chunks_counter = 0
-        omp_lock_t * kh_locks = NULL
-        dict reverse = {}
-        list tmp_chunks_list = []
-        # num_threads = 0: set the number of parallel threads automatically:
-        unsigned int num_threads = 0
-
-        Py_ssize_t chunklen = carray_.chunklen
-        Py_ssize_t allocation_size = carray_.dtype.itemsize + 1
-
-
-
-    labels, chunklen_multiplier = check_labels_structure(carray_, labels)
-
-    # provide access to cython chunks api for persistent labels
-    # fails with in-memory carray
-    if labels._rootdir is not None:
-        labels_chunks = labels.chunks
-
-    tmp_chunks_keys = [0] * int(carray_.size / labels.chunklen)
-
-    leftover_array_ptr = carray_.leftover_array.__array_interface__['data'][0]
-
-    table = kh_init_str()
-
-    # initialise locks for other shared resources
-    omp_init_lock(&chunk_lock)
-    omp_init_lock(&out_buffer_lock)
-
-    out_buffer = np.empty(labels.chunklen, dtype='uint64')
-    out_buffer_org_ptr = out_buffer.data
-
-    with nogil, parallel(num_threads=num_threads):
-        # Initialise shared locks for hash-table writes for each child thread.
-        omp_start_critical()
-        (&kh_locks)[0] = par_initialise_kh_locks(&num_threads)
-        omp_end_critical()
-
-        # allocate an in-buffer and an out-buffer for each thread
-        in_buffer_ptr = par_allocate_in_buffer(chunklen, allocation_size-1)
-        out_buffer_ptr = par_allocate_out_buffer(chunklen, chunklen_multiplier)
-
-        for i in prange(0, <int>(nchunks/chunklen_multiplier)*chunklen_multiplier, chunklen_multiplier):
-            thread_id = threadid()
-            # pack the results from multiple carray_-chunks into one 
-            # labels-chunk: allows out-of-order writing of labels at good speed
-            for j in xrange(chunklen_multiplier):
-                # chunk._getitem is not thread-safe. This may be due to it
-                # setting properties of chunk using self. - Or not.
-                # TODO: check whether _getitem can be made thread-safe to 
-                #       get rid of this probably unnecessary thread lock
-                omp_set_lock(&chunk_lock)
-                with gil:
-                    chunk_ = carray_.chunks[i+j]
-                    # decompress into in_buffer
-                    # note: _getitem releases gil during blosc decompression
-                    chunk_._getitem(0, chunklen, in_buffer_ptr)
-                omp_unset_lock(&chunk_lock)
-
-                _factorize_str_helper(chunklen,
-                                allocation_size,
-                                in_buffer_ptr,
-                                out_buffer_ptr + j * chunklen,
-                                table,
-                                &count,
-                                thread_id,
-                                kh_locks,
-                                num_threads,
-                                )
-
-            # note 2: labels.append(...) requires ndarray, but no thread-local 
-            #         ndarrays can be created. Use the shared out_buffer 
-            # ndarray object and redirect its data pointer to the thread-local
-            # c array out_buffer_ptr
-            omp_set_lock(&out_buffer_lock)
-            with gil:
-                out_buffer.data = <char *>out_buffer_ptr
-                # compress out_buffer into labels
-                # _save() and append() freeze at versions prior to v0.8.1
-                if labels._rootdir is not None:
-                    labels_chunks._save(int(i/chunklen_multiplier), chunk(out_buffer, labels._dtype, 
-                                                 labels._cparams, _memory=False))
-                    labels._cbytes += chunk_.cbytes
-                    labels._nbytes += labels.chunklen * labels.itemsize
-                    labels_chunks.nchunks += 1
-                # in-memory chunks cannot be "_save()"-ed but must be appended
-                # to a python chunk list
-                else:
-                    # TODO: Why does the following not work? (Freezes...)
-                    # tmp_chunks_list[i] = chunk(out_buffer, labels._dtype, labels._cparams, _memory=True)
-                    #
-                    # Work-around until above problem is solved.
-                    tmp_chunks_list.append(chunk(out_buffer, labels._dtype, 
-                                                 labels._cparams, _memory=True))
-                    tmp_chunks_keys[int(i/chunklen_multiplier)] = tmp_chunks_counter
-                    (&tmp_chunks_counter)[0] += 1
-
-            omp_unset_lock(&out_buffer_lock)
-
-        # Clean-up thread local variables
-        free(in_buffer_ptr)
-        free(out_buffer_ptr)
-
-    # update chunk count (necessary due to use of _save() above
-    if labels._rootdir is None:
-        for i in xrange(tmp_chunks_counter):
-            labels.chunks.append(tmp_chunks_list[tmp_chunks_keys[i]])
-            labels._cbytes += tmp_chunks_list[tmp_chunks_keys[i]].cbytes
-            labels._nbytes += labels.chunklen * labels.itemsize
-
-    # process remaining chunks that could not be processed en-bloc, chunks in-order
-    # TODO: parallelise after refactoring
-    # This is really slowing us down when chunk_multiplier = 1 !!!
-    # e.g. uncompressed unique()
-    in_buffer_ptr = <char *>malloc(chunklen * (allocation_size-1) * sizeof(char))
-    out_buffer_ptr = <uint64_t *>malloc(chunklen * sizeof(uint64_t))
-    out_buffer.data = <char *>out_buffer_ptr
-    for i in xrange(labels.size/carray_.chunklen, nchunks):
-        chunk_ = carray_.chunks[i]
-        chunk_._getitem(0, chunklen, in_buffer_ptr)
-        _factorize_str_helper(chunklen,
-                        allocation_size,
-                        in_buffer_ptr,
-                        out_buffer_ptr,
-                        table,
-                        &count,
-                        0,
-                        kh_locks,
-                        num_threads,
-                        )
-        #out_buffer.data = <char *>out_buffer_ptr
-        labels.append(out_buffer[:chunklen].astype(np.int64))
-
-
-    # processing of leftover_array is not parallelised in view of an upcoming
-    # changes to bcolz chunk access which will allow seamlessly folding 
-    # this into the prange block 
-    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
-    if leftover_elements > 0:
-        _factorize_str_helper(leftover_elements,
-                            allocation_size,
-                            <char *>leftover_array_ptr,
-                            out_buffer_ptr,
-                            table,
-                            &count,
-                            0,
-                            kh_locks,
-                            num_threads,
-                            )
-        # compress out_buffer into labels
-        labels.append(out_buffer[:leftover_elements].astype(np.int64))
-
-    
-    free(in_buffer_ptr)
-    free(out_buffer_ptr)
-
-    # destroy and free all locks
-    for j in xrange(num_threads):
-        omp_destroy_lock(&kh_locks[j])
-    free(kh_locks)
-    omp_destroy_lock(&chunk_lock)
-    omp_destroy_lock(&out_buffer_lock)
-
-    # restore out_buffer data pointer to avoid segfault when python thries
-    # to free the object's data
-    out_buffer.data = out_buffer_org_ptr
-
+                k = kh_put_str(table, insert, &ret)
+                table.vals[k] = idx = count[0]
+                count[0] += 1
+            # release all locks
+            for j in xrange(num_threads):
+                omp_unset_lock(&kh_locks[j])
+            # acquire our own lock again, to indicate we are reading the table
+            omp_set_lock(&kh_locks[thread_id])
+        out_buffer[i] = idx
+
+    omp_unset_lock(&kh_locks[thread_id])
+    free(element)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_str(carray carray_, unsigned int num_threads_requested = 0, **kwargs):
+    cdef:
+        Py_ssize_t i, blocklen, nblocks
+        khint_t j
+        carray labels
+        ndarray in_buffer
+        ndarray[npy_uint64] out_buffer
+        unsigned int thread_id
+        par_info_t par_info
+
+        kh_str_t *table
+        char * out_buffer_org_ptr
+        char * in_buffer_ptr
+        uint64_t * out_buffer_ptr
+
+        Py_ssize_t count = 0
+        dict reverse = {}
+
+        bint first_thread = True
+        Py_ssize_t element_allocation_size = carray_.dtype.itemsize + 1
+        Py_ssize_t carray_chunklen = carray_.chunklen
+
+    labels = create_labels_carray(carray_, **kwargs)
+
+    out_buffer = np.empty(labels.chunklen, dtype='uint64')
+    out_buffer_org_ptr = out_buffer.data
+
+    table = kh_init_str()
+
+    block_iterator = bz.iterblocks(carray_, blen=labels.chunklen)
+    nblocks = <int>(len(carray_)/labels.chunklen+0.5)+1
+
+    with nogil, parallel(num_threads=num_threads_requested):
+        # Initialise some parallel processing stuff
+        omp_start_critical()
+        if first_thread == True:
+            (&first_thread)[0] = False
+            with gil:
+                par_initialise(&par_info, carray_)
+        omp_end_critical()
+        
+        # allocate thread-local in- and out-buffers
+        with gil:
+            in_buffer_ptr = <char *>par_allocate_buffer(labels.chunklen, carray_.dtype.itemsize)
+            out_buffer_ptr = <uint64_t *>par_allocate_buffer(labels.chunklen, labels.dtype.itemsize)
+
+        # factorise the chunks in parallel
+        for i in prange(0, nblocks, schedule='dynamic', chunksize=1):
+            thread_id = threadid()
+
+            omp_set_lock(&par_info.chunk_lock)
+            with gil:
+                in_buffer = np.ascontiguousarray(block_iterator.next())
+                blocklen = len(in_buffer)
+                memcpy(in_buffer_ptr, <char*>in_buffer.data, in_buffer.nbytes)
+            omp_unset_lock(&par_info.chunk_lock)
+
+            _factorize_str_helper(blocklen,
+                            element_allocation_size,
+                            in_buffer_ptr,
+                            out_buffer_ptr,
+                            table,
+                            &count,
+                            thread_id,
+                            par_info.kh_locks,
+                            par_info.num_threads,
+                            )
+
+            # compress out_buffer into labels
+            omp_set_lock(&par_info.out_buffer_lock)
+            with gil:
+                out_buffer.data = <char *>out_buffer_ptr
+                par_save_block(i, labels, out_buffer, blocklen, nblocks)
+            omp_unset_lock(&par_info.out_buffer_lock)
+
+        # Clean-up thread local variables
+        free(in_buffer_ptr)
+        free(out_buffer_ptr)
+
+    # Clean-up some parallel processing stuff
+    par_terminate(par_info)
+
+    # restore out_buffer data pointer to allow python to free the object's data
+    out_buffer.data = out_buffer_org_ptr
+
     # construct python dict from vectors and free element memory
-    cdef unsigned long m
-    for m in xrange(table.n_buckets):
-        if not kh_exist_str(table, m):
+    for j in range(table.n_buckets):
+        if not kh_exist_str(table, j):
             continue
-        reverse[table.vals[m]] = <char*>table.keys[m]
-        free(<void*>table.keys[m])
+        reverse[table.vals[j]] = <char*>table.keys[j]
+        free(<void*>table.keys[j])
     kh_destroy_str(table)
-
-    return labels, reverse
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_int64_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       ndarray[npy_int64] in_buffer,
-                       ndarray[npy_uint64] out_buffer,
-                       kh_int64_t *table,
-                       Py_ssize_t * count,
-                       dict reverse,
-                       ):
-    cdef:
-        Py_ssize_t i, idx
-        int ret
-        npy_int64 element
-        khiter_t k
-
-    ret = 0
-
-    for i in range(iter_range):
-        element = in_buffer[i]
-        k = kh_get_int64(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            k = kh_put_int64(table, element, &ret)
-            table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
-            count[0] += 1
-        out_buffer[i] = idx
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_int64(carray carray_, carray labels=None):
-    cdef:
-        chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements
-        dict reverse
-        ndarray[npy_int64] in_buffer
-        ndarray[npy_uint64] out_buffer
-        kh_int64_t *table
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    n = len(carray_)
-    chunklen = carray_.chunklen
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=n)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    in_buffer = np.empty(chunklen, dtype='int64')
-    table = kh_init_int64()
-
-    for i in range(carray_.nchunks):
-        chunk_ = carray_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        _factorize_int64_helper(chunklen,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer.astype(np.int64))
-
-    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
-    if leftover_elements > 0:
-        _factorize_int64_helper(leftover_elements,
-                          carray_.dtype.itemsize + 1,
-                          carray_.leftover_array,
-                          out_buffer,
-                          table,
-                          &count,
-                          reverse,
-                          )
-
-    # compress out_buffer into labels
-    labels.append(out_buffer[:leftover_elements].astype(np.int64))
-
-    kh_destroy_int64(table)
-
-    return labels, reverse
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_int32_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       ndarray[npy_int32] in_buffer,
-                       ndarray[npy_uint64] out_buffer,
-                       kh_int32_t *table,
-                       Py_ssize_t * count,
-                       dict reverse,
-                       ):
-    cdef:
-        Py_ssize_t i, idx
-        int ret
-        npy_int32 element
-        khiter_t k
-
-    ret = 0
-
-    for i in range(iter_range):
-        element = in_buffer[i]
-        k = kh_get_int32(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            k = kh_put_int32(table, element, &ret)
-            table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
-            count[0] += 1
-        out_buffer[i] = idx
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_int32(carray carray_, carray labels=None):
-    cdef:
-        chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements
-        dict reverse
-        ndarray[npy_int32] in_buffer
-        ndarray[npy_uint64] out_buffer
-        kh_int32_t *table
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    n = len(carray_)
-    chunklen = carray_.chunklen
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=n)
-    # in-buffer isn't typed, because cython doesn't support string arrays (?)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    in_buffer = np.empty(chunklen, dtype='int32')
-    table = kh_init_int32()
-
-    for i in range(carray_.nchunks):
-        chunk_ = carray_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        _factorize_int32_helper(chunklen,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer.astype(np.int64))
-
-    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
-    if leftover_elements > 0:
-        _factorize_int32_helper(leftover_elements,
-                          carray_.dtype.itemsize + 1,
-                          carray_.leftover_array,
-                          out_buffer,
-                          table,
-                          &count,
-                          reverse,
-                          )
-
-    # compress out_buffer into labels
-    labels.append(out_buffer[:leftover_elements].astype(np.int64))
-
-    kh_destroy_int32(table)
-
-    return labels, reverse
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_float64_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       ndarray[npy_float64] in_buffer,
-                       ndarray[npy_uint64] out_buffer,
-                       kh_float64_t *table,
-                       Py_ssize_t * count,
-                       dict reverse,
-                       ):
-    cdef:
-        Py_ssize_t i, idx
-        int ret
-        npy_float64 element
-        khiter_t k
-
-    ret = 0
-
-    for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
-        k = kh_get_float64(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            k = kh_put_float64(table, element, &ret)
-            table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
-            count[0] += 1
-        out_buffer[i] = idx
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_float64(carray carray_, carray labels=None):
-    cdef:
-        chunk chunk_
-        Py_ssize_t n, i, count, chunklen, leftover_elements
-        dict reverse
-        ndarray[npy_float64] in_buffer
-        ndarray[npy_uint64] out_buffer
-        kh_float64_t *table
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    n = len(carray_)
-    chunklen = carray_.chunklen
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=n)
-    # in-buffer isn't typed, because cython doesn't support string arrays (?)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    in_buffer = np.empty(chunklen, dtype='float64')
-    table = kh_init_float64()
-
-    for i in range(carray_.nchunks):
-        chunk_ = carray_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        _factorize_float64_helper(chunklen,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer.astype(np.int64))
-
-    leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize)
-    if leftover_elements > 0:
-        _factorize_float64_helper(leftover_elements,
-                          carray_.dtype.itemsize + 1,
-                          carray_.leftover_array,
-                          out_buffer,
-                          table,
-                          &count,
-                          reverse,
-                          )
-
-    # compress out_buffer into labels
-    labels.append(out_buffer[:leftover_elements].astype(np.int64))
-
-    kh_destroy_float64(table)
-
-    return labels, reverse
-
-cpdef factorize(carray carray_, carray labels=None):
-    if carray_.dtype == 'int32':
-        labels, reverse = factorize_int32(carray_, labels=labels)
-    elif carray_.dtype == 'int64':
-        labels, reverse = factorize_int64(carray_, labels=labels)
-    elif carray_.dtype == 'float64':
-        labels, reverse = factorize_float64(carray_, labels=labels)
-    else:
-        #TODO: check that the input is a string_ dtype type
-        labels, reverse = factorize_str(carray_, labels=labels)
-    return labels, reverse
-
-# ---------------------------------------------------------------------------
-# Translate existing arrays
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1):
-    cdef:
-        chunk chunk_
-        Py_ssize_t i, chunklen, leftover_elements
-        ndarray[npy_int64] in_buffer
-        ndarray[npy_int64] out_buffer
-
-    chunklen = input_.chunklen
-    out_buffer = np.empty(chunklen, dtype='int64')
-    in_buffer = np.empty(chunklen, dtype='int64')
-
-    for i in range(input_.nchunks):
-        chunk_ = input_.chunks[i]
-        # decompress into in_buffer
-        chunk_._getitem(0, chunklen, in_buffer.data)
-        for i in range(chunklen):
-            element = in_buffer[i]
-            out_buffer[i] = lookup.get(element, default)
-        # compress out_buffer into labels
-        output_.append(out_buffer.astype(np.int64))
-
-    leftover_elements = cython.cdiv(input_.leftover, input_.atomsize)
-    if leftover_elements > 0:
-        in_buffer = input_.leftover_array
-        for i in range(leftover_elements):
-            element = in_buffer[i]
-            out_buffer[i] = lookup.get(element, default)
-        output_.append(out_buffer[:leftover_elements].astype(np.int64))
-
-# ---------------------------------------------------------------------------
-# Aggregation Section (old)
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def agg_sum_na(iter_):
-    cdef:
-        npy_float64 v, v_cum = 0.0
-
-    for v in iter_:
-        if v == v:  # skip NA values
-            v_cum += v
-
-    return v_cum
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def agg_sum(iter_):
-    cdef:
-        npy_float64 v, v_cum = 0.0
-
-    for v in iter_:
-        v_cum += v
-
-    return v_cum
-
-# ---------------------------------------------------------------------------
-# Aggregation Section
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def groupsort_indexer(carray index, Py_ssize_t ngroups):
-    cdef:
-        Py_ssize_t i, label, n
-        ndarray[int64_t] counts, where, np_result
-        # --
-        carray c_result
-        chunk input_chunk, index_chunk
-        Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements
-
-        ndarray[int64_t] in_buffer
-
-    index_chunk_len = index.chunklen
-    in_buffer = np.empty(index_chunk_len, dtype='int64')
-    index_chunk_nr = 0
-
-    # count group sizes, location 0 for NA
-    counts = np.zeros(ngroups + 1, dtype=np.int64)
-    n = len(index)
-
-    for index_chunk_nr in range(index.nchunks):
-        # fill input buffer
-        input_chunk = index.chunks[index_chunk_nr]
-        input_chunk._getitem(0, index_chunk_len, in_buffer.data)
-
-        # loop through rows
-        for i in range(index_chunk_len):
-            counts[index[i] + 1] += 1
-
-    leftover_elements = cython.cdiv(index.leftover, index.atomsize)
-    if leftover_elements > 0:
-        # fill input buffer
-        in_buffer = index.leftover_array
-
-        # loop through rows
-        for i in range(leftover_elements):
-            counts[index[i] + 1] += 1
-
-    # mark the start of each contiguous group of like-indexed data
-    where = np.zeros(ngroups + 1, dtype=np.int64)
-    for i from 1 <= i < ngroups + 1:
-        where[i] = where[i - 1] + counts[i - 1]
-
-    # this is our indexer
-    np_result = np.zeros(n, dtype=np.int64)
-    for i from 0 <= i < n:
-        label = index[i] + 1
-        np_result[where[label]] = i
-        where[label] += 1
-
-    return np_result, counts
-
-cdef count_unique_float64(ndarray[float64_t] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        Py_ssize_t idx
-        int ret = 0
-        float64_t val
-        khiter_t k
-        npy_uint64 count = 0
-        bint seen_na = 0
-        kh_float64_t *table
-
-    table = kh_init_float64()
-
-    for i in range(n):
-        val = values[i]
-
-        if val == val:
-            k = kh_get_float64(table, val)
-            if k == table.n_buckets:
-                k = kh_put_float64(table, val, &ret)
-                count += 1
-        elif not seen_na:
-            seen_na = 1
-            count += 1
-
-    kh_destroy_float64(table)
-
-    return count
-
-cdef count_unique_int64(ndarray[int64_t] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        Py_ssize_t idx
-        int ret = 0
-        int64_t val
-        khiter_t k
-        npy_uint64 count = 0
-        kh_int64_t *table
-
-    table = kh_init_int64()
-
-    for i in range(n):
-        val = values[i]
-
-        if val == val:
-            k = kh_get_int64(table, val)
-            if k == table.n_buckets:
-                k = kh_put_int64(table, val, &ret)
-                count += 1
-
-    kh_destroy_int64(table)
-
-    return count
-
-cdef count_unique_int32(ndarray[int32_t] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        Py_ssize_t idx
-        int ret = 0
-        int32_t val
-        khiter_t k
-        npy_uint64 count = 0
-        kh_int32_t *table
-
-    table = kh_init_int32()
-
-    for i in range(n):
-        val = values[i]
-
-        if val == val:
-            k = kh_get_int32(table, val)
-            if k == table.n_buckets:
-                k = kh_put_int32(table, val, &ret)
-                count += 1
-
-    kh_destroy_int32(table)
-
-    return count
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef sum_float64(carray ca_input, carray ca_factor,
-                 Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
-    cdef:
-        chunk input_chunk, factor_chunk
-        Py_ssize_t input_chunk_nr, input_chunk_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row
-        Py_ssize_t current_index, i, j, end_counts, start_counts, factor_total_chunks, leftover_elements
-
-        ndarray[npy_float64] in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray[npy_float64] out_buffer
-        ndarray[npy_float64] last_values
-
-        npy_float64 v
-        bint count_distinct_started = 0
-        carray num_uniques
-
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    if agg_method == _COUNT_DISTINCT:
-        num_uniques = carray([], dtype='int64')
-        positions, counts = groupsort_indexer(ca_factor, nr_groups)
-        start_counts = 0
-        end_counts = 0
-        for j in range(len(counts) - 1):
-            start_counts = end_counts
-            end_counts = start_counts + counts[j + 1]
-            positions[start_counts:end_counts]
-            num_uniques.append(
-                count_unique_float64(ca_input[positions[start_counts:end_counts]])
-            )
-
-        return num_uniques
-
-    input_chunk_len = ca_input.chunklen
-    in_buffer = np.empty(input_chunk_len, dtype='float64')
-    factor_chunk_len = ca_factor.chunklen
-    factor_total_chunks = ca_factor.nchunks
-    factor_chunk_nr = 0
-    factor_buffer = np.empty(factor_chunk_len, dtype='int64')
-    if factor_total_chunks > 0:
-        factor_chunk = ca_factor.chunks[factor_chunk_nr]
-        factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-    else:
-        factor_buffer = ca_factor.leftover_array
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype='float64')
-
-    for input_chunk_nr in range(ca_input.nchunks):
-        # fill input buffer
-        input_chunk = ca_input.chunks[input_chunk_nr]
-        input_chunk._getitem(0, input_chunk_len, in_buffer.data)
-
-        # loop through rows
-        for i in range(input_chunk_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-                    v = in_buffer[i]
-                    if v == v:  # skip NA values
-                        out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='float64')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize)
-    if leftover_elements > 0:
-        # fill input buffer
-        in_buffer = ca_input.leftover_array
-
-        # loop through rows
-        for i in range(leftover_elements):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-                    v = in_buffer[i]
-                    if v == v:  # skip NA values
-                        out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='float64')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    # check whether a row has to be removed if it was meant to be skipped
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef sum_int32(carray ca_input, carray ca_factor,
-               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
-    cdef:
-        chunk input_chunk, factor_chunk
-        Py_ssize_t input_chunk_nr, input_chunk_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row
-        Py_ssize_t current_index, i, j, end_counts, start_counts, factor_total_chunks, leftover_elements
-
-        ndarray[npy_int32] in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray[npy_int32] out_buffer
-        ndarray[npy_int32] last_values
-
-        npy_int32 v
-        bint count_distinct_started = 0
-        carray num_uniques
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    if agg_method == _COUNT_DISTINCT:
-        num_uniques = carray([], dtype='int64')
-        positions, counts = groupsort_indexer(ca_factor, nr_groups)
-        start_counts = 0
-        end_counts = 0
-        for j in range(len(counts) - 1):
-            start_counts = end_counts
-            end_counts = start_counts + counts[j + 1]
-            positions[start_counts:end_counts]
-            num_uniques.append(
-                count_unique_int32(ca_input[positions[start_counts:end_counts]])
-            )
-
-        return num_uniques
-
-    input_chunk_len = ca_input.chunklen
-    in_buffer = np.empty(input_chunk_len, dtype='int32')
-    factor_chunk_len = ca_factor.chunklen
-    factor_total_chunks = ca_factor.nchunks
-    factor_chunk_nr = 0
-    factor_buffer = np.empty(factor_chunk_len, dtype='int64')
-    if factor_total_chunks > 0:
-        factor_chunk = ca_factor.chunks[factor_chunk_nr]
-        factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-    else:
-        factor_buffer = ca_factor.leftover_array
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype='int32')
-
-    for input_chunk_nr in range(ca_input.nchunks):
-        # fill input buffer
-        input_chunk = ca_input.chunks[input_chunk_nr]
-        input_chunk._getitem(0, input_chunk_len, in_buffer.data)
-
-        # loop through rows
-        for i in range(input_chunk_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-                    # TODO: Warning: int does not support NA values, is this what we need?
-                    out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='int32')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize)
-    if leftover_elements > 0:
-        # fill input buffer
-        in_buffer = ca_input.leftover_array
-
-        # loop through rows
-        for i in range(leftover_elements):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-                    # TODO: Warning: int does not support NA values, is this what we need?
-                    out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='int32')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    # check whether a row has to be removed if it was meant to be skipped
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef sum_int64(carray ca_input, carray ca_factor,
-               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
-    cdef:
-        chunk input_chunk, factor_chunk
-        Py_ssize_t input_chunk_nr, input_chunk_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row
-        Py_ssize_t current_index, i, j, end_counts, start_counts, factor_total_chunks, leftover_elements
-
-        ndarray[npy_int64] in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray[npy_int64] out_buffer
-        ndarray[npy_int64] last_values
-
-        npy_int64 v
-        bint count_distinct_started = 0
-        carray num_uniques
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    if agg_method == _COUNT_DISTINCT:
-        num_uniques = carray([], dtype='int64')
-        positions, counts = groupsort_indexer(ca_factor, nr_groups)
-        start_counts = 0
-        end_counts = 0
-        for j in range(len(counts) - 1):
-            start_counts = end_counts
-            end_counts = start_counts + counts[j + 1]
-            positions[start_counts:end_counts]
-            num_uniques.append(
-                count_unique_int64(ca_input[positions[start_counts:end_counts]])
-            )
-
-        return num_uniques
-
-    input_chunk_len = ca_input.chunklen
-    in_buffer = np.empty(input_chunk_len, dtype='int64')
-    factor_chunk_len = ca_factor.chunklen
-    factor_total_chunks = ca_factor.nchunks
-    factor_chunk_nr = 0
-    factor_buffer = np.empty(factor_chunk_len, dtype='int64')
-    if factor_total_chunks > 0:
-        factor_chunk = ca_factor.chunks[factor_chunk_nr]
-        factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-    else:
-        factor_buffer = ca_factor.leftover_array
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype='int64')
-
-    for input_chunk_nr in range(ca_input.nchunks):
-        # fill input buffer
-        input_chunk = ca_input.chunks[input_chunk_nr]
-        input_chunk._getitem(0, input_chunk_len, in_buffer.data)
-
-        # loop through rows
-        for i in range(input_chunk_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-                    # TODO: Warning: int does not support NA values, is this what we need?
-                    out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='int64')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize)
-    if leftover_elements > 0:
-        # fill input buffer
-        in_buffer = ca_input.leftover_array
-
-        # loop through rows
-        for i in range(leftover_elements):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-                    # TODO: Warning: int does not support NA values, is this what we need?
-                    out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='int64')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    # check whether a row has to be removed if it was meant to be skipped
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key):
-    cdef:
-        chunk input_chunk, factor_chunk
-        Py_ssize_t input_chunk_nr, input_chunk_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row
-        Py_ssize_t current_index, i, factor_total_chunks, leftover_elements
-
-        ndarray in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray out_buffer
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    input_chunk_len = ca_input.chunklen
-    in_buffer = np.empty(input_chunk_len, dtype=ca_input.dtype)
-    factor_chunk_len = ca_factor.chunklen
-    factor_total_chunks = ca_factor.nchunks
-    factor_chunk_nr = 0
-    factor_buffer = np.empty(factor_chunk_len, dtype='int64')
-    if factor_total_chunks > 0:
-        factor_chunk = ca_factor.chunks[factor_chunk_nr]
-        factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-    else:
-        factor_buffer = ca_factor.leftover_array
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype)
-
-    for input_chunk_nr in range(ca_input.nchunks):
-
-        # fill input buffer
-        input_chunk = ca_input.chunks[input_chunk_nr]
-        input_chunk._getitem(0, input_chunk_len, in_buffer.data)
-
-        for i in range(input_chunk_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                out_buffer[current_index] = in_buffer[i]
-
-    leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize)
-    if leftover_elements > 0:
-        in_buffer = ca_input.leftover_array
-
-        for i in range(leftover_elements):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_chunk_len:
-                factor_chunk_nr += 1
-                if factor_chunk_nr < factor_total_chunks:
-                    factor_chunk = ca_factor.chunks[factor_chunk_nr]
-                    factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data)
-                else:
-                    factor_buffer = ca_factor.leftover_array
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                out_buffer[current_index] = in_buffer[i]
-
-    # check whether a row has to be fixed
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-def aggregate_groups_by_iter_2(ct_input,
-                        ct_agg,
-                        npy_uint64 nr_groups,
-                        npy_uint64 skip_key,
-                        carray factor_carray,
-                        groupby_cols,
-                        output_agg_ops,
-                        dtype_list,
-                        agg_method=_SUM
-                        ):
-    total = []
-
-    for col in groupby_cols:
-        total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key))
-
-    for col, agg_op in output_agg_ops:
-        # TODO: input vs output column
-        col_dtype = ct_agg[col].dtype
-        if col_dtype == np.float64:
-            total.append(
-                sum_float64(ct_input[col], factor_carray, nr_groups, skip_key,
-                            agg_method=agg_method)
-            )
-        elif col_dtype == np.int64:
-            total.append(
-                sum_int64(ct_input[col], factor_carray, nr_groups, skip_key,
-                          agg_method=agg_method)
-            )
-        elif col_dtype == np.int32:
-            total.append(
-                sum_int32(ct_input[col], factor_carray, nr_groups, skip_key,
-                          agg_method=agg_method)
-            )
-        else:
-            raise NotImplementedError(
-                'Column dtype ({0}) not supported for aggregation yet '
-                '(only int32, int64 & float64)'.format(str(col_dtype)))
-
-    ct_agg.append(total)
-
-# ---------------------------------------------------------------------------
-# Temporary Section
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse):
-    """
-    TEMPORARY WORKAROUND till numexpr support in list operations
-
-    Update a boolean array with checks whether the values of a column (col) are in a set (value_set)
-    Reverse means "not in" functionality
-
-    For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61
-
-    :param col:
-    :param value_set:
-    :param boolarr:
-    :param reverse:
-    :return:
-    """
-    cdef Py_ssize_t i
-    i = 0
-    if not reverse:
-        for val in col.iter():
-            if val not in value_set:
-                boolarr[i] = False
-            i += 1
-    else:
-        for val in col.iter():
-            if val in value_set:
-                boolarr[i] = False
-            i += 1
+    
+    return labels, reverse
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_int64_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       ndarray[npy_int64] in_buffer,
+                       ndarray[npy_uint64] out_buffer,
+                       kh_int64_t *table,
+                       Py_ssize_t * count,
+                       dict reverse,
+                       ):
+    cdef:
+        Py_ssize_t i, idx
+        int ret
+        npy_int64 element
+        khiter_t k
+
+    ret = 0
+
+    for i in range(iter_range):
+        # TODO: Consider indexing directly into the array for efficiency
+        element = in_buffer[i]
+        k = kh_get_int64(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            k = kh_put_int64(table, element, &ret)
+            table.vals[k] = idx = count[0]
+            reverse[count[0]] = element
+            count[0] += 1
+        out_buffer[i] = idx
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_int64(carray carray_, carray labels=None):
+    cdef:
+        Py_ssize_t len_carray, count, chunklen, len_in_buffer
+        dict reverse
+        ndarray[npy_int64] in_buffer
+        ndarray[npy_uint64] out_buffer
+        kh_int64_t *table
+
+    count = 0
+    ret = 0
+    reverse = {}
+
+    len_carray = len(carray_)
+    chunklen = carray_.chunklen
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len_carray)
+    # in-buffer isn't typed, because cython doesn't support string arrays (?)
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    table = kh_init_int64()
+
+    for in_buffer in bz.iterblocks(carray_):
+        len_in_buffer = len(in_buffer)
+        _factorize_int64_helper(len_in_buffer,
+                        carray_.dtype.itemsize + 1,
+                        in_buffer,
+                        out_buffer,
+                        table,
+                        &count,
+                        reverse,
+                        )
+        # compress out_buffer into labels
+        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+    kh_destroy_int64(table)
+
+    return labels, reverse
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_int32_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       ndarray[npy_int32] in_buffer,
+                       ndarray[npy_uint64] out_buffer,
+                       kh_int32_t *table,
+                       Py_ssize_t * count,
+                       dict reverse,
+                       ):
+    cdef:
+        Py_ssize_t i, idx
+        int ret
+        npy_int32 element
+        khiter_t k
+
+    ret = 0
+
+    for i in range(iter_range):
+        # TODO: Consider indexing directly into the array for efficiency
+        element = in_buffer[i]
+        k = kh_get_int32(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            k = kh_put_int32(table, element, &ret)
+            table.vals[k] = idx = count[0]
+            reverse[count[0]] = element
+            count[0] += 1
+        out_buffer[i] = idx
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_int32(carray carray_, carray labels=None):
+    cdef:
+        Py_ssize_t len_carray, count, chunklen, len_in_buffer
+        dict reverse
+        ndarray[npy_int32] in_buffer
+        ndarray[npy_uint64] out_buffer
+        kh_int32_t *table
+
+    count = 0
+    ret = 0
+    reverse = {}
+
+    len_carray = len(carray_)
+    chunklen = carray_.chunklen
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len_carray)
+    # in-buffer isn't typed, because cython doesn't support string arrays (?)
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    table = kh_init_int32()
+
+    for in_buffer in bz.iterblocks(carray_):
+        len_in_buffer = len(in_buffer)
+        _factorize_int32_helper(len_in_buffer,
+                        carray_.dtype.itemsize + 1,
+                        in_buffer,
+                        out_buffer,
+                        table,
+                        &count,
+                        reverse,
+                        )
+        # compress out_buffer into labels
+        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+    kh_destroy_int32(table)
+
+    return labels, reverse
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_float64_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       ndarray[npy_float64] in_buffer,
+                       ndarray[npy_uint64] out_buffer,
+                       kh_float64_t *table,
+                       Py_ssize_t * count,
+                       dict reverse,
+                       ):
+    cdef:
+        Py_ssize_t i, idx
+        int ret
+        npy_float64 element
+        khiter_t k
+
+    ret = 0
+
+    for i in range(iter_range):
+        # TODO: Consider indexing directly into the array for efficiency
+        element = in_buffer[i]
+        k = kh_get_float64(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            k = kh_put_float64(table, element, &ret)
+            table.vals[k] = idx = count[0]
+            reverse[count[0]] = element
+            count[0] += 1
+        out_buffer[i] = idx
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_float64(carray carray_, carray labels=None):
+    cdef:
+        Py_ssize_t len_carray, count, chunklen, len_in_buffer
+        dict reverse
+        ndarray[npy_float64] in_buffer
+        ndarray[npy_uint64] out_buffer
+        kh_float64_t *table
+
+    count = 0
+    ret = 0
+    reverse = {}
+
+    len_carray = len(carray_)
+    chunklen = carray_.chunklen
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len_carray)
+    # in-buffer isn't typed, because cython doesn't support string arrays (?)
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    table = kh_init_float64()
+
+    for in_buffer in bz.iterblocks(carray_):
+        len_in_buffer = len(in_buffer)
+        _factorize_float64_helper(len_in_buffer,
+                        carray_.dtype.itemsize + 1,
+                        in_buffer,
+                        out_buffer,
+                        table,
+                        &count,
+                        reverse,
+                        )
+        # compress out_buffer into labels
+        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+    kh_destroy_float64(table)
+
+    return labels, reverse
+
+def factorize(carray carray_, **kwargs):
+    if carray_.dtype == 'int32':
+        labels, reverse = factorize_int32(carray_, **kwargs)
+    elif carray_.dtype == 'int64':
+        labels, reverse = factorize_int64(carray_, **kwargs)
+    elif carray_.dtype == 'float64':
+        labels, reverse = factorize_float64(carray_, **kwargs)
+    else:
+        #TODO: check that the input is a string_ dtype type
+        labels, reverse = factorize_str(carray_, **kwargs)
+    return labels, reverse
+
+# ---------------------------------------------------------------------------
+# Translate existing arrays
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1):
+    cdef:
+        Py_ssize_t chunklen, leftover_elements, len_in_buffer
+        ndarray[npy_int64] in_buffer
+        ndarray[npy_int64] out_buffer
+
+    chunklen = input_.chunklen
+    out_buffer = np.empty(chunklen, dtype='int64')
+
+    for in_buffer in bz.iterblocks(input_):
+        len_in_buffer = len(in_buffer)
+        for i in range(len_in_buffer):
+            element = in_buffer[i]
+            out_buffer[i] = lookup.get(element, default)
+        # compress out_buffer into labels
+        output_.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+# ---------------------------------------------------------------------------
+# Aggregation Section (old)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def agg_sum_na(iter_):
+    cdef:
+        npy_float64 v, v_cum = 0.0
+
+    for v in iter_:
+        if v == v:  # skip NA values
+            v_cum += v
+
+    return v_cum
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def agg_sum(iter_):
+    cdef:
+        npy_float64 v, v_cum = 0.0
+
+    for v in iter_:
+        v_cum += v
+
+    return v_cum
+
+# ---------------------------------------------------------------------------
+# Aggregation Section
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def groupsort_indexer(carray index, Py_ssize_t ngroups):
+    cdef:
+        Py_ssize_t i, label, n, len_in_buffer
+        ndarray[int64_t] counts, where, np_result
+        # --
+        carray c_result
+        chunk input_chunk, index_chunk
+        Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements
+
+        ndarray[int64_t] in_buffer
+
+    index_chunk_len = index.chunklen
+    in_buffer = np.empty(index_chunk_len, dtype='int64')
+    index_chunk_nr = 0
+
+    # count group sizes, location 0 for NA
+    counts = np.zeros(ngroups + 1, dtype=np.int64)
+    n = len(index)
+
+    for in_buffer in bz.iterblocks(index):
+        len_in_buffer = len(in_buffer)
+        # loop through rows
+        for i in range(len_in_buffer):
+            counts[index[i] + 1] += 1
+
+    # mark the start of each contiguous group of like-indexed data
+    where = np.zeros(ngroups + 1, dtype=np.int64)
+    for i from 1 <= i < ngroups + 1:
+        where[i] = where[i - 1] + counts[i - 1]
+
+    # this is our indexer
+    np_result = np.zeros(n, dtype=np.int64)
+    for i from 0 <= i < n:
+        label = index[i] + 1
+        np_result[where[label]] = i
+        where[label] += 1
+
+    return np_result, counts
+
+cdef count_unique_float64(ndarray[float64_t] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        Py_ssize_t idx
+        int ret = 0
+        float64_t val
+        khiter_t k
+        npy_uint64 count = 0
+        bint seen_na = 0
+        kh_float64_t *table
+
+    table = kh_init_float64()
+
+    for i in range(n):
+        val = values[i]
+
+        if val == val:
+            k = kh_get_float64(table, val)
+            if k == table.n_buckets:
+                k = kh_put_float64(table, val, &ret)
+                count += 1
+        elif not seen_na:
+            seen_na = 1
+            count += 1
+
+    kh_destroy_float64(table)
+
+    return count
+
+cdef count_unique_int64(ndarray[int64_t] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        Py_ssize_t idx
+        int ret = 0
+        int64_t val
+        khiter_t k
+        npy_uint64 count = 0
+        kh_int64_t *table
+
+    table = kh_init_int64()
+
+    for i in range(n):
+        val = values[i]
+
+        if val == val:
+            k = kh_get_int64(table, val)
+            if k == table.n_buckets:
+                k = kh_put_int64(table, val, &ret)
+                count += 1
+
+    kh_destroy_int64(table)
+
+    return count
+
+cdef count_unique_int32(ndarray[int32_t] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        Py_ssize_t idx
+        int ret = 0
+        int32_t val
+        khiter_t k
+        npy_uint64 count = 0
+        kh_int32_t *table
+
+    table = kh_init_int32()
+
+    for i in range(n):
+        val = values[i]
+
+        if val == val:
+            k = kh_get_int32(table, val)
+            if k == table.n_buckets:
+                k = kh_put_int32(table, val, &ret)
+                count += 1
+
+    kh_destroy_int32(table)
+
+    return count
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef sum_float64(carray ca_input, carray ca_factor,
+               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, j, end_counts, start_counts
+
+        ndarray[npy_float64] in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray[npy_float64] out_buffer
+        ndarray[npy_float64] last_values
+
+        npy_float64 v
+        bint count_distinct_started = 0
+        carray num_uniques
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+    if agg_method == _COUNT_DISTINCT:
+        num_uniques = carray([], dtype='int64')
+        positions, counts = groupsort_indexer(ca_factor, nr_groups)
+        start_counts = 0
+        end_counts = 0
+        for j in range(len(counts) - 1):
+            start_counts = end_counts
+            end_counts = start_counts + counts[j + 1]
+            positions[start_counts:end_counts]
+            num_uniques.append(
+                count_unique_float64(ca_input[positions[start_counts:end_counts]])
+            )
+
+        return num_uniques
+
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype='float64')
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        # loop through rows
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                if agg_method == _SUM:
+                    out_buffer[current_index] += in_buffer[i]
+                elif agg_method == _COUNT:
+                    out_buffer[current_index] += 1
+                elif agg_method == _COUNT_NA:
+
+                    v = in_buffer[i]
+                    if v == v:  # skip NA values
+                        out_buffer[current_index] += 1
+                elif agg_method == _SORTED_COUNT_DISTINCT:
+                    v = in_buffer[i]
+                    if not count_distinct_started:
+                        count_distinct_started = 1
+                        last_values = np.zeros(nr_groups, dtype='float64')
+                        last_values[0] = v
+                        out_buffer[0] = 1
+                    else:
+                        if v != last_values[current_index]:
+                            out_buffer[current_index] += 1
+
+                    last_values[current_index] = v
+                else:
+                    raise NotImplementedError('sumtype not supported')
+
+    # check whether a row has to be removed if it was meant to be skipped
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef sum_int32(carray ca_input, carray ca_factor,
+               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, j, end_counts, start_counts
+
+        ndarray[npy_int32] in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray[npy_int32] out_buffer
+        ndarray[npy_int32] last_values
+
+        npy_int32 v
+        bint count_distinct_started = 0
+        carray num_uniques
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+    if agg_method == _COUNT_DISTINCT:
+        num_uniques = carray([], dtype='int64')
+        positions, counts = groupsort_indexer(ca_factor, nr_groups)
+        start_counts = 0
+        end_counts = 0
+        for j in range(len(counts) - 1):
+            start_counts = end_counts
+            end_counts = start_counts + counts[j + 1]
+            positions[start_counts:end_counts]
+            num_uniques.append(
+                count_unique_int32(ca_input[positions[start_counts:end_counts]])
+            )
+
+        return num_uniques
+
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype='int32')
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        # loop through rows
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                if agg_method == _SUM:
+                    out_buffer[current_index] += in_buffer[i]
+                elif agg_method == _COUNT:
+                    out_buffer[current_index] += 1
+                elif agg_method == _COUNT_NA:
+
+                    # TODO: Warning: int does not support NA values, is this what we need?
+                    out_buffer[current_index] += 1
+                elif agg_method == _SORTED_COUNT_DISTINCT:
+                    v = in_buffer[i]
+                    if not count_distinct_started:
+                        count_distinct_started = 1
+                        last_values = np.zeros(nr_groups, dtype='int32')
+                        last_values[0] = v
+                        out_buffer[0] = 1
+                    else:
+                        if v != last_values[current_index]:
+                            out_buffer[current_index] += 1
+
+                    last_values[current_index] = v
+                else:
+                    raise NotImplementedError('sumtype not supported')
+
+    # check whether a row has to be removed if it was meant to be skipped
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef sum_int64(carray ca_input, carray ca_factor,
+               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, j, end_counts, start_counts
+
+        ndarray[npy_int64] in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray[npy_int64] out_buffer
+        ndarray[npy_int64] last_values
+
+        npy_int64 v
+        bint count_distinct_started = 0
+        carray num_uniques
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+    if agg_method == _COUNT_DISTINCT:
+        num_uniques = carray([], dtype='int64')
+        positions, counts = groupsort_indexer(ca_factor, nr_groups)
+        start_counts = 0
+        end_counts = 0
+        for j in range(len(counts) - 1):
+            start_counts = end_counts
+            end_counts = start_counts + counts[j + 1]
+            positions[start_counts:end_counts]
+            num_uniques.append(
+                count_unique_int64(ca_input[positions[start_counts:end_counts]])
+            )
+
+        return num_uniques
+
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype='int64')
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        # loop through rows
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                if agg_method == _SUM:
+                    out_buffer[current_index] += in_buffer[i]
+                elif agg_method == _COUNT:
+                    out_buffer[current_index] += 1
+                elif agg_method == _COUNT_NA:
+
+                    # TODO: Warning: int does not support NA values, is this what we need?
+                    out_buffer[current_index] += 1
+                elif agg_method == _SORTED_COUNT_DISTINCT:
+                    v = in_buffer[i]
+                    if not count_distinct_started:
+                        count_distinct_started = 1
+                        last_values = np.zeros(nr_groups, dtype='int64')
+                        last_values[0] = v
+                        out_buffer[0] = 1
+                    else:
+                        if v != last_values[current_index]:
+                            out_buffer[current_index] += 1
+
+                    last_values[current_index] = v
+                else:
+                    raise NotImplementedError('sumtype not supported')
+
+    # check whether a row has to be removed if it was meant to be skipped
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, factor_total_chunks
+
+        ndarray in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray out_buffer
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+
+    factor_total_chunks = ca_factor.nchunks
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype)
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                out_buffer[current_index] = in_buffer[i]
+
+    # check whether a row has to be fixed
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+def aggregate_groups_by_iter_2(ct_input,
+                        ct_agg,
+                        npy_uint64 nr_groups,
+                        npy_uint64 skip_key,
+                        carray factor_carray,
+                        groupby_cols,
+                        output_agg_ops,
+                        dtype_list,
+                        agg_method=_SUM
+                        ):
+    total = []
+
+    for col in groupby_cols:
+        total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key))
+
+    for col, agg_op in output_agg_ops:
+        # TODO: input vs output column
+        col_dtype = ct_agg[col].dtype
+        if col_dtype == np.float64:
+            total.append(
+                sum_float64(ct_input[col], factor_carray, nr_groups, skip_key,
+                            agg_method=agg_method)
+            )
+        elif col_dtype == np.int64:
+            total.append(
+                sum_int64(ct_input[col], factor_carray, nr_groups, skip_key,
+                          agg_method=agg_method)
+            )
+        elif col_dtype == np.int32:
+            total.append(
+                sum_int32(ct_input[col], factor_carray, nr_groups, skip_key,
+                          agg_method=agg_method)
+            )
+        else:
+            raise NotImplementedError(
+                'Column dtype ({0}) not supported for aggregation yet '
+                '(only int32, int64 & float64)'.format(str(col_dtype)))
+
+    ct_agg.append(total)
+
+# ---------------------------------------------------------------------------
+# Temporary Section
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse):
+    """
+    TEMPORARY WORKAROUND till numexpr support in list operations
+
+    Update a boolean array with checks whether the values of a column (col) are in a set (value_set)
+    Reverse means "not in" functionality
+
+    For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61
+
+    :param col:
+    :param value_set:
+    :param boolarr:
+    :param reverse:
+    :return:
+    """
+    cdef Py_ssize_t i
+    i = 0
+    if not reverse:
+        for val in col.iter():
+            if val not in value_set:
+                boolarr[i] = False
+            i += 1
+    else:
+        for val in col.iter():
+            if val in value_set:
+                boolarr[i] = False
+            i += 1

From a1092c60c52f45945565f93921758a24b7943f39 Mon Sep 17 00:00:00 2001
From: ARF <none@none>
Date: Mon, 16 Mar 2015 15:16:54 +0100
Subject: [PATCH 8/8] iterblocks implementation with pandas khash (unix line
 endings)

---
 bquery/ctable_ext.pyx | 1980 ++++++++++++++++++++---------------------
 1 file changed, 990 insertions(+), 990 deletions(-)

diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx
index a3b2ab9..cb3cc04 100644
--- a/bquery/ctable_ext.pyx
+++ b/bquery/ctable_ext.pyx
@@ -1,990 +1,990 @@
-cimport cython
-from cython.parallel import parallel, prange, threadid
-from openmp cimport *
-
-import numpy as np
-from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64,
-                    npy_int64, npy_float64, uint64_t)
-
-from libc.stdint cimport uintptr_t
-from libc.stdlib cimport malloc, calloc, free
-from libc.string cimport strcpy, memcpy, memset
-from libcpp.vector cimport vector
-
-from khash cimport *
-import bcolz as bz
-from bcolz.carray_ext cimport carray, chunks, chunk
-
-include "parallel_processing.pxi"
-
-# ----------------------------------------------------------------------------
-#                        GLOBAL DEFINITIONS
-# ----------------------------------------------------------------------------
-SUM = 0
-DEF _SUM = 0
-
-COUNT = 1
-DEF _COUNT = 1
-
-COUNT_NA = 2
-DEF _COUNT_NA = 2
-
-COUNT_DISTINCT = 3
-DEF _COUNT_DISTINCT = 3
-
-SORTED_COUNT_DISTINCT = 4
-DEF _SORTED_COUNT_DISTINCT = 4
-
-
-# ----------------------------------------------------------------------------
-#                           FACTORIZE SECTION
-# ----------------------------------------------------------------------------
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_str_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       char * in_buffer_ptr,
-                       uint64_t * out_buffer,
-                       kh_str_t *table,
-                       Py_ssize_t * count,
-                       unsigned int thread_id,
-                       omp_lock_t kh_locks[],
-                       unsigned int num_threads,
-                       ) nogil:
-    cdef:
-        Py_ssize_t i, idx
-        unsigned int j
-        khiter_t k
-
-        char * element
-        char * insert
-
-        int ret = 0
-        Py_ssize_t itemsize = allocation_size - 1
-
-    # allocate enough memory to hold the string element, add one for the
-    # null byte that marks the end of the string.
-    # TODO: understand why zero-filling is necessary. Without zero-filling
-    # the buffer, duplicate keys occur in the reverse dict
-    element = <char *>calloc(allocation_size, sizeof(char))
-
-    # lock ensures that table is in consistend state during access
-    omp_set_lock(&kh_locks[thread_id])
-    for i in xrange(iter_range):
-        # appends null character to element string
-        memcpy(element, in_buffer_ptr, itemsize)
-        in_buffer_ptr += itemsize
-
-        k = kh_get_str(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            omp_unset_lock(&kh_locks[thread_id])
-            insert = <char *>malloc(allocation_size)
-            # TODO: is strcpy really the best way to copy a string?
-            strcpy(insert, element)
-
-            # acquire locks for all threads to avoid inconsistent state
-            for j in xrange(num_threads):
-                omp_set_lock(&kh_locks[j])
-            # check whether another thread has already added the entry by now
-            k = kh_get_str(table, element)
-            if k != table.n_buckets:
-                idx = table.vals[k]
-            # if not, add element
-            else:
-                k = kh_put_str(table, insert, &ret)
-                table.vals[k] = idx = count[0]
-                count[0] += 1
-            # release all locks
-            for j in xrange(num_threads):
-                omp_unset_lock(&kh_locks[j])
-            # acquire our own lock again, to indicate we are reading the table
-            omp_set_lock(&kh_locks[thread_id])
-        out_buffer[i] = idx
-
-    omp_unset_lock(&kh_locks[thread_id])
-    free(element)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_str(carray carray_, unsigned int num_threads_requested = 0, **kwargs):
-    cdef:
-        Py_ssize_t i, blocklen, nblocks
-        khint_t j
-        carray labels
-        ndarray in_buffer
-        ndarray[npy_uint64] out_buffer
-        unsigned int thread_id
-        par_info_t par_info
-
-        kh_str_t *table
-        char * out_buffer_org_ptr
-        char * in_buffer_ptr
-        uint64_t * out_buffer_ptr
-
-        Py_ssize_t count = 0
-        dict reverse = {}
-
-        bint first_thread = True
-        Py_ssize_t element_allocation_size = carray_.dtype.itemsize + 1
-        Py_ssize_t carray_chunklen = carray_.chunklen
-
-    labels = create_labels_carray(carray_, **kwargs)
-
-    out_buffer = np.empty(labels.chunklen, dtype='uint64')
-    out_buffer_org_ptr = out_buffer.data
-
-    table = kh_init_str()
-
-    block_iterator = bz.iterblocks(carray_, blen=labels.chunklen)
-    nblocks = <int>(len(carray_)/labels.chunklen+0.5)+1
-
-    with nogil, parallel(num_threads=num_threads_requested):
-        # Initialise some parallel processing stuff
-        omp_start_critical()
-        if first_thread == True:
-            (&first_thread)[0] = False
-            with gil:
-                par_initialise(&par_info, carray_)
-        omp_end_critical()
-        
-        # allocate thread-local in- and out-buffers
-        with gil:
-            in_buffer_ptr = <char *>par_allocate_buffer(labels.chunklen, carray_.dtype.itemsize)
-            out_buffer_ptr = <uint64_t *>par_allocate_buffer(labels.chunklen, labels.dtype.itemsize)
-
-        # factorise the chunks in parallel
-        for i in prange(0, nblocks, schedule='dynamic', chunksize=1):
-            thread_id = threadid()
-
-            omp_set_lock(&par_info.chunk_lock)
-            with gil:
-                in_buffer = np.ascontiguousarray(block_iterator.next())
-                blocklen = len(in_buffer)
-                memcpy(in_buffer_ptr, <char*>in_buffer.data, in_buffer.nbytes)
-            omp_unset_lock(&par_info.chunk_lock)
-
-            _factorize_str_helper(blocklen,
-                            element_allocation_size,
-                            in_buffer_ptr,
-                            out_buffer_ptr,
-                            table,
-                            &count,
-                            thread_id,
-                            par_info.kh_locks,
-                            par_info.num_threads,
-                            )
-
-            # compress out_buffer into labels
-            omp_set_lock(&par_info.out_buffer_lock)
-            with gil:
-                out_buffer.data = <char *>out_buffer_ptr
-                par_save_block(i, labels, out_buffer, blocklen, nblocks)
-            omp_unset_lock(&par_info.out_buffer_lock)
-
-        # Clean-up thread local variables
-        free(in_buffer_ptr)
-        free(out_buffer_ptr)
-
-    # Clean-up some parallel processing stuff
-    par_terminate(par_info)
-
-    # restore out_buffer data pointer to allow python to free the object's data
-    out_buffer.data = out_buffer_org_ptr
-
-    # construct python dict from vectors and free element memory
-    for j in range(table.n_buckets):
-        if not kh_exist_str(table, j):
-            continue
-        reverse[table.vals[j]] = <char*>table.keys[j]
-        free(<void*>table.keys[j])
-    kh_destroy_str(table)
-    
-    return labels, reverse
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_int64_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       ndarray[npy_int64] in_buffer,
-                       ndarray[npy_uint64] out_buffer,
-                       kh_int64_t *table,
-                       Py_ssize_t * count,
-                       dict reverse,
-                       ):
-    cdef:
-        Py_ssize_t i, idx
-        int ret
-        npy_int64 element
-        khiter_t k
-
-    ret = 0
-
-    for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
-        k = kh_get_int64(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            k = kh_put_int64(table, element, &ret)
-            table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
-            count[0] += 1
-        out_buffer[i] = idx
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_int64(carray carray_, carray labels=None):
-    cdef:
-        Py_ssize_t len_carray, count, chunklen, len_in_buffer
-        dict reverse
-        ndarray[npy_int64] in_buffer
-        ndarray[npy_uint64] out_buffer
-        kh_int64_t *table
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    len_carray = len(carray_)
-    chunklen = carray_.chunklen
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=len_carray)
-    # in-buffer isn't typed, because cython doesn't support string arrays (?)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    table = kh_init_int64()
-
-    for in_buffer in bz.iterblocks(carray_):
-        len_in_buffer = len(in_buffer)
-        _factorize_int64_helper(len_in_buffer,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
-
-    kh_destroy_int64(table)
-
-    return labels, reverse
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_int32_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       ndarray[npy_int32] in_buffer,
-                       ndarray[npy_uint64] out_buffer,
-                       kh_int32_t *table,
-                       Py_ssize_t * count,
-                       dict reverse,
-                       ):
-    cdef:
-        Py_ssize_t i, idx
-        int ret
-        npy_int32 element
-        khiter_t k
-
-    ret = 0
-
-    for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
-        k = kh_get_int32(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            k = kh_put_int32(table, element, &ret)
-            table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
-            count[0] += 1
-        out_buffer[i] = idx
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_int32(carray carray_, carray labels=None):
-    cdef:
-        Py_ssize_t len_carray, count, chunklen, len_in_buffer
-        dict reverse
-        ndarray[npy_int32] in_buffer
-        ndarray[npy_uint64] out_buffer
-        kh_int32_t *table
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    len_carray = len(carray_)
-    chunklen = carray_.chunklen
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=len_carray)
-    # in-buffer isn't typed, because cython doesn't support string arrays (?)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    table = kh_init_int32()
-
-    for in_buffer in bz.iterblocks(carray_):
-        len_in_buffer = len(in_buffer)
-        _factorize_int32_helper(len_in_buffer,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
-
-    kh_destroy_int32(table)
-
-    return labels, reverse
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _factorize_float64_helper(Py_ssize_t iter_range,
-                       Py_ssize_t allocation_size,
-                       ndarray[npy_float64] in_buffer,
-                       ndarray[npy_uint64] out_buffer,
-                       kh_float64_t *table,
-                       Py_ssize_t * count,
-                       dict reverse,
-                       ):
-    cdef:
-        Py_ssize_t i, idx
-        int ret
-        npy_float64 element
-        khiter_t k
-
-    ret = 0
-
-    for i in range(iter_range):
-        # TODO: Consider indexing directly into the array for efficiency
-        element = in_buffer[i]
-        k = kh_get_float64(table, element)
-        if k != table.n_buckets:
-            idx = table.vals[k]
-        else:
-            k = kh_put_float64(table, element, &ret)
-            table.vals[k] = idx = count[0]
-            reverse[count[0]] = element
-            count[0] += 1
-        out_buffer[i] = idx
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def factorize_float64(carray carray_, carray labels=None):
-    cdef:
-        Py_ssize_t len_carray, count, chunklen, len_in_buffer
-        dict reverse
-        ndarray[npy_float64] in_buffer
-        ndarray[npy_uint64] out_buffer
-        kh_float64_t *table
-
-    count = 0
-    ret = 0
-    reverse = {}
-
-    len_carray = len(carray_)
-    chunklen = carray_.chunklen
-    if labels is None:
-        labels = carray([], dtype='int64', expectedlen=len_carray)
-    # in-buffer isn't typed, because cython doesn't support string arrays (?)
-    out_buffer = np.empty(chunklen, dtype='uint64')
-    table = kh_init_float64()
-
-    for in_buffer in bz.iterblocks(carray_):
-        len_in_buffer = len(in_buffer)
-        _factorize_float64_helper(len_in_buffer,
-                        carray_.dtype.itemsize + 1,
-                        in_buffer,
-                        out_buffer,
-                        table,
-                        &count,
-                        reverse,
-                        )
-        # compress out_buffer into labels
-        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
-
-    kh_destroy_float64(table)
-
-    return labels, reverse
-
-def factorize(carray carray_, **kwargs):
-    if carray_.dtype == 'int32':
-        labels, reverse = factorize_int32(carray_, **kwargs)
-    elif carray_.dtype == 'int64':
-        labels, reverse = factorize_int64(carray_, **kwargs)
-    elif carray_.dtype == 'float64':
-        labels, reverse = factorize_float64(carray_, **kwargs)
-    else:
-        #TODO: check that the input is a string_ dtype type
-        labels, reverse = factorize_str(carray_, **kwargs)
-    return labels, reverse
-
-# ---------------------------------------------------------------------------
-# Translate existing arrays
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1):
-    cdef:
-        Py_ssize_t chunklen, leftover_elements, len_in_buffer
-        ndarray[npy_int64] in_buffer
-        ndarray[npy_int64] out_buffer
-
-    chunklen = input_.chunklen
-    out_buffer = np.empty(chunklen, dtype='int64')
-
-    for in_buffer in bz.iterblocks(input_):
-        len_in_buffer = len(in_buffer)
-        for i in range(len_in_buffer):
-            element = in_buffer[i]
-            out_buffer[i] = lookup.get(element, default)
-        # compress out_buffer into labels
-        output_.append(out_buffer[:len_in_buffer].astype(np.int64))
-
-# ---------------------------------------------------------------------------
-# Aggregation Section (old)
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def agg_sum_na(iter_):
-    cdef:
-        npy_float64 v, v_cum = 0.0
-
-    for v in iter_:
-        if v == v:  # skip NA values
-            v_cum += v
-
-    return v_cum
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def agg_sum(iter_):
-    cdef:
-        npy_float64 v, v_cum = 0.0
-
-    for v in iter_:
-        v_cum += v
-
-    return v_cum
-
-# ---------------------------------------------------------------------------
-# Aggregation Section
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def groupsort_indexer(carray index, Py_ssize_t ngroups):
-    cdef:
-        Py_ssize_t i, label, n, len_in_buffer
-        ndarray[int64_t] counts, where, np_result
-        # --
-        carray c_result
-        chunk input_chunk, index_chunk
-        Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements
-
-        ndarray[int64_t] in_buffer
-
-    index_chunk_len = index.chunklen
-    in_buffer = np.empty(index_chunk_len, dtype='int64')
-    index_chunk_nr = 0
-
-    # count group sizes, location 0 for NA
-    counts = np.zeros(ngroups + 1, dtype=np.int64)
-    n = len(index)
-
-    for in_buffer in bz.iterblocks(index):
-        len_in_buffer = len(in_buffer)
-        # loop through rows
-        for i in range(len_in_buffer):
-            counts[index[i] + 1] += 1
-
-    # mark the start of each contiguous group of like-indexed data
-    where = np.zeros(ngroups + 1, dtype=np.int64)
-    for i from 1 <= i < ngroups + 1:
-        where[i] = where[i - 1] + counts[i - 1]
-
-    # this is our indexer
-    np_result = np.zeros(n, dtype=np.int64)
-    for i from 0 <= i < n:
-        label = index[i] + 1
-        np_result[where[label]] = i
-        where[label] += 1
-
-    return np_result, counts
-
-cdef count_unique_float64(ndarray[float64_t] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        Py_ssize_t idx
-        int ret = 0
-        float64_t val
-        khiter_t k
-        npy_uint64 count = 0
-        bint seen_na = 0
-        kh_float64_t *table
-
-    table = kh_init_float64()
-
-    for i in range(n):
-        val = values[i]
-
-        if val == val:
-            k = kh_get_float64(table, val)
-            if k == table.n_buckets:
-                k = kh_put_float64(table, val, &ret)
-                count += 1
-        elif not seen_na:
-            seen_na = 1
-            count += 1
-
-    kh_destroy_float64(table)
-
-    return count
-
-cdef count_unique_int64(ndarray[int64_t] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        Py_ssize_t idx
-        int ret = 0
-        int64_t val
-        khiter_t k
-        npy_uint64 count = 0
-        kh_int64_t *table
-
-    table = kh_init_int64()
-
-    for i in range(n):
-        val = values[i]
-
-        if val == val:
-            k = kh_get_int64(table, val)
-            if k == table.n_buckets:
-                k = kh_put_int64(table, val, &ret)
-                count += 1
-
-    kh_destroy_int64(table)
-
-    return count
-
-cdef count_unique_int32(ndarray[int32_t] values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        Py_ssize_t idx
-        int ret = 0
-        int32_t val
-        khiter_t k
-        npy_uint64 count = 0
-        kh_int32_t *table
-
-    table = kh_init_int32()
-
-    for i in range(n):
-        val = values[i]
-
-        if val == val:
-            k = kh_get_int32(table, val)
-            if k == table.n_buckets:
-                k = kh_put_int32(table, val, &ret)
-                count += 1
-
-    kh_destroy_int32(table)
-
-    return count
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef sum_float64(carray ca_input, carray ca_factor,
-               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
-    cdef:
-        Py_ssize_t in_buffer_len, factor_buffer_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_row
-        Py_ssize_t current_index, i, j, end_counts, start_counts
-
-        ndarray[npy_float64] in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray[npy_float64] out_buffer
-        ndarray[npy_float64] last_values
-
-        npy_float64 v
-        bint count_distinct_started = 0
-        carray num_uniques
-
-    count = 0
-    ret = 0
-    reverse = {}
-    iter_ca_factor = bz.iterblocks(ca_factor)
-
-    if agg_method == _COUNT_DISTINCT:
-        num_uniques = carray([], dtype='int64')
-        positions, counts = groupsort_indexer(ca_factor, nr_groups)
-        start_counts = 0
-        end_counts = 0
-        for j in range(len(counts) - 1):
-            start_counts = end_counts
-            end_counts = start_counts + counts[j + 1]
-            positions[start_counts:end_counts]
-            num_uniques.append(
-                count_unique_float64(ca_input[positions[start_counts:end_counts]])
-            )
-
-        return num_uniques
-
-    factor_chunk_nr = 0
-    factor_buffer = iter_ca_factor.next()
-    factor_buffer_len = len(factor_buffer)
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype='float64')
-
-    for in_buffer in bz.iterblocks(ca_input):
-        in_buffer_len = len(in_buffer)
-
-        # loop through rows
-        for i in range(in_buffer_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_buffer_len:
-                factor_chunk_nr += 1
-                factor_buffer = iter_ca_factor.next()
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-
-                    v = in_buffer[i]
-                    if v == v:  # skip NA values
-                        out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='float64')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    # check whether a row has to be removed if it was meant to be skipped
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef sum_int32(carray ca_input, carray ca_factor,
-               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
-    cdef:
-        Py_ssize_t in_buffer_len, factor_buffer_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_row
-        Py_ssize_t current_index, i, j, end_counts, start_counts
-
-        ndarray[npy_int32] in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray[npy_int32] out_buffer
-        ndarray[npy_int32] last_values
-
-        npy_int32 v
-        bint count_distinct_started = 0
-        carray num_uniques
-
-    count = 0
-    ret = 0
-    reverse = {}
-    iter_ca_factor = bz.iterblocks(ca_factor)
-
-    if agg_method == _COUNT_DISTINCT:
-        num_uniques = carray([], dtype='int64')
-        positions, counts = groupsort_indexer(ca_factor, nr_groups)
-        start_counts = 0
-        end_counts = 0
-        for j in range(len(counts) - 1):
-            start_counts = end_counts
-            end_counts = start_counts + counts[j + 1]
-            positions[start_counts:end_counts]
-            num_uniques.append(
-                count_unique_int32(ca_input[positions[start_counts:end_counts]])
-            )
-
-        return num_uniques
-
-    factor_chunk_nr = 0
-    factor_buffer = iter_ca_factor.next()
-    factor_buffer_len = len(factor_buffer)
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype='int32')
-
-    for in_buffer in bz.iterblocks(ca_input):
-        in_buffer_len = len(in_buffer)
-
-        # loop through rows
-        for i in range(in_buffer_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_buffer_len:
-                factor_chunk_nr += 1
-                factor_buffer = iter_ca_factor.next()
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-
-                    # TODO: Warning: int does not support NA values, is this what we need?
-                    out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='int32')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    # check whether a row has to be removed if it was meant to be skipped
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef sum_int64(carray ca_input, carray ca_factor,
-               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
-    cdef:
-        Py_ssize_t in_buffer_len, factor_buffer_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_row
-        Py_ssize_t current_index, i, j, end_counts, start_counts
-
-        ndarray[npy_int64] in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray[npy_int64] out_buffer
-        ndarray[npy_int64] last_values
-
-        npy_int64 v
-        bint count_distinct_started = 0
-        carray num_uniques
-
-    count = 0
-    ret = 0
-    reverse = {}
-    iter_ca_factor = bz.iterblocks(ca_factor)
-
-    if agg_method == _COUNT_DISTINCT:
-        num_uniques = carray([], dtype='int64')
-        positions, counts = groupsort_indexer(ca_factor, nr_groups)
-        start_counts = 0
-        end_counts = 0
-        for j in range(len(counts) - 1):
-            start_counts = end_counts
-            end_counts = start_counts + counts[j + 1]
-            positions[start_counts:end_counts]
-            num_uniques.append(
-                count_unique_int64(ca_input[positions[start_counts:end_counts]])
-            )
-
-        return num_uniques
-
-    factor_chunk_nr = 0
-    factor_buffer = iter_ca_factor.next()
-    factor_buffer_len = len(factor_buffer)
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype='int64')
-
-    for in_buffer in bz.iterblocks(ca_input):
-        in_buffer_len = len(in_buffer)
-
-        # loop through rows
-        for i in range(in_buffer_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_buffer_len:
-                factor_chunk_nr += 1
-                factor_buffer = iter_ca_factor.next()
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                if agg_method == _SUM:
-                    out_buffer[current_index] += in_buffer[i]
-                elif agg_method == _COUNT:
-                    out_buffer[current_index] += 1
-                elif agg_method == _COUNT_NA:
-
-                    # TODO: Warning: int does not support NA values, is this what we need?
-                    out_buffer[current_index] += 1
-                elif agg_method == _SORTED_COUNT_DISTINCT:
-                    v = in_buffer[i]
-                    if not count_distinct_started:
-                        count_distinct_started = 1
-                        last_values = np.zeros(nr_groups, dtype='int64')
-                        last_values[0] = v
-                        out_buffer[0] = 1
-                    else:
-                        if v != last_values[current_index]:
-                            out_buffer[current_index] += 1
-
-                    last_values[current_index] = v
-                else:
-                    raise NotImplementedError('sumtype not supported')
-
-    # check whether a row has to be removed if it was meant to be skipped
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key):
-    cdef:
-        Py_ssize_t in_buffer_len, factor_buffer_len
-        Py_ssize_t factor_chunk_nr, factor_chunk_row
-        Py_ssize_t current_index, i, factor_total_chunks
-
-        ndarray in_buffer
-        ndarray[npy_int64] factor_buffer
-        ndarray out_buffer
-
-    count = 0
-    ret = 0
-    reverse = {}
-    iter_ca_factor = bz.iterblocks(ca_factor)
-
-
-    factor_total_chunks = ca_factor.nchunks
-    factor_chunk_nr = 0
-    factor_buffer = iter_ca_factor.next()
-    factor_buffer_len = len(factor_buffer)
-    factor_chunk_row = 0
-    out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype)
-
-    for in_buffer in bz.iterblocks(ca_input):
-        in_buffer_len = len(in_buffer)
-
-        for i in range(in_buffer_len):
-
-            # go to next factor buffer if necessary
-            if factor_chunk_row == factor_buffer_len:
-                factor_chunk_nr += 1
-                factor_buffer = iter_ca_factor.next()
-                factor_chunk_row = 0
-
-            # retrieve index
-            current_index = factor_buffer[factor_chunk_row]
-            factor_chunk_row += 1
-
-            # update value if it's not an invalid index
-            if current_index != skip_key:
-                out_buffer[current_index] = in_buffer[i]
-
-    # check whether a row has to be fixed
-    if skip_key < nr_groups:
-        np.delete(out_buffer, skip_key)
-
-    return out_buffer
-
-def aggregate_groups_by_iter_2(ct_input,
-                        ct_agg,
-                        npy_uint64 nr_groups,
-                        npy_uint64 skip_key,
-                        carray factor_carray,
-                        groupby_cols,
-                        output_agg_ops,
-                        dtype_list,
-                        agg_method=_SUM
-                        ):
-    total = []
-
-    for col in groupby_cols:
-        total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key))
-
-    for col, agg_op in output_agg_ops:
-        # TODO: input vs output column
-        col_dtype = ct_agg[col].dtype
-        if col_dtype == np.float64:
-            total.append(
-                sum_float64(ct_input[col], factor_carray, nr_groups, skip_key,
-                            agg_method=agg_method)
-            )
-        elif col_dtype == np.int64:
-            total.append(
-                sum_int64(ct_input[col], factor_carray, nr_groups, skip_key,
-                          agg_method=agg_method)
-            )
-        elif col_dtype == np.int32:
-            total.append(
-                sum_int32(ct_input[col], factor_carray, nr_groups, skip_key,
-                          agg_method=agg_method)
-            )
-        else:
-            raise NotImplementedError(
-                'Column dtype ({0}) not supported for aggregation yet '
-                '(only int32, int64 & float64)'.format(str(col_dtype)))
-
-    ct_agg.append(total)
-
-# ---------------------------------------------------------------------------
-# Temporary Section
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse):
-    """
-    TEMPORARY WORKAROUND till numexpr support in list operations
-
-    Update a boolean array with checks whether the values of a column (col) are in a set (value_set)
-    Reverse means "not in" functionality
-
-    For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61
-
-    :param col:
-    :param value_set:
-    :param boolarr:
-    :param reverse:
-    :return:
-    """
-    cdef Py_ssize_t i
-    i = 0
-    if not reverse:
-        for val in col.iter():
-            if val not in value_set:
-                boolarr[i] = False
-            i += 1
-    else:
-        for val in col.iter():
-            if val in value_set:
-                boolarr[i] = False
-            i += 1
+cimport cython
+from cython.parallel import parallel, prange, threadid
+from openmp cimport *
+
+import numpy as np
+from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64,
+                    npy_int64, npy_float64, uint64_t)
+
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport malloc, calloc, free
+from libc.string cimport strcpy, memcpy, memset
+from libcpp.vector cimport vector
+
+from khash cimport *
+import bcolz as bz
+from bcolz.carray_ext cimport carray, chunks, chunk
+
+include "parallel_processing.pxi"
+
+# ----------------------------------------------------------------------------
+#                        GLOBAL DEFINITIONS
+# ----------------------------------------------------------------------------
+SUM = 0
+DEF _SUM = 0
+
+COUNT = 1
+DEF _COUNT = 1
+
+COUNT_NA = 2
+DEF _COUNT_NA = 2
+
+COUNT_DISTINCT = 3
+DEF _COUNT_DISTINCT = 3
+
+SORTED_COUNT_DISTINCT = 4
+DEF _SORTED_COUNT_DISTINCT = 4
+
+
+# ----------------------------------------------------------------------------
+#                           FACTORIZE SECTION
+# ----------------------------------------------------------------------------
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_str_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       char * in_buffer_ptr,
+                       uint64_t * out_buffer,
+                       kh_str_t *table,
+                       Py_ssize_t * count,
+                       unsigned int thread_id,
+                       omp_lock_t kh_locks[],
+                       unsigned int num_threads,
+                       ) nogil:
+    cdef:
+        Py_ssize_t i, idx
+        unsigned int j
+        khiter_t k
+
+        char * element
+        char * insert
+
+        int ret = 0
+        Py_ssize_t itemsize = allocation_size - 1
+
+    # allocate enough memory to hold the string element, add one for the
+    # null byte that marks the end of the string.
+    # TODO: understand why zero-filling is necessary. Without zero-filling
+    # the buffer, duplicate keys occur in the reverse dict
+    element = <char *>calloc(allocation_size, sizeof(char))
+
+    # lock ensures that table is in consistend state during access
+    omp_set_lock(&kh_locks[thread_id])
+    for i in xrange(iter_range):
+        # appends null character to element string
+        memcpy(element, in_buffer_ptr, itemsize)
+        in_buffer_ptr += itemsize
+
+        k = kh_get_str(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            omp_unset_lock(&kh_locks[thread_id])
+            insert = <char *>malloc(allocation_size)
+            # TODO: is strcpy really the best way to copy a string?
+            strcpy(insert, element)
+
+            # acquire locks for all threads to avoid inconsistent state
+            for j in xrange(num_threads):
+                omp_set_lock(&kh_locks[j])
+            # check whether another thread has already added the entry by now
+            k = kh_get_str(table, element)
+            if k != table.n_buckets:
+                idx = table.vals[k]
+            # if not, add element
+            else:
+                k = kh_put_str(table, insert, &ret)
+                table.vals[k] = idx = count[0]
+                count[0] += 1
+            # release all locks
+            for j in xrange(num_threads):
+                omp_unset_lock(&kh_locks[j])
+            # acquire our own lock again, to indicate we are reading the table
+            omp_set_lock(&kh_locks[thread_id])
+        out_buffer[i] = idx
+
+    omp_unset_lock(&kh_locks[thread_id])
+    free(element)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_str(carray carray_, unsigned int num_threads_requested = 0, **kwargs):
+    cdef:
+        Py_ssize_t i, blocklen, nblocks
+        khint_t j
+        carray labels
+        ndarray in_buffer
+        ndarray[npy_uint64] out_buffer
+        unsigned int thread_id
+        par_info_t par_info
+
+        kh_str_t *table
+        char * out_buffer_org_ptr
+        char * in_buffer_ptr
+        uint64_t * out_buffer_ptr
+
+        Py_ssize_t count = 0
+        dict reverse = {}
+
+        bint first_thread = True
+        Py_ssize_t element_allocation_size = carray_.dtype.itemsize + 1
+        Py_ssize_t carray_chunklen = carray_.chunklen
+
+    labels = create_labels_carray(carray_, **kwargs)
+
+    out_buffer = np.empty(labels.chunklen, dtype='uint64')
+    out_buffer_org_ptr = out_buffer.data
+
+    table = kh_init_str()
+
+    block_iterator = bz.iterblocks(carray_, blen=labels.chunklen)
+    nblocks = <int>(len(carray_)/labels.chunklen+0.5)+1
+
+    with nogil, parallel(num_threads=num_threads_requested):
+        # Initialise some parallel processing stuff
+        omp_start_critical()
+        if first_thread == True:
+            (&first_thread)[0] = False
+            with gil:
+                par_initialise(&par_info, carray_)
+        omp_end_critical()
+        
+        # allocate thread-local in- and out-buffers
+        with gil:
+            in_buffer_ptr = <char *>par_allocate_buffer(labels.chunklen, carray_.dtype.itemsize)
+            out_buffer_ptr = <uint64_t *>par_allocate_buffer(labels.chunklen, labels.dtype.itemsize)
+
+        # factorise the chunks in parallel
+        for i in prange(0, nblocks, schedule='dynamic', chunksize=1):
+            thread_id = threadid()
+
+            omp_set_lock(&par_info.chunk_lock)
+            with gil:
+                in_buffer = np.ascontiguousarray(block_iterator.next())
+                blocklen = len(in_buffer)
+                memcpy(in_buffer_ptr, <char*>in_buffer.data, in_buffer.nbytes)
+            omp_unset_lock(&par_info.chunk_lock)
+
+            _factorize_str_helper(blocklen,
+                            element_allocation_size,
+                            in_buffer_ptr,
+                            out_buffer_ptr,
+                            table,
+                            &count,
+                            thread_id,
+                            par_info.kh_locks,
+                            par_info.num_threads,
+                            )
+
+            # compress out_buffer into labels
+            omp_set_lock(&par_info.out_buffer_lock)
+            with gil:
+                out_buffer.data = <char *>out_buffer_ptr
+                par_save_block(i, labels, out_buffer, blocklen, nblocks)
+            omp_unset_lock(&par_info.out_buffer_lock)
+
+        # Clean-up thread local variables
+        free(in_buffer_ptr)
+        free(out_buffer_ptr)
+
+    # Clean-up some parallel processing stuff
+    par_terminate(par_info)
+
+    # restore out_buffer data pointer to allow python to free the object's data
+    out_buffer.data = out_buffer_org_ptr
+
+    # construct python dict from vectors and free element memory
+    for j in range(table.n_buckets):
+        if not kh_exist_str(table, j):
+            continue
+        reverse[table.vals[j]] = <char*>table.keys[j]
+        free(<void*>table.keys[j])
+    kh_destroy_str(table)
+    
+    return labels, reverse
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_int64_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       ndarray[npy_int64] in_buffer,
+                       ndarray[npy_uint64] out_buffer,
+                       kh_int64_t *table,
+                       Py_ssize_t * count,
+                       dict reverse,
+                       ):
+    cdef:
+        Py_ssize_t i, idx
+        int ret
+        npy_int64 element
+        khiter_t k
+
+    ret = 0
+
+    for i in range(iter_range):
+        # TODO: Consider indexing directly into the array for efficiency
+        element = in_buffer[i]
+        k = kh_get_int64(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            k = kh_put_int64(table, element, &ret)
+            table.vals[k] = idx = count[0]
+            reverse[count[0]] = element
+            count[0] += 1
+        out_buffer[i] = idx
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_int64(carray carray_, carray labels=None):
+    cdef:
+        Py_ssize_t len_carray, count, chunklen, len_in_buffer
+        dict reverse
+        ndarray[npy_int64] in_buffer
+        ndarray[npy_uint64] out_buffer
+        kh_int64_t *table
+
+    count = 0
+    ret = 0
+    reverse = {}
+
+    len_carray = len(carray_)
+    chunklen = carray_.chunklen
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len_carray)
+    # in-buffer isn't typed, because cython doesn't support string arrays (?)
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    table = kh_init_int64()
+
+    for in_buffer in bz.iterblocks(carray_):
+        len_in_buffer = len(in_buffer)
+        _factorize_int64_helper(len_in_buffer,
+                        carray_.dtype.itemsize + 1,
+                        in_buffer,
+                        out_buffer,
+                        table,
+                        &count,
+                        reverse,
+                        )
+        # compress out_buffer into labels
+        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+    kh_destroy_int64(table)
+
+    return labels, reverse
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_int32_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       ndarray[npy_int32] in_buffer,
+                       ndarray[npy_uint64] out_buffer,
+                       kh_int32_t *table,
+                       Py_ssize_t * count,
+                       dict reverse,
+                       ):
+    cdef:
+        Py_ssize_t i, idx
+        int ret
+        npy_int32 element
+        khiter_t k
+
+    ret = 0
+
+    for i in range(iter_range):
+        # TODO: Consider indexing directly into the array for efficiency
+        element = in_buffer[i]
+        k = kh_get_int32(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            k = kh_put_int32(table, element, &ret)
+            table.vals[k] = idx = count[0]
+            reverse[count[0]] = element
+            count[0] += 1
+        out_buffer[i] = idx
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_int32(carray carray_, carray labels=None):
+    cdef:
+        Py_ssize_t len_carray, count, chunklen, len_in_buffer
+        dict reverse
+        ndarray[npy_int32] in_buffer
+        ndarray[npy_uint64] out_buffer
+        kh_int32_t *table
+
+    count = 0
+    ret = 0
+    reverse = {}
+
+    len_carray = len(carray_)
+    chunklen = carray_.chunklen
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len_carray)
+    # in-buffer isn't typed, because cython doesn't support string arrays (?)
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    table = kh_init_int32()
+
+    for in_buffer in bz.iterblocks(carray_):
+        len_in_buffer = len(in_buffer)
+        _factorize_int32_helper(len_in_buffer,
+                        carray_.dtype.itemsize + 1,
+                        in_buffer,
+                        out_buffer,
+                        table,
+                        &count,
+                        reverse,
+                        )
+        # compress out_buffer into labels
+        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+    kh_destroy_int32(table)
+
+    return labels, reverse
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void _factorize_float64_helper(Py_ssize_t iter_range,
+                       Py_ssize_t allocation_size,
+                       ndarray[npy_float64] in_buffer,
+                       ndarray[npy_uint64] out_buffer,
+                       kh_float64_t *table,
+                       Py_ssize_t * count,
+                       dict reverse,
+                       ):
+    cdef:
+        Py_ssize_t i, idx
+        int ret
+        npy_float64 element
+        khiter_t k
+
+    ret = 0
+
+    for i in range(iter_range):
+        # TODO: Consider indexing directly into the array for efficiency
+        element = in_buffer[i]
+        k = kh_get_float64(table, element)
+        if k != table.n_buckets:
+            idx = table.vals[k]
+        else:
+            k = kh_put_float64(table, element, &ret)
+            table.vals[k] = idx = count[0]
+            reverse[count[0]] = element
+            count[0] += 1
+        out_buffer[i] = idx
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def factorize_float64(carray carray_, carray labels=None):
+    cdef:
+        Py_ssize_t len_carray, count, chunklen, len_in_buffer
+        dict reverse
+        ndarray[npy_float64] in_buffer
+        ndarray[npy_uint64] out_buffer
+        kh_float64_t *table
+
+    count = 0
+    ret = 0
+    reverse = {}
+
+    len_carray = len(carray_)
+    chunklen = carray_.chunklen
+    if labels is None:
+        labels = carray([], dtype='int64', expectedlen=len_carray)
+    # in-buffer isn't typed, because cython doesn't support string arrays (?)
+    out_buffer = np.empty(chunklen, dtype='uint64')
+    table = kh_init_float64()
+
+    for in_buffer in bz.iterblocks(carray_):
+        len_in_buffer = len(in_buffer)
+        _factorize_float64_helper(len_in_buffer,
+                        carray_.dtype.itemsize + 1,
+                        in_buffer,
+                        out_buffer,
+                        table,
+                        &count,
+                        reverse,
+                        )
+        # compress out_buffer into labels
+        labels.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+    kh_destroy_float64(table)
+
+    return labels, reverse
+
+def factorize(carray carray_, **kwargs):
+    if carray_.dtype == 'int32':
+        labels, reverse = factorize_int32(carray_, **kwargs)
+    elif carray_.dtype == 'int64':
+        labels, reverse = factorize_int64(carray_, **kwargs)
+    elif carray_.dtype == 'float64':
+        labels, reverse = factorize_float64(carray_, **kwargs)
+    else:
+        #TODO: check that the input is a string_ dtype type
+        labels, reverse = factorize_str(carray_, **kwargs)
+    return labels, reverse
+
+# ---------------------------------------------------------------------------
+# Translate existing arrays
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1):
+    cdef:
+        Py_ssize_t chunklen, leftover_elements, len_in_buffer
+        ndarray[npy_int64] in_buffer
+        ndarray[npy_int64] out_buffer
+
+    chunklen = input_.chunklen
+    out_buffer = np.empty(chunklen, dtype='int64')
+
+    for in_buffer in bz.iterblocks(input_):
+        len_in_buffer = len(in_buffer)
+        for i in range(len_in_buffer):
+            element = in_buffer[i]
+            out_buffer[i] = lookup.get(element, default)
+        # compress out_buffer into labels
+        output_.append(out_buffer[:len_in_buffer].astype(np.int64))
+
+# ---------------------------------------------------------------------------
+# Aggregation Section (old)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def agg_sum_na(iter_):
+    cdef:
+        npy_float64 v, v_cum = 0.0
+
+    for v in iter_:
+        if v == v:  # skip NA values
+            v_cum += v
+
+    return v_cum
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def agg_sum(iter_):
+    cdef:
+        npy_float64 v, v_cum = 0.0
+
+    for v in iter_:
+        v_cum += v
+
+    return v_cum
+
+# ---------------------------------------------------------------------------
+# Aggregation Section
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def groupsort_indexer(carray index, Py_ssize_t ngroups):
+    cdef:
+        Py_ssize_t i, label, n, len_in_buffer
+        ndarray[int64_t] counts, where, np_result
+        # --
+        carray c_result
+        chunk input_chunk, index_chunk
+        Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements
+
+        ndarray[int64_t] in_buffer
+
+    index_chunk_len = index.chunklen
+    in_buffer = np.empty(index_chunk_len, dtype='int64')
+    index_chunk_nr = 0
+
+    # count group sizes, location 0 for NA
+    counts = np.zeros(ngroups + 1, dtype=np.int64)
+    n = len(index)
+
+    for in_buffer in bz.iterblocks(index):
+        len_in_buffer = len(in_buffer)
+        # loop through rows
+        for i in range(len_in_buffer):
+            counts[index[i] + 1] += 1
+
+    # mark the start of each contiguous group of like-indexed data
+    where = np.zeros(ngroups + 1, dtype=np.int64)
+    for i from 1 <= i < ngroups + 1:
+        where[i] = where[i - 1] + counts[i - 1]
+
+    # this is our indexer
+    np_result = np.zeros(n, dtype=np.int64)
+    for i from 0 <= i < n:
+        label = index[i] + 1
+        np_result[where[label]] = i
+        where[label] += 1
+
+    return np_result, counts
+
+cdef count_unique_float64(ndarray[float64_t] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        Py_ssize_t idx
+        int ret = 0
+        float64_t val
+        khiter_t k
+        npy_uint64 count = 0
+        bint seen_na = 0
+        kh_float64_t *table
+
+    table = kh_init_float64()
+
+    for i in range(n):
+        val = values[i]
+
+        if val == val:
+            k = kh_get_float64(table, val)
+            if k == table.n_buckets:
+                k = kh_put_float64(table, val, &ret)
+                count += 1
+        elif not seen_na:
+            seen_na = 1
+            count += 1
+
+    kh_destroy_float64(table)
+
+    return count
+
+cdef count_unique_int64(ndarray[int64_t] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        Py_ssize_t idx
+        int ret = 0
+        int64_t val
+        khiter_t k
+        npy_uint64 count = 0
+        kh_int64_t *table
+
+    table = kh_init_int64()
+
+    for i in range(n):
+        val = values[i]
+
+        if val == val:
+            k = kh_get_int64(table, val)
+            if k == table.n_buckets:
+                k = kh_put_int64(table, val, &ret)
+                count += 1
+
+    kh_destroy_int64(table)
+
+    return count
+
+cdef count_unique_int32(ndarray[int32_t] values):
+    cdef:
+        Py_ssize_t i, n = len(values)
+        Py_ssize_t idx
+        int ret = 0
+        int32_t val
+        khiter_t k
+        npy_uint64 count = 0
+        kh_int32_t *table
+
+    table = kh_init_int32()
+
+    for i in range(n):
+        val = values[i]
+
+        if val == val:
+            k = kh_get_int32(table, val)
+            if k == table.n_buckets:
+                k = kh_put_int32(table, val, &ret)
+                count += 1
+
+    kh_destroy_int32(table)
+
+    return count
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef sum_float64(carray ca_input, carray ca_factor,
+               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, j, end_counts, start_counts
+
+        ndarray[npy_float64] in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray[npy_float64] out_buffer
+        ndarray[npy_float64] last_values
+
+        npy_float64 v
+        bint count_distinct_started = 0
+        carray num_uniques
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+    if agg_method == _COUNT_DISTINCT:
+        num_uniques = carray([], dtype='int64')
+        positions, counts = groupsort_indexer(ca_factor, nr_groups)
+        start_counts = 0
+        end_counts = 0
+        for j in range(len(counts) - 1):
+            start_counts = end_counts
+            end_counts = start_counts + counts[j + 1]
+            positions[start_counts:end_counts]
+            num_uniques.append(
+                count_unique_float64(ca_input[positions[start_counts:end_counts]])
+            )
+
+        return num_uniques
+
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype='float64')
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        # loop through rows
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                if agg_method == _SUM:
+                    out_buffer[current_index] += in_buffer[i]
+                elif agg_method == _COUNT:
+                    out_buffer[current_index] += 1
+                elif agg_method == _COUNT_NA:
+
+                    v = in_buffer[i]
+                    if v == v:  # skip NA values
+                        out_buffer[current_index] += 1
+                elif agg_method == _SORTED_COUNT_DISTINCT:
+                    v = in_buffer[i]
+                    if not count_distinct_started:
+                        count_distinct_started = 1
+                        last_values = np.zeros(nr_groups, dtype='float64')
+                        last_values[0] = v
+                        out_buffer[0] = 1
+                    else:
+                        if v != last_values[current_index]:
+                            out_buffer[current_index] += 1
+
+                    last_values[current_index] = v
+                else:
+                    raise NotImplementedError('sumtype not supported')
+
+    # check whether a row has to be removed if it was meant to be skipped
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef sum_int32(carray ca_input, carray ca_factor,
+               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, j, end_counts, start_counts
+
+        ndarray[npy_int32] in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray[npy_int32] out_buffer
+        ndarray[npy_int32] last_values
+
+        npy_int32 v
+        bint count_distinct_started = 0
+        carray num_uniques
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+    if agg_method == _COUNT_DISTINCT:
+        num_uniques = carray([], dtype='int64')
+        positions, counts = groupsort_indexer(ca_factor, nr_groups)
+        start_counts = 0
+        end_counts = 0
+        for j in range(len(counts) - 1):
+            start_counts = end_counts
+            end_counts = start_counts + counts[j + 1]
+            positions[start_counts:end_counts]
+            num_uniques.append(
+                count_unique_int32(ca_input[positions[start_counts:end_counts]])
+            )
+
+        return num_uniques
+
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype='int32')
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        # loop through rows
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                if agg_method == _SUM:
+                    out_buffer[current_index] += in_buffer[i]
+                elif agg_method == _COUNT:
+                    out_buffer[current_index] += 1
+                elif agg_method == _COUNT_NA:
+
+                    # TODO: Warning: int does not support NA values, is this what we need?
+                    out_buffer[current_index] += 1
+                elif agg_method == _SORTED_COUNT_DISTINCT:
+                    v = in_buffer[i]
+                    if not count_distinct_started:
+                        count_distinct_started = 1
+                        last_values = np.zeros(nr_groups, dtype='int32')
+                        last_values[0] = v
+                        out_buffer[0] = 1
+                    else:
+                        if v != last_values[current_index]:
+                            out_buffer[current_index] += 1
+
+                    last_values[current_index] = v
+                else:
+                    raise NotImplementedError('sumtype not supported')
+
+    # check whether a row has to be removed if it was meant to be skipped
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef sum_int64(carray ca_input, carray ca_factor,
+               Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, j, end_counts, start_counts
+
+        ndarray[npy_int64] in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray[npy_int64] out_buffer
+        ndarray[npy_int64] last_values
+
+        npy_int64 v
+        bint count_distinct_started = 0
+        carray num_uniques
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+    if agg_method == _COUNT_DISTINCT:
+        num_uniques = carray([], dtype='int64')
+        positions, counts = groupsort_indexer(ca_factor, nr_groups)
+        start_counts = 0
+        end_counts = 0
+        for j in range(len(counts) - 1):
+            start_counts = end_counts
+            end_counts = start_counts + counts[j + 1]
+            positions[start_counts:end_counts]
+            num_uniques.append(
+                count_unique_int64(ca_input[positions[start_counts:end_counts]])
+            )
+
+        return num_uniques
+
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype='int64')
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        # loop through rows
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                if agg_method == _SUM:
+                    out_buffer[current_index] += in_buffer[i]
+                elif agg_method == _COUNT:
+                    out_buffer[current_index] += 1
+                elif agg_method == _COUNT_NA:
+
+                    # TODO: Warning: int does not support NA values, is this what we need?
+                    out_buffer[current_index] += 1
+                elif agg_method == _SORTED_COUNT_DISTINCT:
+                    v = in_buffer[i]
+                    if not count_distinct_started:
+                        count_distinct_started = 1
+                        last_values = np.zeros(nr_groups, dtype='int64')
+                        last_values[0] = v
+                        out_buffer[0] = 1
+                    else:
+                        if v != last_values[current_index]:
+                            out_buffer[current_index] += 1
+
+                    last_values[current_index] = v
+                else:
+                    raise NotImplementedError('sumtype not supported')
+
+    # check whether a row has to be removed if it was meant to be skipped
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key):
+    cdef:
+        Py_ssize_t in_buffer_len, factor_buffer_len
+        Py_ssize_t factor_chunk_nr, factor_chunk_row
+        Py_ssize_t current_index, i, factor_total_chunks
+
+        ndarray in_buffer
+        ndarray[npy_int64] factor_buffer
+        ndarray out_buffer
+
+    count = 0
+    ret = 0
+    reverse = {}
+    iter_ca_factor = bz.iterblocks(ca_factor)
+
+
+    factor_total_chunks = ca_factor.nchunks
+    factor_chunk_nr = 0
+    factor_buffer = iter_ca_factor.next()
+    factor_buffer_len = len(factor_buffer)
+    factor_chunk_row = 0
+    out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype)
+
+    for in_buffer in bz.iterblocks(ca_input):
+        in_buffer_len = len(in_buffer)
+
+        for i in range(in_buffer_len):
+
+            # go to next factor buffer if necessary
+            if factor_chunk_row == factor_buffer_len:
+                factor_chunk_nr += 1
+                factor_buffer = iter_ca_factor.next()
+                factor_chunk_row = 0
+
+            # retrieve index
+            current_index = factor_buffer[factor_chunk_row]
+            factor_chunk_row += 1
+
+            # update value if it's not an invalid index
+            if current_index != skip_key:
+                out_buffer[current_index] = in_buffer[i]
+
+    # check whether a row has to be fixed
+    if skip_key < nr_groups:
+        np.delete(out_buffer, skip_key)
+
+    return out_buffer
+
+def aggregate_groups_by_iter_2(ct_input,
+                        ct_agg,
+                        npy_uint64 nr_groups,
+                        npy_uint64 skip_key,
+                        carray factor_carray,
+                        groupby_cols,
+                        output_agg_ops,
+                        dtype_list,
+                        agg_method=_SUM
+                        ):
+    total = []
+
+    for col in groupby_cols:
+        total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key))
+
+    for col, agg_op in output_agg_ops:
+        # TODO: input vs output column
+        col_dtype = ct_agg[col].dtype
+        if col_dtype == np.float64:
+            total.append(
+                sum_float64(ct_input[col], factor_carray, nr_groups, skip_key,
+                            agg_method=agg_method)
+            )
+        elif col_dtype == np.int64:
+            total.append(
+                sum_int64(ct_input[col], factor_carray, nr_groups, skip_key,
+                          agg_method=agg_method)
+            )
+        elif col_dtype == np.int32:
+            total.append(
+                sum_int32(ct_input[col], factor_carray, nr_groups, skip_key,
+                          agg_method=agg_method)
+            )
+        else:
+            raise NotImplementedError(
+                'Column dtype ({0}) not supported for aggregation yet '
+                '(only int32, int64 & float64)'.format(str(col_dtype)))
+
+    ct_agg.append(total)
+
+# ---------------------------------------------------------------------------
+# Temporary Section
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse):
+    """
+    TEMPORARY WORKAROUND till numexpr support in list operations
+
+    Update a boolean array with checks whether the values of a column (col) are in a set (value_set)
+    Reverse means "not in" functionality
+
+    For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61
+
+    :param col:
+    :param value_set:
+    :param boolarr:
+    :param reverse:
+    :return:
+    """
+    cdef Py_ssize_t i
+    i = 0
+    if not reverse:
+        for val in col.iter():
+            if val not in value_set:
+                boolarr[i] = False
+            i += 1
+    else:
+        for val in col.iter():
+            if val in value_set:
+                boolarr[i] = False
+            i += 1