From 7980b1afff33d040b5cc04b17393e056a91205a3 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 6 Mar 2015 15:59:32 +0100 Subject: [PATCH 1/8] Parallelised string factorisation Shared variables with manual locking: - hash table - count - reverse_keys - reverse_values - out_buffer - chunk_ Shared variables without locking requirement: - locks Thread-local variables: - thread_id - in_buffer_ptr (points to thread-local buffer) - out_buffer_ptr (points to thread-local buffer) Locking scheme: - For each thread a lock on the hash table (and other associated shared variables) exists. - Each thread processing a chunk begins by acquiring its own lock on the shared hash table. - The lock is released when the thread encounters an value that is new to the hash table. - Once the thread is ready to write to the hash table, it waits to acquire the locks from all threads. - After the write all locks are released. --- Uncompressed bcolz timings: ``` --- uncached unique() --- pandas (in-memory): In [10]: %timeit -r 10 c.unique() 1 loops, best of 10: 881 ms per loop bquery master over bcolz (persistent): In [12]: %timeit -r 10 a.unique('mycol') 1 loops, best of 10: 2.1 s per loop ==> x2.38 slower than pandas pull request over bcolz (persistent): In [8]: %timeit -r 10 a.unique('mycol') 1 loops, best of 10: 834 ms per loop ==> x1.05 FASTER than pandas ---- cache_factor --- bquery master over bcolz (persistent): In [3]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True) 1 loops, best of 10: 2.51 s per loop pull request with 2 threads over bcolz (persistent): In [3]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True) 1 loops, best of 10: 1.16 s per loop ==> x2.16 faster than master pull request with 1 thread over bcolz (persistent): In [3]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True) 1 loops, best of 10: 1.69 s per loop ==> x1.48 faster than master (c.f. x1.48 from single-threaded PR #21) ==> parallel code seems to have no performance penalty on single-core machines ``` Compressed bcolz timings: ``` --- uncached unique() --- pandas (in-memory): In [10]: %timeit -r 10 c.unique() 1 loops, best of 10: 881 ms per loop bquery master over bcolz (persistent): In [12]: %timeit -r 10 a.unique('mycol') 1 loops, best of 10: 3.39 s per loop ==> x3.85 slower than pandas pull request over bcolz (persistent): In [8]: %timeit -r 10 a.unique('mycol') 1 loops, best of 10: 1.9 s per loop ==> x2.16 slower than pandas ---- cache_factor --- bquery master over bcolz (persistent): In [5]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True) 1 loops, best of 10: 4.09 s per loop pull request with 2 threads over bcolz (persistent): In [5]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True) 1 loops, best of 10: 2.48 s per loop ==> x1.65 faster than master pull request with 1 thread over bcolz (persistent): In [5]: %timeit -r 10 a.cache_factor(['mycol'], refresh=True) 1 loops, best of 10: 3.26 s per loop ==> x1.25 faster than master (c.f. x1.28 from single-threaded PR #21) ``` --- .gitignore | 3 + bquery/ctable_ext.pyx | 225 ++++++++++++++++++++++++++++++++++-------- bquery/khash.pxd | 4 +- requirements.txt | 2 +- setup.py | 7 +- 5 files changed, 195 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 7b12180..5c1395e 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,10 @@ docs/_build/ target/ # other +*.pyx.orig +cython_debug/ bquery/ctable_ext.c +bquery/ctable_ext.cpp bcolz bquery/benchmarks/vb_suite/benchmarks.db bquery/benchmarks/vb_suite/source \ No newline at end of file diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index 1e4becb..e0de1e1 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -1,8 +1,14 @@ -import numpy as np import cython -from numpy cimport ndarray, dtype, npy_intp, npy_int32, npy_uint64, npy_int64, npy_float64 -from libc.stdlib cimport malloc -from libc.string cimport strcpy +from cython.parallel import parallel, prange, threadid +from openmp cimport omp_lock_t, omp_init_lock, omp_set_lock, omp_unset_lock, omp_destroy_lock, omp_get_num_threads + +import numpy as np +from numpy cimport ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t + +from libc.stdint cimport uintptr_t +from libc.stdlib cimport malloc, calloc, free +from libc.string cimport strcpy, memcpy, memset +from libcpp.vector cimport vector from khash cimport * from bcolz.carray_ext cimport carray, chunk @@ -31,94 +37,229 @@ DEF _SORTED_COUNT_DISTINCT = 4 @cython.boundscheck(False) cdef void _factorize_str_helper(Py_ssize_t iter_range, Py_ssize_t allocation_size, - ndarray in_buffer, - ndarray[npy_uint64] out_buffer, + char * in_buffer_ptr, + uint64_t * out_buffer, kh_str_t *table, Py_ssize_t * count, - dict reverse, - ): + vector[char *] & reverse_values, + unsigned int thread_id, + omp_lock_t kh_locks[], + unsigned int num_threads, + ) nogil: cdef: - Py_ssize_t i, idx + Py_ssize_t i, idx, itemsize int ret + unsigned int j char * element char * insert khiter_t k ret = 0 - - for i in range(iter_range): - # TODO: Consider indexing directly into the array for efficiency - element = in_buffer[i] + itemsize = allocation_size - 1 + # allocate enough memory to hold the string element, add one for the + # null byte that marks the end of the string. + # TODO: understand why zero-filling is necessary. Without zero-filling + # the buffer, duplicate keys occur in the reverse dict + element = calloc(allocation_size, sizeof(char)) + + omp_set_lock(&kh_locks[thread_id]) + for i in xrange(iter_range): + # strings are stored without null termination in ndarrays: need a + # buffer to append null termination to use usual string algorithms + memcpy(element, in_buffer_ptr, itemsize) + in_buffer_ptr += itemsize + + # while reading from shared hash table, set lock for this thread k = kh_get_str(table, element) if k != table.n_buckets: idx = table.vals[k] else: + omp_unset_lock(&kh_locks[thread_id]) # allocate enough memory to hold the string, add one for the # null byte that marks the end of the string. insert = malloc(allocation_size) # TODO: is strcpy really the best way to copy a string? strcpy(insert, element) + # acquire locks for all threads before writing to shared resources + for j in xrange(num_threads): + omp_set_lock(&kh_locks[j]) k = kh_put_str(table, insert, &ret) table.vals[k] = idx = count[0] - reverse[count[0]] = element + reverse_values.push_back(insert) count[0] += 1 + # release all locks + for j in xrange(num_threads): + omp_unset_lock(&kh_locks[j]) + omp_set_lock(&kh_locks[thread_id]) out_buffer[i] = idx + omp_unset_lock(&kh_locks[thread_id]) + free(element) + @cython.wraparound(False) @cython.boundscheck(False) def factorize_str(carray carray_, carray labels=None): cdef: chunk chunk_ - Py_ssize_t n, i, count, chunklen, leftover_elements + Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \ + nchunks + unsigned int j, num_threads, thread_id, test + omp_lock_t * kh_locks + omp_lock_t chunk_lock, out_buffer_lock + vector[char *] reverse_values dict reverse - ndarray in_buffer + char * in_buffer_ptr ndarray[npy_uint64] out_buffer + char * out_buffer_org_ptr + uint64_t * out_buffer_ptr kh_str_t *table + uintptr_t leftover_array_ptr + count = 0 ret = 0 reverse = {} + nchunks = carray_.nchunks + # 0 = set the number of parallel threads automatically: + num_threads = 0 n = len(carray_) chunklen = carray_.chunklen + allocation_size = carray_.dtype.itemsize + 1 if labels is None: labels = carray([], dtype='int64', expectedlen=n) - # in-buffer isn't typed, because cython doesn't support string arrays (?) - out_buffer = np.empty(chunklen, dtype='uint64') - in_buffer = np.empty(chunklen, dtype=carray_.dtype) + + leftover_array_ptr = carray_.leftover_array.__array_interface__['data'][0] + table = kh_init_str() - for i in range(carray_.nchunks): - chunk_ = carray_.chunks[i] - # decompress into in_buffer - chunk_._getitem(0, chunklen, in_buffer.data) - _factorize_str_helper(chunklen, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer.astype(np.int64)) + # Allocate and initialise shared locks for hash-table writes for each child + # thread. (Shared because locks is assigned outside the parallel() block) + # num_threads=0: cython chooses no of parallel threads (= cpu cores?) + with nogil, parallel(num_threads=num_threads): + # determine the number of parallel threads that will be created + # (&var)[0]: workaround to keep num_threads shared and available after + # with block + (&num_threads)[0] = omp_get_num_threads() + kh_locks = malloc(num_threads * sizeof(omp_lock_t)) + for j in xrange(num_threads): + omp_init_lock(&kh_locks[j]) + + # initialise locks for other shared resources + omp_init_lock(&chunk_lock) + omp_init_lock(&out_buffer_lock) + + # create an out_buffer ndarray object to interface with chunk.append() + # TODO: Avoid np.empty as it unnecessarily allocates space for the content. + # Use PyArray_SimpleNewFromData instead!? + # Ref: http://blog.enthought.com/python/numpy-arrays-with-pre-allocated-memory + # Ref: https://gist.github.com/GaelVaroquaux/1249305 + out_buffer = np.empty(chunklen, dtype='uint64') + # we are going to redirect the data pointer, keep note of the original + # to be able to restore it. Otherwise python will try to free an invalid + # pointer when it garbage collects the out_buffer object + out_buffer_org_ptr = out_buffer.data + + # code in the parallel(...) block runs "isolated" (to some extent) in + # multiple child threads + # assignments within this block make variables thread-local, + # e.g. in_buffer_ptr. BUT this does not seem to work with 'cdef class' + # defined objects, e.g. numpy.ndarray, bcolz.chunk, etc., and dereferencing. + # The latter is a useful work-around to modifying shared variables from + # within parallel the block if locking is handled manually. + with nogil, parallel(num_threads=num_threads): + # allocate an in-buffer and an out-buffer for each thread + # note 1: cannot use ndarrays like: in_buffer = np.empty(...) + # In spite of the assignment cython does not create a + # thread local ndarray as it should! (Also see note 2 below.) + in_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(char)) + out_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(uint64_t)) + + # This is the work-sharing loop. The body is executed in multiple + # threads with each thread receiving a different value for i. + for i in prange(nchunks): + thread_id = threadid() + # chunk._getitem is not thread-safe. This may be due to it + # setting properties of chunk using self. - Or not. + # TODO: check whether _getitem can be made thread-safe to + # get rid of this probably unnecessary thread lock + omp_set_lock(&chunk_lock) + with gil: + chunk_ = carray_.chunks[i] + # decompress into in_buffer + # note: _getitem releases gil during blosc decompression + chunk_._getitem(0, chunklen, in_buffer_ptr) + omp_unset_lock(&chunk_lock) + + _factorize_str_helper(chunklen, + allocation_size, + in_buffer_ptr, + out_buffer_ptr, + table, + &count, + reverse_values, + thread_id, + kh_locks, + num_threads, + ) + + # note 2: labels.append(...) requires ndarray, but no thread-local + # ndarrays can be created. Use the shared out_buffer + # ndarray object and redirect its data pointer to the thread-local + # c array out_buffer_ptr + omp_set_lock(&out_buffer_lock) + with gil: + out_buffer.data = out_buffer_ptr + # compress out_buffer into labels + # note: append(...) can freeze with bcolz=0.8.0 + labels.append(out_buffer.astype(np.int64)) + omp_unset_lock(&out_buffer_lock) + + # Clean-up thread local variables + free(in_buffer_ptr) + free(out_buffer_ptr) + + # processing of leftover_array is not parallelised in view of an upcoming + # changes to bcolz chunk access which will allow seamlessly folding + # this into the prange block leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) if leftover_elements > 0: + out_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(uint64_t)) _factorize_str_helper(leftover_elements, - carray_.dtype.itemsize + 1, - carray_.leftover_array, - out_buffer, - table, - &count, - reverse, - ) + allocation_size, + leftover_array_ptr, + out_buffer_ptr, + table, + &count, + reverse_values, + 0, + kh_locks, + num_threads, + ) + out_buffer.data = out_buffer_ptr + # compress out_buffer into labels + labels.append(out_buffer[:leftover_elements].astype(np.int64)) - # compress out_buffer into labels - labels.append(out_buffer[:leftover_elements].astype(np.int64)) + # destroy and free all locks + for j in xrange(num_threads): + omp_destroy_lock(&kh_locks[j]) + free(kh_locks) + omp_destroy_lock(&chunk_lock) + omp_destroy_lock(&out_buffer_lock) + + # restore out_buffer data pointer to avoid segfault when python thries + # to free the object's data + out_buffer.data = out_buffer_org_ptr kh_destroy_str(table) + # construct python dict from vectors and + # free the memory allocated for the strings in the reverse_values list + for i in xrange(reverse_values.size()): + reverse[i] = reverse_values[i] + free(reverse_values[i]) + return labels, reverse @cython.wraparound(False) diff --git a/bquery/khash.pxd b/bquery/khash.pxd index a8fd51a..5e855c7 100644 --- a/bquery/khash.pxd +++ b/bquery/khash.pxd @@ -48,9 +48,9 @@ cdef extern from "khash_python.h": inline kh_str_t* kh_init_str() inline void kh_destroy_str(kh_str_t*) inline void kh_clear_str(kh_str_t*) - inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) + inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil inline void kh_resize_str(kh_str_t*, khint_t) - inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) + inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil inline void kh_del_str(kh_str_t*, khint_t) bint kh_exist_str(kh_str_t*, khiter_t) diff --git a/requirements.txt b/requirements.txt index 86f0f4f..c85e29c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -bcolz>=0.8.0 +bcolz>=0.8.1 diff --git a/setup.py b/setup.py index d311871..fbd0ac1 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,10 @@ def check_import(pkgname, pkgver): # Allow setting the Blosc dir if installed in the system BLOSC_DIR = os.environ.get('BLOSC_DIR', '') +# OpenMP libraries +CFLAGS.append('-fopenmp') +LFLAGS.append('-fopenmp') + # Sources & libraries inc_dirs = ['bquery'] lib_dirs = [] @@ -137,7 +141,8 @@ def check_import(pkgname, pkgver): library_dirs=lib_dirs, libraries=libs, extra_link_args=LFLAGS, - extra_compile_args=CFLAGS), + extra_compile_args=CFLAGS, + language='c++'), ], packages=['bquery', 'bquery.tests'], ) From 3ff7cde04f8a84cc3b5736990dfe0f1cc1b4b82a Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 6 Mar 2015 18:25:31 +0100 Subject: [PATCH 2/8] temporary fix for out-of-order chunk results timings are similar --- bquery/ctable_ext.pyx | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index e0de1e1..0359302 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -110,6 +110,7 @@ def factorize_str(carray carray_, carray labels=None): dict reverse char * in_buffer_ptr ndarray[npy_uint64] out_buffer + ndarray[npy_uint64] tmp_labels char * out_buffer_org_ptr uint64_t * out_buffer_ptr kh_str_t *table @@ -161,6 +162,10 @@ def factorize_str(carray carray_, carray labels=None): # pointer when it garbage collects the out_buffer object out_buffer_org_ptr = out_buffer.data + leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) + # full size out-buffer (temporary fix for out of order chunk results) + tmp_labels = np.empty(chunklen*nchunks+leftover_elements, dtype='uint64') + # code in the parallel(...) block runs "isolated" (to some extent) in # multiple child threads # assignments within this block make variables thread-local, @@ -213,7 +218,15 @@ def factorize_str(carray carray_, carray labels=None): out_buffer.data = out_buffer_ptr # compress out_buffer into labels # note: append(...) can freeze with bcolz=0.8.0 - labels.append(out_buffer.astype(np.int64)) + #labels.append(out_buffer.astype(np.int64)) + + # chunk results arrive out of order! + # Solutions: Either keep full-size shared out_buffer in memory + # or find a way to write carray chunks out of order. + # Latter would be preferable. + # For now keeping thread-local out-buffers + full-size out-buffer + # in memory + tmp_labels[i*chunklen:(i+1)*chunklen] = out_buffer.astype(np.int64) omp_unset_lock(&out_buffer_lock) # Clean-up thread local variables @@ -223,7 +236,6 @@ def factorize_str(carray carray_, carray labels=None): # processing of leftover_array is not parallelised in view of an upcoming # changes to bcolz chunk access which will allow seamlessly folding # this into the prange block - leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) if leftover_elements > 0: out_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(uint64_t)) _factorize_str_helper(leftover_elements, @@ -239,7 +251,8 @@ def factorize_str(carray carray_, carray labels=None): ) out_buffer.data = out_buffer_ptr # compress out_buffer into labels - labels.append(out_buffer[:leftover_elements].astype(np.int64)) + #labels.append(out_buffer[:leftover_elements].astype(np.int64)) + tmp_labels[nchunks*chunklen:nchunks*chunklen+leftover_elements+1] = out_buffer[:leftover_elements].astype(np.int64) # destroy and free all locks for j in xrange(num_threads): @@ -254,6 +267,9 @@ def factorize_str(carray carray_, carray labels=None): kh_destroy_str(table) + # compress tmp_labels into carray + labels.append(tmp_labels) + # construct python dict from vectors and # free the memory allocated for the strings in the reverse_values list for i in xrange(reverse_values.size()): From 1a6b2477da5edeee584e87a57c342721ceb0c65d Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 6 Mar 2015 23:50:09 +0100 Subject: [PATCH 3/8] write labels carray chunks out-of-order exhibits performance issues with unique(), presumably somehow linked to the in-memory labels carray --- bquery/ctable.py | 13 ++++ bquery/ctable_ext.pyx | 165 ++++++++++++++++++++++++++++-------------- 2 files changed, 125 insertions(+), 53 deletions(-) diff --git a/bquery/ctable.py b/bquery/ctable.py index 804edf5..ee2df11 100644 --- a/bquery/ctable.py +++ b/bquery/ctable.py @@ -62,6 +62,19 @@ def cache_factor(self, col_list, refresh=False): carray_factor = \ bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir, mode='w') + # ensure carray_factor.chunklen is multiple of column chunklen + if carray_factor.chunklen % self[col].chunklen != 0: + chunklen_multiplier = round(carray_factor.chunklen / + self[col].chunklen) + if chunklen_multiplier < 1.0: + chunklen_multiplier = 1 + chunklen_multiplier = int(round(chunklen_multiplier)) + carray_factor = \ + bcolz.carray([], dtype='int64', + chunklen=chunklen_multiplier * \ + self[col].chunklen, + rootdir=col_factor_rootdir, mode='w') + _, values = \ ctable_ext.factorize(self[col], labels=carray_factor) carray_factor.flush() diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index 0359302..404a945 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -10,7 +10,7 @@ from libc.stdlib cimport malloc, calloc, free from libc.string cimport strcpy, memcpy, memset from libcpp.vector cimport vector from khash cimport * -from bcolz.carray_ext cimport carray, chunk +from bcolz.carray_ext cimport carray, chunks, chunk # ---------------------------------------------------------------------------- # GLOBAL DEFINITIONS @@ -101,16 +101,17 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, def factorize_str(carray carray_, carray labels=None): cdef: chunk chunk_ - Py_ssize_t n, i, count, chunklen, leftover_elements, allocation_size, \ - nchunks - unsigned int j, num_threads, thread_id, test + chunks labels_chunks + Py_ssize_t n, i, j, count, chunklen, leftover_elements, \ + allocation_size, nchunks, chunklen_multiplier, \ + tmp_chunks_counter + unsigned int num_threads, thread_id omp_lock_t * kh_locks omp_lock_t chunk_lock, out_buffer_lock vector[char *] reverse_values dict reverse char * in_buffer_ptr ndarray[npy_uint64] out_buffer - ndarray[npy_uint64] tmp_labels char * out_buffer_org_ptr uint64_t * out_buffer_ptr kh_str_t *table @@ -120,15 +121,34 @@ def factorize_str(carray carray_, carray labels=None): count = 0 ret = 0 reverse = {} + tmp_chunks_list = [] + tmp_chunks_counter = 0 nchunks = carray_.nchunks - # 0 = set the number of parallel threads automatically: + # num_threads = 0: set the number of parallel threads automatically: num_threads = 0 n = len(carray_) chunklen = carray_.chunklen allocation_size = carray_.dtype.itemsize + 1 + if labels is None: labels = carray([], dtype='int64', expectedlen=n) + tmp_multiplier = round(labels.chunklen / carray_.chunklen) + if tmp_multiplier < 1: + tmp_multiplier = 1 + tmp_multiplier = int(round(tmp_multiplier)) + labels = carray([], dtype='int64', + chunklen=tmp_multiplier * carray_.chunklen) + if labels.chunklen % chunklen != 0: + raise NotImplementedError('labels.chunklen must be a multiple of ' + + 'carray_.chunklen') + chunklen_multiplier = labels.chunklen / carray_.chunklen + # provide access to cython chunks api for persistent labels + # fails with in-memory carray + if labels._rootdir is not None: + labels_chunks = labels.chunks + + tmp_chunks_keys = [0] * int(carray_.size / labels.chunklen) leftover_array_ptr = carray_.leftover_array.__array_interface__['data'][0] @@ -143,6 +163,7 @@ def factorize_str(carray carray_, carray labels=None): # (&var)[0]: workaround to keep num_threads shared and available after # with block (&num_threads)[0] = omp_get_num_threads() + num_threads *= 2 kh_locks = malloc(num_threads * sizeof(omp_lock_t)) for j in xrange(num_threads): omp_init_lock(&kh_locks[j]) @@ -156,16 +177,12 @@ def factorize_str(carray carray_, carray labels=None): # Use PyArray_SimpleNewFromData instead!? # Ref: http://blog.enthought.com/python/numpy-arrays-with-pre-allocated-memory # Ref: https://gist.github.com/GaelVaroquaux/1249305 - out_buffer = np.empty(chunklen, dtype='uint64') + out_buffer = np.empty(labels.chunklen, dtype='uint64') # we are going to redirect the data pointer, keep note of the original # to be able to restore it. Otherwise python will try to free an invalid # pointer when it garbage collects the out_buffer object out_buffer_org_ptr = out_buffer.data - leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) - # full size out-buffer (temporary fix for out of order chunk results) - tmp_labels = np.empty(chunklen*nchunks+leftover_elements, dtype='uint64') - # code in the parallel(...) block runs "isolated" (to some extent) in # multiple child threads # assignments within this block make variables thread-local, @@ -178,36 +195,41 @@ def factorize_str(carray carray_, carray labels=None): # note 1: cannot use ndarrays like: in_buffer = np.empty(...) # In spite of the assignment cython does not create a # thread local ndarray as it should! (Also see note 2 below.) - in_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(char)) - out_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(uint64_t)) + in_buffer_ptr = malloc(chunklen * (allocation_size-1) * + sizeof(char)) + out_buffer_ptr = malloc(chunklen_multiplier * chunklen * + sizeof(uint64_t)) # This is the work-sharing loop. The body is executed in multiple # threads with each thread receiving a different value for i. - for i in prange(nchunks): + for i in prange(0, nchunks, chunklen_multiplier): thread_id = threadid() - # chunk._getitem is not thread-safe. This may be due to it - # setting properties of chunk using self. - Or not. - # TODO: check whether _getitem can be made thread-safe to - # get rid of this probably unnecessary thread lock - omp_set_lock(&chunk_lock) - with gil: - chunk_ = carray_.chunks[i] - # decompress into in_buffer - # note: _getitem releases gil during blosc decompression - chunk_._getitem(0, chunklen, in_buffer_ptr) - omp_unset_lock(&chunk_lock) - - _factorize_str_helper(chunklen, - allocation_size, - in_buffer_ptr, - out_buffer_ptr, - table, - &count, - reverse_values, - thread_id, - kh_locks, - num_threads, - ) + # pack the results from multiple carray_-chunks into one + # labels-chunk: allows out-of-order writing of labels at good speed + for j in xrange(chunklen_multiplier): + # chunk._getitem is not thread-safe. This may be due to it + # setting properties of chunk using self. - Or not. + # TODO: check whether _getitem can be made thread-safe to + # get rid of this probably unnecessary thread lock + omp_set_lock(&chunk_lock) + with gil: + chunk_ = carray_.chunks[i+j] + # decompress into in_buffer + # note: _getitem releases gil during blosc decompression + chunk_._getitem(0, chunklen, in_buffer_ptr) + omp_unset_lock(&chunk_lock) + + _factorize_str_helper(chunklen, + allocation_size, + in_buffer_ptr, + out_buffer_ptr + j * chunklen, + table, + &count, + reverse_values, + thread_id, + kh_locks, + num_threads, + ) # note 2: labels.append(...) requires ndarray, but no thread-local # ndarrays can be created. Use the shared out_buffer @@ -217,27 +239,64 @@ def factorize_str(carray carray_, carray labels=None): with gil: out_buffer.data = out_buffer_ptr # compress out_buffer into labels - # note: append(...) can freeze with bcolz=0.8.0 - #labels.append(out_buffer.astype(np.int64)) - - # chunk results arrive out of order! - # Solutions: Either keep full-size shared out_buffer in memory - # or find a way to write carray chunks out of order. - # Latter would be preferable. - # For now keeping thread-local out-buffers + full-size out-buffer - # in memory - tmp_labels[i*chunklen:(i+1)*chunklen] = out_buffer.astype(np.int64) + # _save() and append() freeze at versions prior to v0.8.1 + if labels._rootdir is not None: + labels_chunks._save(int(i/chunklen_multiplier), chunk(out_buffer, labels._dtype, + labels._cparams, _memory=False)) + labels._cbytes += chunk_.cbytes + labels._nbytes += labels.chunklen * labels.itemsize + labels_chunks.nchunks += 1 + # in-memory chunks cannot be "_save()"-ed but must be appended + # to a python chunk list + else: + # TODO: Why does the following not work? (Freezes...) + # tmp_chunks_list[i] = chunk(out_buffer, labels._dtype, labels._cparams, _memory=True) + # + # Work-around until above problem is solved. + tmp_chunks_list.append(chunk(out_buffer, labels._dtype, + labels._cparams, _memory=True)) + tmp_chunks_keys[int(i/chunklen_multiplier)] = tmp_chunks_counter + (&tmp_chunks_counter)[0] += 1 + omp_unset_lock(&out_buffer_lock) # Clean-up thread local variables free(in_buffer_ptr) free(out_buffer_ptr) + # update chunk count (necessary due to use of _save() above + if labels._rootdir is None: + labels.chunks = [tmp_chunks_list[i] for i in tmp_chunks_keys] + + # process remaining chunks that could not be processed en-bloc, chunks in-order + # TODO: parallelise after refactoring + # This is really slowing us down when chunk_multiplier = 1 !!! + # e.g. uncompressed unique() + in_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(char)) + out_buffer_ptr = malloc(chunklen * sizeof(uint64_t)) + for i in xrange(labels.size/carray_.chunklen, nchunks): + chunk_ = carray_.chunks[i] + chunk_._getitem(0, chunklen, in_buffer_ptr) + _factorize_str_helper(chunklen, + allocation_size, + in_buffer_ptr, + out_buffer_ptr, + table, + &count, + reverse_values, + 0, + kh_locks, + num_threads, + ) + #out_buffer.data = out_buffer_ptr + #labels.append(out_buffer.astype(np.int64)) + + # processing of leftover_array is not parallelised in view of an upcoming # changes to bcolz chunk access which will allow seamlessly folding # this into the prange block + leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) if leftover_elements > 0: - out_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(uint64_t)) _factorize_str_helper(leftover_elements, allocation_size, leftover_array_ptr, @@ -251,8 +310,11 @@ def factorize_str(carray carray_, carray labels=None): ) out_buffer.data = out_buffer_ptr # compress out_buffer into labels - #labels.append(out_buffer[:leftover_elements].astype(np.int64)) - tmp_labels[nchunks*chunklen:nchunks*chunklen+leftover_elements+1] = out_buffer[:leftover_elements].astype(np.int64) + labels.append(out_buffer[:leftover_elements].astype(np.int64)) + + + free(in_buffer_ptr) + free(out_buffer_ptr) # destroy and free all locks for j in xrange(num_threads): @@ -267,9 +329,6 @@ def factorize_str(carray carray_, carray labels=None): kh_destroy_str(table) - # compress tmp_labels into carray - labels.append(tmp_labels) - # construct python dict from vectors and # free the memory allocated for the strings in the reverse_values list for i in xrange(reverse_values.size()): From 1158a88153fb7fcb24c16c44158cb2fad268f80b Mon Sep 17 00:00:00 2001 From: ARF Date: Sun, 8 Mar 2015 10:36:07 +0100 Subject: [PATCH 4/8] fixed in-memory labels carray broke due to performance debugging --- bquery/ctable_ext.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index 404a945..2acbdbc 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -266,7 +266,10 @@ def factorize_str(carray carray_, carray labels=None): # update chunk count (necessary due to use of _save() above if labels._rootdir is None: - labels.chunks = [tmp_chunks_list[i] for i in tmp_chunks_keys] + for i in xrange(tmp_chunks_counter): + labels.chunks.append(tmp_chunks_list[tmp_chunks_keys[i]]) + labels._cbytes += tmp_chunks_list[tmp_chunks_keys[i]].cbytes + labels._nbytes += labels.chunklen * labels.itemsize # process remaining chunks that could not be processed en-bloc, chunks in-order # TODO: parallelise after refactoring @@ -274,6 +277,7 @@ def factorize_str(carray carray_, carray labels=None): # e.g. uncompressed unique() in_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(char)) out_buffer_ptr = malloc(chunklen * sizeof(uint64_t)) + out_buffer.data = out_buffer_ptr for i in xrange(labels.size/carray_.chunklen, nchunks): chunk_ = carray_.chunks[i] chunk_._getitem(0, chunklen, in_buffer_ptr) @@ -289,7 +293,7 @@ def factorize_str(carray carray_, carray labels=None): num_threads, ) #out_buffer.data = out_buffer_ptr - #labels.append(out_buffer.astype(np.int64)) + labels.append(out_buffer[:chunklen].astype(np.int64)) # processing of leftover_array is not parallelised in view of an upcoming @@ -308,7 +312,6 @@ def factorize_str(carray carray_, carray labels=None): kh_locks, num_threads, ) - out_buffer.data = out_buffer_ptr # compress out_buffer into labels labels.append(out_buffer[:leftover_elements].astype(np.int64)) From 5a3b4877a48c6fece58676206f41ef16ee7f1cff Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 16 Mar 2015 12:31:49 +0100 Subject: [PATCH 5/8] non-iterblocks implementation --- bquery/ctable_ext.pyx | 194 +++++++++++++++++++++++------------------- bquery/khash.h | 60 +++++-------- bquery/khash_python.h | 13 ++- setup.py | 64 ++++++++++++-- 4 files changed, 196 insertions(+), 135 deletions(-) diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index 2acbdbc..80f0c5f 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -1,14 +1,16 @@ -import cython +cimport cython from cython.parallel import parallel, prange, threadid -from openmp cimport omp_lock_t, omp_init_lock, omp_set_lock, omp_unset_lock, omp_destroy_lock, omp_get_num_threads +from openmp cimport * import numpy as np -from numpy cimport ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t +from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, + npy_int64, npy_float64, uint64_t) from libc.stdint cimport uintptr_t from libc.stdlib cimport malloc, calloc, free from libc.string cimport strcpy, memcpy, memset from libcpp.vector cimport vector + from khash cimport * from bcolz.carray_ext cimport carray, chunks, chunk @@ -32,6 +34,65 @@ SORTED_COUNT_DISTINCT = 4 DEF _SORTED_COUNT_DISTINCT = 4 # ---------------------------------------------------------------------------- + +cdef extern from *: + cdef void omp_start_critical "#pragma omp critical \n { //" () nogil + cdef void omp_end_critical "} //" () nogil + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef omp_lock_t * par_initialise_kh_locks(unsigned int * num_threads) nogil: + cdef: + int i + omp_lock_t * kh_locks + + num_threads[0] = omp_get_num_threads() + kh_locks = malloc(num_threads[0] * sizeof(omp_lock_t)) + for i in range(num_threads[0]): + omp_init_lock(&kh_locks[i]) + + return kh_locks + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef char * par_allocate_in_buffer(Py_ssize_t chunklen, + Py_ssize_t itemsize) nogil: + return malloc(chunklen * itemsize * sizeof(char)) + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef uint64_t * par_allocate_out_buffer(Py_ssize_t chunklen, + Py_ssize_t chunklen_multiplier) nogil: + return malloc(chunklen_multiplier * chunklen + * sizeof(uint64_t)) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef check_labels_structure(carray carray_, carray labels): + # if no labels carray is given, create one with satisfying: + # labels.chunklen % carray_.chunklen = 0 + if labels is None: + labels = carray([], dtype='int64', expectedlen=len(carray_)) + chunklen_multiplier = round(labels.chunklen / carray_.chunklen) + if chunklen_multiplier < 1: + chunklen_multiplier = 1 + chunklen_multiplier = int(round(chunklen_multiplier)) + labels = carray([], dtype='int64', + chunklen=chunklen_multiplier * carray_.chunklen) + + # if one was given, check chunklen is ok + elif labels.chunklen % carray_.chunklen != 0: + raise NotImplementedError('labels.chunklen must be a multiple of ' + + 'carray_.chunklen') + # if labels was given, with ok chunklen, determine multiplier + else: + chunklen_multiplier = labels.chunklen / carray_.chunklen + + return labels, chunklen_multiplier + + # Factorize Section @cython.wraparound(False) @cython.boundscheck(False) @@ -41,7 +102,6 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, uint64_t * out_buffer, kh_str_t *table, Py_ssize_t * count, - vector[char *] & reverse_values, unsigned int thread_id, omp_lock_t kh_locks[], unsigned int num_threads, @@ -83,10 +143,15 @@ cdef void _factorize_str_helper(Py_ssize_t iter_range, # acquire locks for all threads before writing to shared resources for j in xrange(num_threads): omp_set_lock(&kh_locks[j]) - k = kh_put_str(table, insert, &ret) - table.vals[k] = idx = count[0] - reverse_values.push_back(insert) - count[0] += 1 + # check whether another thread has already added the entry by now + k = kh_get_str(table, element) + if k != table.n_buckets: + idx = table.vals[k] + # if not, add element + else: + k = kh_put_str(table, insert, &ret) + table.vals[k] = idx = count[0] + count[0] += 1 # release all locks for j in xrange(num_threads): omp_unset_lock(&kh_locks[j]) @@ -102,14 +167,10 @@ def factorize_str(carray carray_, carray labels=None): cdef: chunk chunk_ chunks labels_chunks - Py_ssize_t n, i, j, count, chunklen, leftover_elements, \ - allocation_size, nchunks, chunklen_multiplier, \ - tmp_chunks_counter - unsigned int num_threads, thread_id - omp_lock_t * kh_locks + Py_ssize_t i, j, k, leftover_elements, \ + chunklen_multiplier + unsigned int thread_id omp_lock_t chunk_lock, out_buffer_lock - vector[char *] reverse_values - dict reverse char * in_buffer_ptr ndarray[npy_uint64] out_buffer char * out_buffer_org_ptr @@ -117,32 +178,22 @@ def factorize_str(carray carray_, carray labels=None): kh_str_t *table uintptr_t leftover_array_ptr + Py_ssize_t count = 0 + Py_ssize_t nchunks = carray_.nchunks + Py_ssize_t tmp_chunks_counter = 0 + omp_lock_t * kh_locks = NULL + dict reverse = {} + list tmp_chunks_list = [] + # num_threads = 0: set the number of parallel threads automatically: + unsigned int num_threads = 0 - count = 0 - ret = 0 - reverse = {} - tmp_chunks_list = [] - tmp_chunks_counter = 0 - nchunks = carray_.nchunks - # num_threads = 0: set the number of parallel threads automatically: - num_threads = 0 + Py_ssize_t chunklen = carray_.chunklen + Py_ssize_t allocation_size = carray_.dtype.itemsize + 1 - n = len(carray_) - chunklen = carray_.chunklen - allocation_size = carray_.dtype.itemsize + 1 - if labels is None: - labels = carray([], dtype='int64', expectedlen=n) - tmp_multiplier = round(labels.chunklen / carray_.chunklen) - if tmp_multiplier < 1: - tmp_multiplier = 1 - tmp_multiplier = int(round(tmp_multiplier)) - labels = carray([], dtype='int64', - chunklen=tmp_multiplier * carray_.chunklen) - if labels.chunklen % chunklen != 0: - raise NotImplementedError('labels.chunklen must be a multiple of ' + - 'carray_.chunklen') - chunklen_multiplier = labels.chunklen / carray_.chunklen + + labels, chunklen_multiplier = check_labels_structure(carray_, labels) + # provide access to cython chunks api for persistent labels # fails with in-memory carray if labels._rootdir is not None: @@ -154,55 +205,24 @@ def factorize_str(carray carray_, carray labels=None): table = kh_init_str() - - # Allocate and initialise shared locks for hash-table writes for each child - # thread. (Shared because locks is assigned outside the parallel() block) - # num_threads=0: cython chooses no of parallel threads (= cpu cores?) - with nogil, parallel(num_threads=num_threads): - # determine the number of parallel threads that will be created - # (&var)[0]: workaround to keep num_threads shared and available after - # with block - (&num_threads)[0] = omp_get_num_threads() - num_threads *= 2 - kh_locks = malloc(num_threads * sizeof(omp_lock_t)) - for j in xrange(num_threads): - omp_init_lock(&kh_locks[j]) - # initialise locks for other shared resources omp_init_lock(&chunk_lock) omp_init_lock(&out_buffer_lock) - # create an out_buffer ndarray object to interface with chunk.append() - # TODO: Avoid np.empty as it unnecessarily allocates space for the content. - # Use PyArray_SimpleNewFromData instead!? - # Ref: http://blog.enthought.com/python/numpy-arrays-with-pre-allocated-memory - # Ref: https://gist.github.com/GaelVaroquaux/1249305 out_buffer = np.empty(labels.chunklen, dtype='uint64') - # we are going to redirect the data pointer, keep note of the original - # to be able to restore it. Otherwise python will try to free an invalid - # pointer when it garbage collects the out_buffer object out_buffer_org_ptr = out_buffer.data - # code in the parallel(...) block runs "isolated" (to some extent) in - # multiple child threads - # assignments within this block make variables thread-local, - # e.g. in_buffer_ptr. BUT this does not seem to work with 'cdef class' - # defined objects, e.g. numpy.ndarray, bcolz.chunk, etc., and dereferencing. - # The latter is a useful work-around to modifying shared variables from - # within parallel the block if locking is handled manually. with nogil, parallel(num_threads=num_threads): + # Initialise shared locks for hash-table writes for each child thread. + omp_start_critical() + (&kh_locks)[0] = par_initialise_kh_locks(&num_threads) + omp_end_critical() + # allocate an in-buffer and an out-buffer for each thread - # note 1: cannot use ndarrays like: in_buffer = np.empty(...) - # In spite of the assignment cython does not create a - # thread local ndarray as it should! (Also see note 2 below.) - in_buffer_ptr = malloc(chunklen * (allocation_size-1) * - sizeof(char)) - out_buffer_ptr = malloc(chunklen_multiplier * chunklen * - sizeof(uint64_t)) - - # This is the work-sharing loop. The body is executed in multiple - # threads with each thread receiving a different value for i. - for i in prange(0, nchunks, chunklen_multiplier): + in_buffer_ptr = par_allocate_in_buffer(chunklen, allocation_size-1) + out_buffer_ptr = par_allocate_out_buffer(chunklen, chunklen_multiplier) + + for i in prange(0, (nchunks/chunklen_multiplier)*chunklen_multiplier, chunklen_multiplier): thread_id = threadid() # pack the results from multiple carray_-chunks into one # labels-chunk: allows out-of-order writing of labels at good speed @@ -225,7 +245,6 @@ def factorize_str(carray carray_, carray labels=None): out_buffer_ptr + j * chunklen, table, &count, - reverse_values, thread_id, kh_locks, num_threads, @@ -287,7 +306,6 @@ def factorize_str(carray carray_, carray labels=None): out_buffer_ptr, table, &count, - reverse_values, 0, kh_locks, num_threads, @@ -307,7 +325,6 @@ def factorize_str(carray carray_, carray labels=None): out_buffer_ptr, table, &count, - reverse_values, 0, kh_locks, num_threads, @@ -330,13 +347,14 @@ def factorize_str(carray carray_, carray labels=None): # to free the object's data out_buffer.data = out_buffer_org_ptr - kh_destroy_str(table) - - # construct python dict from vectors and - # free the memory allocated for the strings in the reverse_values list - for i in xrange(reverse_values.size()): - reverse[i] = reverse_values[i] - free(reverse_values[i]) + # construct python dict from vectors and free element memory + cdef unsigned long m + for m in xrange(table.n_buckets): + if not kh_exist_str(table, m): + continue + reverse[table.vals[m]] = table.keys[m] + free(table.keys[m]) + kh_destroy_str(table) return labels, reverse diff --git a/bquery/khash.h b/bquery/khash.h index c0421c4..5e55088 100644 --- a/bquery/khash.h +++ b/bquery/khash.h @@ -116,7 +116,6 @@ int main() { #ifndef __AC_KHASH_H #define __AC_KHASH_H -#endif /*! @header @@ -139,33 +138,31 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khuint64_t; -typedef signed long khint64_t; +typedef unsigned long khint64_t; #else -typedef unsigned long long khuint64_t; -typedef signed long long khint64_t; +typedef unsigned long long khint64_t; #endif - +#ifndef kh_inline #ifdef _MSC_VER #define kh_inline __inline #else #define kh_inline inline +#endif +#endif /* kh_inline */ -typedef double khfloat64_t; typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1) -#define __ac_isdel(flag, i) (0) -#define __ac_iseither(flag, i) __ac_isempty(flag, i) -#define __ac_set_isdel_false(flag, i) (0) -#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU))) -#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU))) -#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) -#define __ac_set_isdel_true(flag, i) (0) +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) -#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) @@ -187,7 +184,7 @@ typedef khint_t khiter_t; static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ - typedef struct { \ + typedef struct kh_##name##_s { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ @@ -247,7 +244,7 @@ static const double __ac_HASH_UPPER = 0.77; else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (!new_flags) return -1; \ - memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (!new_keys) { kfree(new_flags); return -1; } \ @@ -268,7 +265,7 @@ static const double __ac_HASH_UPPER = 0.77; khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ - __ac_set_isempty_true(h->flags, j); \ + __ac_set_isdel_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t k, i, step = 0; \ k = __hash_func(key); \ @@ -278,7 +275,7 @@ static const double __ac_HASH_UPPER = 0.77; if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ @@ -354,7 +351,7 @@ static const double __ac_HASH_UPPER = 0.77; __KHASH_PROTOTYPES(name, khkey_t, khval_t) #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ @@ -382,7 +379,6 @@ static const double __ac_HASH_UPPER = 0.77; @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) - /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @@ -432,7 +428,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ -#define kh_init(name) kh_init_##name(void) +#define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @@ -593,9 +589,6 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) - #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -604,13 +597,9 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @@ -627,15 +616,4 @@ typedef const char *kh_cstr_t; #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) - -#define kh_exist_str(h, k) (kh_exist(h, k)) -#define kh_exist_float64(h, k) (kh_exist(h, k)) -#define kh_exist_int64(h, k) (kh_exist(h, k)) -#define kh_exist_int32(h, k) (kh_exist(h, k)) - -KHASH_MAP_INIT_STR(str, size_t) -KHASH_MAP_INIT_INT(int32, size_t) -KHASH_MAP_INIT_INT64(int64, size_t) - - #endif /* __AC_KHASH_H */ diff --git a/bquery/khash_python.h b/bquery/khash_python.h index b5b0de2..2566dff 100644 --- a/bquery/khash_python.h +++ b/bquery/khash_python.h @@ -1,7 +1,18 @@ #include +typedef double khfloat64_t; + #include "khash.h" +#define kh_exist_str(h, k) (kh_exist(h, k)) +#define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_int32(h, k) (kh_exist(h, k)) + +KHASH_MAP_INIT_STR(str, size_t) +KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_INT64(int64, size_t) + // kludge #define kh_float64_hash_func _Py_HashDouble @@ -46,4 +57,4 @@ KHASH_SET_INIT_PYOBJECT(pyset) #define kh_exist_pymap(h, k) (kh_exist(h, k)) #define kh_exist_pyset(h, k) (kh_exist(h, k)) -KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) \ No newline at end of file +KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) diff --git a/setup.py b/setup.py index fbd0ac1..acbb51f 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,64 @@ def check_import(pkgname, pkgver): ########### End of checks ########## +########### Project-specific build options ########### + +copt = {'cygwin': ['-fopenmp'], + 'emx': ['-fopenmp'], + 'intel': ['-openmp'], + 'intele': ['-openmp'], + 'intelem': ['-openmp'], + 'mingw32': ['-fopenmp'], + 'msvc': ['/openmp'], + } +lopt = {'cygwin': ['-fopenmp'], + 'emx': ['-fopenmp'], + 'mingw32': ['-fopenmp'], + } + +class bquery_build_ext(build_ext): + user_options = build_ext.user_options + \ + [ + ('from-templates', None, + "rebuild project from code generation templates"), + ] + + def initialize_options(self): + self.from_templates = False + build_ext.initialize_options(self) + + def run(self): + # regenerate cython code from templates + if self.from_templates: + try: + import jinja2 + except: + exit_with_error( + "You need the python package jinja2 to rebuild the " + \ + "extension from the templates") + execfile("bquery/templates/run_templates.py") + + build_ext.run(self) + + def build_extensions(self): + # set compiler-specific build flags, e.g. for openmp + c = self.compiler.compiler_type + if copt.has_key(c): + for e in self.extensions: + e.extra_compile_args += copt[ c ] + else: + print_warning( + "Openmp flags for compiler '%s' not configured in setup.py." + + "Building without parallel processing support.") + if lopt.has_key(c): + for e in self.extensions: + e.extra_link_args += lopt[ c ] + build_ext.build_extensions(self) + + +######### End project-specific build options ######### + + # bquery version VERSION = open('VERSION').read().strip() # Create the version.py file @@ -103,10 +161,6 @@ def check_import(pkgname, pkgver): # Allow setting the Blosc dir if installed in the system BLOSC_DIR = os.environ.get('BLOSC_DIR', '') -# OpenMP libraries -CFLAGS.append('-fopenmp') -LFLAGS.append('-fopenmp') - # Sources & libraries inc_dirs = ['bquery'] lib_dirs = [] @@ -132,7 +186,7 @@ def check_import(pkgname, pkgver): url='https://github.com/visualfabriq/bquery', license='http://www.opensource.org/licenses/bsd-license.php', platforms=['any'], - cmdclass={'build_ext': build_ext}, + cmdclass={'build_ext': bquery_build_ext}, ext_modules=[ Extension("bquery.ctable_ext", include_dirs=inc_dirs, From b2453aa8f8dde8ea0711ec2773ef2ffee8412202 Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 16 Mar 2015 12:40:50 +0100 Subject: [PATCH 6/8] non-iterblocks with pandas khash --- bquery/khash.h | 247 ++++++++++++++++++------------------------ bquery/khash_python.h | 15 +-- 2 files changed, 105 insertions(+), 157 deletions(-) diff --git a/bquery/khash.h b/bquery/khash.h index 5e55088..4350ff0 100644 --- a/bquery/khash.h +++ b/bquery/khash.h @@ -33,6 +33,7 @@ int main() { khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); @@ -46,23 +47,6 @@ int main() { */ /* - 2013-05-02 (0.2.8): - - * Use quadratic probing. When the capacity is power of 2, stepping function - i*(i+1)/2 guarantees to traverse each bucket. It is better than double - hashing on cache performance and is more robust than linear probing. - - In theory, double hashing should be more robust than quadratic probing. - However, my implementation is probably not for large hash tables, because - the second hash function is closely tied to the first hash function, - which reduce the effectiveness of double hashing. - - Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php - - 2011-12-29 (0.2.7): - - * Minor code clean up; no actual effect. - 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the @@ -123,13 +107,12 @@ int main() { Generic hash table library. */ -#define AC_VERSION_KHASH_H "0.2.8" +#define AC_VERSION_KHASH_H "0.2.6" #include #include #include -/* compiler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; @@ -138,78 +121,84 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khint64_t; +typedef unsigned long khuint64_t; +typedef signed long khint64_t; #else -typedef unsigned long long khint64_t; +typedef unsigned long long khuint64_t; +typedef signed long long khint64_t; #endif -#ifndef kh_inline -#ifdef _MSC_VER -#define kh_inline __inline -#else -#define kh_inline inline +typedef double khfloat64_t; + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif #endif -#endif /* kh_inline */ typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) -#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) -#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) -#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) -#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) -#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) -#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1) +#define __ac_isdel(flag, i) (0) +#define __ac_iseither(flag, i) __ac_isempty(flag, i) +#define __ac_set_isdel_false(flag, i) (0) +#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU))) +#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU))) +#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) +#define __ac_set_isdel_true(flag, i) (0) + +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif -#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) +#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif -#ifndef kcalloc -#define kcalloc(N,Z) calloc(N,Z) -#endif -#ifndef kmalloc -#define kmalloc(Z) malloc(Z) -#endif -#ifndef krealloc -#define krealloc(P,Z) realloc(P,Z) -#endif -#ifndef kfree -#define kfree(P) free(P) -#endif - static const double __ac_HASH_UPPER = 0.77; -#define __KHASH_TYPE(name, khkey_t, khval_t) \ - typedef struct kh_##name##_s { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; - -#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ - extern kh_##name##_t *kh_init_##name(void); \ +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); -#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - kfree((void *)h->keys); kfree(h->flags); \ - kfree((void *)h->vals); \ - kfree(h); \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ @@ -222,19 +211,19 @@ static const double __ac_HASH_UPPER = 0.77; SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khint_t k, i, last, mask, step = 0; \ + khint_t inc, k, i, last, mask; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ - last = i; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + (++step)) & mask; \ + i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ @@ -242,18 +231,11 @@ static const double __ac_HASH_UPPER = 0.77; if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (!new_flags) return -1; \ - memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ - khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (!new_keys) { kfree(new_flags); return -1; } \ - h->keys = new_keys; \ - if (kh_is_map) { \ - khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - if (!new_vals) { kfree(new_flags); return -1; } \ - h->vals = new_vals; \ - } \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } /* otherwise shrink */ \ } \ } \ @@ -265,17 +247,18 @@ static const double __ac_HASH_UPPER = 0.77; khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ - __ac_set_isdel_true(h->flags, j); \ + __ac_set_isempty_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khint_t k, i, step = 0; \ + khint_t inc, k, i; \ k = __hash_func(key); \ i = k & new_mask; \ - while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ @@ -285,38 +268,32 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - kfree(h->flags); /* free the working space */ \ + free(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ - return 0; \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) { \ - if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ - *ret = -1; return h->n_buckets; \ - } \ - } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ - *ret = -1; return h->n_buckets; \ - } \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - last = i; \ + inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + (++step)) & mask; \ + i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ @@ -346,16 +323,8 @@ static const double __ac_HASH_UPPER = 0.77; } \ } -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_PROTOTYPES(name, khkey_t, khval_t) - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) - #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + KHASH_INIT2(name, static PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ @@ -379,15 +348,16 @@ static const double __ac_HASH_UPPER = 0.77; @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) + /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ -static kh_inline khint_t __ac_X31_hash_string(const char *s) +static PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s) { - khint_t h = (khint_t)*s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; return h; } /*! @function @@ -401,7 +371,7 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) -static kh_inline khint_t __ac_Wang_hash(khint_t key) +static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); @@ -428,7 +398,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ -#define kh_init(name) kh_init_##name() +#define kh_init(name) kh_init_##name(void) /*! @function @abstract Destroy a hash table. @@ -457,8 +427,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @param r Extra return code: -1 if the operation failed; - 0 if the key is present in the hash table; + @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] @@ -470,7 +439,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) @@ -540,34 +509,6 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) */ #define kh_n_buckets(h) ((h)->n_buckets) -/*! @function - @abstract Iterate over the entries in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param kvar Variable to which key will be assigned - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ -#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (kvar) = kh_key(h,__i); \ - (vvar) = kh_val(h,__i); \ - code; \ - } } - -/*! @function - @abstract Iterate over the values in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ -#define kh_foreach_value(h, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (vvar) = kh_val(h,__i); \ - code; \ - } } - /* More conenient interfaces */ /*! @function @@ -589,6 +530,9 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ +#define KHASH_SET_INIT_UINT64(name) \ + KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -597,9 +541,13 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ +#define KHASH_MAP_INIT_UINT64(name, khval_t) \ + KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @@ -616,4 +564,15 @@ typedef const char *kh_cstr_t; #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#define kh_exist_str(h, k) (kh_exist(h, k)) +#define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_int32(h, k) (kh_exist(h, k)) + +KHASH_MAP_INIT_STR(str, size_t) +KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_INT64(int64, size_t) + + #endif /* __AC_KHASH_H */ diff --git a/bquery/khash_python.h b/bquery/khash_python.h index 2566dff..cdd94b5 100644 --- a/bquery/khash_python.h +++ b/bquery/khash_python.h @@ -1,22 +1,11 @@ #include -typedef double khfloat64_t; - #include "khash.h" -#define kh_exist_str(h, k) (kh_exist(h, k)) -#define kh_exist_float64(h, k) (kh_exist(h, k)) -#define kh_exist_int64(h, k) (kh_exist(h, k)) -#define kh_exist_int32(h, k) (kh_exist(h, k)) - -KHASH_MAP_INIT_STR(str, size_t) -KHASH_MAP_INIT_INT(int32, size_t) -KHASH_MAP_INIT_INT64(int64, size_t) - // kludge #define kh_float64_hash_func _Py_HashDouble -#define kh_float64_hash_equal kh_int64_hash_equal +#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) @@ -24,7 +13,7 @@ KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_FLOAT64(float64, size_t) -int kh_inline pyobject_cmp(PyObject* a, PyObject* b) { +int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { PyErr_Clear(); From 8f76992f766f8e887d556290226be2bac0fcd202 Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 16 Mar 2015 12:43:38 +0100 Subject: [PATCH 7/8] iterblocks implementation with pandas khash --- bquery/ctable.py | 21 +- bquery/ctable_ext.pyx | 2398 +++++++++++++++++------------------------ 2 files changed, 978 insertions(+), 1441 deletions(-) diff --git a/bquery/ctable.py b/bquery/ctable.py index ee2df11..0afa1f5 100644 --- a/bquery/ctable.py +++ b/bquery/ctable.py @@ -59,24 +59,8 @@ def cache_factor(self, col_list, refresh=False): col_factor_rootdir = col_rootdir + '.factor' col_values_rootdir = col_rootdir + '.values' - carray_factor = \ - bcolz.carray([], dtype='int64', expectedlen=self.size, - rootdir=col_factor_rootdir, mode='w') - # ensure carray_factor.chunklen is multiple of column chunklen - if carray_factor.chunklen % self[col].chunklen != 0: - chunklen_multiplier = round(carray_factor.chunklen / - self[col].chunklen) - if chunklen_multiplier < 1.0: - chunklen_multiplier = 1 - chunklen_multiplier = int(round(chunklen_multiplier)) - carray_factor = \ - bcolz.carray([], dtype='int64', - chunklen=chunklen_multiplier * \ - self[col].chunklen, - rootdir=col_factor_rootdir, mode='w') - - _, values = \ - ctable_ext.factorize(self[col], labels=carray_factor) + carray_factor, values = \ + ctable_ext.factorize(self[col], rootdir=col_factor_rootdir) carray_factor.flush() carray_values = \ @@ -207,7 +191,6 @@ def factorize_groupby_cols(self, groupby_cols): return factor_list, values_list - @staticmethod def make_group_index(self, factor_list, values_list, groupby_cols, array_length, bool_arr): # create unique groups for groupby loop diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index 80f0c5f..a3b2ab9 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -1,1436 +1,990 @@ -cimport cython -from cython.parallel import parallel, prange, threadid +cimport cython +from cython.parallel import parallel, prange, threadid from openmp cimport * - -import numpy as np + +import numpy as np from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, npy_int64, npy_float64, uint64_t) - -from libc.stdint cimport uintptr_t -from libc.stdlib cimport malloc, calloc, free -from libc.string cimport strcpy, memcpy, memset -from libcpp.vector cimport vector - -from khash cimport * -from bcolz.carray_ext cimport carray, chunks, chunk - -# ---------------------------------------------------------------------------- -# GLOBAL DEFINITIONS -# ---------------------------------------------------------------------------- - -SUM = 0 -DEF _SUM = 0 - -COUNT = 1 -DEF _COUNT = 1 - -COUNT_NA = 2 -DEF _COUNT_NA = 2 - -COUNT_DISTINCT = 3 -DEF _COUNT_DISTINCT = 3 - -SORTED_COUNT_DISTINCT = 4 -DEF _SORTED_COUNT_DISTINCT = 4 -# ---------------------------------------------------------------------------- - - -cdef extern from *: - cdef void omp_start_critical "#pragma omp critical \n { //" () nogil - cdef void omp_end_critical "} //" () nogil - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef omp_lock_t * par_initialise_kh_locks(unsigned int * num_threads) nogil: - cdef: - int i - omp_lock_t * kh_locks - - num_threads[0] = omp_get_num_threads() - kh_locks = malloc(num_threads[0] * sizeof(omp_lock_t)) - for i in range(num_threads[0]): - omp_init_lock(&kh_locks[i]) - - return kh_locks - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef char * par_allocate_in_buffer(Py_ssize_t chunklen, - Py_ssize_t itemsize) nogil: - return malloc(chunklen * itemsize * sizeof(char)) - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef uint64_t * par_allocate_out_buffer(Py_ssize_t chunklen, - Py_ssize_t chunklen_multiplier) nogil: - return malloc(chunklen_multiplier * chunklen - * sizeof(uint64_t)) - - + +from libc.stdint cimport uintptr_t +from libc.stdlib cimport malloc, calloc, free +from libc.string cimport strcpy, memcpy, memset +from libcpp.vector cimport vector + +from khash cimport * +import bcolz as bz +from bcolz.carray_ext cimport carray, chunks, chunk + +include "parallel_processing.pxi" + +# ---------------------------------------------------------------------------- +# GLOBAL DEFINITIONS +# ---------------------------------------------------------------------------- +SUM = 0 +DEF _SUM = 0 + +COUNT = 1 +DEF _COUNT = 1 + +COUNT_NA = 2 +DEF _COUNT_NA = 2 + +COUNT_DISTINCT = 3 +DEF _COUNT_DISTINCT = 3 + +SORTED_COUNT_DISTINCT = 4 +DEF _SORTED_COUNT_DISTINCT = 4 + + +# ---------------------------------------------------------------------------- +# FACTORIZE SECTION +# ---------------------------------------------------------------------------- @cython.wraparound(False) @cython.boundscheck(False) -cdef check_labels_structure(carray carray_, carray labels): - # if no labels carray is given, create one with satisfying: - # labels.chunklen % carray_.chunklen = 0 - if labels is None: - labels = carray([], dtype='int64', expectedlen=len(carray_)) - chunklen_multiplier = round(labels.chunklen / carray_.chunklen) - if chunklen_multiplier < 1: - chunklen_multiplier = 1 - chunklen_multiplier = int(round(chunklen_multiplier)) - labels = carray([], dtype='int64', - chunklen=chunklen_multiplier * carray_.chunklen) - - # if one was given, check chunklen is ok - elif labels.chunklen % carray_.chunklen != 0: - raise NotImplementedError('labels.chunklen must be a multiple of ' + - 'carray_.chunklen') - # if labels was given, with ok chunklen, determine multiplier - else: - chunklen_multiplier = labels.chunklen / carray_.chunklen - - return labels, chunklen_multiplier - - -# Factorize Section -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_str_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - char * in_buffer_ptr, - uint64_t * out_buffer, - kh_str_t *table, - Py_ssize_t * count, - unsigned int thread_id, - omp_lock_t kh_locks[], - unsigned int num_threads, - ) nogil: - cdef: - Py_ssize_t i, idx, itemsize - int ret - unsigned int j - char * element - char * insert - khiter_t k - - ret = 0 - itemsize = allocation_size - 1 - # allocate enough memory to hold the string element, add one for the - # null byte that marks the end of the string. - # TODO: understand why zero-filling is necessary. Without zero-filling - # the buffer, duplicate keys occur in the reverse dict - element = calloc(allocation_size, sizeof(char)) - - omp_set_lock(&kh_locks[thread_id]) - for i in xrange(iter_range): - # strings are stored without null termination in ndarrays: need a - # buffer to append null termination to use usual string algorithms - memcpy(element, in_buffer_ptr, itemsize) - in_buffer_ptr += itemsize - - # while reading from shared hash table, set lock for this thread - k = kh_get_str(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - omp_unset_lock(&kh_locks[thread_id]) - # allocate enough memory to hold the string, add one for the - # null byte that marks the end of the string. - insert = malloc(allocation_size) - # TODO: is strcpy really the best way to copy a string? - strcpy(insert, element) - # acquire locks for all threads before writing to shared resources - for j in xrange(num_threads): - omp_set_lock(&kh_locks[j]) +cdef void _factorize_str_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + char * in_buffer_ptr, + uint64_t * out_buffer, + kh_str_t *table, + Py_ssize_t * count, + unsigned int thread_id, + omp_lock_t kh_locks[], + unsigned int num_threads, + ) nogil: + cdef: + Py_ssize_t i, idx + unsigned int j + khiter_t k + + char * element + char * insert + + int ret = 0 + Py_ssize_t itemsize = allocation_size - 1 + + # allocate enough memory to hold the string element, add one for the + # null byte that marks the end of the string. + # TODO: understand why zero-filling is necessary. Without zero-filling + # the buffer, duplicate keys occur in the reverse dict + element = calloc(allocation_size, sizeof(char)) + + # lock ensures that table is in consistend state during access + omp_set_lock(&kh_locks[thread_id]) + for i in xrange(iter_range): + # appends null character to element string + memcpy(element, in_buffer_ptr, itemsize) + in_buffer_ptr += itemsize + + k = kh_get_str(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + omp_unset_lock(&kh_locks[thread_id]) + insert = malloc(allocation_size) + # TODO: is strcpy really the best way to copy a string? + strcpy(insert, element) + + # acquire locks for all threads to avoid inconsistent state + for j in xrange(num_threads): + omp_set_lock(&kh_locks[j]) # check whether another thread has already added the entry by now k = kh_get_str(table, element) if k != table.n_buckets: idx = table.vals[k] # if not, add element else: - k = kh_put_str(table, insert, &ret) - table.vals[k] = idx = count[0] - count[0] += 1 - # release all locks - for j in xrange(num_threads): - omp_unset_lock(&kh_locks[j]) - omp_set_lock(&kh_locks[thread_id]) - out_buffer[i] = idx - - omp_unset_lock(&kh_locks[thread_id]) - free(element) - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_str(carray carray_, carray labels=None): - cdef: - chunk chunk_ - chunks labels_chunks - Py_ssize_t i, j, k, leftover_elements, \ - chunklen_multiplier - unsigned int thread_id - omp_lock_t chunk_lock, out_buffer_lock - char * in_buffer_ptr - ndarray[npy_uint64] out_buffer - char * out_buffer_org_ptr - uint64_t * out_buffer_ptr - kh_str_t *table - uintptr_t leftover_array_ptr - - Py_ssize_t count = 0 - Py_ssize_t nchunks = carray_.nchunks - Py_ssize_t tmp_chunks_counter = 0 - omp_lock_t * kh_locks = NULL - dict reverse = {} - list tmp_chunks_list = [] - # num_threads = 0: set the number of parallel threads automatically: - unsigned int num_threads = 0 - - Py_ssize_t chunklen = carray_.chunklen - Py_ssize_t allocation_size = carray_.dtype.itemsize + 1 - - - - labels, chunklen_multiplier = check_labels_structure(carray_, labels) - - # provide access to cython chunks api for persistent labels - # fails with in-memory carray - if labels._rootdir is not None: - labels_chunks = labels.chunks - - tmp_chunks_keys = [0] * int(carray_.size / labels.chunklen) - - leftover_array_ptr = carray_.leftover_array.__array_interface__['data'][0] - - table = kh_init_str() - - # initialise locks for other shared resources - omp_init_lock(&chunk_lock) - omp_init_lock(&out_buffer_lock) - - out_buffer = np.empty(labels.chunklen, dtype='uint64') - out_buffer_org_ptr = out_buffer.data - - with nogil, parallel(num_threads=num_threads): - # Initialise shared locks for hash-table writes for each child thread. - omp_start_critical() - (&kh_locks)[0] = par_initialise_kh_locks(&num_threads) - omp_end_critical() - - # allocate an in-buffer and an out-buffer for each thread - in_buffer_ptr = par_allocate_in_buffer(chunklen, allocation_size-1) - out_buffer_ptr = par_allocate_out_buffer(chunklen, chunklen_multiplier) - - for i in prange(0, (nchunks/chunklen_multiplier)*chunklen_multiplier, chunklen_multiplier): - thread_id = threadid() - # pack the results from multiple carray_-chunks into one - # labels-chunk: allows out-of-order writing of labels at good speed - for j in xrange(chunklen_multiplier): - # chunk._getitem is not thread-safe. This may be due to it - # setting properties of chunk using self. - Or not. - # TODO: check whether _getitem can be made thread-safe to - # get rid of this probably unnecessary thread lock - omp_set_lock(&chunk_lock) - with gil: - chunk_ = carray_.chunks[i+j] - # decompress into in_buffer - # note: _getitem releases gil during blosc decompression - chunk_._getitem(0, chunklen, in_buffer_ptr) - omp_unset_lock(&chunk_lock) - - _factorize_str_helper(chunklen, - allocation_size, - in_buffer_ptr, - out_buffer_ptr + j * chunklen, - table, - &count, - thread_id, - kh_locks, - num_threads, - ) - - # note 2: labels.append(...) requires ndarray, but no thread-local - # ndarrays can be created. Use the shared out_buffer - # ndarray object and redirect its data pointer to the thread-local - # c array out_buffer_ptr - omp_set_lock(&out_buffer_lock) - with gil: - out_buffer.data = out_buffer_ptr - # compress out_buffer into labels - # _save() and append() freeze at versions prior to v0.8.1 - if labels._rootdir is not None: - labels_chunks._save(int(i/chunklen_multiplier), chunk(out_buffer, labels._dtype, - labels._cparams, _memory=False)) - labels._cbytes += chunk_.cbytes - labels._nbytes += labels.chunklen * labels.itemsize - labels_chunks.nchunks += 1 - # in-memory chunks cannot be "_save()"-ed but must be appended - # to a python chunk list - else: - # TODO: Why does the following not work? (Freezes...) - # tmp_chunks_list[i] = chunk(out_buffer, labels._dtype, labels._cparams, _memory=True) - # - # Work-around until above problem is solved. - tmp_chunks_list.append(chunk(out_buffer, labels._dtype, - labels._cparams, _memory=True)) - tmp_chunks_keys[int(i/chunklen_multiplier)] = tmp_chunks_counter - (&tmp_chunks_counter)[0] += 1 - - omp_unset_lock(&out_buffer_lock) - - # Clean-up thread local variables - free(in_buffer_ptr) - free(out_buffer_ptr) - - # update chunk count (necessary due to use of _save() above - if labels._rootdir is None: - for i in xrange(tmp_chunks_counter): - labels.chunks.append(tmp_chunks_list[tmp_chunks_keys[i]]) - labels._cbytes += tmp_chunks_list[tmp_chunks_keys[i]].cbytes - labels._nbytes += labels.chunklen * labels.itemsize - - # process remaining chunks that could not be processed en-bloc, chunks in-order - # TODO: parallelise after refactoring - # This is really slowing us down when chunk_multiplier = 1 !!! - # e.g. uncompressed unique() - in_buffer_ptr = malloc(chunklen * (allocation_size-1) * sizeof(char)) - out_buffer_ptr = malloc(chunklen * sizeof(uint64_t)) - out_buffer.data = out_buffer_ptr - for i in xrange(labels.size/carray_.chunklen, nchunks): - chunk_ = carray_.chunks[i] - chunk_._getitem(0, chunklen, in_buffer_ptr) - _factorize_str_helper(chunklen, - allocation_size, - in_buffer_ptr, - out_buffer_ptr, - table, - &count, - 0, - kh_locks, - num_threads, - ) - #out_buffer.data = out_buffer_ptr - labels.append(out_buffer[:chunklen].astype(np.int64)) - - - # processing of leftover_array is not parallelised in view of an upcoming - # changes to bcolz chunk access which will allow seamlessly folding - # this into the prange block - leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) - if leftover_elements > 0: - _factorize_str_helper(leftover_elements, - allocation_size, - leftover_array_ptr, - out_buffer_ptr, - table, - &count, - 0, - kh_locks, - num_threads, - ) - # compress out_buffer into labels - labels.append(out_buffer[:leftover_elements].astype(np.int64)) - - - free(in_buffer_ptr) - free(out_buffer_ptr) - - # destroy and free all locks - for j in xrange(num_threads): - omp_destroy_lock(&kh_locks[j]) - free(kh_locks) - omp_destroy_lock(&chunk_lock) - omp_destroy_lock(&out_buffer_lock) - - # restore out_buffer data pointer to avoid segfault when python thries - # to free the object's data - out_buffer.data = out_buffer_org_ptr - + k = kh_put_str(table, insert, &ret) + table.vals[k] = idx = count[0] + count[0] += 1 + # release all locks + for j in xrange(num_threads): + omp_unset_lock(&kh_locks[j]) + # acquire our own lock again, to indicate we are reading the table + omp_set_lock(&kh_locks[thread_id]) + out_buffer[i] = idx + + omp_unset_lock(&kh_locks[thread_id]) + free(element) + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_str(carray carray_, unsigned int num_threads_requested = 0, **kwargs): + cdef: + Py_ssize_t i, blocklen, nblocks + khint_t j + carray labels + ndarray in_buffer + ndarray[npy_uint64] out_buffer + unsigned int thread_id + par_info_t par_info + + kh_str_t *table + char * out_buffer_org_ptr + char * in_buffer_ptr + uint64_t * out_buffer_ptr + + Py_ssize_t count = 0 + dict reverse = {} + + bint first_thread = True + Py_ssize_t element_allocation_size = carray_.dtype.itemsize + 1 + Py_ssize_t carray_chunklen = carray_.chunklen + + labels = create_labels_carray(carray_, **kwargs) + + out_buffer = np.empty(labels.chunklen, dtype='uint64') + out_buffer_org_ptr = out_buffer.data + + table = kh_init_str() + + block_iterator = bz.iterblocks(carray_, blen=labels.chunklen) + nblocks = (len(carray_)/labels.chunklen+0.5)+1 + + with nogil, parallel(num_threads=num_threads_requested): + # Initialise some parallel processing stuff + omp_start_critical() + if first_thread == True: + (&first_thread)[0] = False + with gil: + par_initialise(&par_info, carray_) + omp_end_critical() + + # allocate thread-local in- and out-buffers + with gil: + in_buffer_ptr = par_allocate_buffer(labels.chunklen, carray_.dtype.itemsize) + out_buffer_ptr = par_allocate_buffer(labels.chunklen, labels.dtype.itemsize) + + # factorise the chunks in parallel + for i in prange(0, nblocks, schedule='dynamic', chunksize=1): + thread_id = threadid() + + omp_set_lock(&par_info.chunk_lock) + with gil: + in_buffer = np.ascontiguousarray(block_iterator.next()) + blocklen = len(in_buffer) + memcpy(in_buffer_ptr, in_buffer.data, in_buffer.nbytes) + omp_unset_lock(&par_info.chunk_lock) + + _factorize_str_helper(blocklen, + element_allocation_size, + in_buffer_ptr, + out_buffer_ptr, + table, + &count, + thread_id, + par_info.kh_locks, + par_info.num_threads, + ) + + # compress out_buffer into labels + omp_set_lock(&par_info.out_buffer_lock) + with gil: + out_buffer.data = out_buffer_ptr + par_save_block(i, labels, out_buffer, blocklen, nblocks) + omp_unset_lock(&par_info.out_buffer_lock) + + # Clean-up thread local variables + free(in_buffer_ptr) + free(out_buffer_ptr) + + # Clean-up some parallel processing stuff + par_terminate(par_info) + + # restore out_buffer data pointer to allow python to free the object's data + out_buffer.data = out_buffer_org_ptr + # construct python dict from vectors and free element memory - cdef unsigned long m - for m in xrange(table.n_buckets): - if not kh_exist_str(table, m): + for j in range(table.n_buckets): + if not kh_exist_str(table, j): continue - reverse[table.vals[m]] = table.keys[m] - free(table.keys[m]) + reverse[table.vals[j]] = table.keys[j] + free(table.keys[j]) kh_destroy_str(table) - - return labels, reverse - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_int64_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - ndarray[npy_int64] in_buffer, - ndarray[npy_uint64] out_buffer, - kh_int64_t *table, - Py_ssize_t * count, - dict reverse, - ): - cdef: - Py_ssize_t i, idx - int ret - npy_int64 element - khiter_t k - - ret = 0 - - for i in range(iter_range): - element = in_buffer[i] - k = kh_get_int64(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - k = kh_put_int64(table, element, &ret) - table.vals[k] = idx = count[0] - reverse[count[0]] = element - count[0] += 1 - out_buffer[i] = idx - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_int64(carray carray_, carray labels=None): - cdef: - chunk chunk_ - Py_ssize_t n, i, count, chunklen, leftover_elements - dict reverse - ndarray[npy_int64] in_buffer - ndarray[npy_uint64] out_buffer - kh_int64_t *table - - count = 0 - ret = 0 - reverse = {} - - n = len(carray_) - chunklen = carray_.chunklen - if labels is None: - labels = carray([], dtype='int64', expectedlen=n) - out_buffer = np.empty(chunklen, dtype='uint64') - in_buffer = np.empty(chunklen, dtype='int64') - table = kh_init_int64() - - for i in range(carray_.nchunks): - chunk_ = carray_.chunks[i] - # decompress into in_buffer - chunk_._getitem(0, chunklen, in_buffer.data) - _factorize_int64_helper(chunklen, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer.astype(np.int64)) - - leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) - if leftover_elements > 0: - _factorize_int64_helper(leftover_elements, - carray_.dtype.itemsize + 1, - carray_.leftover_array, - out_buffer, - table, - &count, - reverse, - ) - - # compress out_buffer into labels - labels.append(out_buffer[:leftover_elements].astype(np.int64)) - - kh_destroy_int64(table) - - return labels, reverse - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_int32_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - ndarray[npy_int32] in_buffer, - ndarray[npy_uint64] out_buffer, - kh_int32_t *table, - Py_ssize_t * count, - dict reverse, - ): - cdef: - Py_ssize_t i, idx - int ret - npy_int32 element - khiter_t k - - ret = 0 - - for i in range(iter_range): - element = in_buffer[i] - k = kh_get_int32(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - k = kh_put_int32(table, element, &ret) - table.vals[k] = idx = count[0] - reverse[count[0]] = element - count[0] += 1 - out_buffer[i] = idx - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_int32(carray carray_, carray labels=None): - cdef: - chunk chunk_ - Py_ssize_t n, i, count, chunklen, leftover_elements - dict reverse - ndarray[npy_int32] in_buffer - ndarray[npy_uint64] out_buffer - kh_int32_t *table - - count = 0 - ret = 0 - reverse = {} - - n = len(carray_) - chunklen = carray_.chunklen - if labels is None: - labels = carray([], dtype='int64', expectedlen=n) - # in-buffer isn't typed, because cython doesn't support string arrays (?) - out_buffer = np.empty(chunklen, dtype='uint64') - in_buffer = np.empty(chunklen, dtype='int32') - table = kh_init_int32() - - for i in range(carray_.nchunks): - chunk_ = carray_.chunks[i] - # decompress into in_buffer - chunk_._getitem(0, chunklen, in_buffer.data) - _factorize_int32_helper(chunklen, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer.astype(np.int64)) - - leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) - if leftover_elements > 0: - _factorize_int32_helper(leftover_elements, - carray_.dtype.itemsize + 1, - carray_.leftover_array, - out_buffer, - table, - &count, - reverse, - ) - - # compress out_buffer into labels - labels.append(out_buffer[:leftover_elements].astype(np.int64)) - - kh_destroy_int32(table) - - return labels, reverse - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_float64_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - ndarray[npy_float64] in_buffer, - ndarray[npy_uint64] out_buffer, - kh_float64_t *table, - Py_ssize_t * count, - dict reverse, - ): - cdef: - Py_ssize_t i, idx - int ret - npy_float64 element - khiter_t k - - ret = 0 - - for i in range(iter_range): - # TODO: Consider indexing directly into the array for efficiency - element = in_buffer[i] - k = kh_get_float64(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - k = kh_put_float64(table, element, &ret) - table.vals[k] = idx = count[0] - reverse[count[0]] = element - count[0] += 1 - out_buffer[i] = idx - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_float64(carray carray_, carray labels=None): - cdef: - chunk chunk_ - Py_ssize_t n, i, count, chunklen, leftover_elements - dict reverse - ndarray[npy_float64] in_buffer - ndarray[npy_uint64] out_buffer - kh_float64_t *table - - count = 0 - ret = 0 - reverse = {} - - n = len(carray_) - chunklen = carray_.chunklen - if labels is None: - labels = carray([], dtype='int64', expectedlen=n) - # in-buffer isn't typed, because cython doesn't support string arrays (?) - out_buffer = np.empty(chunklen, dtype='uint64') - in_buffer = np.empty(chunklen, dtype='float64') - table = kh_init_float64() - - for i in range(carray_.nchunks): - chunk_ = carray_.chunks[i] - # decompress into in_buffer - chunk_._getitem(0, chunklen, in_buffer.data) - _factorize_float64_helper(chunklen, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer.astype(np.int64)) - - leftover_elements = cython.cdiv(carray_.leftover, carray_.atomsize) - if leftover_elements > 0: - _factorize_float64_helper(leftover_elements, - carray_.dtype.itemsize + 1, - carray_.leftover_array, - out_buffer, - table, - &count, - reverse, - ) - - # compress out_buffer into labels - labels.append(out_buffer[:leftover_elements].astype(np.int64)) - - kh_destroy_float64(table) - - return labels, reverse - -cpdef factorize(carray carray_, carray labels=None): - if carray_.dtype == 'int32': - labels, reverse = factorize_int32(carray_, labels=labels) - elif carray_.dtype == 'int64': - labels, reverse = factorize_int64(carray_, labels=labels) - elif carray_.dtype == 'float64': - labels, reverse = factorize_float64(carray_, labels=labels) - else: - #TODO: check that the input is a string_ dtype type - labels, reverse = factorize_str(carray_, labels=labels) - return labels, reverse - -# --------------------------------------------------------------------------- -# Translate existing arrays -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1): - cdef: - chunk chunk_ - Py_ssize_t i, chunklen, leftover_elements - ndarray[npy_int64] in_buffer - ndarray[npy_int64] out_buffer - - chunklen = input_.chunklen - out_buffer = np.empty(chunklen, dtype='int64') - in_buffer = np.empty(chunklen, dtype='int64') - - for i in range(input_.nchunks): - chunk_ = input_.chunks[i] - # decompress into in_buffer - chunk_._getitem(0, chunklen, in_buffer.data) - for i in range(chunklen): - element = in_buffer[i] - out_buffer[i] = lookup.get(element, default) - # compress out_buffer into labels - output_.append(out_buffer.astype(np.int64)) - - leftover_elements = cython.cdiv(input_.leftover, input_.atomsize) - if leftover_elements > 0: - in_buffer = input_.leftover_array - for i in range(leftover_elements): - element = in_buffer[i] - out_buffer[i] = lookup.get(element, default) - output_.append(out_buffer[:leftover_elements].astype(np.int64)) - -# --------------------------------------------------------------------------- -# Aggregation Section (old) -@cython.boundscheck(False) -@cython.wraparound(False) -def agg_sum_na(iter_): - cdef: - npy_float64 v, v_cum = 0.0 - - for v in iter_: - if v == v: # skip NA values - v_cum += v - - return v_cum - -@cython.boundscheck(False) -@cython.wraparound(False) -def agg_sum(iter_): - cdef: - npy_float64 v, v_cum = 0.0 - - for v in iter_: - v_cum += v - - return v_cum - -# --------------------------------------------------------------------------- -# Aggregation Section -@cython.boundscheck(False) -@cython.wraparound(False) -def groupsort_indexer(carray index, Py_ssize_t ngroups): - cdef: - Py_ssize_t i, label, n - ndarray[int64_t] counts, where, np_result - # -- - carray c_result - chunk input_chunk, index_chunk - Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements - - ndarray[int64_t] in_buffer - - index_chunk_len = index.chunklen - in_buffer = np.empty(index_chunk_len, dtype='int64') - index_chunk_nr = 0 - - # count group sizes, location 0 for NA - counts = np.zeros(ngroups + 1, dtype=np.int64) - n = len(index) - - for index_chunk_nr in range(index.nchunks): - # fill input buffer - input_chunk = index.chunks[index_chunk_nr] - input_chunk._getitem(0, index_chunk_len, in_buffer.data) - - # loop through rows - for i in range(index_chunk_len): - counts[index[i] + 1] += 1 - - leftover_elements = cython.cdiv(index.leftover, index.atomsize) - if leftover_elements > 0: - # fill input buffer - in_buffer = index.leftover_array - - # loop through rows - for i in range(leftover_elements): - counts[index[i] + 1] += 1 - - # mark the start of each contiguous group of like-indexed data - where = np.zeros(ngroups + 1, dtype=np.int64) - for i from 1 <= i < ngroups + 1: - where[i] = where[i - 1] + counts[i - 1] - - # this is our indexer - np_result = np.zeros(n, dtype=np.int64) - for i from 0 <= i < n: - label = index[i] + 1 - np_result[where[label]] = i - where[label] += 1 - - return np_result, counts - -cdef count_unique_float64(ndarray[float64_t] values): - cdef: - Py_ssize_t i, n = len(values) - Py_ssize_t idx - int ret = 0 - float64_t val - khiter_t k - npy_uint64 count = 0 - bint seen_na = 0 - kh_float64_t *table - - table = kh_init_float64() - - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_float64(table, val) - if k == table.n_buckets: - k = kh_put_float64(table, val, &ret) - count += 1 - elif not seen_na: - seen_na = 1 - count += 1 - - kh_destroy_float64(table) - - return count - -cdef count_unique_int64(ndarray[int64_t] values): - cdef: - Py_ssize_t i, n = len(values) - Py_ssize_t idx - int ret = 0 - int64_t val - khiter_t k - npy_uint64 count = 0 - kh_int64_t *table - - table = kh_init_int64() - - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_int64(table, val) - if k == table.n_buckets: - k = kh_put_int64(table, val, &ret) - count += 1 - - kh_destroy_int64(table) - - return count - -cdef count_unique_int32(ndarray[int32_t] values): - cdef: - Py_ssize_t i, n = len(values) - Py_ssize_t idx - int ret = 0 - int32_t val - khiter_t k - npy_uint64 count = 0 - kh_int32_t *table - - table = kh_init_int32() - - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_int32(table, val) - if k == table.n_buckets: - k = kh_put_int32(table, val, &ret) - count += 1 - - kh_destroy_int32(table) - - return count - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef sum_float64(carray ca_input, carray ca_factor, - Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): - cdef: - chunk input_chunk, factor_chunk - Py_ssize_t input_chunk_nr, input_chunk_len - Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row - Py_ssize_t current_index, i, j, end_counts, start_counts, factor_total_chunks, leftover_elements - - ndarray[npy_float64] in_buffer - ndarray[npy_int64] factor_buffer - ndarray[npy_float64] out_buffer - ndarray[npy_float64] last_values - - npy_float64 v - bint count_distinct_started = 0 - carray num_uniques - - - count = 0 - ret = 0 - reverse = {} - - if agg_method == _COUNT_DISTINCT: - num_uniques = carray([], dtype='int64') - positions, counts = groupsort_indexer(ca_factor, nr_groups) - start_counts = 0 - end_counts = 0 - for j in range(len(counts) - 1): - start_counts = end_counts - end_counts = start_counts + counts[j + 1] - positions[start_counts:end_counts] - num_uniques.append( - count_unique_float64(ca_input[positions[start_counts:end_counts]]) - ) - - return num_uniques - - input_chunk_len = ca_input.chunklen - in_buffer = np.empty(input_chunk_len, dtype='float64') - factor_chunk_len = ca_factor.chunklen - factor_total_chunks = ca_factor.nchunks - factor_chunk_nr = 0 - factor_buffer = np.empty(factor_chunk_len, dtype='int64') - if factor_total_chunks > 0: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype='float64') - - for input_chunk_nr in range(ca_input.nchunks): - # fill input buffer - input_chunk = ca_input.chunks[input_chunk_nr] - input_chunk._getitem(0, input_chunk_len, in_buffer.data) - - # loop through rows - for i in range(input_chunk_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - v = in_buffer[i] - if v == v: # skip NA values - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='float64') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize) - if leftover_elements > 0: - # fill input buffer - in_buffer = ca_input.leftover_array - - # loop through rows - for i in range(leftover_elements): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - v = in_buffer[i] - if v == v: # skip NA values - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='float64') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - # check whether a row has to be removed if it was meant to be skipped - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef sum_int32(carray ca_input, carray ca_factor, - Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): - cdef: - chunk input_chunk, factor_chunk - Py_ssize_t input_chunk_nr, input_chunk_len - Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row - Py_ssize_t current_index, i, j, end_counts, start_counts, factor_total_chunks, leftover_elements - - ndarray[npy_int32] in_buffer - ndarray[npy_int64] factor_buffer - ndarray[npy_int32] out_buffer - ndarray[npy_int32] last_values - - npy_int32 v - bint count_distinct_started = 0 - carray num_uniques - - count = 0 - ret = 0 - reverse = {} - - if agg_method == _COUNT_DISTINCT: - num_uniques = carray([], dtype='int64') - positions, counts = groupsort_indexer(ca_factor, nr_groups) - start_counts = 0 - end_counts = 0 - for j in range(len(counts) - 1): - start_counts = end_counts - end_counts = start_counts + counts[j + 1] - positions[start_counts:end_counts] - num_uniques.append( - count_unique_int32(ca_input[positions[start_counts:end_counts]]) - ) - - return num_uniques - - input_chunk_len = ca_input.chunklen - in_buffer = np.empty(input_chunk_len, dtype='int32') - factor_chunk_len = ca_factor.chunklen - factor_total_chunks = ca_factor.nchunks - factor_chunk_nr = 0 - factor_buffer = np.empty(factor_chunk_len, dtype='int64') - if factor_total_chunks > 0: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype='int32') - - for input_chunk_nr in range(ca_input.nchunks): - # fill input buffer - input_chunk = ca_input.chunks[input_chunk_nr] - input_chunk._getitem(0, input_chunk_len, in_buffer.data) - - # loop through rows - for i in range(input_chunk_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - # TODO: Warning: int does not support NA values, is this what we need? - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='int32') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize) - if leftover_elements > 0: - # fill input buffer - in_buffer = ca_input.leftover_array - - # loop through rows - for i in range(leftover_elements): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - # TODO: Warning: int does not support NA values, is this what we need? - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='int32') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - # check whether a row has to be removed if it was meant to be skipped - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef sum_int64(carray ca_input, carray ca_factor, - Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): - cdef: - chunk input_chunk, factor_chunk - Py_ssize_t input_chunk_nr, input_chunk_len - Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row - Py_ssize_t current_index, i, j, end_counts, start_counts, factor_total_chunks, leftover_elements - - ndarray[npy_int64] in_buffer - ndarray[npy_int64] factor_buffer - ndarray[npy_int64] out_buffer - ndarray[npy_int64] last_values - - npy_int64 v - bint count_distinct_started = 0 - carray num_uniques - - count = 0 - ret = 0 - reverse = {} - - if agg_method == _COUNT_DISTINCT: - num_uniques = carray([], dtype='int64') - positions, counts = groupsort_indexer(ca_factor, nr_groups) - start_counts = 0 - end_counts = 0 - for j in range(len(counts) - 1): - start_counts = end_counts - end_counts = start_counts + counts[j + 1] - positions[start_counts:end_counts] - num_uniques.append( - count_unique_int64(ca_input[positions[start_counts:end_counts]]) - ) - - return num_uniques - - input_chunk_len = ca_input.chunklen - in_buffer = np.empty(input_chunk_len, dtype='int64') - factor_chunk_len = ca_factor.chunklen - factor_total_chunks = ca_factor.nchunks - factor_chunk_nr = 0 - factor_buffer = np.empty(factor_chunk_len, dtype='int64') - if factor_total_chunks > 0: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype='int64') - - for input_chunk_nr in range(ca_input.nchunks): - # fill input buffer - input_chunk = ca_input.chunks[input_chunk_nr] - input_chunk._getitem(0, input_chunk_len, in_buffer.data) - - # loop through rows - for i in range(input_chunk_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - # TODO: Warning: int does not support NA values, is this what we need? - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='int64') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize) - if leftover_elements > 0: - # fill input buffer - in_buffer = ca_input.leftover_array - - # loop through rows - for i in range(leftover_elements): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - # TODO: Warning: int does not support NA values, is this what we need? - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='int64') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - # check whether a row has to be removed if it was meant to be skipped - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key): - cdef: - chunk input_chunk, factor_chunk - Py_ssize_t input_chunk_nr, input_chunk_len - Py_ssize_t factor_chunk_nr, factor_chunk_len, factor_chunk_row - Py_ssize_t current_index, i, factor_total_chunks, leftover_elements - - ndarray in_buffer - ndarray[npy_int64] factor_buffer - ndarray out_buffer - - count = 0 - ret = 0 - reverse = {} - - input_chunk_len = ca_input.chunklen - in_buffer = np.empty(input_chunk_len, dtype=ca_input.dtype) - factor_chunk_len = ca_factor.chunklen - factor_total_chunks = ca_factor.nchunks - factor_chunk_nr = 0 - factor_buffer = np.empty(factor_chunk_len, dtype='int64') - if factor_total_chunks > 0: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype) - - for input_chunk_nr in range(ca_input.nchunks): - - # fill input buffer - input_chunk = ca_input.chunks[input_chunk_nr] - input_chunk._getitem(0, input_chunk_len, in_buffer.data) - - for i in range(input_chunk_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - out_buffer[current_index] = in_buffer[i] - - leftover_elements = cython.cdiv(ca_input.leftover, ca_input.atomsize) - if leftover_elements > 0: - in_buffer = ca_input.leftover_array - - for i in range(leftover_elements): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_chunk_len: - factor_chunk_nr += 1 - if factor_chunk_nr < factor_total_chunks: - factor_chunk = ca_factor.chunks[factor_chunk_nr] - factor_chunk._getitem(0, factor_chunk_len, factor_buffer.data) - else: - factor_buffer = ca_factor.leftover_array - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - out_buffer[current_index] = in_buffer[i] - - # check whether a row has to be fixed - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -def aggregate_groups_by_iter_2(ct_input, - ct_agg, - npy_uint64 nr_groups, - npy_uint64 skip_key, - carray factor_carray, - groupby_cols, - output_agg_ops, - dtype_list, - agg_method=_SUM - ): - total = [] - - for col in groupby_cols: - total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key)) - - for col, agg_op in output_agg_ops: - # TODO: input vs output column - col_dtype = ct_agg[col].dtype - if col_dtype == np.float64: - total.append( - sum_float64(ct_input[col], factor_carray, nr_groups, skip_key, - agg_method=agg_method) - ) - elif col_dtype == np.int64: - total.append( - sum_int64(ct_input[col], factor_carray, nr_groups, skip_key, - agg_method=agg_method) - ) - elif col_dtype == np.int32: - total.append( - sum_int32(ct_input[col], factor_carray, nr_groups, skip_key, - agg_method=agg_method) - ) - else: - raise NotImplementedError( - 'Column dtype ({0}) not supported for aggregation yet ' - '(only int32, int64 & float64)'.format(str(col_dtype))) - - ct_agg.append(total) - -# --------------------------------------------------------------------------- -# Temporary Section -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse): - """ - TEMPORARY WORKAROUND till numexpr support in list operations - - Update a boolean array with checks whether the values of a column (col) are in a set (value_set) - Reverse means "not in" functionality - - For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61 - - :param col: - :param value_set: - :param boolarr: - :param reverse: - :return: - """ - cdef Py_ssize_t i - i = 0 - if not reverse: - for val in col.iter(): - if val not in value_set: - boolarr[i] = False - i += 1 - else: - for val in col.iter(): - if val in value_set: - boolarr[i] = False - i += 1 + + return labels, reverse + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_int64_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + ndarray[npy_int64] in_buffer, + ndarray[npy_uint64] out_buffer, + kh_int64_t *table, + Py_ssize_t * count, + dict reverse, + ): + cdef: + Py_ssize_t i, idx + int ret + npy_int64 element + khiter_t k + + ret = 0 + + for i in range(iter_range): + # TODO: Consider indexing directly into the array for efficiency + element = in_buffer[i] + k = kh_get_int64(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + k = kh_put_int64(table, element, &ret) + table.vals[k] = idx = count[0] + reverse[count[0]] = element + count[0] += 1 + out_buffer[i] = idx + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_int64(carray carray_, carray labels=None): + cdef: + Py_ssize_t len_carray, count, chunklen, len_in_buffer + dict reverse + ndarray[npy_int64] in_buffer + ndarray[npy_uint64] out_buffer + kh_int64_t *table + + count = 0 + ret = 0 + reverse = {} + + len_carray = len(carray_) + chunklen = carray_.chunklen + if labels is None: + labels = carray([], dtype='int64', expectedlen=len_carray) + # in-buffer isn't typed, because cython doesn't support string arrays (?) + out_buffer = np.empty(chunklen, dtype='uint64') + table = kh_init_int64() + + for in_buffer in bz.iterblocks(carray_): + len_in_buffer = len(in_buffer) + _factorize_int64_helper(len_in_buffer, + carray_.dtype.itemsize + 1, + in_buffer, + out_buffer, + table, + &count, + reverse, + ) + # compress out_buffer into labels + labels.append(out_buffer[:len_in_buffer].astype(np.int64)) + + kh_destroy_int64(table) + + return labels, reverse + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_int32_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + ndarray[npy_int32] in_buffer, + ndarray[npy_uint64] out_buffer, + kh_int32_t *table, + Py_ssize_t * count, + dict reverse, + ): + cdef: + Py_ssize_t i, idx + int ret + npy_int32 element + khiter_t k + + ret = 0 + + for i in range(iter_range): + # TODO: Consider indexing directly into the array for efficiency + element = in_buffer[i] + k = kh_get_int32(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + k = kh_put_int32(table, element, &ret) + table.vals[k] = idx = count[0] + reverse[count[0]] = element + count[0] += 1 + out_buffer[i] = idx + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_int32(carray carray_, carray labels=None): + cdef: + Py_ssize_t len_carray, count, chunklen, len_in_buffer + dict reverse + ndarray[npy_int32] in_buffer + ndarray[npy_uint64] out_buffer + kh_int32_t *table + + count = 0 + ret = 0 + reverse = {} + + len_carray = len(carray_) + chunklen = carray_.chunklen + if labels is None: + labels = carray([], dtype='int64', expectedlen=len_carray) + # in-buffer isn't typed, because cython doesn't support string arrays (?) + out_buffer = np.empty(chunklen, dtype='uint64') + table = kh_init_int32() + + for in_buffer in bz.iterblocks(carray_): + len_in_buffer = len(in_buffer) + _factorize_int32_helper(len_in_buffer, + carray_.dtype.itemsize + 1, + in_buffer, + out_buffer, + table, + &count, + reverse, + ) + # compress out_buffer into labels + labels.append(out_buffer[:len_in_buffer].astype(np.int64)) + + kh_destroy_int32(table) + + return labels, reverse + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_float64_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + ndarray[npy_float64] in_buffer, + ndarray[npy_uint64] out_buffer, + kh_float64_t *table, + Py_ssize_t * count, + dict reverse, + ): + cdef: + Py_ssize_t i, idx + int ret + npy_float64 element + khiter_t k + + ret = 0 + + for i in range(iter_range): + # TODO: Consider indexing directly into the array for efficiency + element = in_buffer[i] + k = kh_get_float64(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + k = kh_put_float64(table, element, &ret) + table.vals[k] = idx = count[0] + reverse[count[0]] = element + count[0] += 1 + out_buffer[i] = idx + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_float64(carray carray_, carray labels=None): + cdef: + Py_ssize_t len_carray, count, chunklen, len_in_buffer + dict reverse + ndarray[npy_float64] in_buffer + ndarray[npy_uint64] out_buffer + kh_float64_t *table + + count = 0 + ret = 0 + reverse = {} + + len_carray = len(carray_) + chunklen = carray_.chunklen + if labels is None: + labels = carray([], dtype='int64', expectedlen=len_carray) + # in-buffer isn't typed, because cython doesn't support string arrays (?) + out_buffer = np.empty(chunklen, dtype='uint64') + table = kh_init_float64() + + for in_buffer in bz.iterblocks(carray_): + len_in_buffer = len(in_buffer) + _factorize_float64_helper(len_in_buffer, + carray_.dtype.itemsize + 1, + in_buffer, + out_buffer, + table, + &count, + reverse, + ) + # compress out_buffer into labels + labels.append(out_buffer[:len_in_buffer].astype(np.int64)) + + kh_destroy_float64(table) + + return labels, reverse + +def factorize(carray carray_, **kwargs): + if carray_.dtype == 'int32': + labels, reverse = factorize_int32(carray_, **kwargs) + elif carray_.dtype == 'int64': + labels, reverse = factorize_int64(carray_, **kwargs) + elif carray_.dtype == 'float64': + labels, reverse = factorize_float64(carray_, **kwargs) + else: + #TODO: check that the input is a string_ dtype type + labels, reverse = factorize_str(carray_, **kwargs) + return labels, reverse + +# --------------------------------------------------------------------------- +# Translate existing arrays +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1): + cdef: + Py_ssize_t chunklen, leftover_elements, len_in_buffer + ndarray[npy_int64] in_buffer + ndarray[npy_int64] out_buffer + + chunklen = input_.chunklen + out_buffer = np.empty(chunklen, dtype='int64') + + for in_buffer in bz.iterblocks(input_): + len_in_buffer = len(in_buffer) + for i in range(len_in_buffer): + element = in_buffer[i] + out_buffer[i] = lookup.get(element, default) + # compress out_buffer into labels + output_.append(out_buffer[:len_in_buffer].astype(np.int64)) + +# --------------------------------------------------------------------------- +# Aggregation Section (old) +@cython.boundscheck(False) +@cython.wraparound(False) +def agg_sum_na(iter_): + cdef: + npy_float64 v, v_cum = 0.0 + + for v in iter_: + if v == v: # skip NA values + v_cum += v + + return v_cum + +@cython.boundscheck(False) +@cython.wraparound(False) +def agg_sum(iter_): + cdef: + npy_float64 v, v_cum = 0.0 + + for v in iter_: + v_cum += v + + return v_cum + +# --------------------------------------------------------------------------- +# Aggregation Section +@cython.boundscheck(False) +@cython.wraparound(False) +def groupsort_indexer(carray index, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, label, n, len_in_buffer + ndarray[int64_t] counts, where, np_result + # -- + carray c_result + chunk input_chunk, index_chunk + Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements + + ndarray[int64_t] in_buffer + + index_chunk_len = index.chunklen + in_buffer = np.empty(index_chunk_len, dtype='int64') + index_chunk_nr = 0 + + # count group sizes, location 0 for NA + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + + for in_buffer in bz.iterblocks(index): + len_in_buffer = len(in_buffer) + # loop through rows + for i in range(len_in_buffer): + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + where = np.zeros(ngroups + 1, dtype=np.int64) + for i from 1 <= i < ngroups + 1: + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + np_result = np.zeros(n, dtype=np.int64) + for i from 0 <= i < n: + label = index[i] + 1 + np_result[where[label]] = i + where[label] += 1 + + return np_result, counts + +cdef count_unique_float64(ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx + int ret = 0 + float64_t val + khiter_t k + npy_uint64 count = 0 + bint seen_na = 0 + kh_float64_t *table + + table = kh_init_float64() + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(table, val) + if k == table.n_buckets: + k = kh_put_float64(table, val, &ret) + count += 1 + elif not seen_na: + seen_na = 1 + count += 1 + + kh_destroy_float64(table) + + return count + +cdef count_unique_int64(ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx + int ret = 0 + int64_t val + khiter_t k + npy_uint64 count = 0 + kh_int64_t *table + + table = kh_init_int64() + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_int64(table, val) + if k == table.n_buckets: + k = kh_put_int64(table, val, &ret) + count += 1 + + kh_destroy_int64(table) + + return count + +cdef count_unique_int32(ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx + int ret = 0 + int32_t val + khiter_t k + npy_uint64 count = 0 + kh_int32_t *table + + table = kh_init_int32() + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_int32(table, val) + if k == table.n_buckets: + k = kh_put_int32(table, val, &ret) + count += 1 + + kh_destroy_int32(table) + + return count + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef sum_float64(carray ca_input, carray ca_factor, + Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, j, end_counts, start_counts + + ndarray[npy_float64] in_buffer + ndarray[npy_int64] factor_buffer + ndarray[npy_float64] out_buffer + ndarray[npy_float64] last_values + + npy_float64 v + bint count_distinct_started = 0 + carray num_uniques + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + if agg_method == _COUNT_DISTINCT: + num_uniques = carray([], dtype='int64') + positions, counts = groupsort_indexer(ca_factor, nr_groups) + start_counts = 0 + end_counts = 0 + for j in range(len(counts) - 1): + start_counts = end_counts + end_counts = start_counts + counts[j + 1] + positions[start_counts:end_counts] + num_uniques.append( + count_unique_float64(ca_input[positions[start_counts:end_counts]]) + ) + + return num_uniques + + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype='float64') + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + # loop through rows + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + if agg_method == _SUM: + out_buffer[current_index] += in_buffer[i] + elif agg_method == _COUNT: + out_buffer[current_index] += 1 + elif agg_method == _COUNT_NA: + + v = in_buffer[i] + if v == v: # skip NA values + out_buffer[current_index] += 1 + elif agg_method == _SORTED_COUNT_DISTINCT: + v = in_buffer[i] + if not count_distinct_started: + count_distinct_started = 1 + last_values = np.zeros(nr_groups, dtype='float64') + last_values[0] = v + out_buffer[0] = 1 + else: + if v != last_values[current_index]: + out_buffer[current_index] += 1 + + last_values[current_index] = v + else: + raise NotImplementedError('sumtype not supported') + + # check whether a row has to be removed if it was meant to be skipped + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef sum_int32(carray ca_input, carray ca_factor, + Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, j, end_counts, start_counts + + ndarray[npy_int32] in_buffer + ndarray[npy_int64] factor_buffer + ndarray[npy_int32] out_buffer + ndarray[npy_int32] last_values + + npy_int32 v + bint count_distinct_started = 0 + carray num_uniques + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + if agg_method == _COUNT_DISTINCT: + num_uniques = carray([], dtype='int64') + positions, counts = groupsort_indexer(ca_factor, nr_groups) + start_counts = 0 + end_counts = 0 + for j in range(len(counts) - 1): + start_counts = end_counts + end_counts = start_counts + counts[j + 1] + positions[start_counts:end_counts] + num_uniques.append( + count_unique_int32(ca_input[positions[start_counts:end_counts]]) + ) + + return num_uniques + + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype='int32') + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + # loop through rows + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + if agg_method == _SUM: + out_buffer[current_index] += in_buffer[i] + elif agg_method == _COUNT: + out_buffer[current_index] += 1 + elif agg_method == _COUNT_NA: + + # TODO: Warning: int does not support NA values, is this what we need? + out_buffer[current_index] += 1 + elif agg_method == _SORTED_COUNT_DISTINCT: + v = in_buffer[i] + if not count_distinct_started: + count_distinct_started = 1 + last_values = np.zeros(nr_groups, dtype='int32') + last_values[0] = v + out_buffer[0] = 1 + else: + if v != last_values[current_index]: + out_buffer[current_index] += 1 + + last_values[current_index] = v + else: + raise NotImplementedError('sumtype not supported') + + # check whether a row has to be removed if it was meant to be skipped + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef sum_int64(carray ca_input, carray ca_factor, + Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, j, end_counts, start_counts + + ndarray[npy_int64] in_buffer + ndarray[npy_int64] factor_buffer + ndarray[npy_int64] out_buffer + ndarray[npy_int64] last_values + + npy_int64 v + bint count_distinct_started = 0 + carray num_uniques + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + if agg_method == _COUNT_DISTINCT: + num_uniques = carray([], dtype='int64') + positions, counts = groupsort_indexer(ca_factor, nr_groups) + start_counts = 0 + end_counts = 0 + for j in range(len(counts) - 1): + start_counts = end_counts + end_counts = start_counts + counts[j + 1] + positions[start_counts:end_counts] + num_uniques.append( + count_unique_int64(ca_input[positions[start_counts:end_counts]]) + ) + + return num_uniques + + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype='int64') + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + # loop through rows + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + if agg_method == _SUM: + out_buffer[current_index] += in_buffer[i] + elif agg_method == _COUNT: + out_buffer[current_index] += 1 + elif agg_method == _COUNT_NA: + + # TODO: Warning: int does not support NA values, is this what we need? + out_buffer[current_index] += 1 + elif agg_method == _SORTED_COUNT_DISTINCT: + v = in_buffer[i] + if not count_distinct_started: + count_distinct_started = 1 + last_values = np.zeros(nr_groups, dtype='int64') + last_values[0] = v + out_buffer[0] = 1 + else: + if v != last_values[current_index]: + out_buffer[current_index] += 1 + + last_values[current_index] = v + else: + raise NotImplementedError('sumtype not supported') + + # check whether a row has to be removed if it was meant to be skipped + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, factor_total_chunks + + ndarray in_buffer + ndarray[npy_int64] factor_buffer + ndarray out_buffer + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + + factor_total_chunks = ca_factor.nchunks + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype) + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + out_buffer[current_index] = in_buffer[i] + + # check whether a row has to be fixed + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +def aggregate_groups_by_iter_2(ct_input, + ct_agg, + npy_uint64 nr_groups, + npy_uint64 skip_key, + carray factor_carray, + groupby_cols, + output_agg_ops, + dtype_list, + agg_method=_SUM + ): + total = [] + + for col in groupby_cols: + total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key)) + + for col, agg_op in output_agg_ops: + # TODO: input vs output column + col_dtype = ct_agg[col].dtype + if col_dtype == np.float64: + total.append( + sum_float64(ct_input[col], factor_carray, nr_groups, skip_key, + agg_method=agg_method) + ) + elif col_dtype == np.int64: + total.append( + sum_int64(ct_input[col], factor_carray, nr_groups, skip_key, + agg_method=agg_method) + ) + elif col_dtype == np.int32: + total.append( + sum_int32(ct_input[col], factor_carray, nr_groups, skip_key, + agg_method=agg_method) + ) + else: + raise NotImplementedError( + 'Column dtype ({0}) not supported for aggregation yet ' + '(only int32, int64 & float64)'.format(str(col_dtype))) + + ct_agg.append(total) + +# --------------------------------------------------------------------------- +# Temporary Section +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse): + """ + TEMPORARY WORKAROUND till numexpr support in list operations + + Update a boolean array with checks whether the values of a column (col) are in a set (value_set) + Reverse means "not in" functionality + + For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61 + + :param col: + :param value_set: + :param boolarr: + :param reverse: + :return: + """ + cdef Py_ssize_t i + i = 0 + if not reverse: + for val in col.iter(): + if val not in value_set: + boolarr[i] = False + i += 1 + else: + for val in col.iter(): + if val in value_set: + boolarr[i] = False + i += 1 From a1092c60c52f45945565f93921758a24b7943f39 Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 16 Mar 2015 15:16:54 +0100 Subject: [PATCH 8/8] iterblocks implementation with pandas khash (unix line endings) --- bquery/ctable_ext.pyx | 1980 ++++++++++++++++++++--------------------- 1 file changed, 990 insertions(+), 990 deletions(-) diff --git a/bquery/ctable_ext.pyx b/bquery/ctable_ext.pyx index a3b2ab9..cb3cc04 100644 --- a/bquery/ctable_ext.pyx +++ b/bquery/ctable_ext.pyx @@ -1,990 +1,990 @@ -cimport cython -from cython.parallel import parallel, prange, threadid -from openmp cimport * - -import numpy as np -from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, - npy_int64, npy_float64, uint64_t) - -from libc.stdint cimport uintptr_t -from libc.stdlib cimport malloc, calloc, free -from libc.string cimport strcpy, memcpy, memset -from libcpp.vector cimport vector - -from khash cimport * -import bcolz as bz -from bcolz.carray_ext cimport carray, chunks, chunk - -include "parallel_processing.pxi" - -# ---------------------------------------------------------------------------- -# GLOBAL DEFINITIONS -# ---------------------------------------------------------------------------- -SUM = 0 -DEF _SUM = 0 - -COUNT = 1 -DEF _COUNT = 1 - -COUNT_NA = 2 -DEF _COUNT_NA = 2 - -COUNT_DISTINCT = 3 -DEF _COUNT_DISTINCT = 3 - -SORTED_COUNT_DISTINCT = 4 -DEF _SORTED_COUNT_DISTINCT = 4 - - -# ---------------------------------------------------------------------------- -# FACTORIZE SECTION -# ---------------------------------------------------------------------------- -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_str_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - char * in_buffer_ptr, - uint64_t * out_buffer, - kh_str_t *table, - Py_ssize_t * count, - unsigned int thread_id, - omp_lock_t kh_locks[], - unsigned int num_threads, - ) nogil: - cdef: - Py_ssize_t i, idx - unsigned int j - khiter_t k - - char * element - char * insert - - int ret = 0 - Py_ssize_t itemsize = allocation_size - 1 - - # allocate enough memory to hold the string element, add one for the - # null byte that marks the end of the string. - # TODO: understand why zero-filling is necessary. Without zero-filling - # the buffer, duplicate keys occur in the reverse dict - element = calloc(allocation_size, sizeof(char)) - - # lock ensures that table is in consistend state during access - omp_set_lock(&kh_locks[thread_id]) - for i in xrange(iter_range): - # appends null character to element string - memcpy(element, in_buffer_ptr, itemsize) - in_buffer_ptr += itemsize - - k = kh_get_str(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - omp_unset_lock(&kh_locks[thread_id]) - insert = malloc(allocation_size) - # TODO: is strcpy really the best way to copy a string? - strcpy(insert, element) - - # acquire locks for all threads to avoid inconsistent state - for j in xrange(num_threads): - omp_set_lock(&kh_locks[j]) - # check whether another thread has already added the entry by now - k = kh_get_str(table, element) - if k != table.n_buckets: - idx = table.vals[k] - # if not, add element - else: - k = kh_put_str(table, insert, &ret) - table.vals[k] = idx = count[0] - count[0] += 1 - # release all locks - for j in xrange(num_threads): - omp_unset_lock(&kh_locks[j]) - # acquire our own lock again, to indicate we are reading the table - omp_set_lock(&kh_locks[thread_id]) - out_buffer[i] = idx - - omp_unset_lock(&kh_locks[thread_id]) - free(element) - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_str(carray carray_, unsigned int num_threads_requested = 0, **kwargs): - cdef: - Py_ssize_t i, blocklen, nblocks - khint_t j - carray labels - ndarray in_buffer - ndarray[npy_uint64] out_buffer - unsigned int thread_id - par_info_t par_info - - kh_str_t *table - char * out_buffer_org_ptr - char * in_buffer_ptr - uint64_t * out_buffer_ptr - - Py_ssize_t count = 0 - dict reverse = {} - - bint first_thread = True - Py_ssize_t element_allocation_size = carray_.dtype.itemsize + 1 - Py_ssize_t carray_chunklen = carray_.chunklen - - labels = create_labels_carray(carray_, **kwargs) - - out_buffer = np.empty(labels.chunklen, dtype='uint64') - out_buffer_org_ptr = out_buffer.data - - table = kh_init_str() - - block_iterator = bz.iterblocks(carray_, blen=labels.chunklen) - nblocks = (len(carray_)/labels.chunklen+0.5)+1 - - with nogil, parallel(num_threads=num_threads_requested): - # Initialise some parallel processing stuff - omp_start_critical() - if first_thread == True: - (&first_thread)[0] = False - with gil: - par_initialise(&par_info, carray_) - omp_end_critical() - - # allocate thread-local in- and out-buffers - with gil: - in_buffer_ptr = par_allocate_buffer(labels.chunklen, carray_.dtype.itemsize) - out_buffer_ptr = par_allocate_buffer(labels.chunklen, labels.dtype.itemsize) - - # factorise the chunks in parallel - for i in prange(0, nblocks, schedule='dynamic', chunksize=1): - thread_id = threadid() - - omp_set_lock(&par_info.chunk_lock) - with gil: - in_buffer = np.ascontiguousarray(block_iterator.next()) - blocklen = len(in_buffer) - memcpy(in_buffer_ptr, in_buffer.data, in_buffer.nbytes) - omp_unset_lock(&par_info.chunk_lock) - - _factorize_str_helper(blocklen, - element_allocation_size, - in_buffer_ptr, - out_buffer_ptr, - table, - &count, - thread_id, - par_info.kh_locks, - par_info.num_threads, - ) - - # compress out_buffer into labels - omp_set_lock(&par_info.out_buffer_lock) - with gil: - out_buffer.data = out_buffer_ptr - par_save_block(i, labels, out_buffer, blocklen, nblocks) - omp_unset_lock(&par_info.out_buffer_lock) - - # Clean-up thread local variables - free(in_buffer_ptr) - free(out_buffer_ptr) - - # Clean-up some parallel processing stuff - par_terminate(par_info) - - # restore out_buffer data pointer to allow python to free the object's data - out_buffer.data = out_buffer_org_ptr - - # construct python dict from vectors and free element memory - for j in range(table.n_buckets): - if not kh_exist_str(table, j): - continue - reverse[table.vals[j]] = table.keys[j] - free(table.keys[j]) - kh_destroy_str(table) - - return labels, reverse - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_int64_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - ndarray[npy_int64] in_buffer, - ndarray[npy_uint64] out_buffer, - kh_int64_t *table, - Py_ssize_t * count, - dict reverse, - ): - cdef: - Py_ssize_t i, idx - int ret - npy_int64 element - khiter_t k - - ret = 0 - - for i in range(iter_range): - # TODO: Consider indexing directly into the array for efficiency - element = in_buffer[i] - k = kh_get_int64(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - k = kh_put_int64(table, element, &ret) - table.vals[k] = idx = count[0] - reverse[count[0]] = element - count[0] += 1 - out_buffer[i] = idx - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_int64(carray carray_, carray labels=None): - cdef: - Py_ssize_t len_carray, count, chunklen, len_in_buffer - dict reverse - ndarray[npy_int64] in_buffer - ndarray[npy_uint64] out_buffer - kh_int64_t *table - - count = 0 - ret = 0 - reverse = {} - - len_carray = len(carray_) - chunklen = carray_.chunklen - if labels is None: - labels = carray([], dtype='int64', expectedlen=len_carray) - # in-buffer isn't typed, because cython doesn't support string arrays (?) - out_buffer = np.empty(chunklen, dtype='uint64') - table = kh_init_int64() - - for in_buffer in bz.iterblocks(carray_): - len_in_buffer = len(in_buffer) - _factorize_int64_helper(len_in_buffer, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer[:len_in_buffer].astype(np.int64)) - - kh_destroy_int64(table) - - return labels, reverse - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_int32_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - ndarray[npy_int32] in_buffer, - ndarray[npy_uint64] out_buffer, - kh_int32_t *table, - Py_ssize_t * count, - dict reverse, - ): - cdef: - Py_ssize_t i, idx - int ret - npy_int32 element - khiter_t k - - ret = 0 - - for i in range(iter_range): - # TODO: Consider indexing directly into the array for efficiency - element = in_buffer[i] - k = kh_get_int32(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - k = kh_put_int32(table, element, &ret) - table.vals[k] = idx = count[0] - reverse[count[0]] = element - count[0] += 1 - out_buffer[i] = idx - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_int32(carray carray_, carray labels=None): - cdef: - Py_ssize_t len_carray, count, chunklen, len_in_buffer - dict reverse - ndarray[npy_int32] in_buffer - ndarray[npy_uint64] out_buffer - kh_int32_t *table - - count = 0 - ret = 0 - reverse = {} - - len_carray = len(carray_) - chunklen = carray_.chunklen - if labels is None: - labels = carray([], dtype='int64', expectedlen=len_carray) - # in-buffer isn't typed, because cython doesn't support string arrays (?) - out_buffer = np.empty(chunklen, dtype='uint64') - table = kh_init_int32() - - for in_buffer in bz.iterblocks(carray_): - len_in_buffer = len(in_buffer) - _factorize_int32_helper(len_in_buffer, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer[:len_in_buffer].astype(np.int64)) - - kh_destroy_int32(table) - - return labels, reverse - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef void _factorize_float64_helper(Py_ssize_t iter_range, - Py_ssize_t allocation_size, - ndarray[npy_float64] in_buffer, - ndarray[npy_uint64] out_buffer, - kh_float64_t *table, - Py_ssize_t * count, - dict reverse, - ): - cdef: - Py_ssize_t i, idx - int ret - npy_float64 element - khiter_t k - - ret = 0 - - for i in range(iter_range): - # TODO: Consider indexing directly into the array for efficiency - element = in_buffer[i] - k = kh_get_float64(table, element) - if k != table.n_buckets: - idx = table.vals[k] - else: - k = kh_put_float64(table, element, &ret) - table.vals[k] = idx = count[0] - reverse[count[0]] = element - count[0] += 1 - out_buffer[i] = idx - -@cython.wraparound(False) -@cython.boundscheck(False) -def factorize_float64(carray carray_, carray labels=None): - cdef: - Py_ssize_t len_carray, count, chunklen, len_in_buffer - dict reverse - ndarray[npy_float64] in_buffer - ndarray[npy_uint64] out_buffer - kh_float64_t *table - - count = 0 - ret = 0 - reverse = {} - - len_carray = len(carray_) - chunklen = carray_.chunklen - if labels is None: - labels = carray([], dtype='int64', expectedlen=len_carray) - # in-buffer isn't typed, because cython doesn't support string arrays (?) - out_buffer = np.empty(chunklen, dtype='uint64') - table = kh_init_float64() - - for in_buffer in bz.iterblocks(carray_): - len_in_buffer = len(in_buffer) - _factorize_float64_helper(len_in_buffer, - carray_.dtype.itemsize + 1, - in_buffer, - out_buffer, - table, - &count, - reverse, - ) - # compress out_buffer into labels - labels.append(out_buffer[:len_in_buffer].astype(np.int64)) - - kh_destroy_float64(table) - - return labels, reverse - -def factorize(carray carray_, **kwargs): - if carray_.dtype == 'int32': - labels, reverse = factorize_int32(carray_, **kwargs) - elif carray_.dtype == 'int64': - labels, reverse = factorize_int64(carray_, **kwargs) - elif carray_.dtype == 'float64': - labels, reverse = factorize_float64(carray_, **kwargs) - else: - #TODO: check that the input is a string_ dtype type - labels, reverse = factorize_str(carray_, **kwargs) - return labels, reverse - -# --------------------------------------------------------------------------- -# Translate existing arrays -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1): - cdef: - Py_ssize_t chunklen, leftover_elements, len_in_buffer - ndarray[npy_int64] in_buffer - ndarray[npy_int64] out_buffer - - chunklen = input_.chunklen - out_buffer = np.empty(chunklen, dtype='int64') - - for in_buffer in bz.iterblocks(input_): - len_in_buffer = len(in_buffer) - for i in range(len_in_buffer): - element = in_buffer[i] - out_buffer[i] = lookup.get(element, default) - # compress out_buffer into labels - output_.append(out_buffer[:len_in_buffer].astype(np.int64)) - -# --------------------------------------------------------------------------- -# Aggregation Section (old) -@cython.boundscheck(False) -@cython.wraparound(False) -def agg_sum_na(iter_): - cdef: - npy_float64 v, v_cum = 0.0 - - for v in iter_: - if v == v: # skip NA values - v_cum += v - - return v_cum - -@cython.boundscheck(False) -@cython.wraparound(False) -def agg_sum(iter_): - cdef: - npy_float64 v, v_cum = 0.0 - - for v in iter_: - v_cum += v - - return v_cum - -# --------------------------------------------------------------------------- -# Aggregation Section -@cython.boundscheck(False) -@cython.wraparound(False) -def groupsort_indexer(carray index, Py_ssize_t ngroups): - cdef: - Py_ssize_t i, label, n, len_in_buffer - ndarray[int64_t] counts, where, np_result - # -- - carray c_result - chunk input_chunk, index_chunk - Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements - - ndarray[int64_t] in_buffer - - index_chunk_len = index.chunklen - in_buffer = np.empty(index_chunk_len, dtype='int64') - index_chunk_nr = 0 - - # count group sizes, location 0 for NA - counts = np.zeros(ngroups + 1, dtype=np.int64) - n = len(index) - - for in_buffer in bz.iterblocks(index): - len_in_buffer = len(in_buffer) - # loop through rows - for i in range(len_in_buffer): - counts[index[i] + 1] += 1 - - # mark the start of each contiguous group of like-indexed data - where = np.zeros(ngroups + 1, dtype=np.int64) - for i from 1 <= i < ngroups + 1: - where[i] = where[i - 1] + counts[i - 1] - - # this is our indexer - np_result = np.zeros(n, dtype=np.int64) - for i from 0 <= i < n: - label = index[i] + 1 - np_result[where[label]] = i - where[label] += 1 - - return np_result, counts - -cdef count_unique_float64(ndarray[float64_t] values): - cdef: - Py_ssize_t i, n = len(values) - Py_ssize_t idx - int ret = 0 - float64_t val - khiter_t k - npy_uint64 count = 0 - bint seen_na = 0 - kh_float64_t *table - - table = kh_init_float64() - - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_float64(table, val) - if k == table.n_buckets: - k = kh_put_float64(table, val, &ret) - count += 1 - elif not seen_na: - seen_na = 1 - count += 1 - - kh_destroy_float64(table) - - return count - -cdef count_unique_int64(ndarray[int64_t] values): - cdef: - Py_ssize_t i, n = len(values) - Py_ssize_t idx - int ret = 0 - int64_t val - khiter_t k - npy_uint64 count = 0 - kh_int64_t *table - - table = kh_init_int64() - - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_int64(table, val) - if k == table.n_buckets: - k = kh_put_int64(table, val, &ret) - count += 1 - - kh_destroy_int64(table) - - return count - -cdef count_unique_int32(ndarray[int32_t] values): - cdef: - Py_ssize_t i, n = len(values) - Py_ssize_t idx - int ret = 0 - int32_t val - khiter_t k - npy_uint64 count = 0 - kh_int32_t *table - - table = kh_init_int32() - - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_int32(table, val) - if k == table.n_buckets: - k = kh_put_int32(table, val, &ret) - count += 1 - - kh_destroy_int32(table) - - return count - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef sum_float64(carray ca_input, carray ca_factor, - Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): - cdef: - Py_ssize_t in_buffer_len, factor_buffer_len - Py_ssize_t factor_chunk_nr, factor_chunk_row - Py_ssize_t current_index, i, j, end_counts, start_counts - - ndarray[npy_float64] in_buffer - ndarray[npy_int64] factor_buffer - ndarray[npy_float64] out_buffer - ndarray[npy_float64] last_values - - npy_float64 v - bint count_distinct_started = 0 - carray num_uniques - - count = 0 - ret = 0 - reverse = {} - iter_ca_factor = bz.iterblocks(ca_factor) - - if agg_method == _COUNT_DISTINCT: - num_uniques = carray([], dtype='int64') - positions, counts = groupsort_indexer(ca_factor, nr_groups) - start_counts = 0 - end_counts = 0 - for j in range(len(counts) - 1): - start_counts = end_counts - end_counts = start_counts + counts[j + 1] - positions[start_counts:end_counts] - num_uniques.append( - count_unique_float64(ca_input[positions[start_counts:end_counts]]) - ) - - return num_uniques - - factor_chunk_nr = 0 - factor_buffer = iter_ca_factor.next() - factor_buffer_len = len(factor_buffer) - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype='float64') - - for in_buffer in bz.iterblocks(ca_input): - in_buffer_len = len(in_buffer) - - # loop through rows - for i in range(in_buffer_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_buffer_len: - factor_chunk_nr += 1 - factor_buffer = iter_ca_factor.next() - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - - v = in_buffer[i] - if v == v: # skip NA values - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='float64') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - # check whether a row has to be removed if it was meant to be skipped - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef sum_int32(carray ca_input, carray ca_factor, - Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): - cdef: - Py_ssize_t in_buffer_len, factor_buffer_len - Py_ssize_t factor_chunk_nr, factor_chunk_row - Py_ssize_t current_index, i, j, end_counts, start_counts - - ndarray[npy_int32] in_buffer - ndarray[npy_int64] factor_buffer - ndarray[npy_int32] out_buffer - ndarray[npy_int32] last_values - - npy_int32 v - bint count_distinct_started = 0 - carray num_uniques - - count = 0 - ret = 0 - reverse = {} - iter_ca_factor = bz.iterblocks(ca_factor) - - if agg_method == _COUNT_DISTINCT: - num_uniques = carray([], dtype='int64') - positions, counts = groupsort_indexer(ca_factor, nr_groups) - start_counts = 0 - end_counts = 0 - for j in range(len(counts) - 1): - start_counts = end_counts - end_counts = start_counts + counts[j + 1] - positions[start_counts:end_counts] - num_uniques.append( - count_unique_int32(ca_input[positions[start_counts:end_counts]]) - ) - - return num_uniques - - factor_chunk_nr = 0 - factor_buffer = iter_ca_factor.next() - factor_buffer_len = len(factor_buffer) - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype='int32') - - for in_buffer in bz.iterblocks(ca_input): - in_buffer_len = len(in_buffer) - - # loop through rows - for i in range(in_buffer_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_buffer_len: - factor_chunk_nr += 1 - factor_buffer = iter_ca_factor.next() - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - - # TODO: Warning: int does not support NA values, is this what we need? - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='int32') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - # check whether a row has to be removed if it was meant to be skipped - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef sum_int64(carray ca_input, carray ca_factor, - Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): - cdef: - Py_ssize_t in_buffer_len, factor_buffer_len - Py_ssize_t factor_chunk_nr, factor_chunk_row - Py_ssize_t current_index, i, j, end_counts, start_counts - - ndarray[npy_int64] in_buffer - ndarray[npy_int64] factor_buffer - ndarray[npy_int64] out_buffer - ndarray[npy_int64] last_values - - npy_int64 v - bint count_distinct_started = 0 - carray num_uniques - - count = 0 - ret = 0 - reverse = {} - iter_ca_factor = bz.iterblocks(ca_factor) - - if agg_method == _COUNT_DISTINCT: - num_uniques = carray([], dtype='int64') - positions, counts = groupsort_indexer(ca_factor, nr_groups) - start_counts = 0 - end_counts = 0 - for j in range(len(counts) - 1): - start_counts = end_counts - end_counts = start_counts + counts[j + 1] - positions[start_counts:end_counts] - num_uniques.append( - count_unique_int64(ca_input[positions[start_counts:end_counts]]) - ) - - return num_uniques - - factor_chunk_nr = 0 - factor_buffer = iter_ca_factor.next() - factor_buffer_len = len(factor_buffer) - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype='int64') - - for in_buffer in bz.iterblocks(ca_input): - in_buffer_len = len(in_buffer) - - # loop through rows - for i in range(in_buffer_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_buffer_len: - factor_chunk_nr += 1 - factor_buffer = iter_ca_factor.next() - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - if agg_method == _SUM: - out_buffer[current_index] += in_buffer[i] - elif agg_method == _COUNT: - out_buffer[current_index] += 1 - elif agg_method == _COUNT_NA: - - # TODO: Warning: int does not support NA values, is this what we need? - out_buffer[current_index] += 1 - elif agg_method == _SORTED_COUNT_DISTINCT: - v = in_buffer[i] - if not count_distinct_started: - count_distinct_started = 1 - last_values = np.zeros(nr_groups, dtype='int64') - last_values[0] = v - out_buffer[0] = 1 - else: - if v != last_values[current_index]: - out_buffer[current_index] += 1 - - last_values[current_index] = v - else: - raise NotImplementedError('sumtype not supported') - - # check whether a row has to be removed if it was meant to be skipped - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key): - cdef: - Py_ssize_t in_buffer_len, factor_buffer_len - Py_ssize_t factor_chunk_nr, factor_chunk_row - Py_ssize_t current_index, i, factor_total_chunks - - ndarray in_buffer - ndarray[npy_int64] factor_buffer - ndarray out_buffer - - count = 0 - ret = 0 - reverse = {} - iter_ca_factor = bz.iterblocks(ca_factor) - - - factor_total_chunks = ca_factor.nchunks - factor_chunk_nr = 0 - factor_buffer = iter_ca_factor.next() - factor_buffer_len = len(factor_buffer) - factor_chunk_row = 0 - out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype) - - for in_buffer in bz.iterblocks(ca_input): - in_buffer_len = len(in_buffer) - - for i in range(in_buffer_len): - - # go to next factor buffer if necessary - if factor_chunk_row == factor_buffer_len: - factor_chunk_nr += 1 - factor_buffer = iter_ca_factor.next() - factor_chunk_row = 0 - - # retrieve index - current_index = factor_buffer[factor_chunk_row] - factor_chunk_row += 1 - - # update value if it's not an invalid index - if current_index != skip_key: - out_buffer[current_index] = in_buffer[i] - - # check whether a row has to be fixed - if skip_key < nr_groups: - np.delete(out_buffer, skip_key) - - return out_buffer - -def aggregate_groups_by_iter_2(ct_input, - ct_agg, - npy_uint64 nr_groups, - npy_uint64 skip_key, - carray factor_carray, - groupby_cols, - output_agg_ops, - dtype_list, - agg_method=_SUM - ): - total = [] - - for col in groupby_cols: - total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key)) - - for col, agg_op in output_agg_ops: - # TODO: input vs output column - col_dtype = ct_agg[col].dtype - if col_dtype == np.float64: - total.append( - sum_float64(ct_input[col], factor_carray, nr_groups, skip_key, - agg_method=agg_method) - ) - elif col_dtype == np.int64: - total.append( - sum_int64(ct_input[col], factor_carray, nr_groups, skip_key, - agg_method=agg_method) - ) - elif col_dtype == np.int32: - total.append( - sum_int32(ct_input[col], factor_carray, nr_groups, skip_key, - agg_method=agg_method) - ) - else: - raise NotImplementedError( - 'Column dtype ({0}) not supported for aggregation yet ' - '(only int32, int64 & float64)'.format(str(col_dtype))) - - ct_agg.append(total) - -# --------------------------------------------------------------------------- -# Temporary Section -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse): - """ - TEMPORARY WORKAROUND till numexpr support in list operations - - Update a boolean array with checks whether the values of a column (col) are in a set (value_set) - Reverse means "not in" functionality - - For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61 - - :param col: - :param value_set: - :param boolarr: - :param reverse: - :return: - """ - cdef Py_ssize_t i - i = 0 - if not reverse: - for val in col.iter(): - if val not in value_set: - boolarr[i] = False - i += 1 - else: - for val in col.iter(): - if val in value_set: - boolarr[i] = False - i += 1 +cimport cython +from cython.parallel import parallel, prange, threadid +from openmp cimport * + +import numpy as np +from numpy cimport (ndarray, dtype, npy_intp, npy_uint8, npy_int32, npy_uint64, + npy_int64, npy_float64, uint64_t) + +from libc.stdint cimport uintptr_t +from libc.stdlib cimport malloc, calloc, free +from libc.string cimport strcpy, memcpy, memset +from libcpp.vector cimport vector + +from khash cimport * +import bcolz as bz +from bcolz.carray_ext cimport carray, chunks, chunk + +include "parallel_processing.pxi" + +# ---------------------------------------------------------------------------- +# GLOBAL DEFINITIONS +# ---------------------------------------------------------------------------- +SUM = 0 +DEF _SUM = 0 + +COUNT = 1 +DEF _COUNT = 1 + +COUNT_NA = 2 +DEF _COUNT_NA = 2 + +COUNT_DISTINCT = 3 +DEF _COUNT_DISTINCT = 3 + +SORTED_COUNT_DISTINCT = 4 +DEF _SORTED_COUNT_DISTINCT = 4 + + +# ---------------------------------------------------------------------------- +# FACTORIZE SECTION +# ---------------------------------------------------------------------------- +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_str_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + char * in_buffer_ptr, + uint64_t * out_buffer, + kh_str_t *table, + Py_ssize_t * count, + unsigned int thread_id, + omp_lock_t kh_locks[], + unsigned int num_threads, + ) nogil: + cdef: + Py_ssize_t i, idx + unsigned int j + khiter_t k + + char * element + char * insert + + int ret = 0 + Py_ssize_t itemsize = allocation_size - 1 + + # allocate enough memory to hold the string element, add one for the + # null byte that marks the end of the string. + # TODO: understand why zero-filling is necessary. Without zero-filling + # the buffer, duplicate keys occur in the reverse dict + element = calloc(allocation_size, sizeof(char)) + + # lock ensures that table is in consistend state during access + omp_set_lock(&kh_locks[thread_id]) + for i in xrange(iter_range): + # appends null character to element string + memcpy(element, in_buffer_ptr, itemsize) + in_buffer_ptr += itemsize + + k = kh_get_str(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + omp_unset_lock(&kh_locks[thread_id]) + insert = malloc(allocation_size) + # TODO: is strcpy really the best way to copy a string? + strcpy(insert, element) + + # acquire locks for all threads to avoid inconsistent state + for j in xrange(num_threads): + omp_set_lock(&kh_locks[j]) + # check whether another thread has already added the entry by now + k = kh_get_str(table, element) + if k != table.n_buckets: + idx = table.vals[k] + # if not, add element + else: + k = kh_put_str(table, insert, &ret) + table.vals[k] = idx = count[0] + count[0] += 1 + # release all locks + for j in xrange(num_threads): + omp_unset_lock(&kh_locks[j]) + # acquire our own lock again, to indicate we are reading the table + omp_set_lock(&kh_locks[thread_id]) + out_buffer[i] = idx + + omp_unset_lock(&kh_locks[thread_id]) + free(element) + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_str(carray carray_, unsigned int num_threads_requested = 0, **kwargs): + cdef: + Py_ssize_t i, blocklen, nblocks + khint_t j + carray labels + ndarray in_buffer + ndarray[npy_uint64] out_buffer + unsigned int thread_id + par_info_t par_info + + kh_str_t *table + char * out_buffer_org_ptr + char * in_buffer_ptr + uint64_t * out_buffer_ptr + + Py_ssize_t count = 0 + dict reverse = {} + + bint first_thread = True + Py_ssize_t element_allocation_size = carray_.dtype.itemsize + 1 + Py_ssize_t carray_chunklen = carray_.chunklen + + labels = create_labels_carray(carray_, **kwargs) + + out_buffer = np.empty(labels.chunklen, dtype='uint64') + out_buffer_org_ptr = out_buffer.data + + table = kh_init_str() + + block_iterator = bz.iterblocks(carray_, blen=labels.chunklen) + nblocks = (len(carray_)/labels.chunklen+0.5)+1 + + with nogil, parallel(num_threads=num_threads_requested): + # Initialise some parallel processing stuff + omp_start_critical() + if first_thread == True: + (&first_thread)[0] = False + with gil: + par_initialise(&par_info, carray_) + omp_end_critical() + + # allocate thread-local in- and out-buffers + with gil: + in_buffer_ptr = par_allocate_buffer(labels.chunklen, carray_.dtype.itemsize) + out_buffer_ptr = par_allocate_buffer(labels.chunklen, labels.dtype.itemsize) + + # factorise the chunks in parallel + for i in prange(0, nblocks, schedule='dynamic', chunksize=1): + thread_id = threadid() + + omp_set_lock(&par_info.chunk_lock) + with gil: + in_buffer = np.ascontiguousarray(block_iterator.next()) + blocklen = len(in_buffer) + memcpy(in_buffer_ptr, in_buffer.data, in_buffer.nbytes) + omp_unset_lock(&par_info.chunk_lock) + + _factorize_str_helper(blocklen, + element_allocation_size, + in_buffer_ptr, + out_buffer_ptr, + table, + &count, + thread_id, + par_info.kh_locks, + par_info.num_threads, + ) + + # compress out_buffer into labels + omp_set_lock(&par_info.out_buffer_lock) + with gil: + out_buffer.data = out_buffer_ptr + par_save_block(i, labels, out_buffer, blocklen, nblocks) + omp_unset_lock(&par_info.out_buffer_lock) + + # Clean-up thread local variables + free(in_buffer_ptr) + free(out_buffer_ptr) + + # Clean-up some parallel processing stuff + par_terminate(par_info) + + # restore out_buffer data pointer to allow python to free the object's data + out_buffer.data = out_buffer_org_ptr + + # construct python dict from vectors and free element memory + for j in range(table.n_buckets): + if not kh_exist_str(table, j): + continue + reverse[table.vals[j]] = table.keys[j] + free(table.keys[j]) + kh_destroy_str(table) + + return labels, reverse + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_int64_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + ndarray[npy_int64] in_buffer, + ndarray[npy_uint64] out_buffer, + kh_int64_t *table, + Py_ssize_t * count, + dict reverse, + ): + cdef: + Py_ssize_t i, idx + int ret + npy_int64 element + khiter_t k + + ret = 0 + + for i in range(iter_range): + # TODO: Consider indexing directly into the array for efficiency + element = in_buffer[i] + k = kh_get_int64(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + k = kh_put_int64(table, element, &ret) + table.vals[k] = idx = count[0] + reverse[count[0]] = element + count[0] += 1 + out_buffer[i] = idx + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_int64(carray carray_, carray labels=None): + cdef: + Py_ssize_t len_carray, count, chunklen, len_in_buffer + dict reverse + ndarray[npy_int64] in_buffer + ndarray[npy_uint64] out_buffer + kh_int64_t *table + + count = 0 + ret = 0 + reverse = {} + + len_carray = len(carray_) + chunklen = carray_.chunklen + if labels is None: + labels = carray([], dtype='int64', expectedlen=len_carray) + # in-buffer isn't typed, because cython doesn't support string arrays (?) + out_buffer = np.empty(chunklen, dtype='uint64') + table = kh_init_int64() + + for in_buffer in bz.iterblocks(carray_): + len_in_buffer = len(in_buffer) + _factorize_int64_helper(len_in_buffer, + carray_.dtype.itemsize + 1, + in_buffer, + out_buffer, + table, + &count, + reverse, + ) + # compress out_buffer into labels + labels.append(out_buffer[:len_in_buffer].astype(np.int64)) + + kh_destroy_int64(table) + + return labels, reverse + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_int32_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + ndarray[npy_int32] in_buffer, + ndarray[npy_uint64] out_buffer, + kh_int32_t *table, + Py_ssize_t * count, + dict reverse, + ): + cdef: + Py_ssize_t i, idx + int ret + npy_int32 element + khiter_t k + + ret = 0 + + for i in range(iter_range): + # TODO: Consider indexing directly into the array for efficiency + element = in_buffer[i] + k = kh_get_int32(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + k = kh_put_int32(table, element, &ret) + table.vals[k] = idx = count[0] + reverse[count[0]] = element + count[0] += 1 + out_buffer[i] = idx + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_int32(carray carray_, carray labels=None): + cdef: + Py_ssize_t len_carray, count, chunklen, len_in_buffer + dict reverse + ndarray[npy_int32] in_buffer + ndarray[npy_uint64] out_buffer + kh_int32_t *table + + count = 0 + ret = 0 + reverse = {} + + len_carray = len(carray_) + chunklen = carray_.chunklen + if labels is None: + labels = carray([], dtype='int64', expectedlen=len_carray) + # in-buffer isn't typed, because cython doesn't support string arrays (?) + out_buffer = np.empty(chunklen, dtype='uint64') + table = kh_init_int32() + + for in_buffer in bz.iterblocks(carray_): + len_in_buffer = len(in_buffer) + _factorize_int32_helper(len_in_buffer, + carray_.dtype.itemsize + 1, + in_buffer, + out_buffer, + table, + &count, + reverse, + ) + # compress out_buffer into labels + labels.append(out_buffer[:len_in_buffer].astype(np.int64)) + + kh_destroy_int32(table) + + return labels, reverse + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _factorize_float64_helper(Py_ssize_t iter_range, + Py_ssize_t allocation_size, + ndarray[npy_float64] in_buffer, + ndarray[npy_uint64] out_buffer, + kh_float64_t *table, + Py_ssize_t * count, + dict reverse, + ): + cdef: + Py_ssize_t i, idx + int ret + npy_float64 element + khiter_t k + + ret = 0 + + for i in range(iter_range): + # TODO: Consider indexing directly into the array for efficiency + element = in_buffer[i] + k = kh_get_float64(table, element) + if k != table.n_buckets: + idx = table.vals[k] + else: + k = kh_put_float64(table, element, &ret) + table.vals[k] = idx = count[0] + reverse[count[0]] = element + count[0] += 1 + out_buffer[i] = idx + +@cython.wraparound(False) +@cython.boundscheck(False) +def factorize_float64(carray carray_, carray labels=None): + cdef: + Py_ssize_t len_carray, count, chunklen, len_in_buffer + dict reverse + ndarray[npy_float64] in_buffer + ndarray[npy_uint64] out_buffer + kh_float64_t *table + + count = 0 + ret = 0 + reverse = {} + + len_carray = len(carray_) + chunklen = carray_.chunklen + if labels is None: + labels = carray([], dtype='int64', expectedlen=len_carray) + # in-buffer isn't typed, because cython doesn't support string arrays (?) + out_buffer = np.empty(chunklen, dtype='uint64') + table = kh_init_float64() + + for in_buffer in bz.iterblocks(carray_): + len_in_buffer = len(in_buffer) + _factorize_float64_helper(len_in_buffer, + carray_.dtype.itemsize + 1, + in_buffer, + out_buffer, + table, + &count, + reverse, + ) + # compress out_buffer into labels + labels.append(out_buffer[:len_in_buffer].astype(np.int64)) + + kh_destroy_float64(table) + + return labels, reverse + +def factorize(carray carray_, **kwargs): + if carray_.dtype == 'int32': + labels, reverse = factorize_int32(carray_, **kwargs) + elif carray_.dtype == 'int64': + labels, reverse = factorize_int64(carray_, **kwargs) + elif carray_.dtype == 'float64': + labels, reverse = factorize_float64(carray_, **kwargs) + else: + #TODO: check that the input is a string_ dtype type + labels, reverse = factorize_str(carray_, **kwargs) + return labels, reverse + +# --------------------------------------------------------------------------- +# Translate existing arrays +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef translate_int64(carray input_, carray output_, dict lookup, npy_int64 default=-1): + cdef: + Py_ssize_t chunklen, leftover_elements, len_in_buffer + ndarray[npy_int64] in_buffer + ndarray[npy_int64] out_buffer + + chunklen = input_.chunklen + out_buffer = np.empty(chunklen, dtype='int64') + + for in_buffer in bz.iterblocks(input_): + len_in_buffer = len(in_buffer) + for i in range(len_in_buffer): + element = in_buffer[i] + out_buffer[i] = lookup.get(element, default) + # compress out_buffer into labels + output_.append(out_buffer[:len_in_buffer].astype(np.int64)) + +# --------------------------------------------------------------------------- +# Aggregation Section (old) +@cython.boundscheck(False) +@cython.wraparound(False) +def agg_sum_na(iter_): + cdef: + npy_float64 v, v_cum = 0.0 + + for v in iter_: + if v == v: # skip NA values + v_cum += v + + return v_cum + +@cython.boundscheck(False) +@cython.wraparound(False) +def agg_sum(iter_): + cdef: + npy_float64 v, v_cum = 0.0 + + for v in iter_: + v_cum += v + + return v_cum + +# --------------------------------------------------------------------------- +# Aggregation Section +@cython.boundscheck(False) +@cython.wraparound(False) +def groupsort_indexer(carray index, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, label, n, len_in_buffer + ndarray[int64_t] counts, where, np_result + # -- + carray c_result + chunk input_chunk, index_chunk + Py_ssize_t index_chunk_nr, index_chunk_len, leftover_elements + + ndarray[int64_t] in_buffer + + index_chunk_len = index.chunklen + in_buffer = np.empty(index_chunk_len, dtype='int64') + index_chunk_nr = 0 + + # count group sizes, location 0 for NA + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + + for in_buffer in bz.iterblocks(index): + len_in_buffer = len(in_buffer) + # loop through rows + for i in range(len_in_buffer): + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + where = np.zeros(ngroups + 1, dtype=np.int64) + for i from 1 <= i < ngroups + 1: + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + np_result = np.zeros(n, dtype=np.int64) + for i from 0 <= i < n: + label = index[i] + 1 + np_result[where[label]] = i + where[label] += 1 + + return np_result, counts + +cdef count_unique_float64(ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx + int ret = 0 + float64_t val + khiter_t k + npy_uint64 count = 0 + bint seen_na = 0 + kh_float64_t *table + + table = kh_init_float64() + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(table, val) + if k == table.n_buckets: + k = kh_put_float64(table, val, &ret) + count += 1 + elif not seen_na: + seen_na = 1 + count += 1 + + kh_destroy_float64(table) + + return count + +cdef count_unique_int64(ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx + int ret = 0 + int64_t val + khiter_t k + npy_uint64 count = 0 + kh_int64_t *table + + table = kh_init_int64() + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_int64(table, val) + if k == table.n_buckets: + k = kh_put_int64(table, val, &ret) + count += 1 + + kh_destroy_int64(table) + + return count + +cdef count_unique_int32(ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx + int ret = 0 + int32_t val + khiter_t k + npy_uint64 count = 0 + kh_int32_t *table + + table = kh_init_int32() + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_int32(table, val) + if k == table.n_buckets: + k = kh_put_int32(table, val, &ret) + count += 1 + + kh_destroy_int32(table) + + return count + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef sum_float64(carray ca_input, carray ca_factor, + Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, j, end_counts, start_counts + + ndarray[npy_float64] in_buffer + ndarray[npy_int64] factor_buffer + ndarray[npy_float64] out_buffer + ndarray[npy_float64] last_values + + npy_float64 v + bint count_distinct_started = 0 + carray num_uniques + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + if agg_method == _COUNT_DISTINCT: + num_uniques = carray([], dtype='int64') + positions, counts = groupsort_indexer(ca_factor, nr_groups) + start_counts = 0 + end_counts = 0 + for j in range(len(counts) - 1): + start_counts = end_counts + end_counts = start_counts + counts[j + 1] + positions[start_counts:end_counts] + num_uniques.append( + count_unique_float64(ca_input[positions[start_counts:end_counts]]) + ) + + return num_uniques + + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype='float64') + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + # loop through rows + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + if agg_method == _SUM: + out_buffer[current_index] += in_buffer[i] + elif agg_method == _COUNT: + out_buffer[current_index] += 1 + elif agg_method == _COUNT_NA: + + v = in_buffer[i] + if v == v: # skip NA values + out_buffer[current_index] += 1 + elif agg_method == _SORTED_COUNT_DISTINCT: + v = in_buffer[i] + if not count_distinct_started: + count_distinct_started = 1 + last_values = np.zeros(nr_groups, dtype='float64') + last_values[0] = v + out_buffer[0] = 1 + else: + if v != last_values[current_index]: + out_buffer[current_index] += 1 + + last_values[current_index] = v + else: + raise NotImplementedError('sumtype not supported') + + # check whether a row has to be removed if it was meant to be skipped + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef sum_int32(carray ca_input, carray ca_factor, + Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, j, end_counts, start_counts + + ndarray[npy_int32] in_buffer + ndarray[npy_int64] factor_buffer + ndarray[npy_int32] out_buffer + ndarray[npy_int32] last_values + + npy_int32 v + bint count_distinct_started = 0 + carray num_uniques + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + if agg_method == _COUNT_DISTINCT: + num_uniques = carray([], dtype='int64') + positions, counts = groupsort_indexer(ca_factor, nr_groups) + start_counts = 0 + end_counts = 0 + for j in range(len(counts) - 1): + start_counts = end_counts + end_counts = start_counts + counts[j + 1] + positions[start_counts:end_counts] + num_uniques.append( + count_unique_int32(ca_input[positions[start_counts:end_counts]]) + ) + + return num_uniques + + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype='int32') + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + # loop through rows + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + if agg_method == _SUM: + out_buffer[current_index] += in_buffer[i] + elif agg_method == _COUNT: + out_buffer[current_index] += 1 + elif agg_method == _COUNT_NA: + + # TODO: Warning: int does not support NA values, is this what we need? + out_buffer[current_index] += 1 + elif agg_method == _SORTED_COUNT_DISTINCT: + v = in_buffer[i] + if not count_distinct_started: + count_distinct_started = 1 + last_values = np.zeros(nr_groups, dtype='int32') + last_values[0] = v + out_buffer[0] = 1 + else: + if v != last_values[current_index]: + out_buffer[current_index] += 1 + + last_values[current_index] = v + else: + raise NotImplementedError('sumtype not supported') + + # check whether a row has to be removed if it was meant to be skipped + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef sum_int64(carray ca_input, carray ca_factor, + Py_ssize_t nr_groups, Py_ssize_t skip_key, agg_method=_SUM): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, j, end_counts, start_counts + + ndarray[npy_int64] in_buffer + ndarray[npy_int64] factor_buffer + ndarray[npy_int64] out_buffer + ndarray[npy_int64] last_values + + npy_int64 v + bint count_distinct_started = 0 + carray num_uniques + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + if agg_method == _COUNT_DISTINCT: + num_uniques = carray([], dtype='int64') + positions, counts = groupsort_indexer(ca_factor, nr_groups) + start_counts = 0 + end_counts = 0 + for j in range(len(counts) - 1): + start_counts = end_counts + end_counts = start_counts + counts[j + 1] + positions[start_counts:end_counts] + num_uniques.append( + count_unique_int64(ca_input[positions[start_counts:end_counts]]) + ) + + return num_uniques + + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype='int64') + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + # loop through rows + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + if agg_method == _SUM: + out_buffer[current_index] += in_buffer[i] + elif agg_method == _COUNT: + out_buffer[current_index] += 1 + elif agg_method == _COUNT_NA: + + # TODO: Warning: int does not support NA values, is this what we need? + out_buffer[current_index] += 1 + elif agg_method == _SORTED_COUNT_DISTINCT: + v = in_buffer[i] + if not count_distinct_started: + count_distinct_started = 1 + last_values = np.zeros(nr_groups, dtype='int64') + last_values[0] = v + out_buffer[0] = 1 + else: + if v != last_values[current_index]: + out_buffer[current_index] += 1 + + last_values[current_index] = v + else: + raise NotImplementedError('sumtype not supported') + + # check whether a row has to be removed if it was meant to be skipped + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef groupby_value(carray ca_input, carray ca_factor, Py_ssize_t nr_groups, Py_ssize_t skip_key): + cdef: + Py_ssize_t in_buffer_len, factor_buffer_len + Py_ssize_t factor_chunk_nr, factor_chunk_row + Py_ssize_t current_index, i, factor_total_chunks + + ndarray in_buffer + ndarray[npy_int64] factor_buffer + ndarray out_buffer + + count = 0 + ret = 0 + reverse = {} + iter_ca_factor = bz.iterblocks(ca_factor) + + + factor_total_chunks = ca_factor.nchunks + factor_chunk_nr = 0 + factor_buffer = iter_ca_factor.next() + factor_buffer_len = len(factor_buffer) + factor_chunk_row = 0 + out_buffer = np.zeros(nr_groups, dtype=ca_input.dtype) + + for in_buffer in bz.iterblocks(ca_input): + in_buffer_len = len(in_buffer) + + for i in range(in_buffer_len): + + # go to next factor buffer if necessary + if factor_chunk_row == factor_buffer_len: + factor_chunk_nr += 1 + factor_buffer = iter_ca_factor.next() + factor_chunk_row = 0 + + # retrieve index + current_index = factor_buffer[factor_chunk_row] + factor_chunk_row += 1 + + # update value if it's not an invalid index + if current_index != skip_key: + out_buffer[current_index] = in_buffer[i] + + # check whether a row has to be fixed + if skip_key < nr_groups: + np.delete(out_buffer, skip_key) + + return out_buffer + +def aggregate_groups_by_iter_2(ct_input, + ct_agg, + npy_uint64 nr_groups, + npy_uint64 skip_key, + carray factor_carray, + groupby_cols, + output_agg_ops, + dtype_list, + agg_method=_SUM + ): + total = [] + + for col in groupby_cols: + total.append(groupby_value(ct_input[col], factor_carray, nr_groups, skip_key)) + + for col, agg_op in output_agg_ops: + # TODO: input vs output column + col_dtype = ct_agg[col].dtype + if col_dtype == np.float64: + total.append( + sum_float64(ct_input[col], factor_carray, nr_groups, skip_key, + agg_method=agg_method) + ) + elif col_dtype == np.int64: + total.append( + sum_int64(ct_input[col], factor_carray, nr_groups, skip_key, + agg_method=agg_method) + ) + elif col_dtype == np.int32: + total.append( + sum_int32(ct_input[col], factor_carray, nr_groups, skip_key, + agg_method=agg_method) + ) + else: + raise NotImplementedError( + 'Column dtype ({0}) not supported for aggregation yet ' + '(only int32, int64 & float64)'.format(str(col_dtype))) + + ct_agg.append(total) + +# --------------------------------------------------------------------------- +# Temporary Section +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef carray_is_in(carray col, set value_set, ndarray boolarr, bint reverse): + """ + TEMPORARY WORKAROUND till numexpr support in list operations + + Update a boolean array with checks whether the values of a column (col) are in a set (value_set) + Reverse means "not in" functionality + + For the 0d array work around, see https://github.com/Blosc/bcolz/issues/61 + + :param col: + :param value_set: + :param boolarr: + :param reverse: + :return: + """ + cdef Py_ssize_t i + i = 0 + if not reverse: + for val in col.iter(): + if val not in value_set: + boolarr[i] = False + i += 1 + else: + for val in col.iter(): + if val in value_set: + boolarr[i] = False + i += 1