From bbc6fb120e58ee8d551c64afc7044623d7c81fba Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Wed, 5 Sep 2018 19:02:23 -0300 Subject: [PATCH 01/10] Makefile: add proper dependencies to targets, and use $(CFLAGS) make test will now rebuild the library and/or test_multithread.c if needed Remove the @ so we can see which output goes with which test or build-command --- Makefile | 60 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 472a6d4..3cdff28 100644 --- a/Makefile +++ b/Makefile @@ -1,34 +1,36 @@ CC=gcc +CFLAGS=-std=gnu99 -O3 -Wall -Wextra -g +LDFLAGS=-g +LOADLIBS=-lpthread +all : bin/test_p1c1 bin/test_p4c4 bin/test_p100c10 bin/test_p10c100 bin/example + +bin/example: example.c liblfq.so.1.0.0 + gcc $(CFLAGS) $(LDFLAGS) example.c lfq.c -o bin/example -lpthread + +bin/test_p1c1: liblfq.so.1.0.0 test_multithread.c + gcc $(CFLAGS) $(LDFLAGS) test_multithread.c -o bin/test_p1c1 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=1 -D MAX_CONSUMER=1 + +bin/test_p4c4: liblfq.so.1.0.0 test_multithread.c + gcc $(CFLAGS) $(LDFLAGS) test_multithread.c -o bin/test_p4c4 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=4 -D MAX_CONSUMER=4 + +bin/test_p100c10: liblfq.so.1.0.0 test_multithread.c + gcc $(CFLAGS) $(LDFLAGS) test_multithread.c -o bin/test_p100c10 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=100 -D MAX_CONSUMER=10 + +bin/test_p10c100: liblfq.so.1.0.0 test_multithread.c + gcc $(CFLAGS) $(LDFLAGS) test_multithread.c -o bin/test_p10c100 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=10 -D MAX_CONSUMER=100 + +liblfq.so.1.0.0: lfq.c lfq.h cross-platform.h + gcc $(CFLAGS) -c lfq.c # -fno-pie for static linking? + ar rcs liblfq.a lfq.o + gcc $(CFLAGS) -fPIC -c lfq.c + gcc $(LDFLAGS) -shared -o liblfq.so.1.0.0 lfq.o + +test: bin/test_p1c1 bin/test_p4c4 bin/test_p100c10 bin/test_p10c100 + bin/test_p1c1 + bin/test_p4c4 + bin/test_p100c10 + bin/test_p10c100 -all : p1c1 p4c4 p100c10 p10c100 example - -example: liblfq - gcc -std=c99 -g example.c lfq.c -o bin/example -lpthread - -p1c1: liblfq - gcc -std=c99 -g test_multithread.c -o bin/test_p1c1 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=1 -D MAX_CONSUMER=1 - -p4c4: liblfq - gcc -std=c99 -g test_multithread.c -o bin/test_p4c4 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=4 -D MAX_CONSUMER=4 - -p100c10: liblfq - gcc -std=c99 -g test_multithread.c -o bin/test_p100c10 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=100 -D MAX_CONSUMER=10 - -p10c100: liblfq - gcc -std=c99 -g test_multithread.c -o bin/test_p10c100 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=10 -D MAX_CONSUMER=100 - -liblfq: lfq.c - @gcc -std=c99 -c -O3 lfq.c -lpthread - @ar rcs liblfq.a lfq.o - @gcc -std=c99 -fPIC -c lfq.c - @gcc -shared -o liblfq.so.1.0.0 lfq.o - -test: - @bin/test_p1c1 - @bin/test_p4c4 - @bin/test_p100c10 - @bin/test_p10c100 - clean: rm -rf *.o bin/* liblfq.so.1.0.0 liblfq.a From cae8121f0b47dd7fba5929e647e227070a4a763f Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Wed, 5 Sep 2018 19:04:40 -0300 Subject: [PATCH 02/10] cross-platform.h: fix x86 definitions of load/store memory barriers, add XCHG An actual lfence/sfence instruction isn't needed for acquire semantics, just a barrier against compile-time reordering. cygwin and mingw already imply __GNUC__. --- cross-platform.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cross-platform.h b/cross-platform.h index be5de23..85ca47d 100644 --- a/cross-platform.h +++ b/cross-platform.h @@ -38,15 +38,20 @@ #define ATOMIC_SET __sync_lock_test_and_set #define ATOMIC_RELEASE __sync_lock_release -#if defined __GNUC__ || defined __CYGWIN__ || defined __MINGW32__ +#if defined __GNUC__ #define ATOMIC_SUB __sync_sub_and_fetch #define ATOMIC_SUB64 ATOMIC_SUB #define CAS __sync_bool_compare_and_swap +#define XCHG __sync_lock_test_and_set // yes really. The 2nd arg is limited to 1 on machines with TAS but not XCHG. On x86 it's an arbitrary value #define ATOMIC_ADD __sync_add_and_fetch #define ATOMIC_ADD64 ATOMIC_ADD #define mb __sync_synchronize - #define lmb() asm volatile( "lfence" ) - #define smb() asm volatile( "sfence" ) +#if defined(__x86_64__) || defined(__i386) +// #define lmb() asm volatile( "lfence" ) +// #define smb() asm volatile( "sfence" ) + #define lmb() asm volatile("":::"memory") // compiler barrier only. runtime reordering already impossible on x86 + #define smb() asm volatile("":::"memory") +#endif // else no definition // thread #include From d452ad91a00b69e42504efb95b66c22397cfcb1a Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Wed, 5 Sep 2018 19:08:54 -0300 Subject: [PATCH 03/10] tests: make sure we empty the queue, and keep counters thread-private until end of test. Only check consumer exit condition when dequeue fails, so we don't need ctx->count to be updated. This allows us to drop that point of contention between readers and writers. Using a local counter and only doing an atomic add to the shared variable at the end also reduces contention between threads in the test harness itself. --- test_multithread.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/test_multithread.c b/test_multithread.c index 68537b8..b8bbe30 100644 --- a/test_multithread.c +++ b/test_multithread.c @@ -38,19 +38,19 @@ struct user_data{ THREAD_FN addq( void * data ) { struct lfq_ctx * ctx = data; - struct user_data * p=(struct user_data *)0xff; - long i; int ret = 0; - for ( i = 0 ; i < 500000 ; i++) { - p = malloc(sizeof(struct user_data)); + long added; + for (added = 0 ; added < 500000 ; added++) { + struct user_data * p = malloc(sizeof(struct user_data)); p->data=SOMEID; if ( ( ret = lfq_enqueue(ctx,p) ) != 0 ) { printf("lfq_enqueue failed, reason:%s\n", strerror(-ret)); + ATOMIC_ADD64(&cn_added, added); ATOMIC_SUB(&cn_producer, 1); return 0; } - ATOMIC_ADD64(&cn_added, 1); } + ATOMIC_ADD64(&cn_added, added); ATOMIC_SUB(&cn_producer, 1); printf("Producer thread [%lu] exited! Still %d running...\n",THREAD_ID(), cn_producer); return 0; @@ -60,21 +60,28 @@ THREAD_FN delq(void * data) { struct lfq_ctx * ctx = data; struct user_data * p; int tid = ATOMIC_ADD(&cn_t, 1); - - while(ctx->count || cn_producer) { + + long deleted = 0; + while(1) { p = lfq_dequeue_tid(ctx, tid); if (p) { if (p->data!=SOMEID){ printf("data wrong!!\n"); exit(1); } - + free(p); - ATOMIC_ADD64(&cn_deled, 1); - } else - THREAD_YIELD(); // queue is empty, release CPU slice + deleted++; + } else { + if (ctx->count || cn_producer) + THREAD_YIELD(); // queue is empty, release CPU slice + else + break; // queue is empty and no more producers + } } + ATOMIC_ADD64(&cn_deled, deleted); + // p = lfq_dequeue_tid(ctx, tid); printf("Consumer thread [%lu] exited %d\n",THREAD_ID(),cn_producer); return 0; From 58da8e8ae9c895c15e13b4cd50d7f12132be2917 Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Wed, 5 Sep 2018 19:10:43 -0300 Subject: [PATCH 04/10] lfq.h: separate head and tail pointers by at least a cache line reduces contention between readers and writers, which both have to CAS or XCHG their respective pointers. --- lfq.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lfq.h b/lfq.h index 8d3fe68..db74d7e 100755 --- a/lfq.h +++ b/lfq.h @@ -2,6 +2,8 @@ #define __LFQ_H__ #include "cross-platform.h" +#include // C11 + struct lfq_node{ void * data; struct lfq_node * volatile next; @@ -10,8 +12,7 @@ struct lfq_node{ }; struct lfq_ctx{ - volatile struct lfq_node * volatile head; - volatile struct lfq_node * volatile tail; + alignas(64) volatile struct lfq_node * volatile head; int volatile count; volatile struct lfq_node * * HP; volatile int * tid_map; @@ -19,6 +20,8 @@ struct lfq_ctx{ volatile struct lfq_node * volatile fph; // free pool head volatile struct lfq_node * volatile fpt; // free pool tail int MAXHPSIZE; + + alignas(64) volatile struct lfq_node * volatile tail; // in another cache line to avoid contention }; int lfq_init(struct lfq_ctx *ctx, int max_consume_thread); From a723f118c12202d8fa57dd7bfbaa0587bb86267b Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Wed, 5 Sep 2018 19:18:46 -0300 Subject: [PATCH 05/10] lfg_count_freelist: tests count freelist size after threads stop This lets us detect how many nodes we never freed and left on the free-list With some later changes, the first 3 tests (p1c1, p4c4, p100c10) all show freelist=1 while p10c100 shows Total push 5000000 elements, pop 5000000 elements. freelist=1987802, clean = 0 All the contention between consumers is a problem. --- But with this version with lots of contention: p1c1: Total push 500000 elements, pop 500000 elements. freelist=1, clean = 0 p4c4: Total push 2000000 elements, pop 2000000 elements. freelist=753408, clean = 0 bin/test_p100c10 Total push 50000000 elements, pop 50000000 elements. freelist=3536085, clean = 0 bin/test_p10c100 Total push 5000000 elements, pop 5000000 elements. freelist=1665580, clean = 0 --- lfq.c | 18 ++++++++++++++++++ lfq.h | 2 ++ test_multithread.c | 7 +++++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/lfq.c b/lfq.c index 8ce9dc9..414e28a 100755 --- a/lfq.c +++ b/lfq.c @@ -84,6 +84,24 @@ int lfq_init(struct lfq_ctx *ctx, int max_consume_thread) { return 0; } + +long lfg_count_freelist(const struct lfq_ctx *ctx) { + long count=0; + struct lfq_node *p = (struct lfq_node *)ctx->fph; // non-volatile + while(p) { + count++; + p = p->next; + } +/* + while(pn = p->free_next) { + free(p); + p = pn; + count++; + } +*/ + return count; +} + int lfq_clean(struct lfq_ctx *ctx){ if ( ctx->tail && ctx->head ) { // if have data in queue struct lfq_node *tmp; diff --git a/lfq.h b/lfq.h index db74d7e..b2d4198 100755 --- a/lfq.h +++ b/lfq.h @@ -26,6 +26,8 @@ struct lfq_ctx{ int lfq_init(struct lfq_ctx *ctx, int max_consume_thread); int lfq_clean(struct lfq_ctx *ctx); +long lfg_count_freelist(const struct lfq_ctx *ctx); + int lfq_enqueue(struct lfq_ctx *ctx, void * data); void * lfq_dequeue_tid(struct lfq_ctx *ctx, int tid ); void * lfq_dequeue(struct lfq_ctx *ctx ); diff --git a/test_multithread.c b/test_multithread.c index b8bbe30..0c7459c 100644 --- a/test_multithread.c +++ b/test_multithread.c @@ -122,8 +122,11 @@ int main() { for ( i = 0 ; i < MAX_CONSUMER ; i++ ) THREAD_WAIT(thread_d[i]); - - printf("Total push %"PRId64" elements, pop %"PRId64" elements.\n", cn_added, cn_deled ); + + long freecount = lfg_count_freelist(&ctx); + int clean = lfq_clean(&ctx); + + printf("Total push %"PRId64" elements, pop %"PRId64" elements. freelist=%ld, clean = %d\n", cn_added, cn_deled, freecount, clean); if ( cn_added == cn_deled ) printf("Test PASS!!\n"); else From c775e2d1e40bccdf1ca53065802ad83c9afd1538 Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Thu, 6 Sep 2018 04:18:05 -0300 Subject: [PATCH 06/10] Makefile: add CPPFLAGS, and TESTWRAPPER compile with make CPPFLAGS=-DNDEBUG to disable assert() make TESTWRAPPER="perf stat -d" test to profile the tests --- Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 3cdff28..5fc12a6 100644 --- a/Makefile +++ b/Makefile @@ -21,16 +21,16 @@ bin/test_p10c100: liblfq.so.1.0.0 test_multithread.c gcc $(CFLAGS) $(LDFLAGS) test_multithread.c -o bin/test_p10c100 -L. -Wl,-Bstatic -llfq -Wl,-Bdynamic -lpthread -D MAX_PRODUCER=10 -D MAX_CONSUMER=100 liblfq.so.1.0.0: lfq.c lfq.h cross-platform.h - gcc $(CFLAGS) -c lfq.c # -fno-pie for static linking? + gcc $(CFLAGS) $(CPPFLAGS) -c lfq.c # -fno-pie for static linking? ar rcs liblfq.a lfq.o - gcc $(CFLAGS) -fPIC -c lfq.c + gcc $(CFLAGS) $(CPPFLAGS) -fPIC -c lfq.c gcc $(LDFLAGS) -shared -o liblfq.so.1.0.0 lfq.o test: bin/test_p1c1 bin/test_p4c4 bin/test_p100c10 bin/test_p10c100 - bin/test_p1c1 - bin/test_p4c4 - bin/test_p100c10 - bin/test_p10c100 + $(TESTWRAPPER) bin/test_p1c1 + $(TESTWRAPPER) bin/test_p4c4 + $(TESTWRAPPER) bin/test_p100c10 + $(TESTWRAPPER) bin/test_p10c100 clean: rm -rf *.o bin/* liblfq.so.1.0.0 liblfq.a From 69eb2e4cd216bde358801b04123ee4e8c503bda5 Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Thu, 6 Sep 2018 04:20:29 -0300 Subject: [PATCH 07/10] make private functions static so they can inline even with -fPIC. TODO: weak aliases for internal calls between lfq_ functions. --- lfq.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lfq.c b/lfq.c index 414e28a..b0b8cd6 100755 --- a/lfq.c +++ b/lfq.c @@ -3,6 +3,7 @@ #include #define MAXFREE 150 +static int inHP(struct lfq_ctx *ctx, struct lfq_node * lfn) { for ( int i = 0 ; i < ctx->MAXHPSIZE ; i++ ) { lmb(); @@ -20,6 +21,7 @@ void enpool(struct lfq_ctx *ctx, struct lfq_node * lfn) { p->free_next = lfn; } +static void free_pool(struct lfq_ctx *ctx, bool freeall ) { if (!CAS(&ctx->is_freeing, 0, 1)) return; // this pool free is not support multithreading. @@ -37,6 +39,7 @@ void free_pool(struct lfq_ctx *ctx, bool freeall ) { smb(); } +static void safe_free(struct lfq_ctx *ctx, struct lfq_node * lfn) { if (lfn->can_free && !inHP(ctx,lfn)) { // free is not thread safety @@ -51,6 +54,7 @@ void safe_free(struct lfq_ctx *ctx, struct lfq_node * lfn) { free_pool(ctx, false); } +static int alloc_tid(struct lfq_ctx *ctx) { for (int i = 0; i < ctx->MAXHPSIZE; i++) if (ctx->tid_map[i] == 0) @@ -60,6 +64,7 @@ int alloc_tid(struct lfq_ctx *ctx) { return -1; } +static void free_tid(struct lfq_ctx *ctx, int tid) { ctx->tid_map[tid]=0; } From c1d365463ed6ce21524ddc5c43b03a37bfeb3504 Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Thu, 6 Sep 2018 04:21:26 -0300 Subject: [PATCH 08/10] test: remove stray clean call (TODO: fix earlier commit) --- test_multithread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_multithread.c b/test_multithread.c index 0c7459c..7c239b8 100644 --- a/test_multithread.c +++ b/test_multithread.c @@ -131,6 +131,6 @@ int main() { printf("Test PASS!!\n"); else printf("Test Failed!!\n"); - lfq_clean(&ctx); + return (cn_added != cn_deled); } From d817bdb07bba6e59e99d04164dfb7190c0c7b659 Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Thu, 6 Sep 2018 04:34:07 -0300 Subject: [PATCH 09/10] dequeue: poison p->next and assert to detect use-after-free This triggers quickly in tests other than p1c1; the complicated HP[tid] scheme apparently isn't safe. Not surprising; lock-free linked lists are *HARD* without garbage collection. http://www.yebangyu.org/LockFreeProgrammingPractice.pdf collects multiple papers, including Lock-Free Linked Lists Using CAS Reclaim in general for atomic dynamic data structures is hard. Moving nodes to a free-list without trying to free() them would probably be easier, and work reasonably as long as producers actually allocate nodes from the free list. --- lfq.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lfq.c b/lfq.c index b0b8cd6..51dee7b 100755 --- a/lfq.c +++ b/lfq.c @@ -1,5 +1,6 @@ #include "cross-platform.h" #include "lfq.h" +#include #include #define MAXFREE 150 @@ -13,6 +14,7 @@ int inHP(struct lfq_ctx *ctx, struct lfq_node * lfn) { return 0; } +static void enpool(struct lfq_ctx *ctx, struct lfq_node * lfn) { volatile struct lfq_node * p; do { @@ -42,12 +44,13 @@ void free_pool(struct lfq_ctx *ctx, bool freeall ) { static void safe_free(struct lfq_ctx *ctx, struct lfq_node * lfn) { if (lfn->can_free && !inHP(ctx,lfn)) { - // free is not thread safety + // free is not thread-safe if (CAS(&ctx->is_freeing, 0, 1)) { - free(lfn); + lfn->next = (void*)-1; // poison the pointer to detect use-after-free + free(lfn); // we got the lock; actually free ctx->is_freeing = false; smb(); - } else + } else // we didn't get the lock; only add to a freelist enpool(ctx, lfn); } else enpool(ctx, lfn); @@ -164,6 +167,7 @@ void * lfq_dequeue_tid(struct lfq_ctx *ctx, int tid ) { ctx->HP[tid] = 0; return 0; } + assert(pn != (void*)-1 && "read an already-freed node"); } while( ! CAS(&ctx->head, p, pn) ); mb(); ctx->HP[tid] = 0; From 70da751e7268949a39acc4df623fb4fbe29c779d Mon Sep 17 00:00:00 2001 From: Peter Cordes Date: Thu, 6 Sep 2018 04:47:07 -0300 Subject: [PATCH 10/10] count_freelist: use ->free_next, not ->next (another branch used a union for these, hiding this bug.) --- lfq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lfq.c b/lfq.c index 51dee7b..243577b 100755 --- a/lfq.c +++ b/lfq.c @@ -98,7 +98,7 @@ long lfg_count_freelist(const struct lfq_ctx *ctx) { struct lfq_node *p = (struct lfq_node *)ctx->fph; // non-volatile while(p) { count++; - p = p->next; + p = p->free_next; } /* while(pn = p->free_next) {