diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index ea9debb..392b777 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -16,7 +16,7 @@ jobs:
 
     strategy:
       matrix:
-        os: [ubuntu-20.04, ubuntu-18.04, macos-latest]
+        os: [ubuntu-20.04, macos-latest]
         python: ["3.11"]
         go: ["1.19"]
 
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c1a1e30..7628f32 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,7 +16,7 @@ jobs:
 
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-22.04]
         python: [3.7, 3.8, "3.10", "3.11"]
         go: ["1.19", "1.16"]
 
diff --git a/lib/c/bitproto.c b/lib/c/bitproto.c
index d5d0d8b..c6c72a9 100644
--- a/lib/c/bitproto.c
+++ b/lib/c/bitproto.c
@@ -221,27 +221,44 @@ void BpCopyBufferBits(int n, unsigned char *dst, unsigned char *src, int di,
             // shifted src byte.
 
             // Number of bits to process during batch copy.
-            int bits = n + si;
+            int b = n + si;
+            int d = 8 - si;
 
-            if (bits >= 32) {
+            if (n >= 32) {
+                // Copy the remaining bits if si != 0.
+                // This way, the next iteration will ge to di == 0 again.
+                ((uint32_t *)dst)[0] = ((uint32_t *)(src))[0] >> si;
+                if (si) dst[3] |= ((src[4] << d) & ~(0xff << d << si));
+                c = 32;
+            } else if (b >= 32) {
                 // Copy as an uint32 integer.
                 // This way, performance faster x2 than bits copy approach.
                 ((uint32_t *)dst)[0] = ((uint32_t *)(src))[0] >> si;
                 c = 32 - si;
-            } else if (bits >= 16) {
+            } else if (n >= 16) {
+                ((uint16_t *)dst)[0] = ((uint16_t *)(src))[0] >> si;
+                if (si) dst[1] |= ((src[2] << d) & ~(0xff << d << si));
+
+                c = 16;
+            } else if (b >= 16) {
                 // Copy as an uint16 integer.
                 ((uint16_t *)dst)[0] = ((uint16_t *)(src))[0] >> si;
                 c = 16 - si;
-            } else if (bits >= 8) {
+            } else if (n >= 8) {
+                // Copy as an unsigned char.
+                dst[0] = (src[0] >> si) & 0xff;
+                if (si) dst[0] |= ((src[1] << d) & ~(0xff << d << si));
+                c = 8;
+            } else if (b >= 8) {
                 // Copy as an unsigned char.
                 dst[0] = (src[0] >> si) & 0xff;
                 c = 8 - si;
             } else {
-                // When bits < 8 and di == 0
+                // When b < 8 and di == 0
                 // Copy partial bits inside a byte.
                 // For the original statement:
                 // c = BpMinTriple(8 - di, 8 - si, n);
-                // since di is 0 and bits <8, then 8-di is 8
+                // since di is 0 and b <8, then 8-di is 8
                 // and n <8 , the 8-di won't be the smallest, we
                 // just pick function BpMin over BpMinTriple for the little
                 // little performance improvement.
diff --git a/tests/test_encoding/encoding-cases/arrays/c/main.c b/tests/test_encoding/encoding-cases/arrays/c/main.c
index d438ee2..8d27ef5 100644
--- a/tests/test_encoding/encoding-cases/arrays/c/main.c
+++ b/tests/test_encoding/encoding-cases/arrays/c/main.c
@@ -10,12 +10,12 @@ int main(void) {
     for (int i = 0; i < 7; i++) m.b[i] = (int32_t)(i);
     for (int i = 0; i < 7; i++) m.c[i] = (int8_t)(i);
     for (int i = 0; i < 7; i++) m.d[i] = (uint8_t)(i & 7);
-    for (int i = 0; i < 7; i++) m.e[i] = (uint32_t)(i + 118);
+    for (int i = 0; i < 7; i++) m.e[i] = (uint32_t)(i + 11811);
     for (int i = 0; i < 7; i++)
         m.f[i] = (struct Note){i, false, {1, 2, 3, 4, 5, 6, 7}};
     m.g = (struct Note){2, false, {7, 2, 3, 4, 5, 6, 7}};
     for (int i = 0; i < 7; i++)
-        for (int j = 0; j < 7; j++) m.t[i][j] = (int32_t)(i + j + 129);
+        for (int j = 0; j < 7; j++) m.t[i][j] = (int32_t)(i + j + 1291291);
     m.x[0] = -13;
     m.x[1] = -89;
     m.x[2] = 13;
diff --git a/tests/test_encoding/encoding-cases/arrays/go/main.go b/tests/test_encoding/encoding-cases/arrays/go/main.go
index de7e875..7660113 100644
--- a/tests/test_encoding/encoding-cases/arrays/go/main.go
+++ b/tests/test_encoding/encoding-cases/arrays/go/main.go
@@ -27,14 +27,14 @@ func main() {
 		m.D[i] = uint8(i & 7)
 	}
 	for i := 0; i < 7; i++ {
-		m.E[i] = uint32(i + 118)
+		m.E[i] = uint32(i + 11811)
 	}
 	for i := 0; i < 7; i++ {
 		m.F[i] = bp.Note{uint8(i), false, bp.Uint3s{1, 2, 3, 4, 5, 6, 7}}
 	}
 	for i := 0; i < 7; i++ {
 		for j := 0; j < 7; j++ {
-			m.T[i][j] = int32((i + j) + 129)
+			m.T[i][j] = int32((i + j) + 1291291)
 		}
 	}
 	m.G = bp.Note{uint8(2), false, bp.Uint3s{7, 2, 3, 4, 5, 6, 7}}
diff --git a/tests/test_encoding/encoding-cases/arrays/py/main.py b/tests/test_encoding/encoding-cases/arrays/py/main.py
index 6211a69..3db5b18 100644
--- a/tests/test_encoding/encoding-cases/arrays/py/main.py
+++ b/tests/test_encoding/encoding-cases/arrays/py/main.py
@@ -12,12 +12,12 @@ def main() -> None:
     for i in range(7):
         m.d[i] = i
     for i in range(7):
-        m.e[i] = i + 118
+        m.e[i] = i + 11811
     for i in range(7):
         m.f[i] = bp.Note(i, False, [j for j in range(1, 8)])
     for i in range(7):
         for j in range(7):
-            m.t[i][j] = i + j + 129
+            m.t[i][j] = i + j + 1291291
     m.x[0] = -13
     m.x[1] = -89
     m.x[2] = 13