diff --git a/.github/workflows/publish-server.yml b/.github/workflows/publish-server.yml index 10820457b8..e1973fe47c 100644 --- a/.github/workflows/publish-server.yml +++ b/.github/workflows/publish-server.yml @@ -118,23 +118,9 @@ jobs: context: . platforms: ${{ env.platform }} labels: ${{ steps.meta.outputs.labels }} - outputs: type=image,name=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-debug,push-by-digest=true,name-canonical=true,push=true build-args: | BUILD_DEBUG=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: digests-debug-${{ env.PLATFORM_PAIR }} - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 build-arm64: permissions: write-all diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 26eaba46cf..1903c0baff 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -159,8 +159,8 @@ jobs: target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} restore-keys: ${{ runner.os }}-cargo- - - name: check libsql remote - run: cargo check -p libsql --no-default-features -F remote + - name: build libsql all features + run: cargo build -p libsql --all-features # test-rust-wasm: # runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 04ae728065..3579f76ec7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -321,60 +321,30 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" -[[package]] -name = "aws-config" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcdcf0d683fe9c23d32cf5b53c9918ea0a500375a9fb20109802552658e576c9" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-http", - "aws-sdk-sso 0.28.0", - "aws-sdk-sts 0.28.0", - "aws-smithy-async 0.55.3", - "aws-smithy-client", - "aws-smithy-http 0.55.3", - "aws-smithy-http-tower", - "aws-smithy-json 0.55.3", - "aws-smithy-types 0.55.3", - "aws-types 0.55.3", - "bytes", - "fastrand 1.9.0", - "hex", - "http 0.2.12", - "hyper 0.14.30", - "ring 0.16.20", - "time", - "tokio", - "tower", - "tracing", - "zeroize", -] - [[package]] name = "aws-config" version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "caf6cfe2881cb1fcbba9ae946fb9a6480d3b7a714ca84c74925014a89ef3387a" dependencies = [ - "aws-credential-types 1.2.0", + "aws-credential-types", "aws-runtime", - "aws-sdk-sso 1.34.0", + "aws-sdk-sso", "aws-sdk-ssooidc", - "aws-sdk-sts 1.34.0", - "aws-smithy-async 1.2.1", - "aws-smithy-http 0.60.9", - "aws-smithy-json 0.60.7", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", - "aws-types 1.3.3", + "aws-smithy-types", + "aws-types", "bytes", - "fastrand 2.1.0", + "fastrand", "hex", "http 0.2.12", "hyper 0.14.30", - "ring 0.17.8", + "ring", "time", "tokio", "tracing", @@ -382,81 +352,34 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-credential-types" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae" -dependencies = [ - "aws-smithy-async 0.55.3", - "aws-smithy-types 0.55.3", - "fastrand 1.9.0", - "tokio", - "tracing", - "zeroize", -] - [[package]] name = "aws-credential-types" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ - "aws-smithy-async 1.2.1", + "aws-smithy-async", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", + "aws-smithy-types", "zeroize", ] -[[package]] -name = "aws-endpoint" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cce1c41a6cfaa726adee9ebb9a56fcd2bbfd8be49fd8a04c5e20fd968330b04" -dependencies = [ - "aws-smithy-http 0.55.3", - "aws-smithy-types 0.55.3", - "aws-types 0.55.3", - "http 0.2.12", - "regex", - "tracing", -] - -[[package]] -name = "aws-http" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aadbc44e7a8f3e71c8b374e03ecd972869eb91dd2bc89ed018954a52ba84bc44" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-smithy-http 0.55.3", - "aws-smithy-types 0.55.3", - "aws-types 0.55.3", - "bytes", - "http 0.2.12", - "http-body 0.4.6", - "lazy_static", - "percent-encoding", - "pin-project-lite", - "tracing", -] - [[package]] name = "aws-runtime" version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87c5f920ffd1e0526ec9e70e50bf444db50b204395a0fa7016bbf9e31ea1698f" dependencies = [ - "aws-credential-types 1.2.0", - "aws-sigv4 1.2.3", - "aws-smithy-async 1.2.1", - "aws-smithy-eventstream 0.60.4", - "aws-smithy-http 0.60.9", + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", - "aws-types 1.3.3", + "aws-smithy-types", + "aws-types", "bytes", - "fastrand 2.1.0", + "fastrand", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -465,39 +388,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "aws-sdk-s3" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fba197193cbb4bcb6aad8d99796b2291f36fa89562ded5d4501363055b0de89f" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-endpoint", - "aws-http", - "aws-sig-auth", - "aws-sigv4 0.55.3", - "aws-smithy-async 0.55.3", - "aws-smithy-checksums 0.55.3", - "aws-smithy-client", - "aws-smithy-eventstream 0.55.3", - "aws-smithy-http 0.55.3", - "aws-smithy-http-tower", - "aws-smithy-json 0.55.3", - "aws-smithy-types 0.55.3", - "aws-smithy-xml 0.55.3", - "aws-types 0.55.3", - "bytes", - "http 0.2.12", - "http-body 0.4.6", - "once_cell", - "percent-encoding", - "regex", - "tokio-stream", - "tower", - "tracing", - "url", -] - [[package]] name = "aws-sdk-s3" version = "1.40.0" @@ -505,21 +395,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8367c403fdf27690684b926a46ed9524099a69dd5dfcef62028bf4096b5b809f" dependencies = [ "ahash", - "aws-credential-types 1.2.0", + "aws-credential-types", "aws-runtime", - "aws-sigv4 1.2.3", - "aws-smithy-async 1.2.1", - "aws-smithy-checksums 0.60.11", - "aws-smithy-eventstream 0.60.4", - "aws-smithy-http 0.60.9", - "aws-smithy-json 0.60.7", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", - "aws-smithy-xml 0.60.8", - "aws-types 1.3.3", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", "bytes", - "fastrand 2.1.0", + "fastrand", "hex", "hmac", "http 0.2.12", @@ -533,46 +423,21 @@ dependencies = [ "url", ] -[[package]] -name = "aws-sdk-sso" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8b812340d86d4a766b2ca73f740dfd47a97c2dff0c06c8517a16d88241957e4" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-endpoint", - "aws-http", - "aws-sig-auth", - "aws-smithy-async 0.55.3", - "aws-smithy-client", - "aws-smithy-http 0.55.3", - "aws-smithy-http-tower", - "aws-smithy-json 0.55.3", - "aws-smithy-types 0.55.3", - "aws-types 0.55.3", - "bytes", - "http 0.2.12", - "regex", - "tokio-stream", - "tower", - "tracing", -] - [[package]] name = "aws-sdk-sso" version = "1.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdcfae7bf8b8f14cade7579ffa8956fcee91dc23633671096b4b5de7d16f682a" dependencies = [ - "aws-credential-types 1.2.0", + "aws-credential-types", "aws-runtime", - "aws-smithy-async 1.2.1", - "aws-smithy-http 0.60.9", - "aws-smithy-json 0.60.7", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", - "aws-types 1.3.3", + "aws-smithy-types", + "aws-types", "bytes", "http 0.2.12", "once_cell", @@ -586,15 +451,15 @@ version = "1.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33b30def8f02ba81276d5dbc22e7bf3bed20d62d1b175eef82680d6bdc7a6f4c" dependencies = [ - "aws-credential-types 1.2.0", + "aws-credential-types", "aws-runtime", - "aws-smithy-async 1.2.1", - "aws-smithy-http 0.60.9", - "aws-smithy-json 0.60.7", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", - "aws-types 1.3.3", + "aws-smithy-types", + "aws-types", "bytes", "http 0.2.12", "once_cell", @@ -602,102 +467,40 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-sdk-sts" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "265fac131fbfc188e5c3d96652ea90ecc676a934e3174eaaee523c6cec040b3b" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-endpoint", - "aws-http", - "aws-sig-auth", - "aws-smithy-async 0.55.3", - "aws-smithy-client", - "aws-smithy-http 0.55.3", - "aws-smithy-http-tower", - "aws-smithy-json 0.55.3", - "aws-smithy-query 0.55.3", - "aws-smithy-types 0.55.3", - "aws-smithy-xml 0.55.3", - "aws-types 0.55.3", - "bytes", - "http 0.2.12", - "regex", - "tower", - "tracing", -] - [[package]] name = "aws-sdk-sts" version = "1.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0804f840ad31537d5d1a4ec48d59de5e674ad05f1db7d3def2c9acadaf1f7e60" dependencies = [ - "aws-credential-types 1.2.0", + "aws-credential-types", "aws-runtime", - "aws-smithy-async 1.2.1", - "aws-smithy-http 0.60.9", - "aws-smithy-json 0.60.7", - "aws-smithy-query 0.60.7", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", - "aws-smithy-xml 0.60.8", - "aws-types 1.3.3", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", "http 0.2.12", "once_cell", "regex-lite", "tracing", ] -[[package]] -name = "aws-sig-auth" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b94acb10af0c879ecd5c7bdf51cda6679a0a4f4643ce630905a77673bfa3c61" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-sigv4 0.55.3", - "aws-smithy-eventstream 0.55.3", - "aws-smithy-http 0.55.3", - "aws-types 0.55.3", - "http 0.2.12", - "tracing", -] - -[[package]] -name = "aws-sigv4" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d2ce6f507be68e968a33485ced670111d1cbad161ddbbab1e313c03d37d8f4c" -dependencies = [ - "aws-smithy-eventstream 0.55.3", - "aws-smithy-http 0.55.3", - "bytes", - "form_urlencoded", - "hex", - "hmac", - "http 0.2.12", - "once_cell", - "percent-encoding", - "regex", - "sha2", - "time", - "tracing", -] - [[package]] name = "aws-sigv4" version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5df1b0fa6be58efe9d4ccc257df0a53b89cd8909e86591a13ca54817c87517be" dependencies = [ - "aws-credential-types 1.2.0", - "aws-smithy-eventstream 0.60.4", - "aws-smithy-http 0.60.9", + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", + "aws-smithy-types", "bytes", "crypto-bigint 0.5.5", "form_urlencoded", @@ -708,7 +511,7 @@ dependencies = [ "once_cell", "p256", "percent-encoding", - "ring 0.17.8", + "ring", "sha2", "subtle", "time", @@ -716,18 +519,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-smithy-async" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bda3996044c202d75b91afeb11a9afae9db9a721c6a7a427410018e286b880" -dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", - "tokio-stream", -] - [[package]] name = "aws-smithy-async" version = "1.2.1" @@ -739,35 +530,14 @@ dependencies = [ "tokio", ] -[[package]] -name = "aws-smithy-checksums" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07ed8b96d95402f3f6b8b57eb4e0e45ee365f78b1a924faf20ff6e97abf1eae6" -dependencies = [ - "aws-smithy-http 0.55.3", - "aws-smithy-types 0.55.3", - "bytes", - "crc32c", - "crc32fast", - "hex", - "http 0.2.12", - "http-body 0.4.6", - "md-5", - "pin-project-lite", - "sha1", - "sha2", - "tracing", -] - [[package]] name = "aws-smithy-checksums" version = "0.60.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c4134cf3adaeacff34d588dbe814200357b0c466d730cf1c0d8054384a2de4" dependencies = [ - "aws-smithy-http 0.60.9", - "aws-smithy-types 1.2.0", + "aws-smithy-http", + "aws-smithy-types", "bytes", "crc32c", "crc32fast", @@ -781,84 +551,26 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-client" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a86aa6e21e86c4252ad6a0e3e74da9617295d8d6e374d552be7d3059c41cedd" -dependencies = [ - "aws-smithy-async 0.55.3", - "aws-smithy-http 0.55.3", - "aws-smithy-http-tower", - "aws-smithy-types 0.55.3", - "bytes", - "fastrand 1.9.0", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.30", - "hyper-rustls 0.23.2", - "lazy_static", - "pin-project-lite", - "rustls 0.20.9", - "tokio", - "tower", - "tracing", -] - -[[package]] -name = "aws-smithy-eventstream" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460c8da5110835e3d9a717c61f5556b20d03c32a1dec57f8fc559b360f733bb8" -dependencies = [ - "aws-smithy-types 0.55.3", - "bytes", - "crc32fast", -] - [[package]] name = "aws-smithy-eventstream" version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" dependencies = [ - "aws-smithy-types 1.2.0", + "aws-smithy-types", "bytes", "crc32fast", ] -[[package]] -name = "aws-smithy-http" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b3b693869133551f135e1f2c77cb0b8277d9e3e17feaf2213f735857c4f0d28" -dependencies = [ - "aws-smithy-eventstream 0.55.3", - "aws-smithy-types 0.55.3", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.30", - "once_cell", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tokio", - "tokio-util", - "tracing", -] - [[package]] name = "aws-smithy-http" version = "0.60.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9cd0ae3d97daa0a2bf377a4d8e8e1362cae590c4a1aad0d40058ebca18eb91e" dependencies = [ - "aws-smithy-eventstream 0.60.4", + "aws-smithy-eventstream", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", + "aws-smithy-types", "bytes", "bytes-utils", "futures-core", @@ -871,48 +583,13 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-http-tower" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae4f6c5798a247fac98a867698197d9ac22643596dc3777f0c76b91917616b9" -dependencies = [ - "aws-smithy-http 0.55.3", - "aws-smithy-types 0.55.3", - "bytes", - "http 0.2.12", - "http-body 0.4.6", - "pin-project-lite", - "tower", - "tracing", -] - -[[package]] -name = "aws-smithy-json" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23f9f42fbfa96d095194a632fbac19f60077748eba536eb0b9fecc28659807f8" -dependencies = [ - "aws-smithy-types 0.55.3", -] - [[package]] name = "aws-smithy-json" version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" dependencies = [ - "aws-smithy-types 1.2.0", -] - -[[package]] -name = "aws-smithy-query" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98819eb0b04020a1c791903533b638534ae6c12e2aceda3e6e6fba015608d51d" -dependencies = [ - "aws-smithy-types 0.55.3", - "urlencoding", + "aws-smithy-types", ] [[package]] @@ -921,7 +598,7 @@ version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" dependencies = [ - "aws-smithy-types 1.2.0", + "aws-smithy-types", "urlencoding", ] @@ -931,12 +608,12 @@ version = "1.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce87155eba55e11768b8c1afa607f3e864ae82f03caf63258b37455b0ad02537" dependencies = [ - "aws-smithy-async 1.2.1", - "aws-smithy-http 0.60.9", + "aws-smithy-async", + "aws-smithy-http", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", + "aws-smithy-types", "bytes", - "fastrand 2.1.0", + "fastrand", "h2", "http 0.2.12", "http-body 0.4.6", @@ -958,8 +635,8 @@ version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30819352ed0a04ecf6a2f3477e344d2d1ba33d43e0f09ad9047c12e0d923616f" dependencies = [ - "aws-smithy-async 1.2.1", - "aws-smithy-types 1.2.0", + "aws-smithy-async", + "aws-smithy-types", "bytes", "http 0.2.12", "http 1.1.0", @@ -969,19 +646,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-smithy-types" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16a3d0bf4f324f4ef9793b86a1701d9700fbcdbd12a846da45eed104c634c6e8" -dependencies = [ - "base64-simd", - "itoa", - "num-integer", - "ryu", - "time", -] - [[package]] name = "aws-smithy-types" version = "1.2.0" @@ -1014,19 +678,10 @@ version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f280f434214856abace637b1f944d50ccca216814813acd195cdd7f206ce17f" dependencies = [ - "aws-smithy-types 1.2.0", + "aws-smithy-types", "time", ] -[[package]] -name = "aws-smithy-xml" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1b9d12875731bd07e767be7baad95700c3137b56730ec9ddeedb52a5e5ca63b" -dependencies = [ - "xmlparser", -] - [[package]] name = "aws-smithy-xml" version = "0.60.8" @@ -1036,32 +691,16 @@ dependencies = [ "xmlparser", ] -[[package]] -name = "aws-types" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd209616cc8d7bfb82f87811a5c655dc97537f592689b18743bddf5dc5c4829" -dependencies = [ - "aws-credential-types 0.55.3", - "aws-smithy-async 0.55.3", - "aws-smithy-client", - "aws-smithy-http 0.55.3", - "aws-smithy-types 0.55.3", - "http 0.2.12", - "rustc_version", - "tracing", -] - [[package]] name = "aws-types" version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" dependencies = [ - "aws-credential-types 1.2.0", - "aws-smithy-async 1.2.1", + "aws-credential-types", + "aws-smithy-async", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", + "aws-smithy-types", "rustc_version", "tracing", ] @@ -1303,8 +942,8 @@ dependencies = [ "anyhow", "arc-swap", "async-compression 0.4.11", - "aws-config 1.5.4", - "aws-sdk-s3 1.40.0", + "aws-config", + "aws-sdk-s3", "bytes", "chrono", "futures-core", @@ -1325,9 +964,9 @@ version = "0.1.14" dependencies = [ "anyhow", "async-compression 0.4.11", - "aws-config 1.5.4", - "aws-sdk-s3 1.40.0", - "aws-smithy-types 1.2.0", + "aws-config", + "aws-sdk-s3", + "aws-smithy-types", "bottomless", "bytes", "chrono", @@ -2383,15 +2022,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" -[[package]] -name = "fastrand" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] - [[package]] name = "fastrand" version = "2.1.0" @@ -3016,21 +2646,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "hyper-rustls" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" -dependencies = [ - "http 0.2.12", - "hyper 0.14.30", - "log", - "rustls 0.20.9", - "rustls-native-certs 0.6.3", - "tokio", - "tokio-rustls 0.23.4", -] - [[package]] name = "hyper-rustls" version = "0.24.1" @@ -3223,15 +2838,6 @@ dependencies = [ "similar", ] -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", -] - [[package]] name = "io-extras" version = "0.17.4" @@ -3362,7 +2968,7 @@ dependencies = [ "base64 0.21.7", "js-sys", "pem", - "ring 0.17.8", + "ring", "serde", "serde_json", "simple_asn1", @@ -3430,7 +3036,7 @@ dependencies = [ [[package]] name = "libsql" -version = "0.5.0-alpha.2" +version = "0.5.0" dependencies = [ "anyhow", "async-stream", @@ -3488,7 +3094,7 @@ dependencies = [ [[package]] name = "libsql-ffi" -version = "0.3.0" +version = "0.4.0" dependencies = [ "bindgen 0.66.1", "cc", @@ -3508,7 +3114,7 @@ dependencies = [ [[package]] name = "libsql-rusqlite" -version = "0.31.0" +version = "0.32.0" dependencies = [ "bencher", "bitflags 2.6.0", @@ -3542,8 +3148,9 @@ dependencies = [ "async-stream", "async-tempfile", "async-trait", - "aws-config 0.55.3", - "aws-sdk-s3 0.28.0", + "aws-config", + "aws-sdk-s3", + "aws-smithy-runtime", "axum", "axum-extra", "base64 0.21.7", @@ -3599,7 +3206,7 @@ dependencies = [ "regex", "reqwest", "rheaper", - "ring 0.17.8", + "ring", "rustls 0.21.12", "rustls-pemfile 1.0.4", "s3s 0.8.1", @@ -3631,7 +3238,7 @@ dependencies = [ [[package]] name = "libsql-sqlite3-parser" -version = "0.12.0" +version = "0.13.0" dependencies = [ "bitflags 2.6.0", "cc", @@ -3687,7 +3294,7 @@ dependencies = [ [[package]] name = "libsql-sys" -version = "0.6.0" +version = "0.7.0" dependencies = [ "bytes", "libsql-ffi", @@ -3702,10 +3309,11 @@ name = "libsql-wal" version = "0.1.0" dependencies = [ "arc-swap", + "async-lock 3.4.0", "async-stream", - "aws-config 1.5.4", - "aws-credential-types 1.2.0", - "aws-sdk-s3 1.40.0", + "aws-config", + "aws-credential-types", + "aws-sdk-s3", "aws-smithy-runtime", "bitflags 2.6.0", "bytes", @@ -3768,7 +3376,7 @@ dependencies = [ [[package]] name = "libsql_replication" -version = "0.4.0" +version = "0.5.0" dependencies = [ "aes", "arbitrary", @@ -3779,6 +3387,7 @@ dependencies = [ "cbc", "libsql-rusqlite", "libsql-sys", + "libsql-wal", "parking_lot", "prost", "prost-build", @@ -5009,21 +4618,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin 0.5.2", - "untrusted 0.7.1", - "web-sys", - "winapi", -] - [[package]] name = "ring" version = "0.17.8" @@ -5034,8 +4628,8 @@ dependencies = [ "cfg-if", "getrandom", "libc", - "spin 0.9.8", - "untrusted 0.9.0", + "spin", + "untrusted", "windows-sys 0.52.0", ] @@ -5099,18 +4693,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rustls" -version = "0.20.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" -dependencies = [ - "log", - "ring 0.16.20", - "sct", - "webpki", -] - [[package]] name = "rustls" version = "0.21.12" @@ -5118,7 +4700,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", - "ring 0.17.8", + "ring", "rustls-webpki 0.101.7", "sct", ] @@ -5130,7 +4712,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", - "ring 0.17.8", + "ring", "rustls-pki-types", "rustls-webpki 0.102.5", "subtle", @@ -5193,8 +4775,8 @@ version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", + "ring", + "untrusted", ] [[package]] @@ -5203,9 +4785,9 @@ version = "0.102.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9a6fccd794a42c2c105b513a2f62bc3fd8f3ba57a4593677ceb0bd035164d78" dependencies = [ - "ring 0.17.8", + "ring", "rustls-pki-types", - "untrusted 0.9.0", + "untrusted", ] [[package]] @@ -5323,9 +4905,9 @@ version = "0.10.1-dev" source = "git+https://github.com/Nugine/s3s#29bf39cb72507505d09d9d7637f57784dbfc0a40" dependencies = [ "async-trait", - "aws-sdk-s3 1.40.0", + "aws-sdk-s3", "aws-smithy-runtime-api", - "aws-smithy-types 1.2.0", + "aws-smithy-types", "aws-smithy-types-convert", "hyper 1.4.1", "s3s 0.10.1-dev", @@ -5433,8 +5015,8 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", + "ring", + "untrusted", ] [[package]] @@ -5756,12 +5338,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "spin" version = "0.9.8" @@ -5950,7 +5526,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", - "fastrand 2.1.0", + "fastrand", "rustix 0.38.34", "windows-sys 0.52.0", ] @@ -6109,17 +5685,6 @@ dependencies = [ "syn 2.0.70", ] -[[package]] -name = "tokio-rustls" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" -dependencies = [ - "rustls 0.20.9", - "tokio", - "webpki", -] - [[package]] name = "tokio-rustls" version = "0.24.1" @@ -6589,12 +6154,6 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - [[package]] name = "untrusted" version = "0.9.0" @@ -7153,16 +6712,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.22.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" -dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", -] - [[package]] name = "webpki-roots" version = "0.25.4" diff --git a/Cargo.toml b/Cargo.toml index 92487ecdd0..685f14964f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ codegen-units = 1 panic = "unwind" [workspace.dependencies] -rusqlite = { package = "libsql-rusqlite", path = "vendored/rusqlite", version = "0.31", default-features = false, features = [ +rusqlite = { package = "libsql-rusqlite", path = "vendored/rusqlite", version = "0.32", default-features = false, features = [ "libsql-experimental", "column_decltype", "load_extension", @@ -45,6 +45,7 @@ rusqlite = { package = "libsql-rusqlite", path = "vendored/rusqlite", version = ] } hyper = { version = "0.14" } tower = { version = "0.4.13" } +zerocopy = { version = "0.7.32", features = ["derive", "alloc"] } # Config for 'cargo dist' [workspace.metadata.dist] diff --git a/bottomless/src/replicator.rs b/bottomless/src/replicator.rs index f2ef812f75..cd37a70165 100644 --- a/bottomless/src/replicator.rs +++ b/bottomless/src/replicator.rs @@ -17,6 +17,8 @@ use aws_sdk_s3::primitives::ByteStream; use aws_sdk_s3::{Client, Config}; use bytes::{Buf, Bytes}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use libsql_replication::injector::Injector as _; +use libsql_replication::rpc::replication::Frame as RpcFrame; use libsql_sys::{Cipher, EncryptionConfig}; use std::ops::Deref; use std::path::{Path, PathBuf}; @@ -1449,12 +1451,13 @@ impl Replicator { db_path: &Path, ) -> Result { let encryption_config = self.encryption_config.clone(); - let mut injector = libsql_replication::injector::Injector::new( - db_path, + let mut injector = libsql_replication::injector::SqliteInjector::new( + db_path.to_path_buf(), 4096, libsql_sys::connection::NO_AUTOCHECKPOINT, encryption_config, - )?; + ) + .await?; let prefix = format!("{}-{}/", self.db_name, generation); let mut page_buf = { let mut v = Vec::with_capacity(page_size); @@ -1552,7 +1555,11 @@ impl Replicator { }, page_buf.as_slice(), ); - injector.inject_frame(frame_to_inject)?; + let frame = RpcFrame { + data: frame_to_inject.bytes(), + timestamp: None, + }; + injector.inject_frame(frame).await?; applied_wal_frame = true; } } diff --git a/libsql-ffi/Cargo.toml b/libsql-ffi/Cargo.toml index 9b5cbced11..ef9ade1726 100644 --- a/libsql-ffi/Cargo.toml +++ b/libsql-ffi/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql-ffi" -version = "0.3.0" +version = "0.4.0" edition = "2021" build = "build.rs" license = "MIT" diff --git a/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c b/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c index 9d51c1d645..3a76f9cff3 100644 --- a/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c +++ b/libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c @@ -28,6 +28,7 @@ ** README.md ** configure ** configure.ac +** ext/fts5/fts5_tokenize.c ** ext/jni/src/org/sqlite/jni/capi/CollationNeededCallback.java ** ext/jni/src/org/sqlite/jni/capi/CommitHookCallback.java ** ext/jni/src/org/sqlite/jni/capi/PreupdateHookCallback.java @@ -69,6 +70,7 @@ ** src/test2.c ** src/test3.c ** src/test8.c +** src/vacuum.c ** src/vdbe.c ** src/vdbeInt.h ** src/vdbeapi.c @@ -155952,6 +155954,10 @@ SQLITE_PRIVATE void sqlite3UpsertDoUpdate( /* #include "sqliteInt.h" */ /* #include "vdbeInt.h" */ +#ifndef SQLITE_OMIT_VECTOR +/* #include "vectorIndexInt.h" */ +#endif + #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* @@ -156229,6 +156235,27 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( if( rc!=SQLITE_OK ) goto end_of_vacuum; db->init.iDb = 0; +#ifndef SQLITE_OMIT_VECTOR + // shadow tables for vector index will be populated automatically during CREATE INDEX command + // so we must skip them at this step + if( sqlite3FindTable(db, VECTOR_INDEX_GLOBAL_META_TABLE, zDbMain) != NULL ){ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0 AND name NOT IN (SELECT name||'_shadow' FROM " VECTOR_INDEX_GLOBAL_META_TABLE ")", + zDbMain + ); + }else{ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0", + zDbMain + ); + } +#else /* Loop through the tables in the main database. For each, do ** an "INSERT INTO vacuum_db.xxx SELECT * FROM main.xxx;" to copy ** the contents to the temporary database. @@ -156240,6 +156267,7 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( "WHERE type='table'AND coalesce(rootpage,1)>0", zDbMain ); +#endif assert( (db->mDbFlags & DBFLAG_Vacuum)!=0 ); db->mDbFlags &= ~DBFLAG_Vacuum; if( rc!=SQLITE_OK ) goto end_of_vacuum; @@ -211974,6 +212002,7 @@ int diskAnnCreateIndex( int type, dims; u64 maxNeighborsParam, blockSizeBytes; char *zSql; + const char *zRowidColumnName; char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){ @@ -212041,6 +212070,7 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "index_key"; }else{ zSql = sqlite3MPrintf( db, @@ -212050,9 +212080,31 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "rowid"; } rc = sqlite3_exec(db, zSql, 0, 0, 0); sqlite3DbFree(db, zSql); + if( rc != SQLITE_OK ){ + return rc; + } + /* + * vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB) + * in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell + * + * as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them + * so, in order to remove this overhead for random row selection - we creating an index with just single column used + * in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small + */ + zSql = sqlite3MPrintf( + db, + "CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)", + zDbSName, + zIdxName, + zIdxName, + zRowidColumnName + ); + rc = sqlite3_exec(db, zSql, 0, 0, 0); + sqlite3DbFree(db, zSql); return rc; } @@ -212082,8 +212134,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid) zSql = sqlite3MPrintf( pIndex->db, - "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)", - pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow + "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)", + pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow ); if( zSql == NULL ){ rc = SQLITE_NOMEM_BKPT; @@ -213658,11 +213710,6 @@ int vectorF64ParseSqliteBlob( ** VectorIdxParams utilities ****************************************************************************/ -// VACUUM creates tables and indices first and only then populate data -// we need to ignore inserts from 'INSERT INTO vacuum.t SELECT * FROM t' statements because -// all shadow tables will be populated by VACUUM process during regular process of table copy -#define IsVacuum(db) ((db->mDbFlags&DBFLAG_Vacuum)!=0) - void vectorIdxParamsInit(VectorIdxParams *pParams, u8 *pBinBuf, int nBinSize) { assert( nBinSize <= VECTOR_INDEX_PARAMS_BUF_SIZE ); @@ -214381,10 +214428,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { // this is done to prevent unrecoverable situations where index were dropped but index parameters deletion failed and second attempt will fail on first step int rcIdx, rcParams; - if( IsVacuum(db) ){ - return SQLITE_OK; - } - assert( zDbSName != NULL ); rcIdx = diskAnnDropIndex(db, zDbSName, zIdxName); @@ -214395,10 +214438,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { assert( zDbSName != NULL ); - if( IsVacuum(db) ){ - return SQLITE_OK; - } - return diskAnnClearIndex(db, zDbSName, zIdxName); } @@ -214408,7 +214447,7 @@ int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { * this made intentionally in order to natively support upload of SQLite dumps * * dump populates tables first and create indices after - * so we must omit them because shadow tables already filled + * so we must omit index refill setp because shadow tables already filled * * 1. in case of any error :-1 returned (and pParse errMsg is populated with some error message) * 2. if vector index must not be created : 0 returned @@ -214426,10 +214465,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co int hasLibsqlVectorIdxFn = 0, hasCollation = 0; const char *pzErrMsg; - if( IsVacuum(pParse->db) ){ - return CREATE_IGNORE; - } - assert( zDbSName != NULL ); sqlite3 *db = pParse->db; @@ -214488,11 +214523,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: must contain exactly one column wrapped into the " VECTOR_INDEX_MARKER_FUNCTION " function"); return CREATE_FAIL; } - // we are able to support this but I doubt this works for now - more polishing required to make this work - if( pIdx->pPartIdxWhere != NULL ) { - sqlite3ErrorMsg(pParse, "vector index: where condition is forbidden"); - return CREATE_FAIL; - } pArgsList = pIdx->aColExpr->a[0].pExpr->x.pList; pListItem = pArgsList->a; @@ -214582,7 +214612,6 @@ int vectorIndexSearch( VectorIdxParams idxParams; vectorIdxParamsInit(&idxParams, NULL, 0); - assert( !IsVacuum(db) ); assert( zDbSName != NULL ); if( argc != 3 ){ @@ -214667,10 +214696,6 @@ int vectorIndexInsert( int rc; VectorInRow vectorInRow; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - rc = vectorInRowAlloc(pCur->db, pRecord, &vectorInRow, pzErrMsg); if( rc != SQLITE_OK ){ return rc; @@ -214690,10 +214715,6 @@ int vectorIndexDelete( ){ VectorInRow payload; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - payload.pVector = NULL; payload.nKeys = r->nField - 1; payload.pKeyValues = r->aMem + 1; @@ -259749,40 +259770,46 @@ static int fts5TriCreate( Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); - UNUSED_PARAM(pUnused); - if( pNew==0 ){ - rc = SQLITE_NOMEM; + TrigramTokenizer *pNew = 0; + + if( nArg%2 ){ + rc = SQLITE_ERROR; }else{ - int i; - pNew->bFold = 1; - pNew->iFoldParam = 0; - for(i=0; rc==SQLITE_OK && ibFold = 1; + pNew->iFoldParam = 0; + for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); + } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ - pNew->bFold = (zArg[0]=='0'); - } - }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ - if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; - }else{ - pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } - }else{ - rc = SQLITE_ERROR; } - } - if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ - rc = SQLITE_ERROR; - } + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } - if( rc!=SQLITE_OK ){ - fts5TriDelete((Fts5Tokenizer*)pNew); - pNew = 0; + if( rc!=SQLITE_OK ){ + fts5TriDelete((Fts5Tokenizer*)pNew); + pNew = 0; + } } } *ppOut = (Fts5Tokenizer*)pNew; diff --git a/libsql-ffi/bundled/bindings/bindgen.rs b/libsql-ffi/bundled/bindings/bindgen.rs index 9dec505c10..cc73807f33 100644 --- a/libsql-ffi/bundled/bindings/bindgen.rs +++ b/libsql-ffi/bundled/bindings/bindgen.rs @@ -940,7 +940,7 @@ extern "C" { extern "C" { pub fn sqlite3_vmprintf( arg1: *const ::std::os::raw::c_char, - arg2: va_list, + arg2: *mut __va_list_tag, ) -> *mut ::std::os::raw::c_char; } extern "C" { @@ -956,7 +956,7 @@ extern "C" { arg1: ::std::os::raw::c_int, arg2: *mut ::std::os::raw::c_char, arg3: *const ::std::os::raw::c_char, - arg4: va_list, + arg4: *mut __va_list_tag, ) -> *mut ::std::os::raw::c_char; } extern "C" { @@ -2503,7 +2503,7 @@ extern "C" { pub fn sqlite3_str_vappendf( arg1: *mut sqlite3_str, zFormat: *const ::std::os::raw::c_char, - arg2: va_list, + arg2: *mut __va_list_tag, ); } extern "C" { @@ -3524,4 +3524,12 @@ extern "C" { extern "C" { pub static sqlite3_wal_manager: libsql_wal_manager; } -pub type __builtin_va_list = *mut ::std::os::raw::c_char; +pub type __builtin_va_list = [__va_list_tag; 1usize]; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct __va_list_tag { + pub gp_offset: ::std::os::raw::c_uint, + pub fp_offset: ::std::os::raw::c_uint, + pub overflow_arg_area: *mut ::std::os::raw::c_void, + pub reg_save_area: *mut ::std::os::raw::c_void, +} diff --git a/libsql-ffi/bundled/src/sqlite3.c b/libsql-ffi/bundled/src/sqlite3.c index 9d51c1d645..3a76f9cff3 100644 --- a/libsql-ffi/bundled/src/sqlite3.c +++ b/libsql-ffi/bundled/src/sqlite3.c @@ -28,6 +28,7 @@ ** README.md ** configure ** configure.ac +** ext/fts5/fts5_tokenize.c ** ext/jni/src/org/sqlite/jni/capi/CollationNeededCallback.java ** ext/jni/src/org/sqlite/jni/capi/CommitHookCallback.java ** ext/jni/src/org/sqlite/jni/capi/PreupdateHookCallback.java @@ -69,6 +70,7 @@ ** src/test2.c ** src/test3.c ** src/test8.c +** src/vacuum.c ** src/vdbe.c ** src/vdbeInt.h ** src/vdbeapi.c @@ -155952,6 +155954,10 @@ SQLITE_PRIVATE void sqlite3UpsertDoUpdate( /* #include "sqliteInt.h" */ /* #include "vdbeInt.h" */ +#ifndef SQLITE_OMIT_VECTOR +/* #include "vectorIndexInt.h" */ +#endif + #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* @@ -156229,6 +156235,27 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( if( rc!=SQLITE_OK ) goto end_of_vacuum; db->init.iDb = 0; +#ifndef SQLITE_OMIT_VECTOR + // shadow tables for vector index will be populated automatically during CREATE INDEX command + // so we must skip them at this step + if( sqlite3FindTable(db, VECTOR_INDEX_GLOBAL_META_TABLE, zDbMain) != NULL ){ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0 AND name NOT IN (SELECT name||'_shadow' FROM " VECTOR_INDEX_GLOBAL_META_TABLE ")", + zDbMain + ); + }else{ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0", + zDbMain + ); + } +#else /* Loop through the tables in the main database. For each, do ** an "INSERT INTO vacuum_db.xxx SELECT * FROM main.xxx;" to copy ** the contents to the temporary database. @@ -156240,6 +156267,7 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( "WHERE type='table'AND coalesce(rootpage,1)>0", zDbMain ); +#endif assert( (db->mDbFlags & DBFLAG_Vacuum)!=0 ); db->mDbFlags &= ~DBFLAG_Vacuum; if( rc!=SQLITE_OK ) goto end_of_vacuum; @@ -211974,6 +212002,7 @@ int diskAnnCreateIndex( int type, dims; u64 maxNeighborsParam, blockSizeBytes; char *zSql; + const char *zRowidColumnName; char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){ @@ -212041,6 +212070,7 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "index_key"; }else{ zSql = sqlite3MPrintf( db, @@ -212050,9 +212080,31 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "rowid"; } rc = sqlite3_exec(db, zSql, 0, 0, 0); sqlite3DbFree(db, zSql); + if( rc != SQLITE_OK ){ + return rc; + } + /* + * vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB) + * in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell + * + * as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them + * so, in order to remove this overhead for random row selection - we creating an index with just single column used + * in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small + */ + zSql = sqlite3MPrintf( + db, + "CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)", + zDbSName, + zIdxName, + zIdxName, + zRowidColumnName + ); + rc = sqlite3_exec(db, zSql, 0, 0, 0); + sqlite3DbFree(db, zSql); return rc; } @@ -212082,8 +212134,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid) zSql = sqlite3MPrintf( pIndex->db, - "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)", - pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow + "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)", + pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow ); if( zSql == NULL ){ rc = SQLITE_NOMEM_BKPT; @@ -213658,11 +213710,6 @@ int vectorF64ParseSqliteBlob( ** VectorIdxParams utilities ****************************************************************************/ -// VACUUM creates tables and indices first and only then populate data -// we need to ignore inserts from 'INSERT INTO vacuum.t SELECT * FROM t' statements because -// all shadow tables will be populated by VACUUM process during regular process of table copy -#define IsVacuum(db) ((db->mDbFlags&DBFLAG_Vacuum)!=0) - void vectorIdxParamsInit(VectorIdxParams *pParams, u8 *pBinBuf, int nBinSize) { assert( nBinSize <= VECTOR_INDEX_PARAMS_BUF_SIZE ); @@ -214381,10 +214428,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { // this is done to prevent unrecoverable situations where index were dropped but index parameters deletion failed and second attempt will fail on first step int rcIdx, rcParams; - if( IsVacuum(db) ){ - return SQLITE_OK; - } - assert( zDbSName != NULL ); rcIdx = diskAnnDropIndex(db, zDbSName, zIdxName); @@ -214395,10 +214438,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { assert( zDbSName != NULL ); - if( IsVacuum(db) ){ - return SQLITE_OK; - } - return diskAnnClearIndex(db, zDbSName, zIdxName); } @@ -214408,7 +214447,7 @@ int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { * this made intentionally in order to natively support upload of SQLite dumps * * dump populates tables first and create indices after - * so we must omit them because shadow tables already filled + * so we must omit index refill setp because shadow tables already filled * * 1. in case of any error :-1 returned (and pParse errMsg is populated with some error message) * 2. if vector index must not be created : 0 returned @@ -214426,10 +214465,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co int hasLibsqlVectorIdxFn = 0, hasCollation = 0; const char *pzErrMsg; - if( IsVacuum(pParse->db) ){ - return CREATE_IGNORE; - } - assert( zDbSName != NULL ); sqlite3 *db = pParse->db; @@ -214488,11 +214523,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: must contain exactly one column wrapped into the " VECTOR_INDEX_MARKER_FUNCTION " function"); return CREATE_FAIL; } - // we are able to support this but I doubt this works for now - more polishing required to make this work - if( pIdx->pPartIdxWhere != NULL ) { - sqlite3ErrorMsg(pParse, "vector index: where condition is forbidden"); - return CREATE_FAIL; - } pArgsList = pIdx->aColExpr->a[0].pExpr->x.pList; pListItem = pArgsList->a; @@ -214582,7 +214612,6 @@ int vectorIndexSearch( VectorIdxParams idxParams; vectorIdxParamsInit(&idxParams, NULL, 0); - assert( !IsVacuum(db) ); assert( zDbSName != NULL ); if( argc != 3 ){ @@ -214667,10 +214696,6 @@ int vectorIndexInsert( int rc; VectorInRow vectorInRow; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - rc = vectorInRowAlloc(pCur->db, pRecord, &vectorInRow, pzErrMsg); if( rc != SQLITE_OK ){ return rc; @@ -214690,10 +214715,6 @@ int vectorIndexDelete( ){ VectorInRow payload; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - payload.pVector = NULL; payload.nKeys = r->nField - 1; payload.pKeyValues = r->aMem + 1; @@ -259749,40 +259770,46 @@ static int fts5TriCreate( Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); - UNUSED_PARAM(pUnused); - if( pNew==0 ){ - rc = SQLITE_NOMEM; + TrigramTokenizer *pNew = 0; + + if( nArg%2 ){ + rc = SQLITE_ERROR; }else{ - int i; - pNew->bFold = 1; - pNew->iFoldParam = 0; - for(i=0; rc==SQLITE_OK && ibFold = 1; + pNew->iFoldParam = 0; + for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); + } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ - pNew->bFold = (zArg[0]=='0'); - } - }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ - if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; - }else{ - pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } - }else{ - rc = SQLITE_ERROR; } - } - if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ - rc = SQLITE_ERROR; - } + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } - if( rc!=SQLITE_OK ){ - fts5TriDelete((Fts5Tokenizer*)pNew); - pNew = 0; + if( rc!=SQLITE_OK ){ + fts5TriDelete((Fts5Tokenizer*)pNew); + pNew = 0; + } } } *ppOut = (Fts5Tokenizer*)pNew; diff --git a/libsql-replication/Cargo.toml b/libsql-replication/Cargo.toml index 56f00d7a7d..068e23a652 100644 --- a/libsql-replication/Cargo.toml +++ b/libsql-replication/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql_replication" -version = "0.4.0" +version = "0.5.0" edition = "2021" description = "libSQL replication protocol" repository = "https://github.com/tursodatabase/libsql" @@ -11,7 +11,8 @@ license = "MIT" [dependencies] tonic = { version = "0.11", features = ["tls"] } prost = "0.12" -libsql-sys = { version = "0.6", path = "../libsql-sys", default-features = false, features = ["wal", "rusqlite", "api"] } +libsql-sys = { version = "0.7", path = "../libsql-sys", default-features = false, features = ["wal", "rusqlite", "api"] } +libsql-wal = { path = "../libsql-wal/", optional = true } rusqlite = { workspace = true } parking_lot = "0.12.1" bytes = { version = "1.5.0", features = ["serde"] } @@ -37,3 +38,4 @@ tonic-build = "0.11" [features] encryption = ["libsql-sys/encryption"] +libsql_wal = ["dep:libsql-wal"] diff --git a/libsql-replication/proto/replication_log.proto b/libsql-replication/proto/replication_log.proto index 6208874609..b358232705 100644 --- a/libsql-replication/proto/replication_log.proto +++ b/libsql-replication/proto/replication_log.proto @@ -5,6 +5,12 @@ import "metadata.proto"; message LogOffset { uint64 next_offset = 1; + enum WalFlavor { + Sqlite = 0; + Libsql = 1; + } + // the type of wal frames that the client is expecting + optional WalFlavor wal_flavor = 2; } message HelloRequest { diff --git a/libsql-replication/src/frame.rs b/libsql-replication/src/frame.rs index a6a2854e52..55b5b778b5 100644 --- a/libsql-replication/src/frame.rs +++ b/libsql-replication/src/frame.rs @@ -13,7 +13,6 @@ use crate::LIBSQL_PAGE_SIZE; pub type FrameNo = u64; /// The file header for the WAL log. All fields are represented in little-endian ordering. -/// See `encode` and `decode` for actual layout. // repr C for stable sizing #[repr(C)] #[derive(Debug, Clone, Copy, zerocopy::FromZeroes, zerocopy::FromBytes, zerocopy::AsBytes)] @@ -22,7 +21,7 @@ pub struct FrameHeader { pub frame_no: lu64, /// Rolling checksum of all the previous frames, including this one. pub checksum: lu64, - /// page number, if frame_type is FrameType::Page + /// page number pub page_no: lu32, /// Size of the database (in page) after committing the transaction. This is passed from sqlite, /// and serves as commit transaction boundary diff --git a/libsql-replication/src/generated/wal_log.rs b/libsql-replication/src/generated/wal_log.rs index 2d7330e732..a34d5e59dd 100644 --- a/libsql-replication/src/generated/wal_log.rs +++ b/libsql-replication/src/generated/wal_log.rs @@ -4,6 +4,48 @@ pub struct LogOffset { #[prost(uint64, tag = "1")] pub next_offset: u64, + /// the type of wal frames that the client is expecting + #[prost(enumeration = "log_offset::WalFlavor", optional, tag = "2")] + pub wal_flavor: ::core::option::Option, +} +/// Nested message and enum types in `LogOffset`. +pub mod log_offset { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum WalFlavor { + Sqlite = 0, + Libsql = 1, + } + impl WalFlavor { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + WalFlavor::Sqlite => "Sqlite", + WalFlavor::Libsql => "Libsql", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "Sqlite" => Some(Self::Sqlite), + "Libsql" => Some(Self::Libsql), + _ => None, + } + } + } } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/libsql-replication/src/injector/error.rs b/libsql-replication/src/injector/error.rs index 14899089ea..ac8f1be711 100644 --- a/libsql-replication/src/injector/error.rs +++ b/libsql-replication/src/injector/error.rs @@ -1,9 +1,12 @@ +pub type Result = std::result::Result; +pub type BoxError = Box; + #[derive(Debug, thiserror::Error)] pub enum Error { #[error("IO error: {0}")] Io(#[from] std::io::Error), #[error("SQLite error: {0}")] Sqlite(#[from] rusqlite::Error), - #[error("A fatal error occured injecting frames")] - FatalInjectError, + #[error("A fatal error occured injecting frames: {0}")] + FatalInjectError(BoxError), } diff --git a/libsql-replication/src/injector/libsql_injector.rs b/libsql-replication/src/injector/libsql_injector.rs new file mode 100644 index 0000000000..7c01522e1d --- /dev/null +++ b/libsql-replication/src/injector/libsql_injector.rs @@ -0,0 +1,52 @@ +use std::mem::size_of; + +use libsql_wal::io::StdIO; +use libsql_wal::replication::injector::Injector; +use libsql_wal::segment::Frame as WalFrame; +use zerocopy::{AsBytes, FromZeroes}; + +use crate::frame::FrameNo; +use crate::rpc::replication::Frame as RpcFrame; + +use super::error::{Error, Result}; + +pub struct LibsqlInjector { + injector: Injector, +} + +impl LibsqlInjector { + pub fn new(injector: Injector) -> Self { + Self { injector } + } +} + +impl super::Injector for LibsqlInjector { + async fn inject_frame(&mut self, frame: RpcFrame) -> Result> { + // this is a bit annoying be we want to read the frame, and it has to be aligned, so we + // must copy it... + // FIXME: optimize this. + let mut wal_frame = WalFrame::new_box_zeroed(); + if frame.data.len() != size_of::() { + todo!("invalid frame"); + } + wal_frame.as_bytes_mut().copy_from_slice(&frame.data[..]); + + Ok(self + .injector + .insert_frame(wal_frame) + .await + .map_err(|e| Error::FatalInjectError(e.into()))?) + } + + async fn rollback(&mut self) { + self.injector.rollback(); + } + + async fn flush(&mut self) -> Result> { + self.injector + .flush(None) + .await + .map_err(|e| Error::FatalInjectError(e.into()))?; + Ok(None) + } +} diff --git a/libsql-replication/src/injector/mod.rs b/libsql-replication/src/injector/mod.rs index 80443964fe..b139f07cc9 100644 --- a/libsql-replication/src/injector/mod.rs +++ b/libsql-replication/src/injector/mod.rs @@ -1,299 +1,32 @@ -use std::path::Path; -use std::sync::Arc; -use std::{collections::VecDeque, path::PathBuf}; +use std::future::Future; -use parking_lot::Mutex; -use rusqlite::OpenFlags; +use super::rpc::replication::Frame as RpcFrame; +#[cfg(feature = "libsql_wal")] +pub use libsql_injector::LibsqlInjector; +pub use sqlite_injector::SqliteInjector; -use crate::frame::{Frame, FrameNo}; +use crate::frame::FrameNo; pub use error::Error; - -use self::injector_wal::{ - InjectorWal, InjectorWalManager, LIBSQL_INJECT_FATAL, LIBSQL_INJECT_OK, LIBSQL_INJECT_OK_TXN, -}; +use error::Result; mod error; -mod headers; -mod injector_wal; - -#[derive(Debug)] -pub enum InjectError {} - -pub type FrameBuffer = Arc>>; - -pub struct Injector { - /// The injector is in a transaction state - is_txn: bool, - /// Buffer for holding current transaction frames - buffer: FrameBuffer, - /// Maximum capacity of the frame buffer - capacity: usize, - /// Injector connection - // connection must be dropped before the hook context - connection: Arc>>, - biggest_uncommitted_seen: FrameNo, - - // Connection config items used to recreate the injection connection - path: PathBuf, - encryption_config: Option, - auto_checkpoint: u32, -} - -/// Methods from this trait are called before and after performing a frame injection. -/// This trait trait is used to record the last committed frame_no to the log. -/// The implementer can persist the pre and post commit frame no, and compare them in the event of -/// a crash; if the pre and post commit frame_no don't match, then the log may be corrupted. -impl Injector { - pub fn new( - path: impl AsRef, - capacity: usize, - auto_checkpoint: u32, - encryption_config: Option, - ) -> Result { - let path = path.as_ref().to_path_buf(); - - let buffer = FrameBuffer::default(); - let wal_manager = InjectorWalManager::new(buffer.clone()); - let connection = libsql_sys::Connection::open( - &path, - OpenFlags::SQLITE_OPEN_READ_WRITE - | OpenFlags::SQLITE_OPEN_CREATE - | OpenFlags::SQLITE_OPEN_URI - | OpenFlags::SQLITE_OPEN_NO_MUTEX, - wal_manager, - auto_checkpoint, - encryption_config.clone(), - )?; - - Ok(Self { - is_txn: false, - buffer, - capacity, - connection: Arc::new(Mutex::new(connection)), - biggest_uncommitted_seen: 0, - - path, - encryption_config, - auto_checkpoint, - }) - } - - /// Inject a frame into the log. If this was a commit frame, returns Ok(Some(FrameNo)). - pub fn inject_frame(&mut self, frame: Frame) -> Result, Error> { - let frame_close_txn = frame.header().size_after.get() != 0; - self.buffer.lock().push_back(frame); - if frame_close_txn || self.buffer.lock().len() >= self.capacity { - return self.flush(); - } +#[cfg(feature = "libsql_wal")] +mod libsql_injector; +mod sqlite_injector; - Ok(None) - } +pub trait Injector { + /// Inject a singular frame. + fn inject_frame( + &mut self, + frame: RpcFrame, + ) -> impl Future>> + Send; - pub fn rollback(&mut self) { - let conn = self.connection.lock(); - let mut rollback = conn.prepare_cached("ROLLBACK").unwrap(); - let _ = rollback.execute(()); - self.is_txn = false; - } + /// Discard any uncommintted frames. + fn rollback(&mut self) -> impl Future + Send; /// Flush the buffer to libsql WAL. /// Trigger a dummy write, and flush the cache to trigger a call to xFrame. The buffer's frame /// are then injected into the wal. - pub fn flush(&mut self) -> Result, Error> { - match self.try_flush() { - Err(e) => { - // something went wrong, rollback the connection to make sure we can retry in a - // clean state - self.biggest_uncommitted_seen = 0; - self.rollback(); - Err(e) - } - Ok(ret) => Ok(ret), - } - } - - fn try_flush(&mut self) -> Result, Error> { - if !self.is_txn { - self.begin_txn()?; - } - - let lock = self.buffer.lock(); - // the frames in the buffer are either monotonically increasing (log) or decreasing - // (snapshot). Either way, we want to find the biggest frameno we're about to commit, and - // that is either the front or the back of the buffer - let last_frame_no = match lock.back().zip(lock.front()) { - Some((b, f)) => f.header().frame_no.get().max(b.header().frame_no.get()), - None => { - tracing::trace!("nothing to inject"); - return Ok(None); - } - }; - - self.biggest_uncommitted_seen = self.biggest_uncommitted_seen.max(last_frame_no); - - drop(lock); - - let connection = self.connection.lock(); - // use prepare cached to avoid parsing the same statement over and over again. - let mut stmt = - connection.prepare_cached("INSERT INTO libsql_temp_injection VALUES (42)")?; - - // We execute the statement, and then force a call to xframe if necesacary. If the execute - // succeeds, then xframe wasn't called, in this case, we call cache_flush, and then process - // the error. - // It is unexpected that execute flushes, but it is possible, so we handle that case. - match stmt.execute(()).and_then(|_| connection.cache_flush()) { - Ok(_) => panic!("replication hook was not called"), - Err(e) => { - if let Some(e) = e.sqlite_error() { - if e.extended_code == LIBSQL_INJECT_OK { - // refresh schema - connection.pragma_update(None, "writable_schema", "reset")?; - let mut rollback = connection.prepare_cached("ROLLBACK")?; - let _ = rollback.execute(()); - self.is_txn = false; - assert!(self.buffer.lock().is_empty()); - let commit_frame_no = self.biggest_uncommitted_seen; - self.biggest_uncommitted_seen = 0; - return Ok(Some(commit_frame_no)); - } else if e.extended_code == LIBSQL_INJECT_OK_TXN { - self.is_txn = true; - assert!(self.buffer.lock().is_empty()); - return Ok(None); - } else if e.extended_code == LIBSQL_INJECT_FATAL { - return Err(Error::FatalInjectError); - } - } - - Err(Error::FatalInjectError) - } - } - } - - fn begin_txn(&mut self) -> Result<(), Error> { - let mut conn = self.connection.lock(); - - { - let wal_manager = InjectorWalManager::new(self.buffer.clone()); - let new_conn = libsql_sys::Connection::open( - &self.path, - OpenFlags::SQLITE_OPEN_READ_WRITE - | OpenFlags::SQLITE_OPEN_CREATE - | OpenFlags::SQLITE_OPEN_URI - | OpenFlags::SQLITE_OPEN_NO_MUTEX, - wal_manager, - self.auto_checkpoint, - self.encryption_config.clone(), - )?; - - let _ = std::mem::replace(&mut *conn, new_conn); - } - - conn.pragma_update(None, "writable_schema", "true")?; - - let mut stmt = conn.prepare_cached("BEGIN IMMEDIATE")?; - stmt.execute(())?; - // we create a dummy table. This table MUST not be persisted, otherwise the replica schema - // would differ with the primary's. - let mut stmt = - conn.prepare_cached("CREATE TABLE IF NOT EXISTS libsql_temp_injection (x)")?; - stmt.execute(())?; - - Ok(()) - } - - pub fn clear_buffer(&mut self) { - self.buffer.lock().clear() - } - - #[cfg(test)] - pub fn is_txn(&self) -> bool { - self.is_txn - } -} - -#[cfg(test)] -mod test { - use crate::frame::FrameBorrowed; - use std::mem::size_of; - - use super::*; - /// this this is generated by creating a table test, inserting 5 rows into it, and then - /// truncating the wal file of it's header. - const WAL: &[u8] = include_bytes!("../../assets/test/test_wallog"); - - fn wal_log() -> impl Iterator { - WAL.chunks(size_of::()) - .map(|b| Frame::try_from(b).unwrap()) - } - - #[test] - fn test_simple_inject_frames() { - let temp = tempfile::tempdir().unwrap(); - - let mut injector = Injector::new(temp.path().join("data"), 10, 10000, None).unwrap(); - let log = wal_log(); - for frame in log { - injector.inject_frame(frame).unwrap(); - } - - let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); - - conn.query_row("SELECT COUNT(*) FROM test", (), |row| { - assert_eq!(row.get::<_, usize>(0).unwrap(), 5); - Ok(()) - }) - .unwrap(); - } - - #[test] - fn test_inject_frames_split_txn() { - let temp = tempfile::tempdir().unwrap(); - - // inject one frame at a time - let mut injector = Injector::new(temp.path().join("data"), 1, 10000, None).unwrap(); - let log = wal_log(); - for frame in log { - injector.inject_frame(frame).unwrap(); - } - - let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); - - conn.query_row("SELECT COUNT(*) FROM test", (), |row| { - assert_eq!(row.get::<_, usize>(0).unwrap(), 5); - Ok(()) - }) - .unwrap(); - } - - #[test] - fn test_inject_partial_txn_isolated() { - let temp = tempfile::tempdir().unwrap(); - - // inject one frame at a time - let mut injector = Injector::new(temp.path().join("data"), 10, 1000, None).unwrap(); - let mut frames = wal_log(); - - assert!(injector - .inject_frame(frames.next().unwrap()) - .unwrap() - .is_none()); - let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); - assert!(conn - .query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) - .is_err()); - - while injector - .inject_frame(frames.next().unwrap()) - .unwrap() - .is_none() - {} - - // reset schema - conn.pragma_update(None, "writable_schema", "reset") - .unwrap(); - conn.query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) - .unwrap(); - } + fn flush(&mut self) -> impl Future>> + Send; } diff --git a/libsql-replication/src/injector/headers.rs b/libsql-replication/src/injector/sqlite_injector/headers.rs similarity index 100% rename from libsql-replication/src/injector/headers.rs rename to libsql-replication/src/injector/sqlite_injector/headers.rs diff --git a/libsql-replication/src/injector/injector_wal.rs b/libsql-replication/src/injector/sqlite_injector/injector_wal.rs similarity index 100% rename from libsql-replication/src/injector/injector_wal.rs rename to libsql-replication/src/injector/sqlite_injector/injector_wal.rs diff --git a/libsql-replication/src/injector/sqlite_injector/mod.rs b/libsql-replication/src/injector/sqlite_injector/mod.rs new file mode 100644 index 0000000000..f6ce2aa89f --- /dev/null +++ b/libsql-replication/src/injector/sqlite_injector/mod.rs @@ -0,0 +1,348 @@ +use std::path::Path; +use std::sync::Arc; +use std::{collections::VecDeque, path::PathBuf}; + +use parking_lot::Mutex; +use rusqlite::OpenFlags; +use tokio::task::spawn_blocking; + +use crate::frame::{Frame, FrameNo}; +use crate::rpc::replication::Frame as RpcFrame; + +use self::injector_wal::{ + InjectorWal, InjectorWalManager, LIBSQL_INJECT_FATAL, LIBSQL_INJECT_OK, LIBSQL_INJECT_OK_TXN, +}; + +use super::error::Result; +use super::{Error, Injector}; + +mod headers; +mod injector_wal; + +pub type FrameBuffer = Arc>>; + +pub struct SqliteInjector { + pub(in super::super) inner: Arc>, +} + +impl Injector for SqliteInjector { + async fn inject_frame(&mut self, frame: RpcFrame) -> Result> { + let inner = self.inner.clone(); + let frame = + Frame::try_from(&frame.data[..]).map_err(|e| Error::FatalInjectError(e.into()))?; + spawn_blocking(move || inner.lock().inject_frame(frame)) + .await + .unwrap() + } + + async fn rollback(&mut self) { + let inner = self.inner.clone(); + spawn_blocking(move || inner.lock().rollback()) + .await + .unwrap(); + } + + async fn flush(&mut self) -> Result> { + let inner = self.inner.clone(); + spawn_blocking(move || inner.lock().flush()).await.unwrap() + } +} + +impl SqliteInjector { + pub async fn new( + path: PathBuf, + capacity: usize, + auto_checkpoint: u32, + encryption_config: Option, + ) -> super::Result { + let inner = spawn_blocking(move || { + SqliteInjectorInner::new(path, capacity, auto_checkpoint, encryption_config) + }) + .await + .unwrap()?; + + Ok(Self { + inner: Arc::new(Mutex::new(inner)), + }) + } +} + +pub(in super::super) struct SqliteInjectorInner { + /// The injector is in a transaction state + is_txn: bool, + /// Buffer for holding current transaction frames + buffer: FrameBuffer, + /// Maximum capacity of the frame buffer + capacity: usize, + /// Injector connection + // connection must be dropped before the hook context + connection: Arc>>, + biggest_uncommitted_seen: FrameNo, + + // Connection config items used to recreate the injection connection + path: PathBuf, + encryption_config: Option, + auto_checkpoint: u32, +} + +/// Methods from this trait are called before and after performing a frame injection. +/// This trait trait is used to record the last committed frame_no to the log. +/// The implementer can persist the pre and post commit frame no, and compare them in the event of +/// a crash; if the pre and post commit frame_no don't match, then the log may be corrupted. +impl SqliteInjectorInner { + fn new( + path: impl AsRef, + capacity: usize, + auto_checkpoint: u32, + encryption_config: Option, + ) -> Result { + let path = path.as_ref().to_path_buf(); + + let buffer = FrameBuffer::default(); + let wal_manager = InjectorWalManager::new(buffer.clone()); + let connection = libsql_sys::Connection::open( + &path, + OpenFlags::SQLITE_OPEN_READ_WRITE + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX, + wal_manager, + auto_checkpoint, + encryption_config.clone(), + )?; + + Ok(Self { + is_txn: false, + buffer, + capacity, + connection: Arc::new(Mutex::new(connection)), + biggest_uncommitted_seen: 0, + + path, + encryption_config, + auto_checkpoint, + }) + } + + /// Inject a frame into the log. If this was a commit frame, returns Ok(Some(FrameNo)). + pub fn inject_frame(&mut self, frame: Frame) -> Result, Error> { + let frame_close_txn = frame.header().size_after.get() != 0; + self.buffer.lock().push_back(frame); + if frame_close_txn || self.buffer.lock().len() >= self.capacity { + return self.flush(); + } + + Ok(None) + } + + pub fn rollback(&mut self) { + self.clear_buffer(); + let conn = self.connection.lock(); + let mut rollback = conn.prepare_cached("ROLLBACK").unwrap(); + let _ = rollback.execute(()); + self.is_txn = false; + } + + /// Flush the buffer to libsql WAL. + /// Trigger a dummy write, and flush the cache to trigger a call to xFrame. The buffer's frame + /// are then injected into the wal. + pub fn flush(&mut self) -> Result, Error> { + match self.try_flush() { + Err(e) => { + // something went wrong, rollback the connection to make sure we can retry in a + // clean state + self.biggest_uncommitted_seen = 0; + self.rollback(); + Err(e) + } + Ok(ret) => Ok(ret), + } + } + + fn try_flush(&mut self) -> Result, Error> { + if !self.is_txn { + self.begin_txn()?; + } + + let lock = self.buffer.lock(); + // the frames in the buffer are either monotonically increasing (log) or decreasing + // (snapshot). Either way, we want to find the biggest frameno we're about to commit, and + // that is either the front or the back of the buffer + let last_frame_no = match lock.back().zip(lock.front()) { + Some((b, f)) => f.header().frame_no.get().max(b.header().frame_no.get()), + None => { + tracing::trace!("nothing to inject"); + return Ok(None); + } + }; + + self.biggest_uncommitted_seen = self.biggest_uncommitted_seen.max(last_frame_no); + + drop(lock); + + let connection = self.connection.lock(); + // use prepare cached to avoid parsing the same statement over and over again. + let mut stmt = + connection.prepare_cached("INSERT INTO libsql_temp_injection VALUES (42)")?; + + // We execute the statement, and then force a call to xframe if necesacary. If the execute + // succeeds, then xframe wasn't called, in this case, we call cache_flush, and then process + // the error. + // It is unexpected that execute flushes, but it is possible, so we handle that case. + match stmt.execute(()).and_then(|_| connection.cache_flush()) { + Ok(_) => panic!("replication hook was not called"), + Err(e) => { + if let Some(err) = e.sqlite_error() { + if err.extended_code == LIBSQL_INJECT_OK { + // refresh schema + connection.pragma_update(None, "writable_schema", "reset")?; + let mut rollback = connection.prepare_cached("ROLLBACK")?; + let _ = rollback.execute(()); + self.is_txn = false; + assert!(self.buffer.lock().is_empty()); + let commit_frame_no = self.biggest_uncommitted_seen; + self.biggest_uncommitted_seen = 0; + return Ok(Some(commit_frame_no)); + } else if err.extended_code == LIBSQL_INJECT_OK_TXN { + self.is_txn = true; + assert!(self.buffer.lock().is_empty()); + return Ok(None); + } else if err.extended_code == LIBSQL_INJECT_FATAL { + return Err(Error::FatalInjectError(e.into())); + } + } + + Err(Error::FatalInjectError(e.into())) + } + } + } + + fn begin_txn(&mut self) -> Result<(), Error> { + let mut conn = self.connection.lock(); + + { + let wal_manager = InjectorWalManager::new(self.buffer.clone()); + let new_conn = libsql_sys::Connection::open( + &self.path, + OpenFlags::SQLITE_OPEN_READ_WRITE + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX, + wal_manager, + self.auto_checkpoint, + self.encryption_config.clone(), + )?; + + let _ = std::mem::replace(&mut *conn, new_conn); + } + + conn.pragma_update(None, "writable_schema", "true")?; + + let mut stmt = conn.prepare_cached("BEGIN IMMEDIATE")?; + stmt.execute(())?; + // we create a dummy table. This table MUST not be persisted, otherwise the replica schema + // would differ with the primary's. + let mut stmt = + conn.prepare_cached("CREATE TABLE IF NOT EXISTS libsql_temp_injection (x)")?; + stmt.execute(())?; + + Ok(()) + } + + pub fn clear_buffer(&mut self) { + self.buffer.lock().clear() + } + + #[cfg(test)] + pub fn is_txn(&self) -> bool { + self.is_txn + } +} + +#[cfg(test)] +mod test { + use crate::frame::FrameBorrowed; + use std::mem::size_of; + + use super::*; + /// this this is generated by creating a table test, inserting 5 rows into it, and then + /// truncating the wal file of it's header. + const WAL: &[u8] = include_bytes!("../../../assets/test/test_wallog"); + + fn wal_log() -> impl Iterator { + WAL.chunks(size_of::()) + .map(|b| Frame::try_from(b).unwrap()) + } + + #[test] + fn test_simple_inject_frames() { + let temp = tempfile::tempdir().unwrap(); + + let mut injector = + SqliteInjectorInner::new(temp.path().join("data"), 10, 10000, None).unwrap(); + let log = wal_log(); + for frame in log { + injector.inject_frame(frame).unwrap(); + } + + let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); + + conn.query_row("SELECT COUNT(*) FROM test", (), |row| { + assert_eq!(row.get::<_, usize>(0).unwrap(), 5); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_inject_frames_split_txn() { + let temp = tempfile::tempdir().unwrap(); + + // inject one frame at a time + let mut injector = + SqliteInjectorInner::new(temp.path().join("data"), 1, 10000, None).unwrap(); + let log = wal_log(); + for frame in log { + injector.inject_frame(frame).unwrap(); + } + + let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); + + conn.query_row("SELECT COUNT(*) FROM test", (), |row| { + assert_eq!(row.get::<_, usize>(0).unwrap(), 5); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_inject_partial_txn_isolated() { + let temp = tempfile::tempdir().unwrap(); + + // inject one frame at a time + let mut injector = + SqliteInjectorInner::new(temp.path().join("data"), 10, 1000, None).unwrap(); + let mut frames = wal_log(); + + assert!(injector + .inject_frame(frames.next().unwrap()) + .unwrap() + .is_none()); + let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); + assert!(conn + .query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) + .is_err()); + + while injector + .inject_frame(frames.next().unwrap()) + .unwrap() + .is_none() + {} + + // reset schema + conn.pragma_update(None, "writable_schema", "reset") + .unwrap(); + conn.query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) + .unwrap(); + } +} diff --git a/libsql-replication/src/replicator.rs b/libsql-replication/src/replicator.rs index bc1eada7f8..38cdbf6e7c 100644 --- a/libsql-replication/src/replicator.rs +++ b/libsql-replication/src/replicator.rs @@ -1,14 +1,11 @@ use std::path::PathBuf; -use std::sync::Arc; -use parking_lot::Mutex; -use tokio::task::spawn_blocking; use tokio::time::Duration; use tokio_stream::{Stream, StreamExt}; use tonic::{Code, Status}; use crate::frame::{Frame, FrameNo}; -use crate::injector::Injector; +use crate::injector::{Injector, SqliteInjector}; use crate::rpc::replication::{ Frame as RpcFrame, NAMESPACE_DOESNT_EXIST, NEED_SNAPSHOT_ERROR_MSG, NO_HELLO_ERROR_MSG, }; @@ -66,7 +63,7 @@ impl From for Error { #[async_trait::async_trait] pub trait ReplicatorClient { - type FrameStream: Stream> + Unpin + Send; + type FrameStream: Stream> + Unpin + Send; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error>; @@ -137,9 +134,9 @@ where /// The `Replicator`'s duty is to download frames from the primary, and pass them to the injector at /// transaction boundaries. -pub struct Replicator { +pub struct Replicator { client: C, - injector: Arc>, + injector: I, state: ReplicatorState, frames_synced: usize, } @@ -154,33 +151,41 @@ enum ReplicatorState { Exit, } -impl Replicator { +impl Replicator +where + C: ReplicatorClient, +{ /// Creates a replicator for the db file pointed at by `db_path` - pub async fn new( + pub async fn new_sqlite( client: C, db_path: PathBuf, auto_checkpoint: u32, encryption_config: Option, ) -> Result { - let injector = { - let db_path = db_path.clone(); - spawn_blocking(move || { - Injector::new( - db_path, - INJECTOR_BUFFER_CAPACITY, - auto_checkpoint, - encryption_config, - ) - }) - .await?? - }; + let injector = SqliteInjector::new( + db_path.clone(), + INJECTOR_BUFFER_CAPACITY, + auto_checkpoint, + encryption_config, + ) + .await?; + + Ok(Self::new(client, injector)) + } +} - Ok(Self { +impl Replicator +where + C: ReplicatorClient, + I: Injector, +{ + pub fn new(client: C, injector: I) -> Self { + Self { client, - injector: Arc::new(Mutex::new(injector)), + injector, state: ReplicatorState::NeedHandshake, frames_synced: 0, - }) + } } /// for a handshake on next call to replicate. @@ -250,7 +255,7 @@ impl Replicator { // in case of error we rollback the current injector transaction, and start over. if ret.is_err() { self.client.rollback(); - self.injector.lock().rollback(); + self.injector.rollback().await; } self.state = match ret { @@ -293,7 +298,8 @@ impl Replicator { } async fn load_snapshot(&mut self) -> Result<(), Error> { - self.injector.lock().clear_buffer(); + self.client.rollback(); + self.injector.rollback().await; loop { match self.client.snapshot().await { Ok(mut stream) => { @@ -312,29 +318,25 @@ impl Replicator { } } - async fn inject_frame(&mut self, frame: Frame) -> Result<(), Error> { + async fn inject_frame(&mut self, frame: RpcFrame) -> Result<(), Error> { self.frames_synced += 1; - let injector = self.injector.clone(); - match spawn_blocking(move || injector.lock().inject_frame(frame)).await? { - Ok(Some(commit_fno)) => { + match self.injector.inject_frame(frame).await? { + Some(commit_fno) => { self.client.commit_frame_no(commit_fno).await?; } - Ok(None) => (), - Err(e) => Err(e)?, + None => (), } Ok(()) } pub async fn flush(&mut self) -> Result<(), Error> { - let injector = self.injector.clone(); - match spawn_blocking(move || injector.lock().flush()).await? { - Ok(Some(commit_fno)) => { + match self.injector.flush().await? { + Some(commit_fno) => { self.client.commit_frame_no(commit_fno).await?; } - Ok(None) => (), - Err(e) => Err(e)?, + None => (), } Ok(()) @@ -358,6 +360,7 @@ mod test { use async_stream::stream; use crate::frame::{FrameBorrowed, FrameMut}; + use crate::rpc::replication::Frame as RpcFrame; use super::*; @@ -368,7 +371,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -395,7 +399,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); @@ -412,7 +416,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -438,7 +443,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -454,7 +459,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -482,7 +488,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -498,7 +504,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -526,7 +533,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -542,7 +549,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -568,7 +576,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -584,7 +592,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -610,7 +619,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); replicator.state = ReplicatorState::NeedSnapshot; @@ -625,7 +634,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -653,7 +663,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -670,7 +680,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -696,7 +707,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); replicator.state = ReplicatorState::NeedHandshake; @@ -738,7 +749,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -750,15 +762,26 @@ mod test { let frames = self .frames .iter() + .map(|f| RpcFrame { + data: f.bytes(), + timestamp: None, + }) .take(2) - .cloned() .map(Ok) .chain(Some(Err(Error::Client("some client error".into())))) .collect::>(); Ok(Box::pin(tokio_stream::iter(frames))) } else { - let stream = tokio_stream::iter(self.frames.clone().into_iter().map(Ok)); - Ok(Box::pin(stream)) + let iter = self + .frames + .iter() + .map(|f| RpcFrame { + data: f.bytes(), + timestamp: None, + }) + .map(Ok) + .collect::>(); + Ok(Box::pin(tokio_stream::iter(iter))) } } /// Return a snapshot for the current replication index. Called after next_frame has returned a @@ -784,7 +807,7 @@ mod test { committed_frame_no: None, }; - let mut replicator = Replicator::new(client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); @@ -795,7 +818,7 @@ mod test { replicator.try_replicate_step().await.unwrap_err(), Error::Client(_) )); - assert!(!replicator.injector.lock().is_txn()); + assert!(!replicator.injector.inner.lock().is_txn()); assert!(replicator.client_mut().committed_frame_no.is_none()); assert_eq!(replicator.state, ReplicatorState::NeedHandshake); @@ -805,7 +828,7 @@ mod test { replicator.client_mut().should_error = false; replicator.try_replicate_step().await.unwrap(); - assert!(!replicator.injector.lock().is_txn()); + assert!(!replicator.injector.inner.lock().is_txn()); assert_eq!(replicator.state, ReplicatorState::Exit); assert_eq!(replicator.client_mut().committed_frame_no, Some(6)); } diff --git a/libsql-replication/src/rpc.rs b/libsql-replication/src/rpc.rs index ebc92cf10c..3b31bd2b21 100644 --- a/libsql-replication/src/rpc.rs +++ b/libsql-replication/src/rpc.rs @@ -23,8 +23,14 @@ pub mod proxy { pub mod replication { #![allow(clippy::all)] + use std::pin::Pin; use uuid::Uuid; + use tokio_stream::Stream; + + pub type BoxStream<'a, T> = Pin + Send + 'a>>; + + use self::replication_log_server::ReplicationLog; include!("generated/wal_log.rs"); pub const NO_HELLO_ERROR_MSG: &str = "NO_HELLO"; @@ -52,6 +58,45 @@ pub mod replication { } } } + + pub type BoxReplicationService = Box>, + SnapshotStream = BoxStream<'static, Result>, + >>; + + #[tonic::async_trait] + impl ReplicationLog for BoxReplicationService { + type LogEntriesStream = BoxStream<'static, Result>; + type SnapshotStream = BoxStream<'static, Result>; + + async fn log_entries( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + self.as_ref().log_entries(req).await + } + + async fn batch_log_entries( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + self.as_ref().batch_log_entries(req).await + } + + async fn hello( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + self.as_ref().hello(req).await + } + + async fn snapshot( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + self.as_ref().snapshot(req).await + } + } } pub mod metadata { diff --git a/libsql-server/Cargo.toml b/libsql-server/Cargo.toml index 6763c02dfb..c3b59f0766 100644 --- a/libsql-server/Cargo.toml +++ b/libsql-server/Cargo.toml @@ -36,7 +36,7 @@ hyper-tungstenite = "0.11" itertools = "0.10.5" jsonwebtoken = "9" libsql = { path = "../libsql/", optional = true } -libsql_replication = { path = "../libsql-replication" } +libsql_replication = { path = "../libsql-replication", features = ["libsql_wal"] } libsql-wal = { path = "../libsql-wal/" } libsql-storage = { path = "../libsql-storage", optional = true } metrics = "0.21.1" @@ -83,7 +83,7 @@ url = { version = "2.3", features = ["serde"] } uuid = { version = "1.3", features = ["v4", "serde", "v7"] } aes = { version = "0.8.3", optional = true } cbc = { version = "0.1.2", optional = true } -zerocopy = { version = "0.7.28", features = ["derive", "alloc"] } +zerocopy = { workspace = true } hashbrown = { version = "0.14.3", features = ["serde"] } hdrhistogram = "7.5.4" crossbeam = "0.8.4" @@ -91,11 +91,12 @@ async-recursion = "1" mimalloc = "0.1.42" rheaper = { git = "https://github.com/MarinPostma/rheaper.git", tag = "v0.2.0", default-features = false, features = ["allocator"] } tar = "0.4.41" +aws-config = "1" +aws-sdk-s3 = "1" +aws-smithy-runtime = "1.6.2" [dev-dependencies] arbitrary = { version = "1.3.0", features = ["derive_arbitrary"] } -aws-config = "0.55" -aws-sdk-s3 = "0.28" env_logger = "0.10" hyper = { workspace = true, features = ["client"] } insta = { version = "1.26.0", features = ["json"] } diff --git a/libsql-server/src/connection/connection_core.rs b/libsql-server/src/connection/connection_core.rs new file mode 100644 index 0000000000..216348f102 --- /dev/null +++ b/libsql-server/src/connection/connection_core.rs @@ -0,0 +1,809 @@ +use std::ffi::{c_int, c_void}; +use std::time::{Duration, Instant}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::path::{Path, PathBuf}; + +use libsql_sys::wal::{WalManager, Wal}; +use metrics::histogram; +use parking_lot::Mutex; +use tokio::sync::watch; + +use crate::stats::{Stats, StatsUpdateMessage}; +use crate::replication::FrameNo; +use crate::query_result_builder::{QueryBuilderConfig, QueryResultBuilder}; +use crate::query_analysis::StmtKind; +use crate::namespace::ResolveNamespacePathFn; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::metrics::{PROGRAM_EXEC_COUNT, QUERY_CANCELED, VACUUM_COUNT, WAL_CHECKPOINT_COUNT}; +use crate::error::Error; +use crate::connection::legacy::open_conn_active_checkpoint; +use crate::{Result, BLOCKING_RT}; + +use super::config::DatabaseConfig; +use super::program::{DescribeCol, DescribeParam, DescribeResponse, Program, Vm}; + +/// The base connection type, shared between legacy and libsql-wal implementations +pub(super) struct CoreConnection { + conn: libsql_sys::Connection, + stats: Arc, + config_store: MetaStoreHandle, + builder_config: QueryBuilderConfig, + current_frame_no_receiver: watch::Receiver>, + block_writes: Arc, + resolve_attach_path: ResolveNamespacePathFn, + forced_rollback: bool, + broadcaster: BroadcasterHandle, + hooked: bool, + canceled: Arc, +} + +fn update_stats( + stats: &Stats, + sql: String, + rows_read: u64, + rows_written: u64, + mem_used: u64, + elapsed: Duration, +) { + stats.send(StatsUpdateMessage { + sql, + elapsed, + rows_read, + rows_written, + mem_used, + }); +} + +impl CoreConnection { + pub(super) fn new>( + path: &Path, + extensions: Arc<[PathBuf]>, + wal_manager: T, + stats: Arc, + broadcaster: BroadcasterHandle, + config_store: MetaStoreHandle, + builder_config: QueryBuilderConfig, + current_frame_no_receiver: watch::Receiver>, + block_writes: Arc, + resolve_attach_path: ResolveNamespacePathFn, + ) -> Result { + let conn = open_conn_active_checkpoint( + path, + wal_manager, + None, + builder_config.auto_checkpoint, + builder_config.encryption_config.clone(), + )?; + + let config = config_store.get(); + conn.pragma_update(None, "max_page_count", config.max_db_pages)?; + tracing::debug!("setting PRAGMA synchronous to {}", config.durability_mode); + conn.pragma_update(None, "synchronous", config.durability_mode)?; + + conn.set_limit( + rusqlite::limits::Limit::SQLITE_LIMIT_LENGTH, + config.max_row_size as i32, + ); + + unsafe { + const MAX_RETRIES: c_int = 8; + extern "C" fn do_nothing(_: *mut c_void, n: c_int) -> c_int { + (n < MAX_RETRIES) as _ + } + libsql_sys::ffi::sqlite3_busy_handler( + conn.handle(), + Some(do_nothing), + std::ptr::null_mut(), + ); + } + + let canceled = Arc::new(AtomicBool::new(false)); + + conn.progress_handler(100, { + let canceled = canceled.clone(); + Some(move || { + let canceled = canceled.load(Ordering::Relaxed); + if canceled { + QUERY_CANCELED.increment(1); + tracing::trace!("request canceled"); + } + canceled + }) + }); + + let this = Self { + conn, + stats, + config_store, + builder_config, + current_frame_no_receiver, + block_writes, + resolve_attach_path, + forced_rollback: false, + broadcaster, + hooked: false, + canceled, + }; + + for ext in extensions.iter() { + unsafe { + let _guard = rusqlite::LoadExtensionGuard::new(&this.conn).unwrap(); + if let Err(e) = this.conn.load_extension(ext, None) { + tracing::error!("failed to load extension: {}", ext.display()); + Err(e)?; + } + tracing::trace!("Loaded extension {}", ext.display()); + } + } + + Ok(this) + } + + pub(super) fn raw_mut(&mut self) -> &mut libsql_sys::Connection { + &mut self.conn + } + + pub(super) fn raw(&self) -> &libsql_sys::Connection { + &self.conn + } + + pub(super) fn config(&self) -> Arc{ + self.config_store.get() + } + + pub(super) async fn run_async( + this: Arc>, + pgm: Program, + builder: B, + ) -> Result { + struct Bomb { + canceled: Arc, + defused: bool, + } + + impl Drop for Bomb { + fn drop(&mut self) { + if !self.defused { + tracing::trace!("cancelling request"); + self.canceled.store(true, Ordering::Relaxed); + } + } + } + + let canceled = { + let cancelled = this.lock().canceled.clone(); + cancelled.store(false, Ordering::Relaxed); + cancelled + }; + + PROGRAM_EXEC_COUNT.increment(1); + + // create the bomb right before spawning the blocking task. + let mut bomb = Bomb { + canceled, + defused: false, + }; + let ret = BLOCKING_RT + .spawn_blocking(move || CoreConnection::run(this, pgm, builder)) + .await + .unwrap(); + + bomb.defused = true; + + ret + } + + pub(super) fn run( + this: Arc>, + pgm: Program, + mut builder: B, + ) -> Result { + let (config, stats, block_writes, resolve_attach_path) = { + let mut lock = this.lock(); + let config = lock.config_store.get(); + let stats = lock.stats.clone(); + let block_writes = lock.block_writes.clone(); + let resolve_attach_path = lock.resolve_attach_path.clone(); + + lock.update_hooks(); + + (config, stats, block_writes, resolve_attach_path) + }; + + builder.init(&this.lock().builder_config)?; + let mut vm = Vm::new( + builder, + &pgm, + move |stmt_kind| { + let should_block = match stmt_kind { + StmtKind::Read | StmtKind::TxnBegin => config.block_reads, + StmtKind::Write => { + config.block_reads + || config.block_writes + || block_writes.load(Ordering::SeqCst) + } + StmtKind::DDL => config.block_reads || config.block_writes, + StmtKind::TxnEnd + | StmtKind::Release + | StmtKind::Savepoint + | StmtKind::Detach + | StmtKind::Attach(_) => false, + }; + + ( + should_block, + should_block.then(|| config.block_reason.clone()).flatten(), + ) + }, + move |sql, rows_read, rows_written, mem_used, elapsed| { + update_stats(&stats, sql, rows_read, rows_written, mem_used, elapsed) + }, + resolve_attach_path, + ); + + let mut has_timeout = false; + while !vm.finished() { + let mut conn = this.lock(); + + if conn.forced_rollback { + has_timeout = true; + conn.forced_rollback = false; + } + + // once there was a timeout, invalidate all the program steps + if has_timeout { + vm.builder().begin_step()?; + vm.builder().step_error(Error::LibSqlTxTimeout)?; + vm.builder().finish_step(0, None)?; + vm.advance(); + continue; + } + + vm.step(&conn.raw())?; + } + + { + let mut lock = this.lock(); + let is_autocommit = lock.conn.is_autocommit(); + let current_fno = *lock.current_frame_no_receiver.borrow_and_update(); + vm.builder().finish(current_fno, is_autocommit)?; + } + + Ok(vm.into_builder()) + } + + fn rollback(&self) { + if let Err(e) = self.conn.execute("ROLLBACK", ()) { + tracing::error!("failed to rollback: {e}"); + } + } + + pub(super) fn force_rollback(&mut self) { + if !self.forced_rollback { + self.rollback(); + self.forced_rollback = true; + } + } + + pub(super) fn checkpoint(&self) -> Result<()> { + let start = Instant::now(); + self.conn + .query_row("PRAGMA wal_checkpoint(TRUNCATE)", (), |row| { + let status: i32 = row.get(0)?; + let wal_frames: i32 = row.get(1)?; + let moved_frames: i32 = row.get(2)?; + tracing::info!( + "WAL checkpoint successful, status: {}, WAL frames: {}, moved frames: {}", + status, + wal_frames, + moved_frames + ); + Ok(()) + })?; + WAL_CHECKPOINT_COUNT.increment(1); + histogram!("libsql_server_wal_checkpoint_time", start.elapsed()); + Ok(()) + } + + pub(super) fn vacuum_if_needed(&self) -> Result<()> { + let page_count = self + .conn + .query_row("PRAGMA page_count", (), |row| row.get::<_, i64>(0))?; + let freelist_count = self + .conn + .query_row("PRAGMA freelist_count", (), |row| row.get::<_, i64>(0))?; + // NOTICE: don't bother vacuuming if we don't have at least 256MiB of data + if page_count >= 65536 && freelist_count * 2 > page_count { + tracing::info!("Vacuuming: pages={page_count} freelist={freelist_count}"); + self.conn.execute("VACUUM", ())?; + } else { + tracing::trace!("Not vacuuming: pages={page_count} freelist={freelist_count}"); + } + VACUUM_COUNT.increment(1); + Ok(()) + } + + pub(super) fn describe(&self, sql: &str) -> crate::Result { + let stmt = self.conn.prepare(sql)?; + + let params = (1..=stmt.parameter_count()) + .map(|param_i| { + let name = stmt.parameter_name(param_i).map(|n| n.into()); + DescribeParam { name } + }) + .collect(); + + let cols = stmt + .columns() + .into_iter() + .map(|col| { + let name = col.name().into(); + let decltype = col.decl_type().map(|t| t.into()); + DescribeCol { name, decltype } + }) + .collect(); + + let is_explain = stmt.is_explain() != 0; + let is_readonly = stmt.readonly(); + Ok(DescribeResponse { + params, + cols, + is_explain, + is_readonly, + }) + } + + pub(super) fn is_autocommit(&self) -> bool { + self.conn.is_autocommit() + } + + fn update_hooks(&mut self) { + let (update_fn, commit_fn, rollback_fn) = if self.hooked { + if self.broadcaster.active() { + return; + } + self.hooked = false; + (None, None, None) + } else { + let Some(broadcaster) = self.broadcaster.get() else { + return; + }; + + let update = broadcaster.clone(); + let update_fn = Some(move |action: _, _: &_, table: &_, _| { + update.notify(table, action); + }); + + let commit = broadcaster.clone(); + let commit_fn = Some(move || { + commit.commit(); + false // allow commit to go through + }); + + let rollback = broadcaster; + let rollback_fn = Some(move || rollback.rollback()); + (update_fn, commit_fn, rollback_fn) + }; + + self.conn.update_hook(update_fn); + self.conn.commit_hook(commit_fn); + self.conn.rollback_hook(rollback_fn); + } +} + +#[cfg(test)] +mod test { + use itertools::Itertools; + #[cfg(not(feature = "durable-wal"))] + use libsql_sys::wal::either::Either as EitherWAL; + #[cfg(feature = "durable-wal")] + use libsql_sys::wal::either::Either3 as EitherWAL; + use libsql_sys::wal::wrapper::PassthroughWalWrapper; + use libsql_sys::wal::{Sqlite3Wal, Sqlite3WalManager}; + use rand::Rng; + use tempfile::tempdir; + use tokio::task::JoinSet; + use tokio::time::Instant; + + use crate::auth::Authenticated; + use crate::connection::legacy::MakeLegacyConnection; + use crate::connection::{Connection as _, RequestContext, TXN_TIMEOUT}; + use crate::namespace::meta_store::{metastore_connection_maker, MetaStore}; + use crate::namespace::NamespaceName; + use crate::query_result_builder::test::{test_driver, TestBuilder}; + use crate::query_result_builder::QueryResultBuilder; + use crate::DEFAULT_AUTO_CHECKPOINT; + + use super::*; + + fn setup_test_conn() -> Arc>> { + let conn = CoreConnection { + conn: libsql_sys::Connection::test(), + stats: Arc::new(Stats::default()), + config_store: MetaStoreHandle::new_test(), + builder_config: QueryBuilderConfig::default(), + current_frame_no_receiver: watch::channel(None).1, + block_writes: Default::default(), + resolve_attach_path: Arc::new(|_| unreachable!()), + forced_rollback: false, + broadcaster: Default::default(), + hooked: false, + canceled: Arc::new(false.into()), + }; + + let conn = Arc::new(Mutex::new(conn)); + + let stmts = std::iter::once("create table test (x)") + .chain(std::iter::repeat("insert into test values ('hello world')").take(100)) + .collect_vec(); + CoreConnection::run(conn.clone(), Program::seq(&stmts), TestBuilder::default()).unwrap(); + + conn + } + + #[test] + fn test_libsql_conn_builder_driver() { + test_driver(1000, |b| { + let conn = setup_test_conn(); + CoreConnection::run(conn, Program::seq(&["select * from test"]), b) + }) + } + + #[ignore = "the new implementation doesn't steal if nobody is trying to acquire a write lock"] + #[tokio::test] + async fn txn_timeout_no_stealing() { + let tmp = tempdir().unwrap(); + let make_conn = MakeLegacyConnection::new( + tmp.path().into(), + PassthroughWalWrapper, + Default::default(), + Default::default(), + MetaStoreHandle::load(tmp.path()).unwrap(), + Arc::new([]), + 100000000, + 100000000, + DEFAULT_AUTO_CHECKPOINT, + watch::channel(None).1, + None, + Default::default(), + Arc::new(|_| unreachable!()), + Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + ) + .await + .unwrap(); + + tokio::time::pause(); + let conn = make_conn.make_connection().await.unwrap(); + let _builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["BEGIN IMMEDIATE"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(!conn.inner.lock().conn.is_autocommit()); + + tokio::time::sleep(Duration::from_secs(1)).await; + + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["create table test (c)"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(!conn.is_autocommit().await.unwrap()); + assert!(matches!(builder.into_ret()[0], Err(Error::LibSqlTxTimeout))); + } + + #[tokio::test] + /// A bunch of txn try to acquire the lock, and never release it. They will try to steal the + /// lock one after the other. All txn should eventually acquire the write lock + async fn serialized_txn_timeouts() { + let tmp = tempdir().unwrap(); + let make_conn = MakeLegacyConnection::new( + tmp.path().into(), + PassthroughWalWrapper, + Default::default(), + Default::default(), + MetaStoreHandle::load(tmp.path()).unwrap(), + Arc::new([]), + 100000000, + 100000000, + DEFAULT_AUTO_CHECKPOINT, + watch::channel(None).1, + None, + Default::default(), + Arc::new(|_| unreachable!()), + Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + ) + .await + .unwrap(); + + let mut set = JoinSet::new(); + for _ in 0..10 { + let conn = make_conn.make_connection().await.unwrap(); + set.spawn_blocking(move || { + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["BEGIN IMMEDIATE"]), + TestBuilder::default(), + ) + .unwrap(); + let ret = &builder.into_ret()[0]; + assert!( + (ret.is_ok() && !conn.inner.lock().conn.is_autocommit()) + || (matches!(ret, Err(Error::RusqliteErrorExtended(_, 5))) + && conn.inner.lock().conn.is_autocommit()) + ); + }); + } + + tokio::time::pause(); + + while let Some(ret) = set.join_next().await { + assert!(ret.is_ok()); + // advance time by a bit more than the txn timeout + tokio::time::advance(TXN_TIMEOUT + Duration::from_millis(100)).await; + } + } + + #[tokio::test] + /// verify that releasing a txn before the timeout + async fn release_before_timeout() { + let tmp = tempdir().unwrap(); + let make_conn = MakeLegacyConnection::new( + tmp.path().into(), + PassthroughWalWrapper, + Default::default(), + Default::default(), + MetaStoreHandle::load(tmp.path()).unwrap(), + Arc::new([]), + 100000000, + 100000000, + DEFAULT_AUTO_CHECKPOINT, + watch::channel(None).1, + None, + Default::default(), + Arc::new(|_| unreachable!()), + Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + ) + .await + .unwrap(); + + let conn1 = make_conn.make_connection().await.unwrap(); + tokio::task::spawn_blocking({ + let conn = conn1.clone(); + move || { + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["BEGIN IMMEDIATE"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(!conn.inner.lock().is_autocommit()); + assert!(builder.into_ret()[0].is_ok()); + } + }) + .await + .unwrap(); + + let conn2 = make_conn.make_connection().await.unwrap(); + let handle = tokio::task::spawn_blocking({ + let conn = conn2.clone(); + move || { + let before = Instant::now(); + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["BEGIN IMMEDIATE"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(!conn.inner.lock().is_autocommit()); + assert!(builder.into_ret()[0].is_ok()); + before.elapsed() + } + }); + + let wait_time = TXN_TIMEOUT / 10; + tokio::time::sleep(wait_time).await; + + tokio::task::spawn_blocking({ + let conn = conn1.clone(); + move || { + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["COMMIT"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(conn.inner.lock().is_autocommit()); + assert!(builder.into_ret()[0].is_ok()); + } + }) + .await + .unwrap(); + + let elapsed = handle.await.unwrap(); + + let epsilon = Duration::from_millis(100); + assert!((wait_time..wait_time + epsilon).contains(&elapsed)); + } + + /// The goal of this test is to run many concurrent transaction and hopefully catch a bug in + /// the lock stealing code. If this test becomes flaky check out the lock stealing code. + #[tokio::test] + async fn test_many_concurrent() { + let tmp = tempdir().unwrap(); + let make_conn = MakeLegacyConnection::new( + tmp.path().into(), + PassthroughWalWrapper, + Default::default(), + Default::default(), + MetaStoreHandle::load(tmp.path()).unwrap(), + Arc::new([]), + 100000000, + 100000000, + DEFAULT_AUTO_CHECKPOINT, + watch::channel(None).1, + None, + Default::default(), + Arc::new(|_| unreachable!()), + Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + ) + .await + .unwrap(); + + let conn = make_conn.make_connection().await.unwrap(); + let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); + let ctx = RequestContext::new( + Authenticated::FullAccess, + NamespaceName::default(), + MetaStore::new(Default::default(), tmp.path(), maker().unwrap(), manager) + .await + .unwrap(), + ); + conn.execute_program( + Program::seq(&["CREATE TABLE test (x)"]), + ctx.clone(), + TestBuilder::default(), + None, + ) + .await + .unwrap(); + let run_conn = |maker: Arc>| { + let ctx = ctx.clone(); + async move { + for _ in 0..1000 { + let conn = maker.make_connection().await.unwrap(); + let pgm = Program::seq(&["BEGIN IMMEDIATE", "INSERT INTO test VALUES (42)"]); + let res = conn + .execute_program(pgm, ctx.clone(), TestBuilder::default(), None) + .await + .unwrap() + .into_ret(); + for result in res { + result.unwrap(); + } + // with 99% change, commit the txn + if rand::thread_rng().gen_range(0..100) > 1 { + let pgm = Program::seq(&["INSERT INTO test VALUES (43)", "COMMIT"]); + let res = conn + .execute_program(pgm, ctx.clone(), TestBuilder::default(), None) + .await + .unwrap() + .into_ret(); + for result in res { + result.unwrap(); + } + } + } + } + }; + + let maker = Arc::new(make_conn); + let mut join_set = JoinSet::new(); + for _ in 0..3 { + join_set.spawn(run_conn(maker.clone())); + } + + let join_all = async move { + while let Some(next) = join_set.join_next().await { + next.unwrap(); + } + }; + + tokio::time::timeout(Duration::from_secs(60), join_all) + .await + .expect("timed out running connections"); + } + + #[tokio::test] + /// verify that releasing a txn before the timeout + async fn force_rollback_reset() { + let tmp = tempdir().unwrap(); + let make_conn = MakeLegacyConnection::new( + tmp.path().into(), + PassthroughWalWrapper, + Default::default(), + Default::default(), + MetaStoreHandle::load(tmp.path()).unwrap(), + Arc::new([]), + 100000000, + 100000000, + DEFAULT_AUTO_CHECKPOINT, + watch::channel(None).1, + None, + Default::default(), + Arc::new(|_| unreachable!()), + Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + ) + .await + .unwrap(); + + let conn1 = make_conn.make_connection().await.unwrap(); + tokio::task::spawn_blocking({ + let conn = conn1.clone(); + move || { + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["BEGIN IMMEDIATE"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(!conn.inner.lock().is_autocommit()); + assert!(builder.into_ret()[0].is_ok()); + } + }) + .await + .unwrap(); + + let conn2 = make_conn.make_connection().await.unwrap(); + tokio::task::spawn_blocking({ + let conn = conn2.clone(); + move || { + let before = Instant::now(); + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["BEGIN IMMEDIATE"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(!conn.inner.lock().is_autocommit()); + assert!(builder.into_ret()[0].is_ok()); + before.elapsed() + } + }) + .await + .unwrap(); + + tokio::time::sleep(TXN_TIMEOUT * 2).await; + + tokio::task::spawn_blocking({ + let conn = conn1.clone(); + move || { + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["SELECT 1;"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(conn.inner.lock().is_autocommit()); + // timeout + assert!(builder.into_ret()[0].is_err()); + + let builder = CoreConnection::run( + conn.inner.clone(), + Program::seq(&["SELECT 1;"]), + TestBuilder::default(), + ) + .unwrap(); + assert!(conn.inner.lock().is_autocommit()); + // state reset + assert!(builder.into_ret()[0].is_ok()); + } + }) + .await + .unwrap(); + } +} diff --git a/libsql-server/src/connection/connection_manager.rs b/libsql-server/src/connection/connection_manager.rs index 8a95cd8e6e..b923f65ab6 100644 --- a/libsql-server/src/connection/connection_manager.rs +++ b/libsql-server/src/connection/connection_manager.rs @@ -15,27 +15,28 @@ use libsql_sys::wal::either::Either3; use libsql_sys::wal::wrapper::{WrapWal, WrappedWal}; use libsql_sys::wal::{CheckpointMode, Sqlite3Wal, Sqlite3WalManager, Wal}; use libsql_wal::io::StdIO; -use libsql_wal::storage::NoStorage; use libsql_wal::wal::{LibsqlWal, LibsqlWalManager}; use metrics::atomics::AtomicU64; use parking_lot::{Mutex, MutexGuard}; use rusqlite::ErrorCode; -use super::libsql::Connection; +use crate::SqldStorage; + +use super::connection_core::CoreConnection; use super::TXN_TIMEOUT; pub type ConnId = u64; #[cfg(feature = "durable-wal")] pub type InnerWalManager = - Either3, DurableWalManager>; + Either3, DurableWalManager>; #[cfg(feature = "durable-wal")] pub type InnerWal = Either3, DurableWal>; -#[cfg(not(feature = "durable-wal"))] -pub type InnerWalManager = Either>; #[cfg(not(feature = "durable-wal"))] +pub type InnerWalManager = Either>; +#[cfg(not(feature = "durable-wal"))] pub type InnerWal = Either>; pub type ManagedConnectionWal = WrappedWal; @@ -50,7 +51,7 @@ struct Slot { struct Abort(Arc); impl Abort { - fn from_conn(conn: &Arc>>) -> Self { + fn from_conn(conn: &Arc>>) -> Self { let conn = Arc::downgrade(conn); Self(Arc::new(move || { conn.upgrade() @@ -73,7 +74,7 @@ pub struct ConnectionManager { impl ConnectionManager { pub(super) fn register_connection( &self, - conn: &Arc>>, + conn: &Arc>>, id: ConnId, ) { let abort = Abort::from_conn(conn); diff --git a/libsql-server/src/connection/legacy.rs b/libsql-server/src/connection/legacy.rs new file mode 100644 index 0000000000..c8a240647e --- /dev/null +++ b/libsql-server/src/connection/legacy.rs @@ -0,0 +1,456 @@ +use std::ffi::c_int; +use std::path::{Path, PathBuf}; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use libsql_sys::wal::wrapper::{WrapWal, WrappedWal}; +use libsql_sys::wal::{BusyHandler, CheckpointCallback, Wal, WalManager}; +use libsql_sys::EncryptionConfig; +use parking_lot::Mutex; +use rusqlite::ffi::SQLITE_BUSY; +use rusqlite::{ErrorCode, OpenFlags}; +use tokio::sync::watch; +use tokio::time::Duration; + +use crate::error::Error; +use crate::metrics:: + DESCRIBE_COUNT +; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::ResolveNamespacePathFn; +use crate::query_result_builder::{QueryBuilderConfig, QueryResultBuilder}; +use crate::replication::FrameNo; +use crate::stats::Stats; +use crate::{record_time, Result}; + +use super::connection_core::CoreConnection; + +use super::connection_manager::{ + ConnectionManager, InnerWalManager, ManagedConnectionWal, ManagedConnectionWalWrapper, +}; +use super::program::{ + check_describe_auth, check_program_auth, DescribeResponse, +}; +use super::{MakeConnection, Program, RequestContext, TXN_TIMEOUT}; + +pub struct MakeLegacyConnection { + db_path: PathBuf, + wal_wrapper: W, + stats: Arc, + broadcaster: BroadcasterHandle, + config_store: MetaStoreHandle, + extensions: Arc<[PathBuf]>, + max_response_size: u64, + max_total_response_size: u64, + auto_checkpoint: u32, + current_frame_no_receiver: watch::Receiver>, + connection_manager: ConnectionManager, + /// return sqlite busy. To mitigate that, we hold on to one connection + _db: Option>, + encryption_config: Option, + block_writes: Arc, + resolve_attach_path: ResolveNamespacePathFn, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +} + +impl MakeLegacyConnection +where + W: WrapWal + Send + 'static + Clone, +{ + #[allow(clippy::too_many_arguments)] + pub async fn new( + db_path: PathBuf, + wal_wrapper: W, + stats: Arc, + broadcaster: BroadcasterHandle, + config_store: MetaStoreHandle, + extensions: Arc<[PathBuf]>, + max_response_size: u64, + max_total_response_size: u64, + auto_checkpoint: u32, + current_frame_no_receiver: watch::Receiver>, + encryption_config: Option, + block_writes: Arc, + resolve_attach_path: ResolveNamespacePathFn, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + ) -> Result { + let txn_timeout = config_store.get().txn_timeout.unwrap_or(TXN_TIMEOUT); + + let mut this = Self { + db_path, + stats, + broadcaster, + config_store, + extensions, + max_response_size, + max_total_response_size, + auto_checkpoint, + current_frame_no_receiver, + _db: None, + wal_wrapper, + encryption_config, + block_writes, + resolve_attach_path, + connection_manager: ConnectionManager::new(txn_timeout), + make_wal_manager, + }; + + let db = this.try_create_db().await?; + this._db = Some(db); + + Ok(this) + } + + /// Tries to create a database, retrying if the database is busy. + async fn try_create_db(&self) -> Result> { + // try 100 times to acquire initial db connection. + let mut retries = 0; + loop { + match self.make_connection().await { + Ok(conn) => return Ok(conn), + Err( + err @ Error::RusqliteError(rusqlite::Error::SqliteFailure( + rusqlite::ffi::Error { + code: ErrorCode::DatabaseBusy, + .. + }, + _, + )), + ) => { + if retries < 100 { + tracing::warn!("Database file is busy, retrying..."); + retries += 1; + tokio::time::sleep(Duration::from_millis(100)).await + } else { + Err(err)?; + } + } + Err(e) => Err(e)?, + } + } + } + + #[tracing::instrument(skip(self))] + pub(super) async fn make_connection(&self) -> Result> { + LegacyConnection::new( + self.db_path.clone(), + self.extensions.clone(), + self.wal_wrapper.clone(), + self.stats.clone(), + self.broadcaster.clone(), + self.config_store.clone(), + QueryBuilderConfig { + max_size: Some(self.max_response_size), + max_total_size: Some(self.max_total_response_size), + auto_checkpoint: self.auto_checkpoint, + encryption_config: self.encryption_config.clone(), + }, + self.current_frame_no_receiver.clone(), + self.block_writes.clone(), + self.resolve_attach_path.clone(), + self.connection_manager.clone(), + self.make_wal_manager.clone(), + ) + .await + } +} + +#[async_trait::async_trait] +impl MakeConnection for MakeLegacyConnection +where + W: WrapWal + Send + Sync + 'static + Clone, +{ + type Connection = LegacyConnection; + + async fn create(&self) -> Result { + self.make_connection().await + } +} + +pub struct LegacyConnection { + pub(super) inner: Arc>>>, +} + +#[cfg(test)] +impl LegacyConnection { + pub async fn new_test(path: &Path) -> Self { + #[cfg(not(feature = "durable-wal"))] + use libsql_sys::wal::either::Either as EitherWAL; + #[cfg(feature = "durable-wal")] + use libsql_sys::wal::either::Either3 as EitherWAL; + use libsql_sys::wal::Sqlite3WalManager; + + Self::new( + path.to_owned(), + Arc::new([]), + libsql_sys::wal::wrapper::PassthroughWalWrapper, + Default::default(), + Default::default(), + MetaStoreHandle::new_test(), + QueryBuilderConfig::default(), + tokio::sync::watch::channel(None).1, + Default::default(), + Arc::new(|_| unreachable!()), + ConnectionManager::new(TXN_TIMEOUT), + Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + ) + .await + .unwrap() + } +} + +impl Clone for LegacyConnection { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + } + } +} + +#[derive(Clone, Copy)] +pub struct InhibitCheckpointWalWrapper { + close_only: bool, +} + +impl InhibitCheckpointWalWrapper { + pub fn new(close_only: bool) -> Self { + Self { close_only } + } +} + +impl WrapWal for InhibitCheckpointWalWrapper { + fn checkpoint( + &mut self, + wrapped: &mut W, + db: &mut libsql_sys::wal::Sqlite3Db, + mode: libsql_sys::wal::CheckpointMode, + busy_handler: Option<&mut dyn BusyHandler>, + sync_flags: u32, + buf: &mut [u8], + checkpoint_cb: Option<&mut dyn CheckpointCallback>, + in_wal: Option<&mut i32>, + backfilled: Option<&mut i32>, + ) -> libsql_sys::wal::Result<()> { + if !self.close_only { + wrapped.checkpoint( + db, + mode, + busy_handler, + sync_flags, + buf, + checkpoint_cb, + in_wal, + backfilled, + ) + } else { + tracing::warn!( + "checkpoint inhibited: this connection is not allowed to perform checkpoints" + ); + Err(rusqlite::ffi::Error::new(SQLITE_BUSY)) + } + } + + fn close>( + &mut self, + manager: &M, + wrapped: &mut W, + db: &mut libsql_sys::wal::Sqlite3Db, + sync_flags: c_int, + _scratch: Option<&mut [u8]>, + ) -> libsql_sys::wal::Result<()> { + // sqlite3 wall will not checkpoint if it's not provided with a scratch buffer. We take + // advantage of that to prevent checpoint on such connections. + manager.close(wrapped, db, sync_flags, None) + } +} + +pub type InhibitCheckpoint = WrappedWal; + +// Opens a connection with checkpoint inhibited +pub fn open_conn( + path: &Path, + wal_manager: T, + flags: Option, + encryption_config: Option, +) -> Result>, rusqlite::Error> +where + T: WalManager, +{ + open_conn_active_checkpoint( + path, + wal_manager.wrap(InhibitCheckpointWalWrapper::new(false)), + flags, + u32::MAX, + encryption_config, + ) +} + +/// Same as open_conn, but with checkpointing activated. +pub fn open_conn_active_checkpoint( + path: &Path, + wal_manager: T, + flags: Option, + auto_checkpoint: u32, + encryption_config: Option, +) -> Result, rusqlite::Error> +where + T: WalManager, +{ + let flags = flags.unwrap_or( + OpenFlags::SQLITE_OPEN_READ_WRITE + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX, + ); + + libsql_sys::Connection::open( + path.join("data"), + flags, + wal_manager, + auto_checkpoint, + encryption_config, + ) +} + +impl LegacyConnection +where + W: WrapWal + Send + Clone + 'static, +{ + pub async fn new( + path: impl AsRef + Send + 'static, + extensions: Arc<[PathBuf]>, + wal_wrapper: W, + stats: Arc, + broadcaster: BroadcasterHandle, + config_store: MetaStoreHandle, + builder_config: QueryBuilderConfig, + current_frame_no_receiver: watch::Receiver>, + block_writes: Arc, + resolve_attach_path: ResolveNamespacePathFn, + connection_manager: ConnectionManager, + make_wal: Arc InnerWalManager + Sync + Send + 'static>, + ) -> crate::Result { + let (conn, id) = tokio::task::spawn_blocking({ + let connection_manager = connection_manager.clone(); + move || -> crate::Result<_> { + let manager = ManagedConnectionWalWrapper::new(connection_manager); + let id = manager.id(); + let wal = make_wal().wrap(manager).wrap(wal_wrapper); + + let conn = CoreConnection::new( + path.as_ref(), + extensions, + wal, + stats, + broadcaster, + config_store, + builder_config, + current_frame_no_receiver, + block_writes, + resolve_attach_path, + )?; + + let namespace = path + .as_ref() + .file_name() + .unwrap_or_default() + .to_os_string() + .into_string() + .unwrap_or_default(); + conn.raw().create_scalar_function( + "libsql_server_database_name", + 0, + rusqlite::functions::FunctionFlags::SQLITE_UTF8 + | rusqlite::functions::FunctionFlags::SQLITE_DETERMINISTIC, + move |_| Ok(namespace.clone()), + )?; + Ok((conn, id)) + } + }) + .await + .unwrap()?; + + let inner = Arc::new(Mutex::new(conn)); + + connection_manager.register_connection(&inner, id); + + Ok(Self { inner }) + } + + pub async fn execute( + &self, + pgm: Program, + ctx: RequestContext, + builder: B, + ) -> Result { + check_program_auth(&ctx, &pgm, &self.inner.lock().config())?; + let conn = self.inner.clone(); + CoreConnection::run_async(conn, pgm, builder).await + } +} + +#[async_trait::async_trait] +impl super::Connection for LegacyConnection +where + W: WrapWal + Clone + Send + 'static, +{ + async fn execute_program( + &self, + pgm: Program, + ctx: RequestContext, + builder: B, + _replication_index: Option, + ) -> Result { + record_time! { + "libsql_query_exec"; + self.execute(pgm, ctx, builder).await + } + } + + async fn describe( + &self, + sql: String, + ctx: RequestContext, + _replication_index: Option, + ) -> Result> { + DESCRIBE_COUNT.increment(1); + check_describe_auth(ctx)?; + let conn = self.inner.clone(); + let res = tokio::task::spawn_blocking(move || conn.lock().describe(&sql)) + .await + .unwrap(); + + Ok(res) + } + + async fn is_autocommit(&self) -> Result { + Ok(self.inner.lock().is_autocommit()) + } + + async fn checkpoint(&self) -> Result<()> { + let conn = self.inner.clone(); + tokio::task::spawn_blocking(move || conn.lock().checkpoint()) + .await + .unwrap()?; + Ok(()) + } + + async fn vacuum_if_needed(&self) -> Result<()> { + let conn = self.inner.clone(); + tokio::task::spawn_blocking(move || conn.lock().vacuum_if_needed()) + .await + .unwrap()?; + Ok(()) + } + + fn diagnostics(&self) -> String { + String::new() + } + + fn with_raw(&self, f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + let mut inner = self.inner.lock(); + f(inner.raw_mut()) + } +} + diff --git a/libsql-server/src/connection/libsql.rs b/libsql-server/src/connection/libsql.rs index aadff6190b..1f31e5be5b 100644 --- a/libsql-server/src/connection/libsql.rs +++ b/libsql-server/src/connection/libsql.rs @@ -1,711 +1,101 @@ -use std::ffi::{c_int, c_void}; -use std::ops::Deref; use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::AtomicBool; use std::sync::Arc; -use libsql_sys::wal::wrapper::{WrapWal, WrappedWal}; -use libsql_sys::wal::{BusyHandler, CheckpointCallback, Wal, WalManager}; use libsql_sys::EncryptionConfig; -use metrics::histogram; +use libsql_wal::wal::{LibsqlWal, LibsqlWalManager}; +use libsql_wal::io::StdIO; use parking_lot::Mutex; -use rusqlite::ffi::SQLITE_BUSY; -use rusqlite::{ErrorCode, OpenFlags}; use tokio::sync::watch; -use tokio::time::{Duration, Instant}; -use crate::error::Error; -use crate::metrics::{DESCRIBE_COUNT, PROGRAM_EXEC_COUNT, VACUUM_COUNT, WAL_CHECKPOINT_COUNT}; +use crate::connection::program::check_program_auth; +use crate::metrics::DESCRIBE_COUNT; use crate::namespace::broadcasters::BroadcasterHandle; use crate::namespace::meta_store::MetaStoreHandle; use crate::namespace::ResolveNamespacePathFn; -use crate::query_analysis::StmtKind; use crate::query_result_builder::{QueryBuilderConfig, QueryResultBuilder}; +use crate::{record_time, SqldStorage, BLOCKING_RT}; use crate::replication::FrameNo; -use crate::stats::{Stats, StatsUpdateMessage}; -use crate::{record_time, Result, BLOCKING_RT}; +use crate::stats::Stats; +use crate::Result; -use super::connection_manager::{ - ConnectionManager, InnerWalManager, ManagedConnectionWal, ManagedConnectionWalWrapper, -}; -use super::program::{ - check_describe_auth, check_program_auth, DescribeCol, DescribeParam, DescribeResponse, Vm, -}; -use super::{MakeConnection, Program, RequestContext, TXN_TIMEOUT}; +use super::connection_core::CoreConnection; +use super::program::{check_describe_auth, DescribeResponse, Program}; +use super::{MakeConnection, RequestContext}; -pub struct MakeLibSqlConn { - db_path: PathBuf, - wal_wrapper: W, - stats: Arc, - broadcaster: BroadcasterHandle, - config_store: MetaStoreHandle, - extensions: Arc<[PathBuf]>, - max_response_size: u64, - max_total_response_size: u64, - auto_checkpoint: u32, - current_frame_no_receiver: watch::Receiver>, - connection_manager: ConnectionManager, - /// return sqlite busy. To mitigate that, we hold on to one connection - _db: Option>, - encryption_config: Option, - block_writes: Arc, - resolve_attach_path: ResolveNamespacePathFn, - make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +pub struct MakeLibsqlConnection { + pub(crate) inner: Arc, } -impl MakeLibSqlConn -where - W: WrapWal + Send + 'static + Clone, -{ - #[allow(clippy::too_many_arguments)] - pub async fn new( - db_path: PathBuf, - wal_wrapper: W, - stats: Arc, - broadcaster: BroadcasterHandle, - config_store: MetaStoreHandle, - extensions: Arc<[PathBuf]>, - max_response_size: u64, - max_total_response_size: u64, - auto_checkpoint: u32, - current_frame_no_receiver: watch::Receiver>, - encryption_config: Option, - block_writes: Arc, - resolve_attach_path: ResolveNamespacePathFn, - make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, - ) -> Result { - let txn_timeout = config_store.get().txn_timeout.unwrap_or(TXN_TIMEOUT); - - let mut this = Self { - db_path, - stats, - broadcaster, - config_store, - extensions, - max_response_size, - max_total_response_size, - auto_checkpoint, - current_frame_no_receiver, - _db: None, - wal_wrapper, - encryption_config, - block_writes, - resolve_attach_path, - connection_manager: ConnectionManager::new(txn_timeout), - make_wal_manager, - }; - - let db = this.try_create_db().await?; - this._db = Some(db); - - Ok(this) - } - - /// Tries to create a database, retrying if the database is busy. - async fn try_create_db(&self) -> Result> { - // try 100 times to acquire initial db connection. - let mut retries = 0; - loop { - match self.make_connection().await { - Ok(conn) => return Ok(conn), - Err( - err @ Error::RusqliteError(rusqlite::Error::SqliteFailure( - rusqlite::ffi::Error { - code: ErrorCode::DatabaseBusy, - .. - }, - _, - )), - ) => { - if retries < 100 { - tracing::warn!("Database file is busy, retrying..."); - retries += 1; - tokio::time::sleep(Duration::from_millis(100)).await - } else { - Err(err)?; - } - } - Err(e) => Err(e)?, - } - } - } - - #[tracing::instrument(skip(self))] - async fn make_connection(&self) -> Result> { - LibSqlConnection::new( - self.db_path.clone(), - self.extensions.clone(), - self.wal_wrapper.clone(), - self.stats.clone(), - self.broadcaster.clone(), - self.config_store.clone(), - QueryBuilderConfig { - max_size: Some(self.max_response_size), - max_total_size: Some(self.max_total_response_size), - auto_checkpoint: self.auto_checkpoint, - encryption_config: self.encryption_config.clone(), - }, - self.current_frame_no_receiver.clone(), - self.block_writes.clone(), - self.resolve_attach_path.clone(), - self.connection_manager.clone(), - self.make_wal_manager.clone(), - ) - .await - } +pub struct MakeLibsqlConnectionInner { + pub(crate) db_path: Arc, + pub(crate) stats: Arc, + pub(crate) broadcaster: BroadcasterHandle, + pub(crate) config_store: MetaStoreHandle, + pub(crate) extensions: Arc<[PathBuf]>, + pub(crate) max_response_size: u64, + pub(crate) max_total_response_size: u64, + pub(crate) auto_checkpoint: u32, + pub(crate) current_frame_no_receiver: watch::Receiver>, + pub(crate) encryption_config: Option, + pub(crate) block_writes: Arc, + pub(crate) resolve_attach_path: ResolveNamespacePathFn, + pub(crate) wal_manager: LibsqlWalManager, } #[async_trait::async_trait] -impl MakeConnection for MakeLibSqlConn -where - W: WrapWal + Send + Sync + 'static + Clone, -{ - type Connection = LibSqlConnection; - - async fn create(&self) -> Result { - self.make_connection().await - } -} - -pub struct LibSqlConnection { - inner: Arc>>>, -} - -#[cfg(test)] -impl LibSqlConnection { - pub async fn new_test(path: &Path) -> Self { - #[cfg(not(feature = "durable-wal"))] - use libsql_sys::wal::either::Either as EitherWAL; - #[cfg(feature = "durable-wal")] - use libsql_sys::wal::either::Either3 as EitherWAL; - use libsql_sys::wal::Sqlite3WalManager; - - Self::new( - path.to_owned(), - Arc::new([]), - libsql_sys::wal::wrapper::PassthroughWalWrapper, - Default::default(), - Default::default(), - MetaStoreHandle::new_test(), - QueryBuilderConfig::default(), - tokio::sync::watch::channel(None).1, - Default::default(), - Arc::new(|_| unreachable!()), - ConnectionManager::new(TXN_TIMEOUT), - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - ) - .await - .unwrap() - } -} - -impl Clone for LibSqlConnection { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone(), - } - } -} - -#[derive(Clone, Copy)] -pub struct InhibitCheckpointWalWrapper { - close_only: bool, -} - -impl InhibitCheckpointWalWrapper { - pub fn new(close_only: bool) -> Self { - Self { close_only } - } -} +impl MakeConnection for MakeLibsqlConnection { + type Connection = LibsqlConnection; + + async fn create(&self) -> crate::Result { + let inner = self.inner.clone(); + let core = BLOCKING_RT.spawn_blocking(move || -> crate::Result<_> { + let builder_config = QueryBuilderConfig { + max_size: Some(inner.max_response_size), + max_total_size: Some(inner.max_total_response_size), + auto_checkpoint: inner.auto_checkpoint, + encryption_config: inner.encryption_config.clone(), + }; -impl WrapWal for InhibitCheckpointWalWrapper { - fn checkpoint( - &mut self, - wrapped: &mut W, - db: &mut libsql_sys::wal::Sqlite3Db, - mode: libsql_sys::wal::CheckpointMode, - busy_handler: Option<&mut dyn BusyHandler>, - sync_flags: u32, - buf: &mut [u8], - checkpoint_cb: Option<&mut dyn CheckpointCallback>, - in_wal: Option<&mut i32>, - backfilled: Option<&mut i32>, - ) -> libsql_sys::wal::Result<()> { - if !self.close_only { - wrapped.checkpoint( - db, - mode, - busy_handler, - sync_flags, - buf, - checkpoint_cb, - in_wal, - backfilled, + // todo: handle retries + CoreConnection::new( + &inner.db_path, + inner.extensions.clone(), + inner.wal_manager.clone(), + inner.stats.clone(), + inner.broadcaster.clone(), + inner.config_store.clone(), + builder_config, + inner.current_frame_no_receiver.clone(), + inner.block_writes.clone(), + inner.resolve_attach_path.clone() ) - } else { - tracing::warn!( - "checkpoint inhibited: this connection is not allowed to perform checkpoints" - ); - Err(rusqlite::ffi::Error::new(SQLITE_BUSY)) - } - } + }).await.unwrap()?; - fn close>( - &mut self, - manager: &M, - wrapped: &mut W, - db: &mut libsql_sys::wal::Sqlite3Db, - sync_flags: c_int, - _scratch: Option<&mut [u8]>, - ) -> libsql_sys::wal::Result<()> { - // sqlite3 wall will not checkpoint if it's not provided with a scratch buffer. We take - // advantage of that to prevent checpoint on such connections. - manager.close(wrapped, db, sync_flags, None) + Ok(LibsqlConnection { inner: Arc::new(Mutex::new(core)) }) } } -pub type InhibitCheckpoint = WrappedWal; - -// Opens a connection with checkpoint inhibited -pub fn open_conn( - path: &Path, - wal_manager: T, - flags: Option, - encryption_config: Option, -) -> Result>, rusqlite::Error> -where - T: WalManager, -{ - open_conn_active_checkpoint( - path, - wal_manager.wrap(InhibitCheckpointWalWrapper::new(false)), - flags, - u32::MAX, - encryption_config, - ) -} - -/// Same as open_conn, but with checkpointing activated. -pub fn open_conn_active_checkpoint( - path: &Path, - wal_manager: T, - flags: Option, - auto_checkpoint: u32, - encryption_config: Option, -) -> Result, rusqlite::Error> -where - T: WalManager, -{ - let flags = flags.unwrap_or( - OpenFlags::SQLITE_OPEN_READ_WRITE - | OpenFlags::SQLITE_OPEN_CREATE - | OpenFlags::SQLITE_OPEN_URI - | OpenFlags::SQLITE_OPEN_NO_MUTEX, - ); - - libsql_sys::Connection::open( - path.join("data"), - flags, - wal_manager, - auto_checkpoint, - encryption_config, - ) +#[derive(Clone)] +pub struct LibsqlConnection { + inner: Arc>>>, } -impl LibSqlConnection -where - W: WrapWal + Send + Clone + 'static, -{ - pub async fn new( - path: impl AsRef + Send + 'static, - extensions: Arc<[PathBuf]>, - wal_wrapper: W, - stats: Arc, - broadcaster: BroadcasterHandle, - config_store: MetaStoreHandle, - builder_config: QueryBuilderConfig, - current_frame_no_receiver: watch::Receiver>, - block_writes: Arc, - resolve_attach_path: ResolveNamespacePathFn, - connection_manager: ConnectionManager, - make_wal: Arc InnerWalManager + Sync + Send + 'static>, - ) -> crate::Result { - let (conn, id) = tokio::task::spawn_blocking({ - let connection_manager = connection_manager.clone(); - move || -> crate::Result<_> { - let manager = ManagedConnectionWalWrapper::new(connection_manager); - let id = manager.id(); - let wal = make_wal().wrap(manager).wrap(wal_wrapper); - - let conn = Connection::new( - path.as_ref(), - extensions, - wal, - stats, - broadcaster, - config_store, - builder_config, - current_frame_no_receiver, - block_writes, - resolve_attach_path, - )?; - - let namespace = path - .as_ref() - .file_name() - .unwrap_or_default() - .to_os_string() - .into_string() - .unwrap_or_default(); - conn.conn.create_scalar_function( - "libsql_server_database_name", - 0, - rusqlite::functions::FunctionFlags::SQLITE_UTF8 - | rusqlite::functions::FunctionFlags::SQLITE_DETERMINISTIC, - move |_| Ok(namespace.clone()), - )?; - Ok((conn, id)) - } - }) - .await - .unwrap()?; - - let inner = Arc::new(Mutex::new(conn)); - - connection_manager.register_connection(&inner, id); - - Ok(Self { inner }) - } - - pub fn with_raw(&self, f: F) -> R - where - F: FnOnce(&mut rusqlite::Connection) -> R, - { - let mut inner = self.inner.lock(); - f(&mut inner.conn) - } - +impl LibsqlConnection { pub async fn execute( &self, pgm: Program, ctx: RequestContext, builder: B, - ) -> Result<(B, Program)> { - PROGRAM_EXEC_COUNT.increment(1); - - check_program_auth(&ctx, &pgm, &self.inner.lock().config_store.get())?; + ) -> Result { + check_program_auth(&ctx, &pgm, &self.inner.lock().config())?; let conn = self.inner.clone(); - BLOCKING_RT - .spawn_blocking(move || Connection::run(conn, pgm, builder)) - .await - .unwrap() - } -} - -pub(super) struct Connection { - conn: libsql_sys::Connection, - stats: Arc, - config_store: MetaStoreHandle, - builder_config: QueryBuilderConfig, - current_frame_no_receiver: watch::Receiver>, - block_writes: Arc, - resolve_attach_path: ResolveNamespacePathFn, - forced_rollback: bool, - broadcaster: BroadcasterHandle, - hooked: bool, -} - -fn update_stats( - stats: &Stats, - sql: String, - rows_read: u64, - rows_written: u64, - mem_used: u64, - elapsed: Duration, -) { - stats.send(StatsUpdateMessage { - sql, - elapsed, - rows_read, - rows_written, - mem_used, - }); -} - -impl Connection { - fn new>( - path: &Path, - extensions: Arc<[PathBuf]>, - wal_manager: T, - stats: Arc, - broadcaster: BroadcasterHandle, - config_store: MetaStoreHandle, - builder_config: QueryBuilderConfig, - current_frame_no_receiver: watch::Receiver>, - block_writes: Arc, - resolve_attach_path: ResolveNamespacePathFn, - ) -> Result { - let conn = open_conn_active_checkpoint( - path, - wal_manager, - None, - builder_config.auto_checkpoint, - builder_config.encryption_config.clone(), - )?; - - let config = config_store.get(); - conn.pragma_update(None, "max_page_count", config.max_db_pages)?; - tracing::debug!("setting PRAGMA synchronous to {}", config.durability_mode); - conn.pragma_update(None, "synchronous", config.durability_mode)?; - - conn.set_limit( - rusqlite::limits::Limit::SQLITE_LIMIT_LENGTH, - config.max_row_size as i32, - ); - - unsafe { - const MAX_RETRIES: c_int = 8; - extern "C" fn do_nothing(_: *mut c_void, n: c_int) -> c_int { - (n < MAX_RETRIES) as _ - } - libsql_sys::ffi::sqlite3_busy_handler( - conn.handle(), - Some(do_nothing), - std::ptr::null_mut(), - ); - } - - let this = Self { - conn, - stats, - config_store, - builder_config, - current_frame_no_receiver, - block_writes, - resolve_attach_path, - forced_rollback: false, - broadcaster, - hooked: false, - }; - - for ext in extensions.iter() { - unsafe { - let _guard = rusqlite::LoadExtensionGuard::new(&this.conn).unwrap(); - if let Err(e) = this.conn.load_extension(ext, None) { - tracing::error!("failed to load extension: {}", ext.display()); - Err(e)?; - } - tracing::trace!("Loaded extension {}", ext.display()); - } - } - - Ok(this) - } - - fn run( - this: Arc>, - pgm: Program, - mut builder: B, - ) -> Result<(B, Program)> { - let (config, stats, block_writes, resolve_attach_path) = { - let mut lock = this.lock(); - let config = lock.config_store.get(); - let stats = lock.stats.clone(); - let block_writes = lock.block_writes.clone(); - let resolve_attach_path = lock.resolve_attach_path.clone(); - - lock.update_hooks(); - - (config, stats, block_writes, resolve_attach_path) - }; - - builder.init(&this.lock().builder_config)?; - let mut vm = Vm::new( - builder, - &pgm, - move |stmt_kind| { - let should_block = match stmt_kind { - StmtKind::Read | StmtKind::TxnBegin => config.block_reads, - StmtKind::Write => { - config.block_reads - || config.block_writes - || block_writes.load(Ordering::SeqCst) - } - StmtKind::DDL => config.block_reads || config.block_writes, - StmtKind::TxnEnd - | StmtKind::Release - | StmtKind::Savepoint - | StmtKind::Detach - | StmtKind::Attach(_) => false, - }; - - ( - should_block, - should_block.then(|| config.block_reason.clone()).flatten(), - ) - }, - move |sql, rows_read, rows_written, mem_used, elapsed| { - update_stats(&stats, sql, rows_read, rows_written, mem_used, elapsed) - }, - resolve_attach_path, - ); - - let mut has_timeout = false; - while !vm.finished() { - let mut conn = this.lock(); - - if conn.forced_rollback { - has_timeout = true; - conn.forced_rollback = false; - } - - // once there was a timeout, invalidate all the program steps - if has_timeout { - vm.builder().begin_step()?; - vm.builder().step_error(Error::LibSqlTxTimeout)?; - vm.builder().finish_step(0, None)?; - vm.advance(); - continue; - } - - let conn = conn.conn.deref(); - vm.step(conn)?; - } - - { - let mut lock = this.lock(); - let is_autocommit = lock.conn.is_autocommit(); - let current_fno = *lock.current_frame_no_receiver.borrow_and_update(); - vm.builder().finish(current_fno, is_autocommit)?; - } - - Ok((vm.into_builder(), pgm)) - } - - fn rollback(&self) { - if let Err(e) = self.conn.execute("ROLLBACK", ()) { - tracing::error!("failed to rollback: {e}"); - } - } - - pub(super) fn force_rollback(&mut self) { - if !self.forced_rollback { - self.rollback(); - self.forced_rollback = true; - } - } - - fn checkpoint(&self) -> Result<()> { - let start = Instant::now(); - self.conn - .query_row("PRAGMA wal_checkpoint(TRUNCATE)", (), |row| { - let status: i32 = row.get(0)?; - let wal_frames: i32 = row.get(1)?; - let moved_frames: i32 = row.get(2)?; - tracing::info!( - "WAL checkpoint successful, status: {}, WAL frames: {}, moved frames: {}", - status, - wal_frames, - moved_frames - ); - Ok(()) - })?; - WAL_CHECKPOINT_COUNT.increment(1); - histogram!("libsql_server_wal_checkpoint_time", start.elapsed()); - Ok(()) - } - - fn vacuum_if_needed(&self) -> Result<()> { - let page_count = self - .conn - .query_row("PRAGMA page_count", (), |row| row.get::<_, i64>(0))?; - let freelist_count = self - .conn - .query_row("PRAGMA freelist_count", (), |row| row.get::<_, i64>(0))?; - // NOTICE: don't bother vacuuming if we don't have at least 256MiB of data - if page_count >= 65536 && freelist_count * 2 > page_count { - tracing::info!("Vacuuming: pages={page_count} freelist={freelist_count}"); - self.conn.execute("VACUUM", ())?; - } else { - tracing::trace!("Not vacuuming: pages={page_count} freelist={freelist_count}"); - } - VACUUM_COUNT.increment(1); - Ok(()) - } - - fn describe(&self, sql: &str) -> crate::Result { - let stmt = self.conn.prepare(sql)?; - - let params = (1..=stmt.parameter_count()) - .map(|param_i| { - let name = stmt.parameter_name(param_i).map(|n| n.into()); - DescribeParam { name } - }) - .collect(); - - let cols = stmt - .columns() - .into_iter() - .map(|col| { - let name = col.name().into(); - let decltype = col.decl_type().map(|t| t.into()); - DescribeCol { name, decltype } - }) - .collect(); - - let is_explain = stmt.is_explain() != 0; - let is_readonly = stmt.readonly(); - Ok(DescribeResponse { - params, - cols, - is_explain, - is_readonly, - }) - } - - fn is_autocommit(&self) -> bool { - self.conn.is_autocommit() - } - - fn update_hooks(&mut self) { - let (update_fn, commit_fn, rollback_fn) = if self.hooked { - if self.broadcaster.active() { - return; - } - self.hooked = false; - (None, None, None) - } else { - let Some(broadcaster) = self.broadcaster.get() else { - return; - }; - - let update = broadcaster.clone(); - let update_fn = Some(move |action: _, _: &_, table: &_, _| { - update.notify(table, action); - }); - - let commit = broadcaster.clone(); - let commit_fn = Some(move || { - commit.commit(); - false // allow commit to go through - }); - - let rollback = broadcaster; - let rollback_fn = Some(move || rollback.rollback()); - (update_fn, commit_fn, rollback_fn) - }; - - self.conn.update_hook(update_fn); - self.conn.commit_hook(commit_fn); - self.conn.rollback_hook(rollback_fn); + CoreConnection::run_async(conn, pgm, builder).await } } #[async_trait::async_trait] -impl super::Connection for LibSqlConnection -where - W: WrapWal + Clone + Send + 'static, -{ +impl super::Connection for LibsqlConnection { async fn execute_program( &self, pgm: Program, @@ -715,7 +105,7 @@ where ) -> Result { record_time! { "libsql_query_exec"; - self.execute(pgm, ctx, builder).await.map(|(b, _)| b) + self.execute(pgm, ctx, builder).await } } @@ -758,426 +148,9 @@ where fn diagnostics(&self) -> String { String::new() } -} - -#[cfg(test)] -mod test { - use itertools::Itertools; - #[cfg(not(feature = "durable-wal"))] - use libsql_sys::wal::either::Either as EitherWAL; - #[cfg(feature = "durable-wal")] - use libsql_sys::wal::either::Either3 as EitherWAL; - use libsql_sys::wal::wrapper::PassthroughWalWrapper; - use libsql_sys::wal::{Sqlite3Wal, Sqlite3WalManager}; - use rand::Rng; - use tempfile::tempdir; - use tokio::task::JoinSet; - - use crate::auth::Authenticated; - use crate::connection::{Connection as _, TXN_TIMEOUT}; - use crate::namespace::meta_store::{metastore_connection_maker, MetaStore}; - use crate::namespace::NamespaceName; - use crate::query_result_builder::test::{test_driver, TestBuilder}; - use crate::query_result_builder::QueryResultBuilder; - use crate::DEFAULT_AUTO_CHECKPOINT; - - use super::*; - - fn setup_test_conn() -> Arc>> { - let conn = Connection { - conn: libsql_sys::Connection::test(), - stats: Arc::new(Stats::default()), - config_store: MetaStoreHandle::new_test(), - builder_config: QueryBuilderConfig::default(), - current_frame_no_receiver: watch::channel(None).1, - block_writes: Default::default(), - resolve_attach_path: Arc::new(|_| unreachable!()), - forced_rollback: false, - broadcaster: Default::default(), - hooked: false, - }; - - let conn = Arc::new(Mutex::new(conn)); - - let stmts = std::iter::once("create table test (x)") - .chain(std::iter::repeat("insert into test values ('hello world')").take(100)) - .collect_vec(); - Connection::run(conn.clone(), Program::seq(&stmts), TestBuilder::default()).unwrap(); - - conn - } - - #[test] - fn test_libsql_conn_builder_driver() { - test_driver(1000, |b| { - let conn = setup_test_conn(); - Connection::run(conn, Program::seq(&["select * from test"]), b) - }) - } - - #[ignore = "the new implementation doesn't steal if nobody is trying to acquire a write lock"] - #[tokio::test] - async fn txn_timeout_no_stealing() { - let tmp = tempdir().unwrap(); - let make_conn = MakeLibSqlConn::new( - tmp.path().into(), - PassthroughWalWrapper, - Default::default(), - Default::default(), - MetaStoreHandle::load(tmp.path()).unwrap(), - Arc::new([]), - 100000000, - 100000000, - DEFAULT_AUTO_CHECKPOINT, - watch::channel(None).1, - None, - Default::default(), - Arc::new(|_| unreachable!()), - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - ) - .await - .unwrap(); - - tokio::time::pause(); - let conn = make_conn.make_connection().await.unwrap(); - let _builder = Connection::run( - conn.inner.clone(), - Program::seq(&["BEGIN IMMEDIATE"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(!conn.inner.lock().conn.is_autocommit()); - - tokio::time::sleep(Duration::from_secs(1)).await; - - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["create table test (c)"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(!conn.is_autocommit().await.unwrap()); - assert!(matches!(builder.into_ret()[0], Err(Error::LibSqlTxTimeout))); - } - - #[tokio::test] - /// A bunch of txn try to acquire the lock, and never release it. They will try to steal the - /// lock one after the other. All txn should eventually acquire the write lock - async fn serialized_txn_timeouts() { - let tmp = tempdir().unwrap(); - let make_conn = MakeLibSqlConn::new( - tmp.path().into(), - PassthroughWalWrapper, - Default::default(), - Default::default(), - MetaStoreHandle::load(tmp.path()).unwrap(), - Arc::new([]), - 100000000, - 100000000, - DEFAULT_AUTO_CHECKPOINT, - watch::channel(None).1, - None, - Default::default(), - Arc::new(|_| unreachable!()), - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - ) - .await - .unwrap(); - - let mut set = JoinSet::new(); - for _ in 0..10 { - let conn = make_conn.make_connection().await.unwrap(); - set.spawn_blocking(move || { - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["BEGIN IMMEDIATE"]), - TestBuilder::default(), - ) - .unwrap() - .0; - let ret = &builder.into_ret()[0]; - assert!( - (ret.is_ok() && !conn.inner.lock().conn.is_autocommit()) - || (matches!(ret, Err(Error::RusqliteErrorExtended(_, 5))) - && conn.inner.lock().conn.is_autocommit()) - ); - }); - } - - tokio::time::pause(); - - while let Some(ret) = set.join_next().await { - assert!(ret.is_ok()); - // advance time by a bit more than the txn timeout - tokio::time::advance(TXN_TIMEOUT + Duration::from_millis(100)).await; - } - } - - #[tokio::test] - /// verify that releasing a txn before the timeout - async fn release_before_timeout() { - let tmp = tempdir().unwrap(); - let make_conn = MakeLibSqlConn::new( - tmp.path().into(), - PassthroughWalWrapper, - Default::default(), - Default::default(), - MetaStoreHandle::load(tmp.path()).unwrap(), - Arc::new([]), - 100000000, - 100000000, - DEFAULT_AUTO_CHECKPOINT, - watch::channel(None).1, - None, - Default::default(), - Arc::new(|_| unreachable!()), - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - ) - .await - .unwrap(); - - let conn1 = make_conn.make_connection().await.unwrap(); - tokio::task::spawn_blocking({ - let conn = conn1.clone(); - move || { - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["BEGIN IMMEDIATE"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(!conn.inner.lock().is_autocommit()); - assert!(builder.into_ret()[0].is_ok()); - } - }) - .await - .unwrap(); - - let conn2 = make_conn.make_connection().await.unwrap(); - let handle = tokio::task::spawn_blocking({ - let conn = conn2.clone(); - move || { - let before = Instant::now(); - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["BEGIN IMMEDIATE"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(!conn.inner.lock().is_autocommit()); - assert!(builder.into_ret()[0].is_ok()); - before.elapsed() - } - }); - - let wait_time = TXN_TIMEOUT / 10; - tokio::time::sleep(wait_time).await; - tokio::task::spawn_blocking({ - let conn = conn1.clone(); - move || { - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["COMMIT"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(conn.inner.lock().is_autocommit()); - assert!(builder.into_ret()[0].is_ok()); - } - }) - .await - .unwrap(); - - let elapsed = handle.await.unwrap(); - - let epsilon = Duration::from_millis(100); - assert!((wait_time..wait_time + epsilon).contains(&elapsed)); - } - - /// The goal of this test is to run many concurrent transaction and hopefully catch a bug in - /// the lock stealing code. If this test becomes flaky check out the lock stealing code. - #[tokio::test] - async fn test_many_concurrent() { - let tmp = tempdir().unwrap(); - let make_conn = MakeLibSqlConn::new( - tmp.path().into(), - PassthroughWalWrapper, - Default::default(), - Default::default(), - MetaStoreHandle::load(tmp.path()).unwrap(), - Arc::new([]), - 100000000, - 100000000, - DEFAULT_AUTO_CHECKPOINT, - watch::channel(None).1, - None, - Default::default(), - Arc::new(|_| unreachable!()), - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - ) - .await - .unwrap(); - - let conn = make_conn.make_connection().await.unwrap(); - let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); - let ctx = RequestContext::new( - Authenticated::FullAccess, - NamespaceName::default(), - MetaStore::new(Default::default(), tmp.path(), maker().unwrap(), manager) - .await - .unwrap(), - ); - conn.execute_program( - Program::seq(&["CREATE TABLE test (x)"]), - ctx.clone(), - TestBuilder::default(), - None, - ) - .await - .unwrap(); - let run_conn = |maker: Arc>| { - let ctx = ctx.clone(); - async move { - for _ in 0..1000 { - let conn = maker.make_connection().await.unwrap(); - let pgm = Program::seq(&["BEGIN IMMEDIATE", "INSERT INTO test VALUES (42)"]); - let res = conn - .execute_program(pgm, ctx.clone(), TestBuilder::default(), None) - .await - .unwrap() - .into_ret(); - for result in res { - result.unwrap(); - } - // with 99% change, commit the txn - if rand::thread_rng().gen_range(0..100) > 1 { - let pgm = Program::seq(&["INSERT INTO test VALUES (43)", "COMMIT"]); - let res = conn - .execute_program(pgm, ctx.clone(), TestBuilder::default(), None) - .await - .unwrap() - .into_ret(); - for result in res { - result.unwrap(); - } - } - } - } - }; - - let maker = Arc::new(make_conn); - let mut join_set = JoinSet::new(); - for _ in 0..3 { - join_set.spawn(run_conn(maker.clone())); - } - - let join_all = async move { - while let Some(next) = join_set.join_next().await { - next.unwrap(); - } - }; - - tokio::time::timeout(Duration::from_secs(60), join_all) - .await - .expect("timed out running connections"); - } - - #[tokio::test] - /// verify that releasing a txn before the timeout - async fn force_rollback_reset() { - let tmp = tempdir().unwrap(); - let make_conn = MakeLibSqlConn::new( - tmp.path().into(), - PassthroughWalWrapper, - Default::default(), - Default::default(), - MetaStoreHandle::load(tmp.path()).unwrap(), - Arc::new([]), - 100000000, - 100000000, - DEFAULT_AUTO_CHECKPOINT, - watch::channel(None).1, - None, - Default::default(), - Arc::new(|_| unreachable!()), - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - ) - .await - .unwrap(); - - let conn1 = make_conn.make_connection().await.unwrap(); - tokio::task::spawn_blocking({ - let conn = conn1.clone(); - move || { - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["BEGIN IMMEDIATE"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(!conn.inner.lock().is_autocommit()); - assert!(builder.into_ret()[0].is_ok()); - } - }) - .await - .unwrap(); - - let conn2 = make_conn.make_connection().await.unwrap(); - tokio::task::spawn_blocking({ - let conn = conn2.clone(); - move || { - let before = Instant::now(); - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["BEGIN IMMEDIATE"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(!conn.inner.lock().is_autocommit()); - assert!(builder.into_ret()[0].is_ok()); - before.elapsed() - } - }) - .await - .unwrap(); - - tokio::time::sleep(TXN_TIMEOUT * 2).await; - - tokio::task::spawn_blocking({ - let conn = conn1.clone(); - move || { - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["SELECT 1;"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(conn.inner.lock().is_autocommit()); - // timeout - assert!(builder.into_ret()[0].is_err()); - - let builder = Connection::run( - conn.inner.clone(), - Program::seq(&["SELECT 1;"]), - TestBuilder::default(), - ) - .unwrap() - .0; - assert!(conn.inner.lock().is_autocommit()); - // state reset - assert!(builder.into_ret()[0].is_ok()); - } - }) - .await - .unwrap(); + fn with_raw(&self, f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + let mut inner = self.inner.lock(); + f(inner.raw_mut()) } } diff --git a/libsql-server/src/connection/mod.rs b/libsql-server/src/connection/mod.rs index e554130a80..65be8b2533 100644 --- a/libsql-server/src/connection/mod.rs +++ b/libsql-server/src/connection/mod.rs @@ -26,9 +26,11 @@ use self::program::{Cond, DescribeResponse, Program, Step}; pub mod config; pub mod connection_manager; pub mod dump; -pub mod libsql; +pub mod legacy; pub mod program; pub mod write_proxy; +pub mod libsql; +mod connection_core; #[cfg(not(test))] const TXN_TIMEOUT: Duration = Duration::from_secs(5); @@ -169,6 +171,8 @@ pub trait Connection: Send + Sync + 'static { async fn vacuum_if_needed(&self) -> Result<()>; fn diagnostics(&self) -> String; + + fn with_raw(&self, f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R; } fn make_batch_program(batch: Vec) -> Vec { @@ -444,6 +448,10 @@ impl Connection for TrackedConnection { fn diagnostics(&self) -> String { self.inner.diagnostics() } + + fn with_raw(&self, f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + self.inner.with_raw(f) + } } #[cfg(test)] @@ -489,6 +497,10 @@ pub mod test { fn diagnostics(&self) -> String { "dummy".into() } + + fn with_raw(&self, _f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + todo!() + } } #[tokio::test] diff --git a/libsql-server/src/connection/program.rs b/libsql-server/src/connection/program.rs index f128c7538f..785a7b894b 100644 --- a/libsql-server/src/connection/program.rs +++ b/libsql-server/src/connection/program.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use std::time::{Duration, Instant}; use metrics::{histogram, increment_counter}; @@ -14,14 +15,14 @@ use crate::query_result_builder::QueryResultBuilder; use super::config::DatabaseConfig; use super::RequestContext; -#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[derive(Debug, serde::Serialize, serde::Deserialize, Clone)] pub struct Program { - pub steps: Vec, + pub steps: Arc>, } impl Program { pub fn new(steps: Vec) -> Self { - Self { steps } + Self { steps: steps.into() } } pub fn is_read_only(&self) -> bool { @@ -29,7 +30,11 @@ impl Program { } pub fn steps(&self) -> &[Step] { - self.steps.as_slice() + &self.steps + } + + pub fn steps_mut(&mut self) -> Option<&mut Vec> { + Arc::get_mut(&mut self.steps) } #[cfg(test)] diff --git a/libsql-server/src/connection/write_proxy.rs b/libsql-server/src/connection/write_proxy.rs index 6e66dce37b..1531206637 100644 --- a/libsql-server/src/connection/write_proxy.rs +++ b/libsql-server/src/connection/write_proxy.rs @@ -1,5 +1,3 @@ -use std::path::PathBuf; -use std::sync::atomic::AtomicBool; use std::sync::Arc; use futures_core::future::BoxFuture; @@ -8,7 +6,6 @@ use libsql_replication::rpc::proxy::proxy_client::ProxyClient; use libsql_replication::rpc::proxy::{ exec_req, exec_resp, ExecReq, ExecResp, StreamDescribeReq, StreamProgramReq, }; -use libsql_sys::wal::wrapper::PassthroughWalWrapper; use libsql_sys::EncryptionConfig; use parking_lot::Mutex as PMutex; use tokio::sync::{mpsc, watch, Mutex}; @@ -19,72 +16,45 @@ use tonic::{Request, Streaming}; use crate::connection::program::{DescribeCol, DescribeParam}; use crate::error::Error; use crate::metrics::{REPLICA_LOCAL_EXEC_MISPREDICT, REPLICA_LOCAL_PROGRAM_EXEC}; -use crate::namespace::broadcasters::BroadcasterHandle; -use crate::namespace::meta_store::MetaStoreHandle; -use crate::namespace::ResolveNamespacePathFn; use crate::query_analysis::TxnStatus; use crate::query_result_builder::{QueryBuilderConfig, QueryResultBuilder}; use crate::replication::FrameNo; use crate::stats::Stats; use crate::{Result, DEFAULT_AUTO_CHECKPOINT}; -use super::connection_manager::InnerWalManager; -use super::libsql::{LibSqlConnection, MakeLibSqlConn}; use super::program::DescribeResponse; use super::{Connection, RequestContext}; use super::{MakeConnection, Program}; pub type RpcStream = Streaming; -pub struct MakeWriteProxyConn { +pub struct MakeWriteProxyConn { client: ProxyClient, stats: Arc, applied_frame_no_receiver: watch::Receiver>, max_response_size: u64, max_total_response_size: u64, primary_replication_index: Option, - make_read_only_conn: MakeLibSqlConn, + // make_read_only_conn: MakeLegacyConnection, + make_read_only_conn: M, encryption_config: Option, } -impl MakeWriteProxyConn { +impl MakeWriteProxyConn { #[allow(clippy::too_many_arguments)] - pub async fn new( - db_path: PathBuf, - extensions: Arc<[PathBuf]>, + pub fn new( channel: Channel, uri: tonic::transport::Uri, stats: Arc, - broadcaster: BroadcasterHandle, - config_store: MetaStoreHandle, applied_frame_no_receiver: watch::Receiver>, max_response_size: u64, max_total_response_size: u64, primary_replication_index: Option, encryption_config: Option, - resolve_attach_path: ResolveNamespacePathFn, - make_wal_manager: Arc InnerWalManager + Send + Sync + 'static>, - ) -> crate::Result { + make_read_only_conn: M, + ) -> Self { let client = ProxyClient::with_origin(channel, uri); - let make_read_only_conn = MakeLibSqlConn::new( - db_path.clone(), - PassthroughWalWrapper, - stats.clone(), - broadcaster, - config_store.clone(), - extensions.clone(), - max_response_size, - max_total_response_size, - DEFAULT_AUTO_CHECKPOINT, - applied_frame_no_receiver.clone(), - encryption_config.clone(), - Arc::new(AtomicBool::new(false)), // this is always false for write proxy - resolve_attach_path, - make_wal_manager, - ) - .await?; - - Ok(Self { + Self { client, stats, applied_frame_no_receiver, @@ -93,13 +63,15 @@ impl MakeWriteProxyConn { make_read_only_conn, primary_replication_index, encryption_config, - }) + } } } #[async_trait::async_trait] -impl MakeConnection for MakeWriteProxyConn { - type Connection = WriteProxyConnection; +impl MakeConnection for MakeWriteProxyConn +where M: MakeConnection, +{ + type Connection = WriteProxyConnection; async fn create(&self) -> Result { Ok(WriteProxyConnection::new( self.client.clone(), @@ -117,9 +89,9 @@ impl MakeConnection for MakeWriteProxyConn { } } -pub struct WriteProxyConnection { +pub struct WriteProxyConnection { /// Lazily initialized read connection - read_conn: LibSqlConnection, + read_conn: C, write_proxy: ProxyClient, state: Mutex, /// FrameNo of the last write performed by this connection on the primary. @@ -136,7 +108,7 @@ pub struct WriteProxyConnection { primary_replication_index: Option, } -impl WriteProxyConnection { +impl WriteProxyConnection { #[allow(clippy::too_many_arguments)] fn new( write_proxy: ProxyClient, @@ -144,7 +116,7 @@ impl WriteProxyConnection { applied_frame_no_receiver: watch::Receiver>, builder_config: QueryBuilderConfig, primary_replication_index: Option, - read_conn: LibSqlConnection, + read_conn: C, ) -> Result { Ok(Self { read_conn, @@ -190,7 +162,7 @@ impl WriteProxyConnection { *status = TxnStatus::Invalid; let res = self .with_remote_conn(ctx, self.builder_config.clone(), |conn| { - Box::pin(conn.execute(pgm, builder)) + Box::pin(conn.execute(pgm.clone(), builder)) }) .await; @@ -452,7 +424,7 @@ where } #[async_trait::async_trait] -impl Connection for WriteProxyConnection { +impl Connection for WriteProxyConnection { async fn execute_program( &self, pgm: Program, @@ -471,7 +443,9 @@ impl Connection for WriteProxyConnection { // We know that this program won't perform any writes. We attempt to run it on the // replica. If it leaves an open transaction, then this program is an interactive // transaction, so we rollback the replica, and execute again on the primary. - let (builder, pgm) = self.read_conn.execute(pgm, ctx.clone(), builder).await?; + let builder = self + .read_conn + .execute_program(pgm.clone(), ctx.clone(), builder, replication_index).await?; if !self.read_conn.is_autocommit().await? { REPLICA_LOCAL_EXEC_MISPREDICT.increment(1); self.read_conn.rollback(ctx.clone()).await?; @@ -517,6 +491,10 @@ impl Connection for WriteProxyConnection { fn diagnostics(&self) -> String { format!("{:?}", self.state) } + + fn with_raw(&self, _f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + panic!("no raw connection for write proxy"); + } } #[cfg(test)] diff --git a/libsql-server/src/database/libsql_primary.rs b/libsql-server/src/database/libsql_primary.rs new file mode 100644 index 0000000000..5e9e530954 --- /dev/null +++ b/libsql-server/src/database/libsql_primary.rs @@ -0,0 +1,23 @@ +use std::sync::Arc; +use std::sync::atomic::AtomicBool; + +use crate::connection::libsql::{LibsqlConnection, MakeLibsqlConnection}; +use crate::connection::{MakeThrottledConnection, TrackedConnection}; + +pub type LibsqlPrimaryConnection = TrackedConnection; +pub type LibsqlPrimaryConnectionMaker = MakeThrottledConnection; + +pub struct LibsqlPrimaryDatabase { + pub connection_maker: Arc, + pub block_writes: Arc, +} + +impl LibsqlPrimaryDatabase { + pub fn connection_maker(&self) -> Arc { + self.connection_maker.clone() + } + + pub fn destroy(self) { } + + pub async fn shutdown(self) -> anyhow::Result<()> { Ok(()) } +} diff --git a/libsql-server/src/database/libsql_replica.rs b/libsql-server/src/database/libsql_replica.rs new file mode 100644 index 0000000000..86435be292 --- /dev/null +++ b/libsql-server/src/database/libsql_replica.rs @@ -0,0 +1,31 @@ +use std::sync::Arc; + +use libsql_replication::rpc::proxy::ExecResp; +use tonic::Streaming; + +use crate::connection::libsql::{LibsqlConnection, MakeLibsqlConnection}; +use crate::connection::write_proxy::{MakeWriteProxyConn, WriteProxyConnection}; +use crate::connection::{MakeThrottledConnection, TrackedConnection}; + +use super::Result; + +pub type LibsqlReplicaConnection = TrackedConnection< + WriteProxyConnection, LibsqlConnection>, +>; +type LibsqlReplicaConnectionMaker = MakeThrottledConnection>; + +pub struct LibsqlReplicaDatabase { + pub connection_maker: Arc, +} + +impl LibsqlReplicaDatabase { + pub fn connection_maker(&self) -> Arc { + self.connection_maker.clone() + } + + pub fn destroy(self) {} + + pub async fn shutdown(self) -> Result<()> { + Ok(()) + } +} diff --git a/libsql-server/src/database/mod.rs b/libsql-server/src/database/mod.rs index 5af73decc0..e6d59d3998 100644 --- a/libsql-server/src/database/mod.rs +++ b/libsql-server/src/database/mod.rs @@ -1,4 +1,5 @@ use std::fmt; +use std::sync::atomic::AtomicBool; use std::sync::Arc; use bottomless::replicator::Replicator; @@ -7,6 +8,8 @@ use tokio::sync::watch; use crate::connection::{MakeConnection, RequestContext}; use crate::replication::{FrameNo, ReplicationLogger}; +pub use self::libsql_replica::{LibsqlReplicaConnection, LibsqlReplicaDatabase}; +pub use self::libsql_primary::{LibsqlPrimaryConnection, LibsqlPrimaryDatabase, LibsqlPrimaryConnectionMaker}; pub use self::primary::{PrimaryConnection, PrimaryConnectionMaker, PrimaryDatabase}; pub use self::replica::{ReplicaConnection, ReplicaDatabase}; pub use self::schema::{SchemaConnection, SchemaDatabase}; @@ -14,6 +17,8 @@ pub use self::schema::{SchemaConnection, SchemaDatabase}; mod primary; mod replica; mod schema; +mod libsql_primary; +mod libsql_replica; #[derive(Debug, Clone, serde::Deserialize, Copy)] #[serde(rename_all = "snake_case")] @@ -45,7 +50,10 @@ pub type Result = anyhow::Result; pub enum Connection { Primary(PrimaryConnection), Replica(ReplicaConnection), - Schema(SchemaConnection), + Schema(SchemaConnection), + LibsqlPrimary(LibsqlPrimaryConnection), + LibsqlReplica(LibsqlReplicaConnection), + LibsqlSchema(SchemaConnection), } impl fmt::Debug for Connection { @@ -54,6 +62,9 @@ impl fmt::Debug for Connection { Self::Primary(_) => write!(f, "Primary"), Self::Replica(_) => write!(f, "Replica"), Self::Schema(_) => write!(f, "Schema"), + Self::LibsqlPrimary(_) => write!(f, "LibsqlPrimaryConnection"), + Self::LibsqlReplica(_) => write!(f, "LibsqlReplicaConnection"), + Self::LibsqlSchema(_) => write!(f, "LibsqlSchema"), } } } @@ -64,7 +75,7 @@ impl Connection { /// [`Primary`]: Connection::Primary #[must_use] pub fn is_primary(&self) -> bool { - matches!(self, Self::Primary(..)) + matches!(self, Self::Primary(..) | Self::LibsqlPrimary(_)) } } @@ -90,6 +101,18 @@ impl crate::connection::Connection for Connection { conn.execute_program(pgm, ctx, response_builder, replication_index) .await } + Connection::LibsqlPrimary(conn) => { + conn.execute_program(pgm, ctx, response_builder, replication_index) + .await + } + Connection::LibsqlReplica(conn) => { + conn.execute_program(pgm, ctx, response_builder, replication_index) + .await + } + Connection::LibsqlSchema(conn) => { + conn.execute_program(pgm, ctx, response_builder, replication_index) + .await + } } } @@ -103,6 +126,9 @@ impl crate::connection::Connection for Connection { Connection::Primary(conn) => conn.describe(sql, ctx, replication_index).await, Connection::Replica(conn) => conn.describe(sql, ctx, replication_index).await, Connection::Schema(conn) => conn.describe(sql, ctx, replication_index).await, + Connection::LibsqlPrimary(conn) => conn.describe(sql, ctx, replication_index).await, + Connection::LibsqlReplica(conn) => conn.describe(sql, ctx, replication_index).await, + Connection::LibsqlSchema(conn) => conn.describe(sql, ctx, replication_index).await, } } @@ -111,6 +137,9 @@ impl crate::connection::Connection for Connection { Connection::Primary(conn) => conn.is_autocommit().await, Connection::Replica(conn) => conn.is_autocommit().await, Connection::Schema(conn) => conn.is_autocommit().await, + Connection::LibsqlPrimary(conn) => conn.is_autocommit().await, + Connection::LibsqlReplica(conn) => conn.is_autocommit().await, + Connection::LibsqlSchema(conn) => conn.is_autocommit().await, } } @@ -119,6 +148,9 @@ impl crate::connection::Connection for Connection { Connection::Primary(conn) => conn.checkpoint().await, Connection::Replica(conn) => conn.checkpoint().await, Connection::Schema(conn) => conn.checkpoint().await, + Connection::LibsqlPrimary(conn) => conn.checkpoint().await, + Connection::LibsqlReplica(conn) => conn.checkpoint().await, + Connection::LibsqlSchema(conn) => conn.checkpoint().await, } } @@ -127,6 +159,9 @@ impl crate::connection::Connection for Connection { Connection::Primary(conn) => conn.vacuum_if_needed().await, Connection::Replica(conn) => conn.vacuum_if_needed().await, Connection::Schema(conn) => conn.vacuum_if_needed().await, + Connection::LibsqlPrimary(conn) => conn.vacuum_if_needed().await, + Connection::LibsqlReplica(conn) => conn.vacuum_if_needed().await, + Connection::LibsqlSchema(conn) => conn.vacuum_if_needed().await, } } @@ -135,6 +170,20 @@ impl crate::connection::Connection for Connection { Connection::Primary(conn) => conn.diagnostics(), Connection::Replica(conn) => conn.diagnostics(), Connection::Schema(conn) => conn.diagnostics(), + Connection::LibsqlPrimary(conn) => conn.diagnostics(), + Connection::LibsqlReplica(conn) => conn.diagnostics(), + Connection::LibsqlSchema(conn) => conn.diagnostics(), + } + } + + fn with_raw(&self, f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + match self { + Connection::Primary(c) => c.with_raw(f), + Connection::Replica(c) => c.with_raw(f), + Connection::Schema(c) => c.with_raw(f), + Connection::LibsqlPrimary(c) => c.with_raw(f), + Connection::LibsqlReplica(c) => c.with_raw(f), + Connection::LibsqlSchema(c) => c.with_raw(f), } } } @@ -142,7 +191,10 @@ impl crate::connection::Connection for Connection { pub enum Database { Primary(PrimaryDatabase), Replica(ReplicaDatabase), - Schema(SchemaDatabase), + Schema(SchemaDatabase), + LibsqlPrimary(LibsqlPrimaryDatabase), + LibsqlReplica(LibsqlReplicaDatabase), + LibsqlSchema(SchemaDatabase), } impl fmt::Debug for Database { @@ -150,7 +202,10 @@ impl fmt::Debug for Database { match self { Self::Primary(_) => write!(f, "Primary"), Self::Replica(_) => write!(f, "Replica"), - Database::Schema(_) => write!(f, "Schema"), + Self::Schema(_) => write!(f, "Schema"), + Self::LibsqlPrimary(_) => write!(f, "LibsqlPrimary"), + Self::LibsqlReplica(_) => write!(f, "LibsqlReplica"), + Self::LibsqlSchema(_) => write!(f, "LibsqlSchema"), } } } @@ -161,6 +216,9 @@ impl Database { Database::Primary(db) => Arc::new(db.connection_maker().map(Connection::Primary)), Database::Replica(db) => Arc::new(db.connection_maker().map(Connection::Replica)), Database::Schema(db) => Arc::new(db.connection_maker().map(Connection::Schema)), + Database::LibsqlPrimary(db) => Arc::new(db.connection_maker().map(Connection::LibsqlPrimary)), + Database::LibsqlReplica(db) => Arc::new(db.connection_maker().map(Connection::LibsqlReplica)), + Database::LibsqlSchema(db) => Arc::new(db.connection_maker().map(Connection::LibsqlSchema)), } } @@ -169,6 +227,9 @@ impl Database { Database::Primary(db) => db.destroy(), Database::Replica(db) => db.destroy(), Database::Schema(db) => db.destroy(), + Database::LibsqlPrimary(db) => db.destroy(), + Database::LibsqlReplica(db) => db.destroy(), + Database::LibsqlSchema(db) => db.destroy(), } } @@ -177,6 +238,9 @@ impl Database { Database::Primary(db) => db.shutdown().await, Database::Replica(db) => db.shutdown().await, Database::Schema(db) => db.shutdown().await, + Database::LibsqlPrimary(db) => db.shutdown().await, + Database::LibsqlReplica(db) => db.shutdown().await, + Database::LibsqlSchema(db) => db.shutdown().await, } } @@ -184,7 +248,18 @@ impl Database { match self { Database::Primary(p) => Some(p.wal_wrapper.wrapper().logger()), Database::Replica(_) => None, - Database::Schema(s) => Some(s.wal_wrapper.wrapper().logger()), + Database::Schema(s) => Some(s.wal_wrapper.as_ref().unwrap().wrapper().logger()), + Database::LibsqlPrimary(_) => None, + Database::LibsqlReplica(_) => None, + Database::LibsqlSchema(s) => Some(s.wal_wrapper.as_ref().unwrap().wrapper().logger()), + } + } + + pub fn block_writes(&self) -> Option> { + match self { + Self::Primary(p) => Some(p.block_writes.clone()), + Self::LibsqlPrimary(p) => Some(p.block_writes.clone()), + _ => None, } } @@ -199,28 +274,32 @@ impl Database { ), Database::Replica(_) => None, Database::Schema(s) => Some( - s.wal_wrapper - .wrapper() - .logger() - .new_frame_notifier - .subscribe(), + s + .wal_wrapper + .as_ref() + .unwrap() + .wrapper() + .logger() + .new_frame_notifier + .subscribe(), ), + Database::LibsqlPrimary(_) => todo!(), + Database::LibsqlReplica(_) => todo!(), + Database::LibsqlSchema(_) => todo!(), } } - pub fn as_primary(&self) -> Option<&PrimaryDatabase> { - if let Self::Primary(v) = self { - Some(v) - } else { - None + pub fn is_primary(&self) -> bool { + match self { + Self::LibsqlPrimary(_) | Self::Primary(_) => true, + _ => false, } } - pub(crate) fn as_schema(&self) -> Option<&SchemaDatabase> { - if let Self::Schema(v) = self { - Some(v) - } else { - None + pub(crate) fn is_schema(&self) -> bool { + match self { + Self::Schema(_) => true, + _ => false, } } @@ -229,6 +308,9 @@ impl Database { Database::Primary(db) => db.replicator(), Database::Replica(_) => None, Database::Schema(db) => db.replicator(), + Database::LibsqlPrimary(_) => None, + Database::LibsqlReplica(_) => None, + Database::LibsqlSchema(_) => None, } } } diff --git a/libsql-server/src/database/primary.rs b/libsql-server/src/database/primary.rs index 95f146684f..c8c9ac9890 100644 --- a/libsql-server/src/database/primary.rs +++ b/libsql-server/src/database/primary.rs @@ -1,14 +1,14 @@ use std::sync::atomic::AtomicBool; use std::sync::Arc; -use crate::connection::libsql::{LibSqlConnection, MakeLibSqlConn}; +use crate::connection::legacy::{LegacyConnection, MakeLegacyConnection}; use crate::connection::{MakeThrottledConnection, TrackedConnection}; use crate::namespace::replication_wal::ReplicationWalWrapper; use super::Result; -pub type PrimaryConnection = TrackedConnection>; -pub type PrimaryConnectionMaker = MakeThrottledConnection>; +pub type PrimaryConnection = TrackedConnection>; +pub type PrimaryConnectionMaker = MakeThrottledConnection>; pub struct PrimaryDatabase { pub wal_wrapper: ReplicationWalWrapper, diff --git a/libsql-server/src/database/replica.rs b/libsql-server/src/database/replica.rs index c559d68a58..3ffa12587d 100644 --- a/libsql-server/src/database/replica.rs +++ b/libsql-server/src/database/replica.rs @@ -1,15 +1,19 @@ use std::sync::Arc; use libsql_replication::rpc::proxy::ExecResp; +use libsql_sys::wal::wrapper::PassthroughWalWrapper; use tonic::Streaming; +use crate::connection::legacy::{LegacyConnection, MakeLegacyConnection}; use crate::connection::write_proxy::{MakeWriteProxyConn, WriteProxyConnection}; use crate::connection::{MakeThrottledConnection, TrackedConnection}; use super::Result; -pub type ReplicaConnection = TrackedConnection>>; -type ReplicaConnectionMaker = MakeThrottledConnection; +pub type ReplicaConnection = TrackedConnection< + WriteProxyConnection, LegacyConnection>, +>; +type ReplicaConnectionMaker = MakeThrottledConnection>>; pub struct ReplicaDatabase { pub connection_maker: Arc, diff --git a/libsql-server/src/database/schema.rs b/libsql-server/src/database/schema.rs index 0b9674bd60..182247fe63 100644 --- a/libsql-server/src/database/schema.rs +++ b/libsql-server/src/database/schema.rs @@ -10,24 +10,21 @@ use crate::namespace::NamespaceName; use crate::query_result_builder::QueryBuilderConfig; use crate::schema::{perform_migration, validate_migration, MigrationJobStatus, SchedulerHandle}; -use super::primary::PrimaryConnectionMaker; -use super::PrimaryConnection; - -pub struct SchemaConnection { +pub struct SchemaConnection { migration_scheduler: SchedulerHandle, schema: NamespaceName, - connection: Arc, + connection: Arc, config: MetaStoreHandle, } -impl SchemaConnection { - pub(crate) fn connection(&self) -> &PrimaryConnection { +impl SchemaConnection { + pub(crate) fn connection(&self) -> &C { &self.connection } } #[async_trait::async_trait] -impl crate::connection::Connection for SchemaConnection { +impl crate::connection::Connection for SchemaConnection { async fn execute_program( &self, mut migration: Program, @@ -140,20 +137,35 @@ impl crate::connection::Connection for SchemaConnection { fn diagnostics(&self) -> String { self.connection.diagnostics() } + + fn with_raw(&self, f: impl FnOnce(&mut rusqlite::Connection) -> R) -> R { + self.connection().with_raw(f) + } } -#[derive(Clone)] -pub struct SchemaDatabase { +pub struct SchemaDatabase { migration_scheduler: SchedulerHandle, schema: NamespaceName, - connection_maker: Arc, - pub wal_wrapper: ReplicationWalWrapper, + connection_maker: Arc, + pub wal_wrapper: Option, config: MetaStoreHandle, } +impl Clone for SchemaDatabase { + fn clone(&self) -> Self { + Self { + migration_scheduler: self.migration_scheduler.clone(), + schema: self.schema.clone(), + connection_maker: self.connection_maker.clone(), + wal_wrapper: self.wal_wrapper.clone(), + config: self.config.clone(), + } + } +} + #[async_trait::async_trait] -impl MakeConnection for SchemaDatabase { - type Connection = SchemaConnection; +impl MakeConnection for SchemaDatabase { + type Connection = SchemaConnection; async fn create(&self) -> crate::Result { let connection = Arc::new(self.connection_maker.create().await?); @@ -166,16 +178,16 @@ impl MakeConnection for SchemaDatabase { } } -impl SchemaDatabase { +impl SchemaDatabase { pub fn new( migration_scheduler: SchedulerHandle, schema: NamespaceName, - connection_maker: PrimaryConnectionMaker, - wal_wrapper: ReplicationWalWrapper, + connection_maker: Arc, + wal_wrapper: Option, config: MetaStoreHandle, ) -> Self { Self { - connection_maker: connection_maker.into(), + connection_maker, migration_scheduler, schema, wal_wrapper, @@ -184,16 +196,18 @@ impl SchemaDatabase { } pub(crate) async fn shutdown(self) -> Result<(), anyhow::Error> { - self.wal_wrapper - .wrapper() - .logger() - .closed_signal - .send_replace(true); - let wal_manager = self.wal_wrapper; - - if let Some(maybe_replicator) = wal_manager.wrapped().as_ref() { - if let Some(mut replicator) = maybe_replicator.shutdown().await { - replicator.shutdown_gracefully().await?; + if let Some(wrapper) = self.wal_wrapper { + wrapper + .wrapper() + .logger() + .closed_signal + .send_replace(true); + let wal_manager = wrapper; + + if let Some(maybe_replicator) = wal_manager.wrapped().as_ref() { + if let Some(mut replicator) = maybe_replicator.shutdown().await { + replicator.shutdown_gracefully().await?; + } } } @@ -201,11 +215,13 @@ impl SchemaDatabase { } pub(crate) fn destroy(&self) { - self.wal_wrapper - .wrapper() - .logger() - .closed_signal - .send_replace(true); + if let Some(ref wrapper) = self.wal_wrapper { + wrapper + .wrapper() + .logger() + .closed_signal + .send_replace(true); + } } pub(crate) fn connection_maker(&self) -> Self { @@ -215,8 +231,10 @@ impl SchemaDatabase { pub(crate) fn replicator( &self, ) -> Option>>> { - if let Some(wal) = self.wal_wrapper.wrapped() { - return Some(wal.replicator()); + if let Some(ref wrapper) = self.wal_wrapper { + if let Some(wal) = wrapper.wrapped() { + return Some(wal.replicator()); + } } None } diff --git a/libsql-server/src/error.rs b/libsql-server/src/error.rs index 371630abdf..9cd0b81485 100644 --- a/libsql-server/src/error.rs +++ b/libsql-server/src/error.rs @@ -4,7 +4,7 @@ use tonic::metadata::errors::InvalidMetadataValueBytes; use crate::{ auth::AuthError, - namespace::{ForkError, NamespaceName}, + namespace::{configurator::fork::ForkError, NamespaceName}, query_result_builder::QueryResultBuilderError, }; diff --git a/libsql-server/src/hrana/batch.rs b/libsql-server/src/hrana/batch.rs index a8cabb1d8e..3fb31e2a8c 100644 --- a/libsql-server/src/hrana/batch.rs +++ b/libsql-server/src/hrana/batch.rs @@ -1,5 +1,6 @@ use anyhow::{anyhow, bail, Result}; use std::collections::HashMap; +use std::sync::Arc; use crate::connection::program::{Cond, Program, Step}; use crate::connection::{Connection, RequestContext}; @@ -139,7 +140,7 @@ pub fn proto_sequence_to_program(sql: &str) -> Result { Step { cond, query } }) .collect(); - Ok(Program { steps }) + Ok(Program { steps: Arc::new(steps) }) } pub async fn execute_sequence( diff --git a/libsql-server/src/http/user/dump.rs b/libsql-server/src/http/user/dump.rs index ec3486f9a3..41f88bcff6 100644 --- a/libsql-server/src/http/user/dump.rs +++ b/libsql-server/src/http/user/dump.rs @@ -10,7 +10,7 @@ use serde::Deserialize; use crate::auth::Authenticated; use crate::connection::dump::exporter::export_dump; -use crate::connection::MakeConnection; +use crate::connection::Connection as _; use crate::error::Error; use crate::BLOCKING_RT; @@ -98,7 +98,8 @@ pub(super) async fn handle_dump( let conn_maker = state .namespaces .with(namespace, |ns| { - ns.db.as_primary().unwrap().connection_maker() + assert!(ns.db.is_primary()); + ns.db.connection_maker() }) .await .unwrap(); diff --git a/libsql-server/src/http/user/mod.rs b/libsql-server/src/http/user/mod.rs index 2432370aab..d7f5b30209 100644 --- a/libsql-server/src/http/user/mod.rs +++ b/libsql-server/src/http/user/mod.rs @@ -22,11 +22,11 @@ use axum_extra::middleware::option_layer; use base64::prelude::BASE64_STANDARD_NO_PAD; use base64::Engine; use hyper::{header, Body, Request, Response, StatusCode}; +use libsql_replication::rpc::replication::replication_log_server::{ReplicationLog, ReplicationLogServer}; use serde::de::DeserializeOwned; use serde::Serialize; use serde_json::Number; -use tokio::sync::{mpsc, oneshot, Notify}; -use tokio::task::JoinSet; +use tokio::sync::{mpsc, oneshot}; use tonic::transport::Server; use tower_http::compression::predicate::NotForContentType; @@ -36,7 +36,7 @@ use tower_http::{compression::CompressionLayer, cors}; use crate::auth::{Auth, AuthError, Authenticated, Jwt, Permission, UserAuthContext}; use crate::connection::{Connection, RequestContext}; use crate::error::Error; -use crate::hrana; +use crate::{hrana, TaskManager}; use crate::http::user::db_factory::MakeConnectionExtractorPath; use crate::http::user::timing::timings_middleware; use crate::http::user::types::HttpQuery; @@ -47,8 +47,6 @@ use crate::query::{self, Query}; use crate::query_analysis::{predict_final_state, Statement, TxnStatus}; use crate::query_result_builder::QueryResultBuilder; use crate::rpc::proxy::rpc::proxy_server::{Proxy, ProxyServer}; -use crate::rpc::replication_log::rpc::replication_log_server::ReplicationLog; -use crate::rpc::ReplicationLogServer; use crate::schema::{MigrationDetails, MigrationSummary}; use crate::utils::services::idle_shutdown::IdleShutdownKicker; use crate::version; @@ -256,7 +254,6 @@ pub struct UserApi { pub enable_console: bool, pub self_url: Option, pub primary_url: Option, - pub shutdown: Arc, } impl UserApi @@ -265,12 +262,12 @@ where P: Proxy, S: ReplicationLog, { - pub fn configure(self, join_set: &mut JoinSet>) -> Arc { + pub fn configure(self, task_manager: &mut TaskManager) -> Arc { let (hrana_accept_tx, hrana_accept_rx) = mpsc::channel(8); let (hrana_upgrade_tx, hrana_upgrade_rx) = mpsc::channel(8); let hrana_http_srv = Arc::new(hrana::http::Server::new(self.self_url.clone())); - join_set.spawn({ + task_manager.spawn_until_shutdown({ let namespaces = self.namespaces.clone(); let user_auth_strategy = self.user_auth_strategy.clone(); let idle_kicker = self @@ -296,7 +293,7 @@ where } }); - join_set.spawn({ + task_manager.spawn_until_shutdown({ let server = hrana_http_srv.clone(); async move { server.run_expire().await; @@ -305,7 +302,7 @@ where }); if let Some(acceptor) = self.hrana_ws_acceptor { - join_set.spawn(async move { + task_manager.spawn_until_shutdown(async move { hrana::ws::listen(acceptor, hrana_accept_tx).await; Ok(()) }); @@ -446,10 +443,10 @@ where let router = router.fallback(handle_fallback); let h2c = crate::h2c::H2cMaker::new(router); - join_set.spawn(async move { + task_manager.spawn_with_shutdown_notify(|shutdown| async move { hyper::server::Server::builder(acceptor) .serve(h2c) - .with_graceful_shutdown(self.shutdown.notified()) + .with_graceful_shutdown(shutdown.notified()) .await .context("http server")?; Ok(()) diff --git a/libsql-server/src/lib.rs b/libsql-server/src/lib.rs index 5404a11108..71320fc486 100644 --- a/libsql-server/src/lib.rs +++ b/libsql-server/src/lib.rs @@ -4,7 +4,6 @@ use std::alloc::Layout; use std::ffi::c_void; use std::mem::{align_of, size_of}; use std::path::{Path, PathBuf}; -use std::pin::Pin; use std::str::FromStr; use std::sync::{Arc, Weak}; @@ -18,14 +17,18 @@ use crate::pager::{make_pager, PAGER_CACHE_SIZE}; use crate::rpc::proxy::rpc::proxy_server::Proxy; use crate::rpc::proxy::ProxyService; use crate::rpc::replica_proxy::ReplicaProxyService; -use crate::rpc::replication_log::rpc::replication_log_server::ReplicationLog; -use crate::rpc::replication_log::ReplicationLogService; -use crate::rpc::replication_log_proxy::ReplicationLogProxyService; +use crate::rpc::replication::libsql_replicator::LibsqlReplicationService; +use crate::rpc::replication::replication_log::rpc::replication_log_server::ReplicationLog; +use crate::rpc::replication::replication_log::ReplicationLogService; +use crate::rpc::replication::replication_log_proxy::ReplicationLogProxyService; use crate::rpc::run_rpc_server; use crate::schema::Scheduler; use crate::stats::Stats; use anyhow::Context as AnyhowContext; use auth::Auth; +use aws_config::{BehaviorVersion, Region}; +use aws_sdk_s3::config::{Credentials, SharedCredentialsProvider}; +use aws_smithy_runtime::client::http::hyper_014::HyperClientBuilder; use config::{ AdminApiConfig, DbConfig, HeartbeatConfig, RpcClientConfig, RpcServerConfig, UserApiConfig, }; @@ -33,20 +36,26 @@ use futures::future::ready; use futures::Future; use http::user::UserApi; use hyper::client::HttpConnector; +use hyper::Uri; use hyper_rustls::HttpsConnector; +use libsql_replication::rpc::replication::BoxReplicationService; #[cfg(feature = "durable-wal")] use libsql_storage::{DurableWalManager, LockManager}; +use libsql_sys::wal::either::Either; #[cfg(not(feature = "durable-wal"))] use libsql_sys::wal::either::Either as EitherWAL; #[cfg(feature = "durable-wal")] use libsql_sys::wal::either::Either3 as EitherWAL; use libsql_sys::wal::Sqlite3WalManager; use libsql_wal::checkpointer::LibsqlCheckpointer; +use libsql_wal::io::StdIO; use libsql_wal::registry::WalRegistry; +use libsql_wal::segment::sealed::SealedSegment; +use libsql_wal::storage::async_storage::{AsyncStorage, AsyncStorageInitConfig}; +use libsql_wal::storage::backend::s3::S3Backend; use libsql_wal::storage::NoStorage; -use libsql_wal::wal::LibsqlWalManager; use namespace::meta_store::MetaStoreHandle; -use namespace::{NamespaceConfig, NamespaceName}; +use namespace::NamespaceName; use net::Connector; use once_cell::sync::Lazy; use rusqlite::ffi::SQLITE_CONFIG_MALLOC; @@ -55,14 +64,21 @@ use tokio::runtime::Runtime; use tokio::sync::{mpsc, Notify, Semaphore}; use tokio::task::JoinSet; use tokio::time::Duration; +use tonic::transport::Channel; use url::Url; use utils::services::idle_shutdown::IdleShutdownKicker; use self::config::MetaStoreConfig; use self::connection::connection_manager::InnerWalManager; +use self::namespace::configurator::{ + BaseNamespaceConfig, LibsqlPrimaryConfigurator, LibsqlReplicaConfigurator, + LibsqlSchemaConfigurator, NamespaceConfigurators, PrimaryConfig, PrimaryConfigurator, + ReplicaConfigurator, SchemaConfigurator, +}; use self::namespace::NamespaceStore; use self::net::AddrIncoming; use self::replication::script_backup_manager::{CommandHandler, ScriptBackupManager}; +use self::schema::SchedulerHandle; pub mod auth; mod broadcaster; @@ -108,6 +124,16 @@ pub(crate) static BLOCKING_RT: Lazy = Lazy::new(|| { type Result = std::result::Result; type StatsSender = mpsc::Sender<(NamespaceName, MetaStoreHandle, Weak)>; +type MakeReplicationSvc = Box< + dyn FnOnce( + NamespaceStore, + Option, + Option, + bool, + ) -> BoxReplicationService + + Send + + 'static, +>; // #[global_allocator] // static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -142,6 +168,7 @@ pub struct Server, pub storage_server_address: String, + pub connector: Option, } impl Default for Server { @@ -165,6 +192,7 @@ impl Default for Server { shutdown_timeout: Duration::from_secs(30), use_custom_wal: None, storage_server_address: Default::default(), + connector: None, } } } @@ -180,9 +208,77 @@ struct Services { disable_default_namespace: bool, db_config: DbConfig, user_auth_strategy: Auth, +} + +struct TaskManager { + join_set: JoinSet>, shutdown: Arc, } +impl TaskManager { + /// pass a shutdown notifier to the task. The task must shutdown upon receiving a signal + pub fn spawn_with_shutdown_notify(&mut self, f: F) + where F: FnOnce(Arc) -> Fut, + Fut: Future> + Send + 'static, + { + let fut = f(self.shutdown.clone()); + self.join_set.spawn(fut); + } + + pub fn spawn_until_shutdown(&mut self, fut: F) + where + F: Future> + Send + 'static, + { + self.spawn_until_shutdown_with_teardown(fut, ready(Ok(()))) + } + + /// run the passed future until shutdown is called, then call the passed teardown future + #[track_caller] + pub fn spawn_until_shutdown_with_teardown( + &mut self, + fut: F, + teardown: T, + ) where + F: Future> + Send + 'static, + T: Future> + Send + 'static, + { + let shutdown = self.shutdown.clone(); + self.join_set.spawn(async move { + tokio::select! { + _ = shutdown.notified() => { + let ret = teardown.await; + if let Err(ref e) = ret { + let caller = std::panic::Location::caller(); + tracing::error!(caller = caller.to_string(), "task teardown returned an error: {e}"); + } + ret + }, + ret = fut => ret + } + }); + } + + fn new() -> Self { + Self { join_set: JoinSet::new(), shutdown: Arc::new(Notify::new()) } + } + + pub async fn shutdown(&mut self) -> anyhow::Result<()> { + self.shutdown.notify_waiters(); + while let Some(ret) = self.join_set.join_next().await { + ret?? + } + + Ok(()) + } + + pub async fn join_next(&mut self) -> anyhow::Result<()> { + if let Some(ret) = self.join_set.join_next().await { + ret??; + } + Ok(()) + } +} + impl Services where A: crate::net::Accept, @@ -190,7 +286,7 @@ where S: ReplicationLog, C: Connector, { - fn configure(self, join_set: &mut JoinSet>) { + fn configure(self, task_manager: &mut TaskManager) { let user_http = UserApi { http_acceptor: self.user_api_config.http_acceptor, hrana_ws_acceptor: self.user_api_config.hrana_ws_acceptor, @@ -205,10 +301,9 @@ where enable_console: self.user_api_config.enable_http_console, self_url: self.user_api_config.self_url, primary_url: self.user_api_config.primary_url, - shutdown: self.shutdown.clone(), }; - let user_http_service = user_http.configure(join_set); + let user_http_service = user_http.configure(task_manager); if let Some(AdminApiConfig { acceptor, @@ -216,8 +311,7 @@ where disable_metrics, }) = self.admin_api_config { - let shutdown = self.shutdown.clone(); - join_set.spawn(http::admin::run( + task_manager.spawn_with_shutdown_notify(|shutdown| http::admin::run( acceptor, user_http_service, self.namespace_store, @@ -229,6 +323,9 @@ where } } +pub type SqldStorage = + Either, SealedSegment>, NoStorage>; + #[tracing::instrument(skip(connection_maker))] async fn run_periodic_checkpoint( connection_maker: Arc, @@ -326,7 +423,7 @@ where fn spawn_monitoring_tasks( &self, - join_set: &mut JoinSet>, + task_manager: &mut TaskManager, stats_receiver: mpsc::Receiver<(NamespaceName, MetaStoreHandle, Weak)>, ) -> anyhow::Result<()> { match self.heartbeat_config { @@ -336,7 +433,8 @@ where config.heartbeat_url.as_deref().unwrap_or(""), config.heartbeat_period, ); - join_set.spawn({ + + task_manager.spawn_until_shutdown({ let heartbeat_auth = config.heartbeat_auth.clone(); let heartbeat_period = config.heartbeat_period; let heartbeat_url = if let Some(url) = &config.heartbeat_url { @@ -373,7 +471,6 @@ where proxy_service: P, replication_service: L, user_auth_strategy: Auth, - shutdown: Arc, ) -> Services { Services { namespace_store, @@ -386,13 +483,12 @@ where disable_default_namespace: self.disable_default_namespace, db_config: self.db_config, user_auth_strategy, - shutdown, } } pub async fn start(mut self) -> anyhow::Result<()> { static INIT: std::sync::Once = std::sync::Once::new(); - let mut join_set = JoinSet::new(); + let mut task_manager = TaskManager::new(); if std::env::var("LIBSQL_SQLITE_MIMALLOC").is_ok() { setup_sqlite_alloc(); @@ -423,60 +519,47 @@ where let extensions = self.db_config.validate_extensions()?; let user_auth_strategy = self.user_api_config.auth_strategy.clone(); - let service_shutdown = Arc::new(Notify::new()); - let db_kind = if self.rpc_client_config.is_some() { - DatabaseKind::Replica - } else { - DatabaseKind::Primary - }; - let scripted_backup = match self.db_config.snapshot_exec { Some(ref command) => { let (scripted_backup, script_backup_task) = ScriptBackupManager::new(&self.path, CommandHandler::new(command.to_string())) .await?; - join_set.spawn(script_backup_task.run()); + task_manager.spawn_until_shutdown(script_backup_task.run()); Some(scripted_backup) } None => None, }; - let (channel, uri) = match self.rpc_client_config { - Some(ref config) => { - let (channel, uri) = config.configure().await?; - (Some(channel), Some(uri)) - } - None => (None, None), + let db_kind = match self.rpc_client_config { + Some(_) => DatabaseKind::Replica, + _ => DatabaseKind::Primary, }; + let client_config = self.get_client_config().await?; let (scheduler_sender, scheduler_receiver) = mpsc::channel(128); - let (stats_sender, stats_receiver) = mpsc::channel(1024); - // chose the wal backend - let (make_wal_manager, registry_shutdown) = self.configure_wal_manager(&mut join_set)?; - - let ns_config = NamespaceConfig { - db_kind, + let base_config = BaseNamespaceConfig { base_path: self.path.clone(), - max_log_size: self.db_config.max_log_size, - max_log_duration: self.db_config.max_log_duration.map(Duration::from_secs_f32), - bottomless_replication: self.db_config.bottomless_replication.clone(), extensions, - stats_sender: stats_sender.clone(), + stats_sender, max_response_size: self.db_config.max_response_size, max_total_response_size: self.db_config.max_total_response_size, - checkpoint_interval: self.db_config.checkpoint_interval, - encryption_config: self.db_config.encryption_config.clone(), max_concurrent_connections: Arc::new(Semaphore::new(self.max_concurrent_connections)), - scripted_backup, max_concurrent_requests: self.db_config.max_concurrent_requests, - channel: channel.clone(), - uri: uri.clone(), - migration_scheduler: scheduler_sender.into(), - make_wal_manager, + encryption_config: self.db_config.encryption_config.clone(), }; + let (configurators, make_replication_svc) = self + .make_configurators_and_replication_svc( + base_config, + client_config.clone(), + &mut task_manager, + scheduler_sender.into(), + scripted_backup, + ) + .await?; + let (metastore_conn_maker, meta_store_wal_manager) = metastore_connection_maker(self.meta_store_config.bottomless.clone(), &self.path) .await?; @@ -488,35 +571,18 @@ where meta_store_wal_manager, ) .await?; + let namespace_store: NamespaceStore = NamespaceStore::new( db_kind.is_replica(), self.db_config.snapshot_at_shutdown, self.max_active_namespaces, - ns_config, meta_store, + configurators, + db_kind, ) .await?; - let meta_conn = metastore_conn_maker()?; - let scheduler = Scheduler::new(namespace_store.clone(), meta_conn).await?; - - join_set.spawn(async move { - scheduler.run(scheduler_receiver).await; - Ok(()) - }); - - self.spawn_monitoring_tasks(&mut join_set, stats_receiver)?; - - // eagerly load the default namespace when namespaces are disabled - if self.disable_namespaces && db_kind.is_primary() { - namespace_store - .create( - NamespaceName::default(), - namespace::RestoreOption::Latest, - Default::default(), - ) - .await?; - } + self.spawn_monitoring_tasks(&mut task_manager, stats_receiver)?; // if namespaces are enabled, then bottomless must have set DB ID if !self.disable_namespaces { @@ -532,7 +598,7 @@ where let proxy_service = ProxyService::new(namespace_store.clone(), None, self.disable_namespaces); // Garbage collect proxy clients every 30 seconds - join_set.spawn({ + task_manager.spawn_until_shutdown({ let clients = proxy_service.clients(); async move { loop { @@ -541,21 +607,49 @@ where } } }); - join_set.spawn(run_rpc_server( - proxy_service, - config.acceptor, - config.tls_config, - idle_shutdown_kicker.clone(), + + let replication_service = make_replication_svc( namespace_store.clone(), - self.disable_namespaces, - )); + None, + idle_shutdown_kicker.clone(), + false, + ); + + task_manager.spawn_until_shutdown( + run_rpc_server( + proxy_service, + config.acceptor, + config.tls_config, + idle_shutdown_kicker.clone(), + replication_service, + ), + ); } let shutdown_timeout = self.shutdown_timeout.clone(); let shutdown = self.shutdown.clone(); + let service_shutdown = Arc::new(Notify::new()); // setup user-facing rpc services match db_kind { DatabaseKind::Primary => { + // The migration scheduler is only useful on the primary + let meta_conn = metastore_conn_maker()?; + let scheduler = Scheduler::new(namespace_store.clone(), meta_conn).await?; + task_manager.spawn_until_shutdown(async move { + scheduler.run(scheduler_receiver).await; + Ok(()) + }); + + if self.disable_namespaces { + namespace_store + .create( + NamespaceName::default(), + namespace::RestoreOption::Latest, + Default::default(), + ) + .await?; + } + let replication_svc = ReplicationLogService::new( namespace_store.clone(), idle_shutdown_kicker.clone(), @@ -571,7 +665,7 @@ where ); // Garbage collect proxy clients every 30 seconds - join_set.spawn({ + task_manager.spawn_until_shutdown({ let clients = proxy_svc.clients(); async move { loop { @@ -587,16 +681,15 @@ where proxy_svc, replication_svc, user_auth_strategy.clone(), - service_shutdown.clone(), ) - .configure(&mut join_set); + .configure(&mut task_manager); } DatabaseKind::Replica => { - let replication_svc = - ReplicationLogProxyService::new(channel.clone().unwrap(), uri.clone().unwrap()); + let (channel, uri) = client_config.clone().unwrap(); + let replication_svc = ReplicationLogProxyService::new(channel.clone(), uri.clone()); let proxy_svc = ReplicaProxyService::new( - channel.clone().unwrap(), - uri.clone().unwrap(), + channel, + uri, namespace_store.clone(), user_auth_strategy.clone(), self.disable_namespaces, @@ -608,19 +701,18 @@ where proxy_svc, replication_svc, user_auth_strategy, - service_shutdown.clone(), ) - .configure(&mut join_set); + .configure(&mut task_manager); } }; tokio::select! { _ = shutdown.notified() => { let shutdown = async { - join_set.shutdown().await; + task_manager.shutdown().await?; + // join_set.shutdown().await; service_shutdown.notify_waiters(); namespace_store.shutdown().await?; - registry_shutdown.await?; Ok::<_, crate::Error>(()) }; @@ -640,8 +732,8 @@ where } } - Some(res) = join_set.join_next() => { - res??; + res = task_manager.join_next() => { + res?; }, else => (), } @@ -649,21 +741,15 @@ where Ok(()) } - fn setup_shutdown(&self) -> Option { - let shutdown_notify = self.shutdown.clone(); - self.idle_shutdown_timeout.map(|d| { - IdleShutdownKicker::new(d, self.initial_idle_shutdown_timeout, shutdown_notify) - }) - } - - fn configure_wal_manager( + async fn make_configurators_and_replication_svc( &self, - join_set: &mut JoinSet>, - ) -> anyhow::Result<( - Arc InnerWalManager + Sync + Send + 'static>, - Pin> + Send + Sync + 'static>>, - )> { - let wal_path = self.path.join("wals"); + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + task_manager: &mut TaskManager, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result<(NamespaceConfigurators, MakeReplicationSvc)> { + let wal_path = base_config.base_path.join("wals"); let enable_libsql_wal_test = { let is_primary = self.rpc_server_config.is_some(); let is_libsql_wal_test = std::env::var("LIBSQL_WAL_TEST").is_ok(); @@ -677,16 +763,107 @@ where } } - if self.use_custom_wal.is_some() { + #[cfg(feature = "durable-wal")] + if let Some(CustomWAL::DurableWal) = self.use_custom_wal { if self.db_config.bottomless_replication.is_some() { - anyhow::bail!("bottomless not supported with custom WAL"); + anyhow::bail!("bottomless not supported with durable WAL"); } - if self.rpc_client_config.is_some() { - anyhow::bail!("custom WAL not supported in replica mode"); + } + + match self.use_custom_wal { + Some(CustomWAL::LibsqlWal) => self.libsql_wal_configurators( + base_config, + client_config, + task_manager, + migration_scheduler_handle, + scripted_backup, + wal_path, + ).await, + #[cfg(feature = "durable-wal")] + Some(CustomWAL::DurableWal) => self.durable_wal_configurators( + base_config, + client_config, + migration_scheduler_handle, + scripted_backup, + ), + None => { + self.legacy_configurators( + base_config, + client_config, + migration_scheduler_handle, + scripted_backup, + ) + .await } } + } - let namespace_resolver = |path: &Path| { + async fn libsql_wal_configurators( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + task_manager: &mut TaskManager, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + wal_path: PathBuf, + ) -> anyhow::Result<(NamespaceConfigurators, MakeReplicationSvc)> { + tracing::info!("using libsql wal"); + let (sender, receiver) = tokio::sync::mpsc::channel(64); + let storage = if let Some(ref opt) = self.db_config.bottomless_replication { + if client_config.is_some() { + anyhow::bail!("bottomless cannot be enabled on replicas"); + } + + let config = aws_config::load_defaults(BehaviorVersion::latest()).await; + let http_client = HyperClientBuilder::new().build(self.connector.clone().unwrap()); + let mut builder = config.into_builder(); + builder.set_http_client(Some(http_client)); + builder.set_endpoint_url(opt.aws_endpoint.clone()); + builder.set_region(Region::new( + opt.region.clone().expect("expected aws region"), + )); + let cred = Credentials::new( + opt.access_key_id.as_ref().unwrap(), + opt.secret_access_key.as_ref().unwrap(), + None, + None, + "", + ); + builder.set_credentials_provider(Some(SharedCredentialsProvider::new(cred))); + let config = builder.build(); + let backend = S3Backend::from_sdk_config( + config, + opt.bucket_name.clone(), + opt.db_id.clone().expect("expected db id") + ).await?; + let config = AsyncStorageInitConfig { + backend: Arc::new(backend), + max_in_flight_jobs: 16, + }; + let (storage, storage_loop) = AsyncStorage::new(config).await; + + task_manager.spawn_with_shutdown_notify(|_| async move { + storage_loop.run().await; + Ok(()) + }); + + Either::A(storage) + } else { + Either::B(NoStorage) + }; + + if self.rpc_server_config.is_some() && matches!(storage, Either::B(_)) { + anyhow::bail!("replication without bottomless not supported yet"); + } + + let registry = Arc::new(WalRegistry::new(wal_path, storage, sender)?); + let checkpointer = LibsqlCheckpointer::new(registry.clone(), receiver, 8); + task_manager.spawn_with_shutdown_notify(|_| async move { + checkpointer.run().await; + Ok(()) + }); + + let namespace_resolver = Arc::new(|path: &Path| { NamespaceName::from_string( path.parent() .unwrap() @@ -698,50 +875,233 @@ where ) .unwrap() .into() - }; - - match self.use_custom_wal { - Some(CustomWAL::LibsqlWal) => { - let (sender, receiver) = tokio::sync::mpsc::channel(64); - let registry = Arc::new(WalRegistry::new(wal_path, NoStorage, sender)?); - let checkpointer = LibsqlCheckpointer::new(registry.clone(), receiver, 8); - join_set.spawn(async move { - checkpointer.run().await; - Ok(()) - }); + }); - let wal = LibsqlWalManager::new(registry.clone(), Arc::new(namespace_resolver)); - let shutdown_notify = self.shutdown.clone(); - let shutdown_fut = Box::pin(async move { - shutdown_notify.notified().await; - registry.shutdown().await?; - Ok(()) - }); + task_manager.spawn_with_shutdown_notify(|shutdown| { + let registry = registry.clone(); + async move { + shutdown.notified().await; + registry.shutdown().await?; + Ok(()) + } + }); - tracing::info!("using libsql wal"); - Ok((Arc::new(move || EitherWAL::B(wal.clone())), shutdown_fut)) + let make_replication_svc = Box::new({ + let registry = registry.clone(); + let disable_namespaces = self.disable_namespaces; + move |store, user_auth, _, _| -> BoxReplicationService { + Box::new(LibsqlReplicationService::new( + registry, + store, + user_auth, + disable_namespaces, + )) } - #[cfg(feature = "durable-wal")] - Some(CustomWAL::DurableWal) => { - tracing::info!("using durable wal"); - let lock_manager = Arc::new(std::sync::Mutex::new(LockManager::new())); - let wal = DurableWalManager::new( - lock_manager, + }); + let mut configurators = NamespaceConfigurators::empty(); + + match client_config { + // configure replica + Some((channel, uri)) => { + let replica_configurator = LibsqlReplicaConfigurator::new( + base_config, + registry.clone(), + uri, + channel, namespace_resolver, - self.storage_server_address.clone(), ); - Ok(( - Arc::new(move || EitherWAL::C(wal.clone())), - Box::pin(ready(Ok(()))), - )) + configurators.with_replica(replica_configurator); } + // configure primary None => { - tracing::info!("using sqlite3 wal"); - Ok(( - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - Box::pin(ready(Ok(()))), + let primary_config = PrimaryConfig { + max_log_size: self.db_config.max_log_size, + max_log_duration: self.db_config.max_log_duration.map(Duration::from_secs_f32), + bottomless_replication: self.db_config.bottomless_replication.clone(), + scripted_backup, + checkpoint_interval: self.db_config.checkpoint_interval, + }; + let primary_configurator = LibsqlPrimaryConfigurator::new( + base_config.clone(), + primary_config.clone(), + registry.clone(), + namespace_resolver.clone(), + ); + + let schema_configurator = LibsqlSchemaConfigurator::new( + base_config, + primary_config, + migration_scheduler_handle, + registry, + namespace_resolver, + ); + + configurators.with_primary(primary_configurator); + configurators.with_schema(schema_configurator); + } + } + + Ok((configurators, make_replication_svc)) + } + + #[cfg(feature = "durable-wal")] + fn durable_wal_configurators( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result<(NamespaceConfigurators, MakeReplicationSvc)> { + tracing::info!("using durable wal"); + let lock_manager = Arc::new(std::sync::Mutex::new(LockManager::new())); + let namespace_resolver = |path: &Path| { + NamespaceName::from_string( + path.parent() + .unwrap() + .file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(), + ) + .unwrap() + .into() + }; + let wal = DurableWalManager::new( + lock_manager, + namespace_resolver, + self.storage_server_address.clone(), + ); + let make_wal_manager = Arc::new(move || EitherWAL::C(wal.clone())); + let configurators = self.configurators_common( + base_config, + client_config, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + )?; + + let make_replication_svc = Box::new({ + let disable_namespaces = self.disable_namespaces; + move |store, client_auth, idle_shutdown, collect_stats| -> BoxReplicationService { + Box::new(ReplicationLogService::new( + store, + idle_shutdown, + client_auth, + disable_namespaces, + collect_stats, )) } + }); + + Ok((configurators, make_replication_svc)) + } + + async fn legacy_configurators( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result<(NamespaceConfigurators, MakeReplicationSvc)> { + let make_wal_manager = Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())); + let configurators = self.configurators_common( + base_config, + client_config, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + )?; + + let make_replication_svc = Box::new({ + let disable_namespaces = self.disable_namespaces; + move |store, client_auth, idle_shutdown, collect_stats| -> BoxReplicationService { + Box::new(ReplicationLogService::new( + store, + idle_shutdown, + client_auth, + disable_namespaces, + collect_stats, + )) + } + }); + + Ok((configurators, make_replication_svc)) + } + + fn configurators_common( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result { + let mut configurators = NamespaceConfigurators::empty(); + match client_config { + // replica mode + Some((channel, uri)) => { + let replica_configurator = + ReplicaConfigurator::new(base_config, channel, uri, make_wal_manager); + configurators.with_replica(replica_configurator); + } + // primary mode + None => self.configure_primary_common( + base_config, + &mut configurators, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + ), + } + + Ok(configurators) + } + + fn configure_primary_common( + &self, + base_config: BaseNamespaceConfig, + configurators: &mut NamespaceConfigurators, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) { + let primary_config = PrimaryConfig { + max_log_size: self.db_config.max_log_size, + max_log_duration: self.db_config.max_log_duration.map(Duration::from_secs_f32), + bottomless_replication: self.db_config.bottomless_replication.clone(), + scripted_backup, + checkpoint_interval: self.db_config.checkpoint_interval, + }; + + let primary_configurator = PrimaryConfigurator::new( + base_config.clone(), + primary_config.clone(), + make_wal_manager.clone(), + ); + + let schema_configurator = SchemaConfigurator::new( + base_config.clone(), + primary_config, + make_wal_manager.clone(), + migration_scheduler_handle, + ); + + configurators.with_schema(schema_configurator); + configurators.with_primary(primary_configurator); + } + + fn setup_shutdown(&self) -> Option { + let shutdown_notify = self.shutdown.clone(); + self.idle_shutdown_timeout.map(|d| { + IdleShutdownKicker::new(d, self.initial_idle_shutdown_timeout, shutdown_notify) + }) + } + + async fn get_client_config(&self) -> anyhow::Result> { + match self.rpc_client_config { + Some(ref config) => Ok(Some(config.configure().await?)), + None => Ok(None), } } } diff --git a/libsql-server/src/main.rs b/libsql-server/src/main.rs index 6fce78a06a..edd448e5b9 100644 --- a/libsql-server/src/main.rs +++ b/libsql-server/src/main.rs @@ -651,6 +651,7 @@ async fn build_server(config: &Cli) -> anyhow::Result { .unwrap_or(Duration::from_secs(30)), use_custom_wal: config.use_custom_wal, storage_server_address: config.storage_server_address.clone(), + connector: Some(HttpConnector::new()), }) } diff --git a/libsql-server/src/metrics.rs b/libsql-server/src/metrics.rs index a71b5ca979..1ac97435b3 100644 --- a/libsql-server/src/metrics.rs +++ b/libsql-server/src/metrics.rs @@ -153,3 +153,8 @@ pub static LISTEN_EVENTS_DROPPED: Lazy = Lazy::new(|| { describe_counter!(NAME, "Number of listen events dropped"); register_counter!(NAME) }); +pub static QUERY_CANCELED: Lazy = Lazy::new(|| { + const NAME: &str = "libsql_server_query_canceled"; + describe_counter!(NAME, "Number of canceled queries"); + register_counter!(NAME) +}); diff --git a/libsql-server/src/namespace/fork.rs b/libsql-server/src/namespace/configurator/fork.rs similarity index 74% rename from libsql-server/src/namespace/fork.rs rename to libsql-server/src/namespace/configurator/fork.rs index dfa053b43d..4b3e58ee85 100644 --- a/libsql-server/src/namespace/fork.rs +++ b/libsql-server/src/namespace/configurator/fork.rs @@ -12,17 +12,71 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio::time::Duration; use tokio_stream::StreamExt; -use crate::namespace::ResolveNamespacePathFn; +use crate::database::Database; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{Namespace, NamespaceBottomlessDbId}; use crate::replication::primary::frame_stream::FrameStream; use crate::replication::{LogReadError, ReplicationLogger}; use crate::{BLOCKING_RT, LIBSQL_PAGE_SIZE}; -use super::broadcasters::BroadcasterHandle; -use super::meta_store::MetaStoreHandle; -use super::{Namespace, NamespaceConfig, NamespaceName, NamespaceStore, RestoreOption}; +use super::helpers::make_bottomless_options; +use super::{NamespaceName, NamespaceStore, PrimaryConfig, RestoreOption}; type Result = crate::Result; +pub(super) async fn fork( + from_ns: &Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + primary_config: &PrimaryConfig, + base_path: Arc, +) -> crate::Result { + let from_config = from_config.get(); + let bottomless_db_id = NamespaceBottomlessDbId::from_config(&from_config); + let restore_to = if let Some(timestamp) = timestamp { + if let Some(ref options) = primary_config.bottomless_replication { + Some(PointInTimeRestore { + timestamp, + replicator_options: make_bottomless_options( + options, + bottomless_db_id.clone(), + from_ns.name().clone(), + ), + }) + } else { + return Err(crate::Error::Fork(ForkError::BackupServiceNotConfigured)); + } + } else { + None + }; + + let logger = match &from_ns.db { + Database::Primary(db) => db.wal_wrapper.wrapper().logger(), + Database::Schema(db) => db.wal_wrapper.as_ref().unwrap().wrapper().logger(), + _ => { + return Err(crate::Error::Fork(ForkError::Internal(anyhow::Error::msg( + "Invalid source database type for fork", + )))); + } + }; + + let fork_task = ForkTask { + base_path, + to_namespace: to_ns.clone(), + logger, + restore_to, + to_config, + store, + }; + + let ns = fork_task.fork().await?; + + Ok(ns) +} + #[derive(Debug, thiserror::Error)] pub enum ForkError { #[error("internal error: {0}")] @@ -54,16 +108,13 @@ async fn write_frame(frame: &FrameBorrowed, temp_file: &mut tokio::fs::File) -> Ok(()) } -pub struct ForkTask<'a> { +pub struct ForkTask { pub base_path: Arc, pub logger: Arc, pub to_namespace: NamespaceName, pub to_config: MetaStoreHandle, pub restore_to: Option, - pub ns_config: &'a NamespaceConfig, - pub resolve_attach: ResolveNamespacePathFn, pub store: NamespaceStore, - pub broadcaster: BroadcasterHandle, } pub struct PointInTimeRestore { @@ -71,7 +122,7 @@ pub struct PointInTimeRestore { pub replicator_options: bottomless::replicator::Options, } -impl<'a> ForkTask<'a> { +impl ForkTask { pub async fn fork(self) -> Result { let base_path = self.base_path.clone(); let dest_namespace = self.to_namespace.clone(); @@ -105,18 +156,10 @@ impl<'a> ForkTask<'a> { let dest_path = self.base_path.join("dbs").join(self.to_namespace.as_str()); tokio::fs::rename(temp_dir.path(), dest_path).await?; - Namespace::from_config( - self.ns_config, - self.to_config.clone(), - RestoreOption::Latest, - &self.to_namespace, - Box::new(|_op| {}), - self.resolve_attach.clone(), - self.store.clone(), - self.broadcaster, - ) - .await - .map_err(|e| ForkError::CreateNamespace(Box::new(e))) + self.store + .make_namespace(&self.to_namespace, self.to_config, RestoreOption::Latest) + .await + .map_err(|e| ForkError::CreateNamespace(Box::new(e))) } /// Restores the database state from a local log file. diff --git a/libsql-server/src/namespace/configurator/helpers.rs b/libsql-server/src/namespace/configurator/helpers.rs new file mode 100644 index 0000000000..558ccee7ca --- /dev/null +++ b/libsql-server/src/namespace/configurator/helpers.rs @@ -0,0 +1,460 @@ +use std::path::{Path, PathBuf}; +use std::sync::Weak; +use std::sync::{atomic::AtomicBool, Arc}; +use std::time::Duration; + +use anyhow::Context as _; +use bottomless::replicator::Options; +use bytes::Bytes; +use enclose::enclose; +use futures::Stream; +use libsql_sys::wal::Sqlite3WalManager; +use libsql_sys::EncryptionConfig; +use tokio::io::AsyncBufReadExt as _; +use tokio::sync::watch; +use tokio::task::JoinSet; +use tokio_util::io::StreamReader; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::legacy::{open_conn, MakeLegacyConnection}; +use crate::connection::{Connection as _, MakeConnection as _}; +use crate::database::{PrimaryConnection, PrimaryConnectionMaker}; +use crate::error::LoadDumpError; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::replication_wal::{make_replication_wal_wrapper, ReplicationWalWrapper}; +use crate::namespace::{ + NamespaceBottomlessDbId, NamespaceBottomlessDbIdInit, NamespaceName, ResolveNamespacePathFn, + RestoreOption, +}; +use crate::replication::{FrameNo, ReplicationLogger}; +use crate::stats::Stats; +use crate::{StatsSender, BLOCKING_RT, DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT}; + +use super::{BaseNamespaceConfig, PrimaryConfig}; + +const WASM_TABLE_CREATE: &str = + "CREATE TABLE libsql_wasm_func_table (name text PRIMARY KEY, body text) WITHOUT ROWID;"; + +#[tracing::instrument(skip_all)] +pub(super) async fn make_primary_connection_maker( + primary_config: &PrimaryConfig, + base_config: &BaseNamespaceConfig, + meta_store_handle: &MetaStoreHandle, + db_path: &Path, + name: &NamespaceName, + restore_option: RestoreOption, + block_writes: Arc, + join_set: &mut JoinSet>, + resolve_attach_path: ResolveNamespacePathFn, + broadcaster: BroadcasterHandle, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + encryption_config: Option, +) -> crate::Result<(PrimaryConnectionMaker, ReplicationWalWrapper, Arc)> { + let db_config = meta_store_handle.get(); + let bottomless_db_id = NamespaceBottomlessDbId::from_config(&db_config); + // FIXME: figure how to to it per-db + let mut is_dirty = { + let sentinel_path = db_path.join(".sentinel"); + if sentinel_path.try_exists()? { + true + } else { + tokio::fs::File::create(&sentinel_path).await?; + false + } + }; + + // FIXME: due to a bug in logger::checkpoint_db we call regular checkpointing code + // instead of our virtual WAL one. It's a bit tangled to fix right now, because + // we need WAL context for checkpointing, and WAL context needs the ReplicationLogger... + // So instead we checkpoint early, *before* bottomless gets initialized. That way + // we're sure bottomless won't try to back up any existing WAL frames and will instead + // treat the existing db file as the source of truth. + + let bottomless_replicator = match primary_config.bottomless_replication { + Some(ref options) => { + tracing::debug!("Checkpointing before initializing bottomless"); + crate::replication::primary::logger::checkpoint_db(&db_path.join("data"))?; + tracing::debug!("Checkpointed before initializing bottomless"); + let options = make_bottomless_options(options, bottomless_db_id, name.clone()); + let (replicator, did_recover) = + init_bottomless_replicator(db_path.join("data"), options, &restore_option).await?; + tracing::debug!("Completed init of bottomless replicator"); + is_dirty |= did_recover; + Some(replicator) + } + None => None, + }; + + tracing::debug!("Checking fresh db"); + let is_fresh_db = check_fresh_db(&db_path)?; + // switch frame-count checkpoint to time-based one + let auto_checkpoint = if primary_config.checkpoint_interval.is_some() { + 0 + } else { + DEFAULT_AUTO_CHECKPOINT + }; + + let logger = Arc::new(ReplicationLogger::open( + &db_path, + primary_config.max_log_size, + primary_config.max_log_duration, + is_dirty, + auto_checkpoint, + primary_config.scripted_backup.clone(), + name.clone(), + encryption_config.clone(), + )?); + + tracing::debug!("sending stats"); + + let stats = make_stats( + &db_path, + join_set, + meta_store_handle.clone(), + base_config.stats_sender.clone(), + name.clone(), + logger.new_frame_notifier.subscribe(), + base_config.encryption_config.clone(), + ) + .await?; + + tracing::debug!("Making replication wal wrapper"); + let wal_wrapper = make_replication_wal_wrapper(bottomless_replicator, logger.clone()); + + tracing::debug!("Opening libsql connection"); + + let connection_maker = MakeLegacyConnection::new( + db_path.to_path_buf(), + wal_wrapper.clone(), + stats.clone(), + broadcaster, + meta_store_handle.clone(), + base_config.extensions.clone(), + base_config.max_response_size, + base_config.max_total_response_size, + auto_checkpoint, + logger.new_frame_notifier.subscribe(), + encryption_config, + block_writes, + resolve_attach_path, + make_wal_manager.clone(), + ) + .await? + .throttled( + base_config.max_concurrent_connections.clone(), + Some(DB_CREATE_TIMEOUT), + base_config.max_total_response_size, + base_config.max_concurrent_requests, + ); + + tracing::debug!("Completed opening libsql connection"); + + // this must happen after we create the connection maker. The connection maker old on a + // connection to ensure that no other connection is closing while we try to open the dump. + // that would cause a SQLITE_LOCKED error. + match restore_option { + RestoreOption::Dump(_) if !is_fresh_db => { + Err(LoadDumpError::LoadDumpExistingDb)?; + } + RestoreOption::Dump(dump) => { + let conn = connection_maker.create().await?; + tracing::debug!("Loading dump"); + load_dump(dump, conn).await?; + tracing::debug!("Done loading dump"); + } + _ => { /* other cases were already handled when creating bottomless */ } + } + + join_set.spawn(run_periodic_compactions(logger.clone())); + + tracing::debug!("Done making primary connection"); + + Ok((connection_maker, wal_wrapper, stats)) +} + +pub(super) fn make_bottomless_options( + options: &Options, + namespace_db_id: NamespaceBottomlessDbId, + name: NamespaceName, +) -> Options { + let mut options = options.clone(); + let mut db_id = match namespace_db_id { + NamespaceBottomlessDbId::Namespace(id) => id, + // FIXME(marin): I don't like that, if bottomless is enabled, proper config must be passed. + NamespaceBottomlessDbId::NotProvided => options.db_id.unwrap_or_default(), + }; + + db_id = format!("ns-{db_id}:{name}"); + options.db_id = Some(db_id); + options +} + +async fn init_bottomless_replicator( + path: impl AsRef, + options: bottomless::replicator::Options, + restore_option: &RestoreOption, +) -> anyhow::Result<(bottomless::replicator::Replicator, bool)> { + tracing::debug!("Initializing bottomless replication"); + let path = path + .as_ref() + .to_str() + .ok_or_else(|| anyhow::anyhow!("Invalid db path"))? + .to_owned(); + let mut replicator = bottomless::replicator::Replicator::with_options(path, options).await?; + + let (generation, timestamp) = match restore_option { + RestoreOption::Latest | RestoreOption::Dump(_) => (None, None), + RestoreOption::Generation(generation) => (Some(*generation), None), + RestoreOption::PointInTime(timestamp) => (None, Some(*timestamp)), + }; + + let (action, did_recover) = replicator.restore(generation, timestamp).await?; + match action { + bottomless::replicator::RestoreAction::SnapshotMainDbFile => { + replicator.new_generation().await; + if let Some(_handle) = replicator.snapshot_main_db_file(true).await? { + tracing::trace!("got snapshot handle after restore with generation upgrade"); + } + // Restoration process only leaves the local WAL file if it was + // detected to be newer than its remote counterpart. + replicator.maybe_replicate_wal().await? + } + bottomless::replicator::RestoreAction::ReuseGeneration(gen) => { + replicator.set_generation(gen); + } + } + + Ok((replicator, did_recover)) +} + +async fn run_periodic_compactions(logger: Arc) -> anyhow::Result<()> { + // calling `ReplicationLogger::maybe_compact()` is cheap if the compaction does not actually + // take place, so we can afford to poll it very often for simplicity + let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(1000)); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + + loop { + interval.tick().await; + let handle = BLOCKING_RT.spawn_blocking(enclose! {(logger) move || { + logger.maybe_compact() + }}); + handle + .await + .expect("Compaction task crashed") + .context("Compaction failed")?; + } +} + +async fn load_dump(dump: S, conn: PrimaryConnection) -> crate::Result<(), LoadDumpError> +where + S: Stream> + Unpin, +{ + let mut reader = tokio::io::BufReader::new(StreamReader::new(dump)); + let mut curr = String::new(); + let mut line = String::new(); + let mut skipped_wasm_table = false; + let mut n_stmt = 0; + let mut line_id = 0; + + while let Ok(n) = reader.read_line(&mut curr).await { + line_id += 1; + if n == 0 { + break; + } + let trimmed = curr.trim(); + if trimmed.is_empty() || trimmed.starts_with("--") { + curr.clear(); + continue; + } + // FIXME: it's well known bug that comment ending with semicolon will be handled incorrectly by currend dump processing code + let statement_end = trimmed.ends_with(';'); + + // we want to concat original(non-trimmed) lines as trimming will join all them in one + // single-line statement which is incorrect if comments in the end are present + line.push_str(&curr); + curr.clear(); + + // This is a hack to ignore the libsql_wasm_func_table table because it is already created + // by the system. + if !skipped_wasm_table && line.trim() == WASM_TABLE_CREATE { + skipped_wasm_table = true; + line.clear(); + continue; + } + + if statement_end { + n_stmt += 1; + // dump must be performd within a txn + if n_stmt > 2 && conn.is_autocommit().await.unwrap() { + return Err(LoadDumpError::NoTxn); + } + + line = tokio::task::spawn_blocking({ + let conn = conn.clone(); + move || -> crate::Result { + conn.with_raw(|conn| conn.execute(&line, ())).map_err(|e| { + LoadDumpError::Internal(format!("line: {}, error: {}", line_id, e)) + })?; + Ok(line) + } + }) + .await??; + line.clear(); + } else { + line.push(' '); + } + } + tracing::debug!("loaded {} lines from dump", line_id); + + if !conn.is_autocommit().await.unwrap() { + tokio::task::spawn_blocking({ + let conn = conn.clone(); + move || -> crate::Result<(), LoadDumpError> { + conn.with_raw(|conn| conn.execute("rollback", ()))?; + Ok(()) + } + }) + .await??; + return Err(LoadDumpError::NoCommit); + } + + Ok(()) +} + +fn check_fresh_db(path: &Path) -> crate::Result { + let is_fresh = !path.join("wallog").try_exists()?; + Ok(is_fresh) +} + +pub(super) async fn make_stats( + db_path: &Path, + join_set: &mut JoinSet>, + meta_store_handle: MetaStoreHandle, + stats_sender: StatsSender, + name: NamespaceName, + mut current_frame_no: watch::Receiver>, + encryption_config: Option, +) -> anyhow::Result> { + tracing::debug!("creating stats type"); + let stats = Stats::new(name.clone(), db_path, join_set).await?; + + // the storage monitor is optional, so we ignore the error here. + tracing::debug!("stats created, sending stats"); + let _ = stats_sender + .send((name.clone(), meta_store_handle, Arc::downgrade(&stats))) + .await; + + join_set.spawn({ + let stats = stats.clone(); + // initialize the current_frame_no value + current_frame_no + .borrow_and_update() + .map(|fno| stats.set_current_frame_no(fno)); + async move { + while current_frame_no.changed().await.is_ok() { + current_frame_no + .borrow_and_update() + .map(|fno| stats.set_current_frame_no(fno)); + } + Ok(()) + } + }); + + // join_set.spawn(run_storage_monitor( + // db_path.into(), + // Arc::downgrade(&stats), + // encryption_config, + // )); + + tracing::debug!("done sending stats, and creating bg tasks"); + + Ok(stats) +} + +// Periodically check the storage used by the database and save it in the Stats structure. +// TODO: Once we have a separate fiber that does WAL checkpoints, running this routine +// right after checkpointing is exactly where it should be done. +async fn run_storage_monitor( + db_path: PathBuf, + stats: Weak, + encryption_config: Option, +) -> anyhow::Result<()> { + // on initialization, the database file doesn't exist yet, so we wait a bit for it to be + // created + tokio::time::sleep(Duration::from_secs(1)).await; + + let duration = tokio::time::Duration::from_secs(60); + let db_path: Arc = db_path.into(); + loop { + let db_path = db_path.clone(); + let Some(stats) = stats.upgrade() else { + return Ok(()); + }; + + let encryption_config = encryption_config.clone(); + let _ = tokio::task::spawn_blocking(move || { + // because closing the last connection interferes with opening a new one, we lazily + // initialize a connection here, and keep it alive for the entirety of the program. If we + // fail to open it, we wait for `duration` and try again later. + match open_conn(&db_path, Sqlite3WalManager::new(), Some(rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY), encryption_config) { + Ok(mut conn) => { + if let Ok(tx) = conn.transaction() { + let page_count = tx.query_row("pragma page_count;", [], |row| { row.get::(0) }); + let freelist_count = tx.query_row("pragma freelist_count;", [], |row| { row.get::(0) }); + if let (Ok(page_count), Ok(freelist_count)) = (page_count, freelist_count) { + let storage_bytes_used = (page_count - freelist_count) * 4096; + stats.set_storage_bytes_used(storage_bytes_used); + } + } + }, + Err(e) => { + tracing::warn!("failed to open connection for storager monitor: {e}, trying again in {duration:?}"); + }, + } + }).await; + + tokio::time::sleep(duration).await; + } +} + +pub(super) async fn cleanup_primary( + base: &BaseNamespaceConfig, + primary_config: &PrimaryConfig, + namespace: &NamespaceName, + db_config: &DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, +) -> crate::Result<()> { + let ns_path = base.base_path.join("dbs").join(namespace.as_str()); + if let Some(ref options) = primary_config.bottomless_replication { + let bottomless_db_id = match bottomless_db_id_init { + NamespaceBottomlessDbIdInit::Provided(db_id) => db_id, + NamespaceBottomlessDbIdInit::FetchFromConfig => { + NamespaceBottomlessDbId::from_config(db_config) + } + }; + let options = make_bottomless_options(options, bottomless_db_id, namespace.clone()); + let replicator = bottomless::replicator::Replicator::with_options( + ns_path.join("data").to_str().unwrap(), + options, + ) + .await?; + if prune_all { + let delete_all = replicator.delete_all(None).await?; + // perform hard deletion in the background + tokio::spawn(delete_all.commit()); + } else { + // for soft delete make sure that local db is fully backed up + replicator.savepoint().confirmed().await?; + } + } + + if ns_path.try_exists()? { + tracing::debug!("removing database directory: {}", ns_path.display()); + tokio::fs::remove_dir_all(ns_path).await?; + } + + Ok(()) +} diff --git a/libsql-server/src/namespace/configurator/libsql_primary.rs b/libsql-server/src/namespace/configurator/libsql_primary.rs new file mode 100644 index 0000000000..3966c969a9 --- /dev/null +++ b/libsql-server/src/namespace/configurator/libsql_primary.rs @@ -0,0 +1,241 @@ +use std::path::Path; +use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use futures::prelude::Future; +use libsql_sys::name::NamespaceResolver; +use libsql_wal::io::StdIO; +use libsql_wal::registry::WalRegistry; +use libsql_wal::wal::LibsqlWalManager; +use tokio::task::JoinSet; + +use crate::connection::config::DatabaseConfig; +use crate::connection::libsql::{MakeLibsqlConnection, MakeLibsqlConnectionInner}; +use crate::connection::{Connection as _, MakeConnection}; +use crate::database::{Database, LibsqlPrimaryConnectionMaker, LibsqlPrimaryDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_stats; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, + ResolveNamespacePathFn, RestoreOption, +}; +use crate::stats::Stats; +use crate::{SqldStorage, DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT}; +use crate::schema::{has_pending_migration_task, setup_migration_table}; + +use super::{BaseNamespaceConfig, ConfigureNamespace, PrimaryConfig}; + +pub struct LibsqlPrimaryConfigurator { + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + registry: Arc>, + namespace_resolver: Arc, +} + +pub struct LibsqlPrimaryCommon { + pub stats: Arc, + pub connection_maker: Arc, + pub join_set: JoinSet>, + pub block_writes: Arc, +} + +pub(super) async fn libsql_primary_common( + db_path: Arc, + db_config: MetaStoreHandle, + base_config: &BaseNamespaceConfig, + primary_config: &PrimaryConfig, + namespace: NamespaceName, + broadcaster: BroadcasterHandle, + resolve_attach_path: ResolveNamespacePathFn, + registry: Arc>, + namespace_resolver: Arc, +) -> crate::Result { + let mut join_set = JoinSet::new(); + + tokio::fs::create_dir_all(&db_path).await?; + + + tracing::debug!("Done making new primary"); + let (_snd, rcv) = tokio::sync::watch::channel(None); + let stats = make_stats( + &db_path, + &mut join_set, + db_config.clone(), + base_config.stats_sender.clone(), + namespace.clone(), + rcv.clone(), + base_config.encryption_config.clone(), + ) + .await?; + + let auto_checkpoint = if primary_config.checkpoint_interval.is_some() { + 0 + } else { + DEFAULT_AUTO_CHECKPOINT + }; + let block_writes = Arc::new(AtomicBool::new(false)); + let connection_maker = MakeLibsqlConnection { + inner: Arc::new(MakeLibsqlConnectionInner { + db_path: db_path.into(), + stats: stats.clone(), + broadcaster, + config_store: db_config.clone(), + extensions: base_config.extensions.clone(), + max_response_size: base_config.max_response_size, + max_total_response_size: base_config.max_total_response_size, + auto_checkpoint, + current_frame_no_receiver: rcv.clone(), + encryption_config: base_config.encryption_config.clone(), + block_writes: block_writes.clone(), + resolve_attach_path, + wal_manager: LibsqlWalManager::new(registry.clone(), namespace_resolver.clone()), + }) + } + .throttled( + base_config.max_concurrent_connections.clone(), + Some(DB_CREATE_TIMEOUT), + base_config.max_total_response_size, + base_config.max_concurrent_requests, + ); + let connection_maker = Arc::new(connection_maker); + + if db_config.get().shared_schema_name.is_some() { + let block_writes = block_writes.clone(); + let conn = connection_maker.create().await?; + tokio::task::spawn_blocking(move || { + conn.with_raw(|conn| -> crate::Result<()> { + setup_migration_table(conn)?; + if has_pending_migration_task(conn)? { + block_writes.store(true, Ordering::SeqCst); + } + Ok(()) + }) + }) + .await + .unwrap()?; + } + + Ok(LibsqlPrimaryCommon { + stats, + connection_maker, + join_set, + block_writes, + }) +} + +impl LibsqlPrimaryConfigurator { + pub fn new( + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + registry: Arc>, + namespace_resolver: Arc, + ) -> Self { + Self { + base, + primary_config, + registry, + namespace_resolver + } + } + + #[tracing::instrument(skip_all, fields(namespace))] + async fn try_new_primary( + &self, + namespace: NamespaceName, + db_config: MetaStoreHandle, + _restore_option: RestoreOption, + resolve_attach_path: ResolveNamespacePathFn, + db_path: Arc, + broadcaster: BroadcasterHandle, + ) -> crate::Result { + let common = libsql_primary_common( + db_path.clone(), + db_config.clone(), + &self.base, + &self.primary_config, + namespace.clone(), + broadcaster, + resolve_attach_path, + self.registry.clone(), + self.namespace_resolver.clone() + ).await?; + + Ok(Namespace { + tasks: common.join_set, + db: Database::LibsqlPrimary(LibsqlPrimaryDatabase { + connection_maker: common.connection_maker, + block_writes: common.block_writes, + }), + name: namespace, + stats: common.stats, + db_config_store: db_config, + path: db_path.into(), + }) + } +} + +impl ConfigureNamespace for LibsqlPrimaryConfigurator { + fn setup<'a>( + &'a self, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + _reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + _store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let db_path: Arc = self.base.base_path.join("dbs").join(name.as_str()).into(); + let fresh_namespace = !db_path.try_exists()?; + // FIXME: make that truly atomic. explore the idea of using temp directories, and it's implications + match self + .try_new_primary( + name.clone(), + meta_store_handle, + restore_option, + resolve_attach_path, + db_path.clone(), + broadcaster, + ) + .await + { + Ok(this) => Ok(this), + Err(e) if fresh_namespace => { + tracing::error!( + "an error occured while deleting creating namespace, cleaning..." + ); + if let Err(e) = tokio::fs::remove_dir_all(&db_path).await { + tracing::error!("failed to remove dirty namespace directory: {e}") + } + Err(e) + } + Err(e) => Err(e), + } + }) + } + + fn cleanup<'a>( + &'a self, + _namespace: &'a NamespaceName, + _db_config: &'a DatabaseConfig, + _prune_all: bool, + _bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + unimplemented!() + } + + fn fork<'a>( + &'a self, + _from_ns: &'a Namespace, + _from_config: MetaStoreHandle, + _to_ns: NamespaceName, + _to_config: MetaStoreHandle, + _timestamp: Option, + _store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + unimplemented!() + } +} diff --git a/libsql-server/src/namespace/configurator/libsql_replica.rs b/libsql-server/src/namespace/configurator/libsql_replica.rs new file mode 100644 index 0000000000..d641cb36fd --- /dev/null +++ b/libsql-server/src/namespace/configurator/libsql_replica.rs @@ -0,0 +1,281 @@ +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + +use hyper::Uri; +use libsql_replication::injector::LibsqlInjector; +use libsql_replication::replicator::Replicator; +use libsql_replication::rpc::replication::log_offset::WalFlavor; +use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; +use libsql_sys::name::NamespaceResolver; +use libsql_wal::io::StdIO; +use libsql_wal::registry::WalRegistry; +use libsql_wal::replication::injector::Injector; +use libsql_wal::transaction::Transaction; +use libsql_wal::wal::LibsqlWalManager; +use tokio::task::JoinSet; +use tonic::transport::Channel; + +use crate::connection::config::DatabaseConfig; +use crate::connection::libsql::{MakeLibsqlConnection, MakeLibsqlConnectionInner}; +use crate::connection::write_proxy::MakeWriteProxyConn; +use crate::connection::MakeConnection; +use crate::database::{Database, LibsqlReplicaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_stats; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, ResetOp, + ResolveNamespacePathFn, RestoreOption, +}; +use crate::{SqldStorage, DB_CREATE_TIMEOUT}; + +use super::{BaseNamespaceConfig, ConfigureNamespace}; + +pub struct LibsqlReplicaConfigurator { + base: BaseNamespaceConfig, + registry: Arc>, + uri: Uri, + channel: Channel, + namespace_resolver: Arc, +} + +impl LibsqlReplicaConfigurator { + pub fn new( + base: BaseNamespaceConfig, + registry: Arc>, + uri: Uri, + channel: Channel, + namespace_resolver: Arc, + ) -> Self { + Self { + base, + registry, + uri, + channel, + namespace_resolver, + } + } +} + +impl ConfigureNamespace for LibsqlReplicaConfigurator { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + tracing::debug!("creating replica namespace"); + let mut join_set = JoinSet::new(); + let db_path = self.base.base_path.join("dbs").join(name.as_str()); + let channel = self.channel.clone(); + let uri = self.uri.clone(); + let rpc_client = ReplicationLogClient::with_origin(channel.clone(), uri.clone()); + let client = crate::replication::replicator_client::Client::new( + name.clone(), + rpc_client, + &db_path, + db_config.clone(), + store.clone(), + WalFlavor::Libsql, + ) + .await?; + let applied_frame_no_receiver = client.current_frame_no_notifier.subscribe(); + let stats = make_stats( + &db_path, + &mut join_set, + db_config.clone(), + self.base.stats_sender.clone(), + name.clone(), + applied_frame_no_receiver.clone(), + self.base.encryption_config.clone(), + ) + .await?; + + let connection_maker = MakeLibsqlConnection { + inner: Arc::new(MakeLibsqlConnectionInner { + db_path: db_path.clone().into(), + stats: stats.clone(), + broadcaster: broadcaster.clone(), + config_store: db_config.clone(), + extensions: self.base.extensions.clone(), + max_response_size: self.base.max_response_size, + max_total_response_size: self.base.max_total_response_size, + auto_checkpoint: 0, + current_frame_no_receiver: applied_frame_no_receiver.clone(), + encryption_config: self.base.encryption_config.clone(), + block_writes: Arc::new(true.into()), + resolve_attach_path: resolve_attach_path.clone(), + wal_manager: LibsqlWalManager::new( + self.registry.clone(), + self.namespace_resolver.clone(), + ), + }), + }; + + let connection_maker = MakeWriteProxyConn::new( + channel.clone(), + uri.clone(), + stats.clone(), + applied_frame_no_receiver.clone(), + self.base.max_response_size, + self.base.max_total_response_size, + // FIXME: we need to fetch the primary index before + None, + self.base.encryption_config.clone(), + connection_maker, + ) + .throttled( + self.base.max_concurrent_connections.clone(), + Some(DB_CREATE_TIMEOUT), + self.base.max_total_response_size, + self.base.max_concurrent_requests, + ); + + // FIXME: hack, this is necessary for the registry to open the SharedWal + let _ = connection_maker.create().await?; + let shared = self + .registry + .get_async(&(name.clone().into())) + .await + .unwrap(); + + let mut tx = Transaction::Read(shared.begin_read(u64::MAX)); + shared.upgrade(&mut tx).unwrap(); + let guard = tx + .into_write() + .unwrap_or_else(|_| panic!()) + .into_lock_owned(); + let injector = Injector::new(shared, guard, 10).unwrap(); + let injector = LibsqlInjector::new(injector); + let mut replicator = Replicator::new(client, injector); + + tracing::debug!("try perform handshake"); + // force a handshake now, to retrieve the primary's current replication index + match replicator.try_perform_handshake().await { + Err(libsql_replication::replicator::Error::Meta( + libsql_replication::meta::Error::LogIncompatible, + )) => { + tracing::error!( + "trying to replicate incompatible logs, reseting replica and nuking db dir" + ); + std::fs::remove_dir_all(&db_path).unwrap(); + return self + .setup( + db_config, + restore_option, + name, + reset, + resolve_attach_path, + store, + broadcaster, + ) + .await; + } + Err(e) => Err(e)?, + Ok(_) => (), + } + + tracing::debug!("done performing handshake"); + + + let namespace = name.clone(); + join_set.spawn(async move { + use libsql_replication::replicator::Error; + loop { + match replicator.run().await { + err @ Error::Fatal(_) => Err(err)?, + err @ Error::NamespaceDoesntExist => { + tracing::error!("namespace {namespace} doesn't exist, destroying..."); + (reset)(ResetOp::Destroy(namespace.clone())); + Err(err)?; + } + e @ Error::Injector(_) => { + tracing::error!("potential corruption detected while replicating, reseting replica: {e}"); + (reset)(ResetOp::Reset(namespace.clone())); + Err(e)?; + }, + Error::Meta(err) => { + use libsql_replication::meta::Error; + match err { + Error::LogIncompatible => { + tracing::error!("trying to replicate incompatible logs, reseting replica"); + (reset)(ResetOp::Reset(namespace.clone())); + Err(err)?; + } + Error::InvalidMetaFile + | Error::Io(_) + | Error::InvalidLogId + | Error::FailedToCommit(_) + | Error::InvalidReplicationPath + | Error::RequiresCleanDatabase => { + // We retry from last frame index? + tracing::warn!("non-fatal replication error, retrying from last commit index: {err}"); + }, + } + } + e @ (Error::Internal(_) + | Error::Client(_) + | Error::PrimaryHandshakeTimeout + | Error::NeedSnapshot) => { + tracing::warn!("non-fatal replication error, retrying from last commit index: {e}"); + }, + Error::NoHandshake => { + // not strictly necessary, but in case the handshake error goes uncaught, + // we reset the client state. + replicator.client_mut().reset_token(); + } + Error::SnapshotPending => unreachable!(), + } + } + }); + + Ok(Namespace { + tasks: join_set, + db: Database::LibsqlReplica(LibsqlReplicaDatabase { + connection_maker: Arc::new(connection_maker), + }), + name: name.clone(), + stats, + db_config_store: db_config, + path: db_path.into(), + }) + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + _db_config: &DatabaseConfig, + _prune_all: bool, + _bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let ns_path = self.base.base_path.join("dbs").join(namespace.as_str()); + if ns_path.try_exists()? { + tracing::debug!("removing database directory: {}", ns_path.display()); + tokio::fs::remove_dir_all(ns_path).await?; + } + Ok(()) + }) + } + + fn fork<'a>( + &'a self, + _from_ns: &'a Namespace, + _from_config: MetaStoreHandle, + _to_ns: NamespaceName, + _to_config: MetaStoreHandle, + _timestamp: Option, + _store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + Box::pin(std::future::ready(Err(crate::Error::Fork( + super::fork::ForkError::ForkReplica, + )))) + } +} diff --git a/libsql-server/src/namespace/configurator/libsql_schema.rs b/libsql-server/src/namespace/configurator/libsql_schema.rs new file mode 100644 index 0000000000..fc30a86885 --- /dev/null +++ b/libsql-server/src/namespace/configurator/libsql_schema.rs @@ -0,0 +1,168 @@ +use std::path::Path; +use std::sync::Arc; + +use futures::prelude::Future; +use libsql_sys::name::NamespaceResolver; +use libsql_wal::io::StdIO; +use libsql_wal::registry::WalRegistry; + +use crate::connection::config::DatabaseConfig; +use crate::database::{Database, SchemaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceName, NamespaceStore, ResetCb, ResolveNamespacePathFn, RestoreOption, +}; +use crate::schema::SchedulerHandle; +use crate::SqldStorage; + +use super::helpers::cleanup_primary; +use super::libsql_primary::libsql_primary_common; +use super::{BaseNamespaceConfig, ConfigureNamespace, PrimaryConfig}; + +pub struct LibsqlSchemaConfigurator { + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + migration_scheduler: SchedulerHandle, + registry: Arc>, + namespace_resolver: Arc, +} + +impl LibsqlSchemaConfigurator { + pub fn new( + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + migration_scheduler: SchedulerHandle, + registry: Arc>, + namespace_resolver: Arc, + ) -> Self { + Self { + base, + primary_config, + migration_scheduler, + registry, + namespace_resolver, + } + } + + #[tracing::instrument(skip_all, fields(namespace))] + async fn try_new_schema( + &self, + namespace: NamespaceName, + db_config: MetaStoreHandle, + _restore_option: RestoreOption, + resolve_attach_path: ResolveNamespacePathFn, + db_path: Arc, + broadcaster: BroadcasterHandle, + ) -> crate::Result { + let common = libsql_primary_common( + db_path.clone(), + db_config.clone(), + &self.base, + &self.primary_config, + namespace.clone(), + broadcaster, + resolve_attach_path, + self.registry.clone(), + self.namespace_resolver.clone() + ).await?; + + Ok(Namespace { + tasks: common.join_set, + db: Database::LibsqlSchema(SchemaDatabase::new( + self.migration_scheduler.clone(), + namespace.clone(), + common.connection_maker, + None, + db_config.clone(), + )), + name: namespace, + stats: common.stats, + db_config_store: db_config, + path: db_path.into(), + }) + } +} + +impl ConfigureNamespace for LibsqlSchemaConfigurator { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + _reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + _store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + let db_path: Arc = self.base.base_path.join("dbs").join(name.as_str()).into(); + let fresh_namespace = !db_path.try_exists()?; + // FIXME: make that truly atomic. explore the idea of using temp directories, and it's implications + match self + .try_new_schema( + name.clone(), + db_config, + restore_option, + resolve_attach_path, + db_path.clone(), + broadcaster, + ) + .await + { + Ok(this) => Ok(this), + Err(e) if fresh_namespace => { + tracing::error!( + "an error occured while deleting creating namespace, cleaning..." + ); + if let Err(e) = tokio::fs::remove_dir_all(&db_path).await { + tracing::error!("failed to remove dirty namespace directory: {e}") + } + Err(e) + } + Err(e) => Err(e), + } + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: crate::namespace::NamespaceBottomlessDbIdInit, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + cleanup_primary( + &self.base, + &self.primary_config, + namespace, + db_config, + prune_all, + bottomless_db_id_init, + ) + .await + }) + } + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(super::fork::fork( + from_ns, + from_config, + to_ns, + to_config, + timestamp, + store, + &self.primary_config, + self.base.base_path.clone(), + )) + } +} diff --git a/libsql-server/src/namespace/configurator/mod.rs b/libsql-server/src/namespace/configurator/mod.rs new file mode 100644 index 0000000000..2517a1113c --- /dev/null +++ b/libsql-server/src/namespace/configurator/mod.rs @@ -0,0 +1,145 @@ +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use chrono::NaiveDateTime; +use futures::Future; +use libsql_sys::EncryptionConfig; +use tokio::sync::Semaphore; + +use crate::connection::config::DatabaseConfig; +use crate::replication::script_backup_manager::ScriptBackupManager; +use crate::StatsSender; + +use super::broadcasters::BroadcasterHandle; +use super::meta_store::MetaStoreHandle; +use super::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, + ResolveNamespacePathFn, RestoreOption, +}; + +pub mod fork; +mod helpers; +mod libsql_replica; +mod primary; +mod replica; +mod schema; +mod libsql_primary; +mod libsql_schema; + +pub use primary::PrimaryConfigurator; +pub use replica::ReplicaConfigurator; +pub use schema::SchemaConfigurator; +pub use libsql_primary::LibsqlPrimaryConfigurator; +pub use libsql_replica::LibsqlReplicaConfigurator; +pub use libsql_schema::LibsqlSchemaConfigurator; + +#[derive(Clone, Debug)] +pub struct BaseNamespaceConfig { + pub(crate) base_path: Arc, + pub(crate) extensions: Arc<[PathBuf]>, + pub(crate) stats_sender: StatsSender, + pub(crate) max_response_size: u64, + pub(crate) max_total_response_size: u64, + pub(crate) max_concurrent_connections: Arc, + pub(crate) max_concurrent_requests: u64, + pub(crate) encryption_config: Option, +} + +#[derive(Clone)] +pub struct PrimaryConfig { + pub(crate) max_log_size: u64, + pub(crate) max_log_duration: Option, + pub(crate) bottomless_replication: Option, + pub(crate) scripted_backup: Option, + pub(crate) checkpoint_interval: Option, +} + +pub type DynConfigurator = dyn ConfigureNamespace + Send + Sync + 'static; + +pub(crate) struct NamespaceConfigurators { + replica_configurator: Option>, + primary_configurator: Option>, + schema_configurator: Option>, +} + +impl Default for NamespaceConfigurators { + fn default() -> Self { + Self::empty() + } +} + +impl NamespaceConfigurators { + pub fn empty() -> Self { + Self { + replica_configurator: None, + primary_configurator: None, + schema_configurator: None, + } + } + + pub fn with_primary( + &mut self, + c: impl ConfigureNamespace + Send + Sync + 'static, + ) -> &mut Self { + self.primary_configurator = Some(Box::new(c)); + self + } + + pub fn with_replica( + &mut self, + c: impl ConfigureNamespace + Send + Sync + 'static, + ) -> &mut Self { + self.replica_configurator = Some(Box::new(c)); + self + } + + pub fn with_schema(&mut self, c: impl ConfigureNamespace + Send + Sync + 'static) -> &mut Self { + self.schema_configurator = Some(Box::new(c)); + self + } + + pub fn configure_schema(&self) -> crate::Result<&DynConfigurator> { + self.schema_configurator.as_deref().ok_or_else(|| todo!()) + } + + pub fn configure_primary(&self) -> crate::Result<&DynConfigurator> { + self.primary_configurator.as_deref().ok_or_else(|| todo!()) + } + + pub fn configure_replica(&self) -> crate::Result<&DynConfigurator> { + self.replica_configurator.as_deref().ok_or_else(|| todo!()) + } +} + +pub trait ConfigureNamespace { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>>; + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>>; + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> Pin> + Send + 'a>>; +} diff --git a/libsql-server/src/namespace/configurator/primary.rs b/libsql-server/src/namespace/configurator/primary.rs new file mode 100644 index 0000000000..66570e9c27 --- /dev/null +++ b/libsql-server/src/namespace/configurator/primary.rs @@ -0,0 +1,202 @@ +use std::path::Path; +use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use futures::prelude::Future; +use libsql_sys::EncryptionConfig; +use tokio::task::JoinSet; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::{Connection as _, MakeConnection}; +use crate::database::{Database, PrimaryDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_primary_connection_maker; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, + ResolveNamespacePathFn, RestoreOption, +}; +use crate::run_periodic_checkpoint; +use crate::schema::{has_pending_migration_task, setup_migration_table}; + +use super::helpers::cleanup_primary; +use super::{BaseNamespaceConfig, ConfigureNamespace, PrimaryConfig}; + +pub struct PrimaryConfigurator { + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +} + +impl PrimaryConfigurator { + pub fn new( + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + ) -> Self { + Self { + base, + primary_config, + make_wal_manager, + } + } + + #[tracing::instrument(skip_all, fields(namespace))] + async fn try_new_primary( + &self, + namespace: NamespaceName, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + resolve_attach_path: ResolveNamespacePathFn, + db_path: Arc, + broadcaster: BroadcasterHandle, + encryption_config: Option, + ) -> crate::Result { + let mut join_set = JoinSet::new(); + + tokio::fs::create_dir_all(&db_path).await?; + + let block_writes = Arc::new(AtomicBool::new(false)); + let (connection_maker, wal_wrapper, stats) = make_primary_connection_maker( + &self.primary_config, + &self.base, + &meta_store_handle, + &db_path, + &namespace, + restore_option, + block_writes.clone(), + &mut join_set, + resolve_attach_path, + broadcaster, + self.make_wal_manager.clone(), + encryption_config, + ) + .await?; + let connection_maker = Arc::new(connection_maker); + + if meta_store_handle.get().shared_schema_name.is_some() { + let block_writes = block_writes.clone(); + let conn = connection_maker.create().await?; + tokio::task::spawn_blocking(move || { + conn.with_raw(|conn| -> crate::Result<()> { + setup_migration_table(conn)?; + if has_pending_migration_task(conn)? { + block_writes.store(true, Ordering::SeqCst); + } + Ok(()) + }) + }) + .await + .unwrap()?; + } + + if let Some(checkpoint_interval) = self.primary_config.checkpoint_interval { + join_set.spawn(run_periodic_checkpoint( + connection_maker.clone(), + checkpoint_interval, + namespace.clone(), + )); + } + + tracing::debug!("Done making new primary"); + + Ok(Namespace { + tasks: join_set, + db: Database::Primary(PrimaryDatabase { + wal_wrapper, + connection_maker, + block_writes, + }), + name: namespace, + stats, + db_config_store: meta_store_handle, + path: db_path.into(), + }) + } +} + +impl ConfigureNamespace for PrimaryConfigurator { + fn setup<'a>( + &'a self, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + _reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + _store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let db_path: Arc = self.base.base_path.join("dbs").join(name.as_str()).into(); + let fresh_namespace = !db_path.try_exists()?; + // FIXME: make that truly atomic. explore the idea of using temp directories, and it's implications + match self + .try_new_primary( + name.clone(), + meta_store_handle, + restore_option, + resolve_attach_path, + db_path.clone(), + broadcaster, + self.base.encryption_config.clone(), + ) + .await + { + Ok(this) => Ok(this), + Err(e) if fresh_namespace => { + tracing::error!( + "an error occured while deleting creating namespace, cleaning..." + ); + if let Err(e) = tokio::fs::remove_dir_all(&db_path).await { + tracing::error!("failed to remove dirty namespace directory: {e}") + } + Err(e) + } + Err(e) => Err(e), + } + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + cleanup_primary( + &self.base, + &self.primary_config, + namespace, + db_config, + prune_all, + bottomless_db_id_init, + ) + .await + }) + } + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + Box::pin(super::fork::fork( + from_ns, + from_config, + to_ns, + to_config, + timestamp, + store, + &self.primary_config, + self.base.base_path.clone(), + )) + } +} diff --git a/libsql-server/src/namespace/configurator/replica.rs b/libsql-server/src/namespace/configurator/replica.rs new file mode 100644 index 0000000000..97342fbe16 --- /dev/null +++ b/libsql-server/src/namespace/configurator/replica.rs @@ -0,0 +1,260 @@ +use std::pin::Pin; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use futures::Future; +use hyper::Uri; +use libsql_replication::rpc::replication::log_offset::WalFlavor; +use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; +use libsql_sys::wal::wrapper::PassthroughWalWrapper; +use tokio::task::JoinSet; +use tonic::transport::Channel; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::legacy::MakeLegacyConnection; +use crate::connection::write_proxy::MakeWriteProxyConn; +use crate::connection::MakeConnection; +use crate::database::{Database, ReplicaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_stats; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{Namespace, NamespaceBottomlessDbIdInit, RestoreOption}; +use crate::namespace::{NamespaceName, NamespaceStore, ResetCb, ResetOp, ResolveNamespacePathFn}; +use crate::{DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT}; + +use super::{BaseNamespaceConfig, ConfigureNamespace}; + +pub struct ReplicaConfigurator { + base: BaseNamespaceConfig, + channel: Channel, + uri: Uri, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +} + +impl ReplicaConfigurator { + pub fn new( + base: BaseNamespaceConfig, + channel: Channel, + uri: Uri, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + ) -> Self { + Self { + base, + channel, + uri, + make_wal_manager, + } + } +} + +impl ConfigureNamespace for ReplicaConfigurator { + fn setup<'a>( + &'a self, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + tracing::debug!("creating replica namespace"); + let db_path = self.base.base_path.join("dbs").join(name.as_str()); + let channel = self.channel.clone(); + let uri = self.uri.clone(); + + let rpc_client = ReplicationLogClient::with_origin(channel.clone(), uri.clone()); + let client = crate::replication::replicator_client::Client::new( + name.clone(), + rpc_client, + &db_path, + meta_store_handle.clone(), + store.clone(), + WalFlavor::Sqlite, + ) + .await?; + let applied_frame_no_receiver = client.current_frame_no_notifier.subscribe(); + let mut replicator = libsql_replication::replicator::Replicator::new_sqlite( + client, + db_path.join("data"), + DEFAULT_AUTO_CHECKPOINT, + None, + ) + .await?; + + tracing::debug!("try perform handshake"); + // force a handshake now, to retrieve the primary's current replication index + match replicator.try_perform_handshake().await { + Err(libsql_replication::replicator::Error::Meta( + libsql_replication::meta::Error::LogIncompatible, + )) => { + tracing::error!( + "trying to replicate incompatible logs, reseting replica and nuking db dir" + ); + std::fs::remove_dir_all(&db_path).unwrap(); + return self + .setup( + meta_store_handle, + restore_option, + name, + reset, + resolve_attach_path, + store, + broadcaster, + ) + .await; + } + Err(e) => Err(e)?, + Ok(_) => (), + } + + tracing::debug!("done performing handshake"); + + let primary_current_replicatio_index = + replicator.client_mut().primary_replication_index; + + let mut join_set = JoinSet::new(); + let namespace = name.clone(); + join_set.spawn(async move { + use libsql_replication::replicator::Error; + loop { + match replicator.run().await { + err @ Error::Fatal(_) => Err(err)?, + err @ Error::NamespaceDoesntExist => { + tracing::error!("namespace {namespace} doesn't exist, destroying..."); + (reset)(ResetOp::Destroy(namespace.clone())); + Err(err)?; + } + e @ Error::Injector(_) => { + tracing::error!("potential corruption detected while replicating, reseting replica: {e}"); + (reset)(ResetOp::Reset(namespace.clone())); + Err(e)?; + }, + Error::Meta(err) => { + use libsql_replication::meta::Error; + match err { + Error::LogIncompatible => { + tracing::error!("trying to replicate incompatible logs, reseting replica"); + (reset)(ResetOp::Reset(namespace.clone())); + Err(err)?; + } + Error::InvalidMetaFile + | Error::Io(_) + | Error::InvalidLogId + | Error::FailedToCommit(_) + | Error::InvalidReplicationPath + | Error::RequiresCleanDatabase => { + // We retry from last frame index? + tracing::warn!("non-fatal replication error, retrying from last commit index: {err}"); + }, + } + } + e @ (Error::Internal(_) + | Error::Client(_) + | Error::PrimaryHandshakeTimeout + | Error::NeedSnapshot) => { + tracing::warn!("non-fatal replication error, retrying from last commit index: {e}"); + }, + Error::NoHandshake => { + // not strictly necessary, but in case the handshake error goes uncaught, + // we reset the client state. + replicator.client_mut().reset_token(); + } + Error::SnapshotPending => unreachable!(), + } + } + }); + + let stats = make_stats( + &db_path, + &mut join_set, + meta_store_handle.clone(), + self.base.stats_sender.clone(), + name.clone(), + applied_frame_no_receiver.clone(), + self.base.encryption_config.clone(), + ) + .await?; + + let connection_maker = MakeLegacyConnection::new( + db_path.clone(), + PassthroughWalWrapper, + stats.clone(), + broadcaster, + meta_store_handle.clone(), + self.base.extensions.clone(), + self.base.max_response_size, + self.base.max_total_response_size, + DEFAULT_AUTO_CHECKPOINT, + applied_frame_no_receiver.clone(), + self.base.encryption_config.clone(), + Arc::new(AtomicBool::new(false)), // this is always false for write proxy + resolve_attach_path, + self.make_wal_manager.clone(), + ) + .await?; + + let connection_maker = MakeWriteProxyConn::new( + channel.clone(), + uri.clone(), + stats.clone(), + applied_frame_no_receiver, + self.base.max_response_size, + self.base.max_total_response_size, + primary_current_replicatio_index, + self.base.encryption_config.clone(), + connection_maker, + ) + .throttled( + self.base.max_concurrent_connections.clone(), + Some(DB_CREATE_TIMEOUT), + self.base.max_total_response_size, + self.base.max_concurrent_requests, + ); + + Ok(Namespace { + tasks: join_set, + db: Database::Replica(ReplicaDatabase { + connection_maker: Arc::new(connection_maker), + }), + name: name.clone(), + stats, + db_config_store: meta_store_handle, + path: db_path.into(), + }) + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + _db_config: &DatabaseConfig, + _prune_all: bool, + _bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let ns_path = self.base.base_path.join("dbs").join(namespace.as_str()); + if ns_path.try_exists()? { + tracing::debug!("removing database directory: {}", ns_path.display()); + tokio::fs::remove_dir_all(ns_path).await?; + } + Ok(()) + }) + } + + fn fork<'a>( + &'a self, + _from_ns: &'a Namespace, + _from_config: MetaStoreHandle, + _to_ns: NamespaceName, + _to_config: MetaStoreHandle, + _timestamp: Option, + _store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + Box::pin(std::future::ready(Err(crate::Error::Fork( + super::fork::ForkError::ForkReplica, + )))) + } +} diff --git a/libsql-server/src/namespace/configurator/schema.rs b/libsql-server/src/namespace/configurator/schema.rs new file mode 100644 index 0000000000..add6d0e918 --- /dev/null +++ b/libsql-server/src/namespace/configurator/schema.rs @@ -0,0 +1,132 @@ +use std::sync::{atomic::AtomicBool, Arc}; + +use futures::prelude::Future; +use tokio::task::JoinSet; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::database::{Database, SchemaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceName, NamespaceStore, ResetCb, ResolveNamespacePathFn, RestoreOption, +}; +use crate::schema::SchedulerHandle; + +use super::helpers::{cleanup_primary, make_primary_connection_maker}; +use super::{BaseNamespaceConfig, ConfigureNamespace, PrimaryConfig}; + +pub struct SchemaConfigurator { + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler: SchedulerHandle, +} + +impl SchemaConfigurator { + pub fn new( + base: BaseNamespaceConfig, + primary_config: PrimaryConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler: SchedulerHandle, + ) -> Self { + Self { + base, + primary_config, + make_wal_manager, + migration_scheduler, + } + } +} + +impl ConfigureNamespace for SchemaConfigurator { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + _reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + _store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + let mut join_set = JoinSet::new(); + let db_path = self.base.base_path.join("dbs").join(name.as_str()); + + tokio::fs::create_dir_all(&db_path).await?; + + let (connection_maker, wal_manager, stats) = make_primary_connection_maker( + &self.primary_config, + &self.base, + &db_config, + &db_path, + &name, + restore_option, + Arc::new(AtomicBool::new(false)), // this is always false for schema + &mut join_set, + resolve_attach_path, + broadcaster, + self.make_wal_manager.clone(), + self.base.encryption_config.clone(), + ) + .await?; + + Ok(Namespace { + db: Database::Schema(SchemaDatabase::new( + self.migration_scheduler.clone(), + name.clone(), + Arc::new(connection_maker), + Some(wal_manager), + db_config.clone(), + )), + name: name.clone(), + tasks: join_set, + stats, + db_config_store: db_config.clone(), + path: db_path.into(), + }) + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: crate::namespace::NamespaceBottomlessDbIdInit, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + cleanup_primary( + &self.base, + &self.primary_config, + namespace, + db_config, + prune_all, + bottomless_db_id_init, + ) + .await + }) + } + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(super::fork::fork( + from_ns, + from_config, + to_ns, + to_config, + timestamp, + store, + &self.primary_config, + self.base.base_path.clone(), + )) + } +} diff --git a/libsql-server/src/namespace/meta_store.rs b/libsql-server/src/namespace/meta_store.rs index 599dab9360..83f9c856ed 100644 --- a/libsql-server/src/namespace/meta_store.rs +++ b/libsql-server/src/namespace/meta_store.rs @@ -24,7 +24,7 @@ use crate::config::BottomlessConfig; use crate::connection::config::DatabaseConfig; use crate::schema::{MigrationDetails, MigrationSummary}; use crate::{ - config::MetaStoreConfig, connection::libsql::open_conn_active_checkpoint, error::Error, Result, + config::MetaStoreConfig, connection::legacy::open_conn_active_checkpoint, error::Error, Result, }; use super::NamespaceName; diff --git a/libsql-server/src/namespace/mod.rs b/libsql-server/src/namespace/mod.rs index 6e48e7f1d8..2a2e3eb211 100644 --- a/libsql-server/src/namespace/mod.rs +++ b/libsql-server/src/namespace/mod.rs @@ -1,62 +1,31 @@ -pub mod broadcasters; -mod fork; -pub mod meta_store; -mod name; -pub mod replication_wal; -mod schema_lock; -mod store; - -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Weak}; +use std::path::Path; +use std::sync::Arc; -use anyhow::{Context as _, Error}; -use bottomless::replicator::Options; -use broadcasters::BroadcasterHandle; +use anyhow::Context as _; use bytes::Bytes; use chrono::NaiveDateTime; -use enclose::enclose; use futures_core::{Future, Stream}; -use hyper::Uri; -use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; -use libsql_sys::wal::Sqlite3WalManager; -use libsql_sys::EncryptionConfig; -use tokio::io::AsyncBufReadExt; -use tokio::sync::{watch, Semaphore}; use tokio::task::JoinSet; -use tokio::time::Duration; -use tokio_util::io::StreamReader; -use tonic::transport::Channel; use uuid::Uuid; use crate::auth::parse_jwt_keys; use crate::connection::config::DatabaseConfig; -use crate::connection::connection_manager::InnerWalManager; -use crate::connection::libsql::{open_conn, MakeLibSqlConn}; -use crate::connection::write_proxy::MakeWriteProxyConn; -use crate::connection::Connection; -use crate::connection::MakeConnection; -use crate::database::{ - Database, DatabaseKind, PrimaryConnection, PrimaryConnectionMaker, PrimaryDatabase, - ReplicaDatabase, SchemaDatabase, -}; -use crate::error::LoadDumpError; -use crate::replication::script_backup_manager::ScriptBackupManager; -use crate::replication::{FrameNo, ReplicationLogger}; -use crate::schema::{has_pending_migration_task, setup_migration_table, SchedulerHandle}; +use crate::connection::Connection as _; +use crate::database::Database; use crate::stats::Stats; -use crate::{ - run_periodic_checkpoint, StatsSender, BLOCKING_RT, DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT, -}; -pub use fork::ForkError; - -use self::fork::{ForkTask, PointInTimeRestore}; use self::meta_store::MetaStoreHandle; pub use self::name::NamespaceName; -use self::replication_wal::{make_replication_wal_wrapper, ReplicationWalWrapper}; pub use self::store::NamespaceStore; +pub mod broadcasters; +pub(crate) mod configurator; +pub mod meta_store; +mod name; +pub mod replication_wal; +mod schema_lock; +mod store; + pub type ResetCb = Box; pub type ResolveNamespacePathFn = Arc crate::Result> + Sync + Send + 'static>; @@ -100,103 +69,10 @@ pub struct Namespace { } impl Namespace { - async fn from_config( - ns_config: &NamespaceConfig, - db_config: MetaStoreHandle, - restore_option: RestoreOption, - name: &NamespaceName, - reset: ResetCb, - resolve_attach_path: ResolveNamespacePathFn, - store: NamespaceStore, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - match ns_config.db_kind { - DatabaseKind::Primary if db_config.get().is_shared_schema => { - Self::new_schema( - ns_config, - name.clone(), - db_config, - restore_option, - resolve_attach_path, - broadcaster, - ) - .await - } - DatabaseKind::Primary => { - Self::new_primary( - ns_config, - name.clone(), - db_config, - restore_option, - resolve_attach_path, - broadcaster, - ) - .await - } - DatabaseKind::Replica => { - Self::new_replica( - ns_config, - name.clone(), - db_config, - reset, - resolve_attach_path, - store, - broadcaster, - ) - .await - } - } - } - pub(crate) fn name(&self) -> &NamespaceName { &self.name } - /// completely remove resources associated with the namespace - pub(crate) async fn cleanup( - ns_config: &NamespaceConfig, - name: &NamespaceName, - db_config: &DatabaseConfig, - prune_all: bool, - bottomless_db_id_init: NamespaceBottomlessDbIdInit, - ) -> crate::Result<()> { - let ns_path = ns_config.base_path.join("dbs").join(name.as_str()); - match ns_config.db_kind { - DatabaseKind::Primary => { - if let Some(ref options) = ns_config.bottomless_replication { - let bottomless_db_id = match bottomless_db_id_init { - NamespaceBottomlessDbIdInit::Provided(db_id) => db_id, - NamespaceBottomlessDbIdInit::FetchFromConfig => { - NamespaceBottomlessDbId::from_config(&db_config) - } - }; - let options = make_bottomless_options(options, bottomless_db_id, name.clone()); - let replicator = bottomless::replicator::Replicator::with_options( - ns_path.join("data").to_str().unwrap(), - options, - ) - .await?; - if prune_all { - let delete_all = replicator.delete_all(None).await?; - // perform hard deletion in the background - tokio::spawn(delete_all.commit()); - } else { - // for soft delete make sure that local db is fully backed up - replicator.savepoint().confirmed().await?; - } - } - } - DatabaseKind::Replica => (), - } - - if ns_path.try_exists()? { - tracing::debug!("removing database directory: {}", ns_path.display()); - tokio::fs::remove_dir_all(ns_path).await?; - } - - Ok(()) - } - async fn destroy(mut self) -> anyhow::Result<()> { self.tasks.shutdown().await; self.db.destroy(); @@ -246,605 +122,11 @@ impl Namespace { pub fn config_changed(&self) -> impl Future { self.db_config_store.changed() } - - async fn new_primary( - config: &NamespaceConfig, - name: NamespaceName, - meta_store_handle: MetaStoreHandle, - restore_option: RestoreOption, - resolve_attach_path: ResolveNamespacePathFn, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let db_path: Arc = config.base_path.join("dbs").join(name.as_str()).into(); - let fresh_namespace = !db_path.try_exists()?; - // FIXME: make that truly atomic. explore the idea of using temp directories, and it's implications - match Self::try_new_primary( - config, - name.clone(), - meta_store_handle, - restore_option, - resolve_attach_path, - db_path.clone(), - broadcaster, - ) - .await - { - Ok(this) => Ok(this), - Err(e) if fresh_namespace => { - tracing::error!("an error occured while deleting creating namespace, cleaning..."); - if let Err(e) = tokio::fs::remove_dir_all(&db_path).await { - tracing::error!("failed to remove dirty namespace directory: {e}") - } - Err(e) - } - Err(e) => Err(e), - } - } - - #[tracing::instrument(skip_all)] - async fn make_primary_connection_maker( - ns_config: &NamespaceConfig, - meta_store_handle: &MetaStoreHandle, - db_path: &Path, - name: &NamespaceName, - restore_option: RestoreOption, - block_writes: Arc, - join_set: &mut JoinSet>, - resolve_attach_path: ResolveNamespacePathFn, - broadcaster: BroadcasterHandle, - ) -> crate::Result<(PrimaryConnectionMaker, ReplicationWalWrapper, Arc)> { - let db_config = meta_store_handle.get(); - let bottomless_db_id = NamespaceBottomlessDbId::from_config(&db_config); - // FIXME: figure how to to it per-db - let mut is_dirty = { - let sentinel_path = db_path.join(".sentinel"); - if sentinel_path.try_exists()? { - true - } else { - tokio::fs::File::create(&sentinel_path).await?; - false - } - }; - - // FIXME: due to a bug in logger::checkpoint_db we call regular checkpointing code - // instead of our virtual WAL one. It's a bit tangled to fix right now, because - // we need WAL context for checkpointing, and WAL context needs the ReplicationLogger... - // So instead we checkpoint early, *before* bottomless gets initialized. That way - // we're sure bottomless won't try to back up any existing WAL frames and will instead - // treat the existing db file as the source of truth. - - let bottomless_replicator = match ns_config.bottomless_replication { - Some(ref options) => { - tracing::debug!("Checkpointing before initializing bottomless"); - crate::replication::primary::logger::checkpoint_db(&db_path.join("data"))?; - tracing::debug!("Checkpointed before initializing bottomless"); - let options = make_bottomless_options(options, bottomless_db_id, name.clone()); - let (replicator, did_recover) = - init_bottomless_replicator(db_path.join("data"), options, &restore_option) - .await?; - tracing::debug!("Completed init of bottomless replicator"); - is_dirty |= did_recover; - Some(replicator) - } - None => None, - }; - - tracing::debug!("Checking fresh db"); - let is_fresh_db = check_fresh_db(&db_path)?; - // switch frame-count checkpoint to time-based one - let auto_checkpoint = if ns_config.checkpoint_interval.is_some() { - 0 - } else { - DEFAULT_AUTO_CHECKPOINT - }; - - let logger = Arc::new(ReplicationLogger::open( - &db_path, - ns_config.max_log_size, - ns_config.max_log_duration, - is_dirty, - auto_checkpoint, - ns_config.scripted_backup.clone(), - name.clone(), - ns_config.encryption_config.clone(), - )?); - - tracing::debug!("sending stats"); - - let stats = make_stats( - &db_path, - join_set, - meta_store_handle.clone(), - ns_config.stats_sender.clone(), - name.clone(), - logger.new_frame_notifier.subscribe(), - ns_config.encryption_config.clone(), - ) - .await?; - - tracing::debug!("Making replication wal wrapper"); - let wal_wrapper = make_replication_wal_wrapper(bottomless_replicator, logger.clone()); - - tracing::debug!("Opening libsql connection"); - - let connection_maker = MakeLibSqlConn::new( - db_path.to_path_buf(), - wal_wrapper.clone(), - stats.clone(), - broadcaster, - meta_store_handle.clone(), - ns_config.extensions.clone(), - ns_config.max_response_size, - ns_config.max_total_response_size, - auto_checkpoint, - logger.new_frame_notifier.subscribe(), - ns_config.encryption_config.clone(), - block_writes, - resolve_attach_path, - ns_config.make_wal_manager.clone(), - ) - .await? - .throttled( - ns_config.max_concurrent_connections.clone(), - Some(DB_CREATE_TIMEOUT), - ns_config.max_total_response_size, - ns_config.max_concurrent_requests, - ); - - tracing::debug!("Completed opening libsql connection"); - - // this must happen after we create the connection maker. The connection maker old on a - // connection to ensure that no other connection is closing while we try to open the dump. - // that would cause a SQLITE_LOCKED error. - match restore_option { - RestoreOption::Dump(_) if !is_fresh_db => { - Err(LoadDumpError::LoadDumpExistingDb)?; - } - RestoreOption::Dump(dump) => { - let conn = connection_maker.create().await?; - tracing::debug!("Loading dump"); - load_dump(dump, conn).await?; - tracing::debug!("Done loading dump"); - } - _ => { /* other cases were already handled when creating bottomless */ } - } - - join_set.spawn(run_periodic_compactions(logger.clone())); - - tracing::debug!("Done making primary connection"); - - Ok((connection_maker, wal_wrapper, stats)) - } - - #[tracing::instrument(skip_all, fields(namespace))] - async fn try_new_primary( - ns_config: &NamespaceConfig, - namespace: NamespaceName, - meta_store_handle: MetaStoreHandle, - restore_option: RestoreOption, - resolve_attach_path: ResolveNamespacePathFn, - db_path: Arc, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let mut join_set = JoinSet::new(); - - tokio::fs::create_dir_all(&db_path).await?; - - let block_writes = Arc::new(AtomicBool::new(false)); - let (connection_maker, wal_wrapper, stats) = Self::make_primary_connection_maker( - ns_config, - &meta_store_handle, - &db_path, - &namespace, - restore_option, - block_writes.clone(), - &mut join_set, - resolve_attach_path, - broadcaster, - ) - .await?; - let connection_maker = Arc::new(connection_maker); - - if meta_store_handle.get().shared_schema_name.is_some() { - let block_writes = block_writes.clone(); - let conn = connection_maker.create().await?; - tokio::task::spawn_blocking(move || { - conn.with_raw(|conn| -> crate::Result<()> { - setup_migration_table(conn)?; - if has_pending_migration_task(conn)? { - block_writes.store(true, Ordering::SeqCst); - } - Ok(()) - }) - }) - .await - .unwrap()?; - } - - if let Some(checkpoint_interval) = ns_config.checkpoint_interval { - join_set.spawn(run_periodic_checkpoint( - connection_maker.clone(), - checkpoint_interval, - namespace.clone(), - )); - } - - tracing::debug!("Done making new primary"); - - Ok(Self { - tasks: join_set, - db: Database::Primary(PrimaryDatabase { - wal_wrapper, - connection_maker, - block_writes, - }), - name: namespace, - stats, - db_config_store: meta_store_handle, - path: db_path.into(), - }) - } - - #[tracing::instrument(skip_all, fields(name))] - #[async_recursion::async_recursion] - async fn new_replica( - config: &NamespaceConfig, - name: NamespaceName, - meta_store_handle: MetaStoreHandle, - reset: ResetCb, - resolve_attach_path: ResolveNamespacePathFn, - store: NamespaceStore, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - tracing::debug!("creating replica namespace"); - let db_path = config.base_path.join("dbs").join(name.as_str()); - let channel = config.channel.clone().expect("bad replica config"); - let uri = config.uri.clone().expect("bad replica config"); - - let rpc_client = ReplicationLogClient::with_origin(channel.clone(), uri.clone()); - let client = crate::replication::replicator_client::Client::new( - name.clone(), - rpc_client, - &db_path, - meta_store_handle.clone(), - store.clone(), - ) - .await?; - let applied_frame_no_receiver = client.current_frame_no_notifier.subscribe(); - let mut replicator = libsql_replication::replicator::Replicator::new( - client, - db_path.join("data"), - DEFAULT_AUTO_CHECKPOINT, - config.encryption_config.clone(), - ) - .await?; - - tracing::debug!("try perform handshake"); - // force a handshake now, to retrieve the primary's current replication index - match replicator.try_perform_handshake().await { - Err(libsql_replication::replicator::Error::Meta( - libsql_replication::meta::Error::LogIncompatible, - )) => { - tracing::error!( - "trying to replicate incompatible logs, reseting replica and nuking db dir" - ); - std::fs::remove_dir_all(&db_path).unwrap(); - return Self::new_replica( - config, - name, - meta_store_handle, - reset, - resolve_attach_path, - store, - broadcaster, - ) - .await; - } - Err(e) => Err(e)?, - Ok(_) => (), - } - - tracing::debug!("done performing handshake"); - - let primary_current_replicatio_index = replicator.client_mut().primary_replication_index; - - let mut join_set = JoinSet::new(); - let namespace = name.clone(); - join_set.spawn(async move { - use libsql_replication::replicator::Error; - loop { - match replicator.run().await { - err @ Error::Fatal(_) => Err(err)?, - err @ Error::NamespaceDoesntExist => { - tracing::error!("namespace {namespace} doesn't exist, destroying..."); - (reset)(ResetOp::Destroy(namespace.clone())); - Err(err)?; - } - e @ Error::Injector(_) => { - tracing::error!("potential corruption detected while replicating, reseting replica: {e}"); - (reset)(ResetOp::Reset(namespace.clone())); - Err(e)?; - }, - Error::Meta(err) => { - use libsql_replication::meta::Error; - match err { - Error::LogIncompatible => { - tracing::error!("trying to replicate incompatible logs, reseting replica"); - (reset)(ResetOp::Reset(namespace.clone())); - Err(err)?; - } - Error::InvalidMetaFile - | Error::Io(_) - | Error::InvalidLogId - | Error::FailedToCommit(_) - | Error::InvalidReplicationPath - | Error::RequiresCleanDatabase => { - // We retry from last frame index? - tracing::warn!("non-fatal replication error, retrying from last commit index: {err}"); - }, - } - } - e @ (Error::Internal(_) - | Error::Client(_) - | Error::PrimaryHandshakeTimeout - | Error::NeedSnapshot) => { - tracing::warn!("non-fatal replication error, retrying from last commit index: {e}"); - }, - Error::NoHandshake => { - // not strictly necessary, but in case the handshake error goes uncaught, - // we reset the client state. - replicator.client_mut().reset_token(); - } - Error::SnapshotPending => unreachable!(), - } - } - }); - - let stats = make_stats( - &db_path, - &mut join_set, - meta_store_handle.clone(), - config.stats_sender.clone(), - name.clone(), - applied_frame_no_receiver.clone(), - config.encryption_config.clone(), - ) - .await?; - - let connection_maker = MakeWriteProxyConn::new( - db_path.clone(), - config.extensions.clone(), - channel.clone(), - uri.clone(), - stats.clone(), - broadcaster, - meta_store_handle.clone(), - applied_frame_no_receiver, - config.max_response_size, - config.max_total_response_size, - primary_current_replicatio_index, - config.encryption_config.clone(), - resolve_attach_path, - config.make_wal_manager.clone(), - ) - .await? - .throttled( - config.max_concurrent_connections.clone(), - Some(DB_CREATE_TIMEOUT), - config.max_total_response_size, - config.max_concurrent_requests, - ); - - Ok(Self { - tasks: join_set, - db: Database::Replica(ReplicaDatabase { - connection_maker: Arc::new(connection_maker), - }), - name, - stats, - db_config_store: meta_store_handle, - path: db_path.into(), - }) - } - - async fn fork( - ns_config: &NamespaceConfig, - from_ns: &Namespace, - from_config: MetaStoreHandle, - to_ns: NamespaceName, - to_config: MetaStoreHandle, - timestamp: Option, - resolve_attach: ResolveNamespacePathFn, - store: NamespaceStore, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let from_config = from_config.get(); - match ns_config.db_kind { - DatabaseKind::Primary => { - let bottomless_db_id = NamespaceBottomlessDbId::from_config(&from_config); - let restore_to = if let Some(timestamp) = timestamp { - if let Some(ref options) = ns_config.bottomless_replication { - Some(PointInTimeRestore { - timestamp, - replicator_options: make_bottomless_options( - options, - bottomless_db_id.clone(), - from_ns.name().clone(), - ), - }) - } else { - return Err(crate::Error::Fork(ForkError::BackupServiceNotConfigured)); - } - } else { - None - }; - - let logger = match &from_ns.db { - Database::Primary(db) => db.wal_wrapper.wrapper().logger(), - Database::Schema(db) => db.wal_wrapper.wrapper().logger(), - _ => { - return Err(crate::Error::Fork(ForkError::Internal(Error::msg( - "Invalid source database type for fork", - )))); - } - }; - - let fork_task = ForkTask { - base_path: ns_config.base_path.clone(), - to_namespace: to_ns.clone(), - logger, - restore_to, - to_config, - ns_config, - resolve_attach, - store, - broadcaster: broadcaster.handle(to_ns), - }; - - let ns = fork_task.fork().await?; - Ok(ns) - } - DatabaseKind::Replica => Err(ForkError::ForkReplica.into()), - } - } - - async fn new_schema( - ns_config: &NamespaceConfig, - name: NamespaceName, - meta_store_handle: MetaStoreHandle, - restore_option: RestoreOption, - resolve_attach_path: ResolveNamespacePathFn, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let mut join_set = JoinSet::new(); - let db_path = ns_config.base_path.join("dbs").join(name.as_str()); - - tokio::fs::create_dir_all(&db_path).await?; - - let (connection_maker, wal_manager, stats) = Self::make_primary_connection_maker( - ns_config, - &meta_store_handle, - &db_path, - &name, - restore_option, - Arc::new(AtomicBool::new(false)), // this is always false for schema - &mut join_set, - resolve_attach_path, - broadcaster, - ) - .await?; - - Ok(Namespace { - db: Database::Schema(SchemaDatabase::new( - ns_config.migration_scheduler.clone(), - name.clone(), - connection_maker, - wal_manager, - meta_store_handle.clone(), - )), - name, - tasks: join_set, - stats, - db_config_store: meta_store_handle, - path: db_path.into(), - }) - } -} - -pub struct NamespaceConfig { - /// Default database kind the store should be Creating - pub(crate) db_kind: DatabaseKind, - // Common config - pub(crate) base_path: Arc, - pub(crate) max_log_size: u64, - pub(crate) max_log_duration: Option, - pub(crate) extensions: Arc<[PathBuf]>, - pub(crate) stats_sender: StatsSender, - pub(crate) max_response_size: u64, - pub(crate) max_total_response_size: u64, - pub(crate) checkpoint_interval: Option, - pub(crate) max_concurrent_connections: Arc, - pub(crate) max_concurrent_requests: u64, - pub(crate) encryption_config: Option, - - // Replica specific config - /// grpc channel for replica - pub channel: Option, - /// grpc uri - pub uri: Option, - - // primary only config - pub(crate) bottomless_replication: Option, - pub(crate) scripted_backup: Option, - pub(crate) migration_scheduler: SchedulerHandle, - pub(crate) make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, } pub type DumpStream = Box> + Send + Sync + 'static + Unpin>; -fn make_bottomless_options( - options: &Options, - namespace_db_id: NamespaceBottomlessDbId, - name: NamespaceName, -) -> Options { - let mut options = options.clone(); - let mut db_id = match namespace_db_id { - NamespaceBottomlessDbId::Namespace(id) => id, - // FIXME(marin): I don't like that, if bottomless is enabled, proper config must be passed. - NamespaceBottomlessDbId::NotProvided => options.db_id.unwrap_or_default(), - }; - - db_id = format!("ns-{db_id}:{name}"); - options.db_id = Some(db_id); - options -} - -async fn make_stats( - db_path: &Path, - join_set: &mut JoinSet>, - meta_store_handle: MetaStoreHandle, - stats_sender: StatsSender, - name: NamespaceName, - mut current_frame_no: watch::Receiver>, - encryption_config: Option, -) -> anyhow::Result> { - tracing::debug!("creating stats type"); - let stats = Stats::new(name.clone(), db_path, join_set).await?; - - // the storage monitor is optional, so we ignore the error here. - tracing::debug!("stats created, sending stats"); - let _ = stats_sender - .send((name.clone(), meta_store_handle, Arc::downgrade(&stats))) - .await; - - join_set.spawn({ - let stats = stats.clone(); - // initialize the current_frame_no value - current_frame_no - .borrow_and_update() - .map(|fno| stats.set_current_frame_no(fno)); - async move { - while current_frame_no.changed().await.is_ok() { - current_frame_no - .borrow_and_update() - .map(|fno| stats.set_current_frame_no(fno)); - } - Ok(()) - } - }); - - join_set.spawn(run_storage_monitor( - db_path.into(), - Arc::downgrade(&stats), - encryption_config, - )); - - tracing::debug!("done sending stats, and creating bg tasks"); - - Ok(stats) -} - #[derive(Default)] pub enum RestoreOption { /// Restore database state from the most recent version found in a backup. @@ -858,189 +140,3 @@ pub enum RestoreOption { /// Granularity depends of how frequently WAL log pages are being snapshotted. PointInTime(NaiveDateTime), } - -const WASM_TABLE_CREATE: &str = - "CREATE TABLE libsql_wasm_func_table (name text PRIMARY KEY, body text) WITHOUT ROWID;"; - -async fn load_dump(dump: S, conn: PrimaryConnection) -> crate::Result<(), LoadDumpError> -where - S: Stream> + Unpin, -{ - let mut reader = tokio::io::BufReader::new(StreamReader::new(dump)); - let mut curr = String::new(); - let mut line = String::new(); - let mut skipped_wasm_table = false; - let mut n_stmt = 0; - let mut line_id = 0; - - while let Ok(n) = reader.read_line(&mut curr).await { - line_id += 1; - if n == 0 { - break; - } - let trimmed = curr.trim(); - if trimmed.is_empty() || trimmed.starts_with("--") { - curr.clear(); - continue; - } - // FIXME: it's well known bug that comment ending with semicolon will be handled incorrectly by currend dump processing code - let statement_end = trimmed.ends_with(';'); - - // we want to concat original(non-trimmed) lines as trimming will join all them in one - // single-line statement which is incorrect if comments in the end are present - line.push_str(&curr); - curr.clear(); - - // This is a hack to ignore the libsql_wasm_func_table table because it is already created - // by the system. - if !skipped_wasm_table && line.trim() == WASM_TABLE_CREATE { - skipped_wasm_table = true; - line.clear(); - continue; - } - - if statement_end { - n_stmt += 1; - // dump must be performd within a txn - if n_stmt > 2 && conn.is_autocommit().await.unwrap() { - return Err(LoadDumpError::NoTxn); - } - - line = tokio::task::spawn_blocking({ - let conn = conn.clone(); - move || -> crate::Result { - conn.with_raw(|conn| conn.execute(&line, ())).map_err(|e| { - LoadDumpError::Internal(format!("line: {}, error: {}", line_id, e)) - })?; - Ok(line) - } - }) - .await??; - line.clear(); - } else { - line.push(' '); - } - } - tracing::debug!("loaded {} lines from dump", line_id); - - if !conn.is_autocommit().await.unwrap() { - tokio::task::spawn_blocking({ - let conn = conn.clone(); - move || -> crate::Result<(), LoadDumpError> { - conn.with_raw(|conn| conn.execute("rollback", ()))?; - Ok(()) - } - }) - .await??; - return Err(LoadDumpError::NoCommit); - } - - Ok(()) -} - -pub async fn init_bottomless_replicator( - path: impl AsRef, - options: bottomless::replicator::Options, - restore_option: &RestoreOption, -) -> anyhow::Result<(bottomless::replicator::Replicator, bool)> { - tracing::debug!("Initializing bottomless replication"); - let path = path - .as_ref() - .to_str() - .ok_or_else(|| anyhow::anyhow!("Invalid db path"))? - .to_owned(); - let mut replicator = bottomless::replicator::Replicator::with_options(path, options).await?; - - let (generation, timestamp) = match restore_option { - RestoreOption::Latest | RestoreOption::Dump(_) => (None, None), - RestoreOption::Generation(generation) => (Some(*generation), None), - RestoreOption::PointInTime(timestamp) => (None, Some(*timestamp)), - }; - - let (action, did_recover) = replicator.restore(generation, timestamp).await?; - match action { - bottomless::replicator::RestoreAction::SnapshotMainDbFile => { - replicator.new_generation().await; - if let Some(_handle) = replicator.snapshot_main_db_file(true).await? { - tracing::trace!("got snapshot handle after restore with generation upgrade"); - } - // Restoration process only leaves the local WAL file if it was - // detected to be newer than its remote counterpart. - replicator.maybe_replicate_wal().await? - } - bottomless::replicator::RestoreAction::ReuseGeneration(gen) => { - replicator.set_generation(gen); - } - } - - Ok((replicator, did_recover)) -} - -async fn run_periodic_compactions(logger: Arc) -> anyhow::Result<()> { - // calling `ReplicationLogger::maybe_compact()` is cheap if the compaction does not actually - // take place, so we can afford to poll it very often for simplicity - let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(1000)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); - - loop { - interval.tick().await; - let handle = BLOCKING_RT.spawn_blocking(enclose! {(logger) move || { - logger.maybe_compact() - }}); - handle - .await - .expect("Compaction task crashed") - .context("Compaction failed")?; - } -} - -fn check_fresh_db(path: &Path) -> crate::Result { - let is_fresh = !path.join("wallog").try_exists()?; - Ok(is_fresh) -} - -// Periodically check the storage used by the database and save it in the Stats structure. -// TODO: Once we have a separate fiber that does WAL checkpoints, running this routine -// right after checkpointing is exactly where it should be done. -async fn run_storage_monitor( - db_path: PathBuf, - stats: Weak, - encryption_config: Option, -) -> anyhow::Result<()> { - // on initialization, the database file doesn't exist yet, so we wait a bit for it to be - // created - tokio::time::sleep(Duration::from_secs(1)).await; - - let duration = tokio::time::Duration::from_secs(60); - let db_path: Arc = db_path.into(); - loop { - let db_path = db_path.clone(); - let Some(stats) = stats.upgrade() else { - return Ok(()); - }; - - let encryption_config = encryption_config.clone(); - let _ = tokio::task::spawn_blocking(move || { - // because closing the last connection interferes with opening a new one, we lazily - // initialize a connection here, and keep it alive for the entirety of the program. If we - // fail to open it, we wait for `duration` and try again later. - match open_conn(&db_path, Sqlite3WalManager::new(), Some(rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY), encryption_config) { - Ok(mut conn) => { - if let Ok(tx) = conn.transaction() { - let page_count = tx.query_row("pragma page_count;", [], |row| { row.get::(0) }); - let freelist_count = tx.query_row("pragma freelist_count;", [], |row| { row.get::(0) }); - if let (Ok(page_count), Ok(freelist_count)) = (page_count, freelist_count) { - let storage_bytes_used = (page_count - freelist_count) * 4096; - stats.set_storage_bytes_used(storage_bytes_used); - } - } - }, - Err(e) => { - tracing::warn!("failed to open connection for storager monitor: {e}, trying again in {duration:?}"); - }, - } - }).await; - - tokio::time::sleep(duration).await; - } -} diff --git a/libsql-server/src/namespace/name.rs b/libsql-server/src/namespace/name.rs index e6335372a5..98fcc3d38b 100644 --- a/libsql-server/src/namespace/name.rs +++ b/libsql-server/src/namespace/name.rs @@ -57,6 +57,10 @@ impl NamespaceName { unsafe { std::str::from_utf8_unchecked(&self.0) } } + pub fn bytes(&self) -> Bytes { + self.0.clone() + } + pub fn from_bytes(bytes: Bytes) -> crate::Result { let s = std::str::from_utf8(&bytes).map_err(|_| Error::InvalidNamespace)?; Self::validate(s)?; diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index e0147fc2e8..a78e4f59b0 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -13,15 +13,17 @@ use tokio_stream::wrappers::BroadcastStream; use crate::auth::Authenticated; use crate::broadcaster::BroadcastMsg; use crate::connection::config::DatabaseConfig; +use crate::database::DatabaseKind; use crate::error::Error; use crate::metrics::NAMESPACE_LOAD_LATENCY; use crate::namespace::{NamespaceBottomlessDbId, NamespaceBottomlessDbIdInit, NamespaceName}; use crate::stats::Stats; use super::broadcasters::{BroadcasterHandle, BroadcasterRegistry}; +use super::configurator::{DynConfigurator, NamespaceConfigurators}; use super::meta_store::{MetaStore, MetaStoreHandle}; use super::schema_lock::SchemaLocksRegistry; -use super::{Namespace, NamespaceConfig, ResetCb, ResetOp, ResolveNamespacePathFn, RestoreOption}; +use super::{Namespace, ResetCb, ResetOp, ResolveNamespacePathFn, RestoreOption}; type NamespaceEntry = Arc>>; @@ -44,18 +46,20 @@ pub struct NamespaceStoreInner { allow_lazy_creation: bool, has_shutdown: AtomicBool, snapshot_at_shutdown: bool, - pub config: NamespaceConfig, schema_locks: SchemaLocksRegistry, broadcasters: BroadcasterRegistry, + configurators: NamespaceConfigurators, + db_kind: DatabaseKind, } impl NamespaceStore { - pub async fn new( + pub(crate) async fn new( allow_lazy_creation: bool, snapshot_at_shutdown: bool, max_active_namespaces: usize, - config: NamespaceConfig, metadata: MetaStore, + configurators: NamespaceConfigurators, + db_kind: DatabaseKind, ) -> crate::Result { tracing::trace!("Max active namespaces: {max_active_namespaces}"); let store = Cache::::builder() @@ -87,9 +91,10 @@ impl NamespaceStore { allow_lazy_creation, has_shutdown: AtomicBool::new(false), snapshot_at_shutdown, - config, schema_locks: Default::default(), broadcasters: Default::default(), + configurators, + db_kind, }), }) } @@ -127,14 +132,8 @@ impl NamespaceStore { } } - Namespace::cleanup( - &self.inner.config, - &namespace, - &db_config, - prune_all, - bottomless_db_id_init, - ) - .await?; + self.cleanup(&namespace, &db_config, prune_all, bottomless_db_id_init) + .await?; tracing::info!("destroyed namespace: {namespace}"); @@ -174,27 +173,18 @@ impl NamespaceStore { ns.destroy().await?; } - let handle = self.inner.metadata.handle(namespace.clone()); + let db_config = self.inner.metadata.handle(namespace.clone()); // destroy on-disk database - Namespace::cleanup( - &self.inner.config, + self.cleanup( &namespace, - &handle.get(), + &db_config.get(), false, NamespaceBottomlessDbIdInit::FetchFromConfig, ) .await?; - let ns = Namespace::from_config( - &self.inner.config, - handle, - restore_option, - &namespace, - self.make_reset_cb(), - self.resolve_attach_fn(), - self.clone(), - self.broadcaster(namespace.clone()), - ) - .await?; + let ns = self + .make_namespace(&namespace, db_config, restore_option) + .await?; lock.replace(ns); @@ -294,18 +284,17 @@ impl NamespaceStore { handle .store_and_maybe_flush(Some(to_config.into()), false) .await?; - let to_ns = Namespace::fork( - &self.inner.config, - from_ns, - from_config, - to.clone(), - handle.clone(), - timestamp, - self.resolve_attach_fn(), - self.clone(), - self.broadcaster(to), - ) - .await?; + let to_ns = self + .get_configurator(&from_config.get()) + .fork( + from_ns, + from_config, + to.clone(), + handle.clone(), + timestamp, + self.clone(), + ) + .await?; to_lock.replace(to_ns); handle.flush().await?; @@ -378,30 +367,39 @@ impl NamespaceStore { .clone() } + pub(crate) async fn make_namespace( + &self, + namespace: &NamespaceName, + config: MetaStoreHandle, + restore_option: RestoreOption, + ) -> crate::Result { + let ns = self + .get_configurator(&config.get()) + .setup( + config, + restore_option, + namespace, + self.make_reset_cb(), + self.resolve_attach_fn(), + self.clone(), + self.broadcaster(namespace.clone()), + ) + .await?; + + Ok(ns) + } + async fn load_namespace( &self, namespace: &NamespaceName, db_config: MetaStoreHandle, restore_option: RestoreOption, ) -> crate::Result { - let init = { - let namespace = namespace.clone(); - async move { - let ns = Namespace::from_config( - &self.inner.config, - db_config, - restore_option, - &namespace, - self.make_reset_cb(), - self.resolve_attach_fn(), - self.clone(), - self.broadcaster(namespace.clone()), - ) + let init = async { + let ns = self + .make_namespace(namespace, db_config, restore_option) .await?; - tracing::info!("loaded namespace: `{namespace}`"); - - Ok(Some(ns)) - } + Ok(Some(ns)) }; let before_load = Instant::now(); @@ -516,4 +514,26 @@ impl NamespaceStore { pub(crate) fn schema_locks(&self) -> &SchemaLocksRegistry { &self.inner.schema_locks } + + fn get_configurator(&self, db_config: &DatabaseConfig) -> &DynConfigurator { + match self.inner.db_kind { + DatabaseKind::Primary if db_config.is_shared_schema => { + self.inner.configurators.configure_schema().unwrap() + } + DatabaseKind::Primary => self.inner.configurators.configure_primary().unwrap(), + DatabaseKind::Replica => self.inner.configurators.configure_replica().unwrap(), + } + } + + async fn cleanup( + &self, + namespace: &NamespaceName, + db_config: &DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> crate::Result<()> { + self.get_configurator(db_config) + .cleanup(namespace, db_config, prune_all, bottomless_db_id_init) + .await + } } diff --git a/libsql-server/src/query_result_builder.rs b/libsql-server/src/query_result_builder.rs index 9ba51a57d0..68dba3d478 100644 --- a/libsql-server/src/query_result_builder.rs +++ b/libsql-server/src/query_result_builder.rs @@ -525,7 +525,6 @@ impl QueryResultBuilder for Take { pub mod test { use std::fmt; - use crate::connection::program::Program; use arbitrary::{Arbitrary, Unstructured}; use itertools::Itertools; use rand::{ @@ -1037,14 +1036,14 @@ pub mod test { pub fn test_driver( iter: usize, - f: impl Fn(FsmQueryBuilder) -> crate::Result<(FsmQueryBuilder, Program)>, + f: impl Fn(FsmQueryBuilder) -> crate::Result, ) { for _ in 0..iter { // inject random errors let builder = FsmQueryBuilder::new(true); match f(builder) { Ok(b) => { - assert_eq!(b.0.state, Finish); + assert_eq!(b.state, Finish); } Err(e) => { assert!(matches!(e, crate::Error::BuilderError(_))); diff --git a/libsql-server/src/replication/primary/logger.rs b/libsql-server/src/replication/primary/logger.rs index 2b14db32a2..6213f0da50 100644 --- a/libsql-server/src/replication/primary/logger.rs +++ b/libsql-server/src/replication/primary/logger.rs @@ -882,7 +882,7 @@ mod test { use libsql_sys::wal::{Sqlite3WalManager, WalManager}; use super::*; - use crate::connection::libsql::open_conn; + use crate::connection::legacy::open_conn; use crate::replication::primary::replication_logger_wal::ReplicationLoggerWalWrapper; use crate::DEFAULT_AUTO_CHECKPOINT; diff --git a/libsql-server/src/replication/primary/replication_logger_wal.rs b/libsql-server/src/replication/primary/replication_logger_wal.rs index 6d7a268a81..defd8abf87 100644 --- a/libsql-server/src/replication/primary/replication_logger_wal.rs +++ b/libsql-server/src/replication/primary/replication_logger_wal.rs @@ -152,7 +152,7 @@ mod test { ); let wal_manager = ReplicationLoggerWalWrapper::new(logger.clone()); - let db = crate::connection::libsql::open_conn_active_checkpoint( + let db = crate::connection::legacy::open_conn_active_checkpoint( tmp.path(), Sqlite3WalManager::default().wrap(wal_manager), None, diff --git a/libsql-server/src/replication/replicator_client.rs b/libsql-server/src/replication/replicator_client.rs index 4d12ff7f83..753baac996 100644 --- a/libsql-server/src/replication/replicator_client.rs +++ b/libsql-server/src/replication/replicator_client.rs @@ -4,15 +4,17 @@ use std::pin::Pin; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::TryStreamExt; -use libsql_replication::frame::Frame; use libsql_replication::meta::WalIndexMeta; -use libsql_replication::replicator::{map_frame_err, Error, ReplicatorClient}; +use libsql_replication::replicator::{Error, ReplicatorClient}; +use libsql_replication::rpc::replication::log_offset::WalFlavor; use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; use libsql_replication::rpc::replication::{ - verify_session_token, HelloRequest, LogOffset, NAMESPACE_METADATA_KEY, SESSION_TOKEN_KEY, + verify_session_token, Frame as RpcFrame, HelloRequest, LogOffset, NAMESPACE_METADATA_KEY, + SESSION_TOKEN_KEY, }; use tokio::sync::watch; -use tokio_stream::{Stream, StreamExt}; +use tokio_stream::Stream; + use tonic::metadata::{AsciiMetadataValue, BinaryMetadataValue}; use tonic::transport::Channel; use tonic::{Code, Request, Status}; @@ -35,6 +37,7 @@ pub struct Client { // the primary current replication index, as reported by the last handshake pub primary_replication_index: Option, store: NamespaceStore, + wal_flavor: WalFlavor, } impl Client { @@ -44,6 +47,7 @@ impl Client { path: &Path, meta_store_handle: MetaStoreHandle, store: NamespaceStore, + wal_flavor: WalFlavor, ) -> crate::Result { let (current_frame_no_notifier, _) = watch::channel(None); let meta = WalIndexMeta::open(path).await?; @@ -57,6 +61,7 @@ impl Client { meta_store_handle, primary_replication_index: None, store, + wal_flavor, }) } @@ -91,7 +96,7 @@ impl Client { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = Pin> + Send + 'static>>; #[tracing::instrument(skip(self))] async fn handshake(&mut self) -> Result<(), Error> { @@ -138,6 +143,7 @@ impl ReplicatorClient for Client { async fn next_frames(&mut self) -> Result { let offset = LogOffset { next_offset: self.next_frame_no(), + wal_flavor: Some(self.wal_flavor.into()), }; let req = self.make_request(offset); let stream = self @@ -165,7 +171,7 @@ impl ReplicatorClient for Client { None => REPLICATION_LATENCY_CACHE_MISS.increment(1), } }) - .map(map_frame_err); + .map_err(Into::into); Ok(Box::pin(stream)) } @@ -173,11 +179,12 @@ impl ReplicatorClient for Client { async fn snapshot(&mut self) -> Result { let offset = LogOffset { next_offset: self.next_frame_no(), + wal_flavor: Some(self.wal_flavor.into()), }; let req = self.make_request(offset); match self.client.snapshot(req).await { Ok(resp) => { - let stream = resp.into_inner().map(map_frame_err); + let stream = resp.into_inner().map_err(Into::into); Ok(Box::pin(stream)) } Err(e) if e.code() == Code::Unavailable => Err(Error::SnapshotPending), diff --git a/libsql-server/src/replication/snapshot_store.rs b/libsql-server/src/replication/snapshot_store.rs index e30d1659b1..21cec057e8 100644 --- a/libsql-server/src/replication/snapshot_store.rs +++ b/libsql-server/src/replication/snapshot_store.rs @@ -16,7 +16,7 @@ use tempfile::NamedTempFile; use uuid::Uuid; use zerocopy::{AsBytes, FromZeroes}; -use crate::connection::libsql::open_conn_active_checkpoint; +use crate::connection::legacy::open_conn_active_checkpoint; use crate::namespace::NamespaceName; use super::FrameNo; diff --git a/libsql-server/src/rpc/mod.rs b/libsql-server/src/rpc/mod.rs index 6359556518..5a15de5af2 100644 --- a/libsql-server/src/rpc/mod.rs +++ b/libsql-server/src/rpc/mod.rs @@ -1,7 +1,8 @@ use std::sync::Arc; use hyper_rustls::TlsAcceptor; -use libsql_replication::rpc::replication::NAMESPACE_METADATA_KEY; +use libsql_replication::rpc::replication::replication_log_server::ReplicationLogServer; +use libsql_replication::rpc::replication::{BoxReplicationService, NAMESPACE_METADATA_KEY}; use rustls::server::AllowAnyAuthenticatedClient; use rustls::RootCertStore; use tonic::Status; @@ -12,35 +13,23 @@ use tracing::Span; use crate::config::TlsConfig; use crate::metrics::CLIENT_VERSION; -use crate::namespace::{NamespaceName, NamespaceStore}; +use crate::namespace::NamespaceName; use crate::rpc::proxy::rpc::proxy_server::ProxyServer; use crate::rpc::proxy::ProxyService; -pub use crate::rpc::replication_log::rpc::replication_log_server::ReplicationLogServer; -use crate::rpc::replication_log::ReplicationLogService; use crate::utils::services::idle_shutdown::IdleShutdownKicker; pub mod proxy; pub mod replica_proxy; -pub mod replication_log; -pub mod replication_log_proxy; pub mod streaming_exec; +pub mod replication; pub async fn run_rpc_server( proxy_service: ProxyService, acceptor: A, maybe_tls: Option, idle_shutdown_layer: Option, - namespaces: NamespaceStore, - disable_namespaces: bool, + service: BoxReplicationService, ) -> anyhow::Result<()> { - let logger_service = ReplicationLogService::new( - namespaces.clone(), - idle_shutdown_layer.clone(), - None, - disable_namespaces, - false, - ); - if let Some(tls_config) = maybe_tls { let cert_pem = tokio::fs::read_to_string(&tls_config.cert).await?; let certs = rustls_pemfile::certs(&mut cert_pem.as_bytes())?; @@ -76,7 +65,7 @@ pub async fn run_rpc_server( let router = tonic::transport::Server::builder() .layer(&option_layer(idle_shutdown_layer)) .add_service(ProxyServer::new(proxy_service)) - .add_service(ReplicationLogServer::new(logger_service)) + .add_service(ReplicationLogServer::new(service)) .into_router(); let svc = ServiceBuilder::new() @@ -96,7 +85,7 @@ pub async fn run_rpc_server( hyper::server::Server::builder(acceptor).serve(h2c).await?; } else { let proxy = ProxyServer::new(proxy_service); - let replication = ReplicationLogServer::new(logger_service); + let replication = ReplicationLogServer::new(service); let router = tonic::transport::Server::builder() .layer(&option_layer(idle_shutdown_layer)) diff --git a/libsql-server/src/rpc/proxy.rs b/libsql-server/src/rpc/proxy.rs index 2899ab435c..80cbd37088 100644 --- a/libsql-server/src/rpc/proxy.rs +++ b/libsql-server/src/rpc/proxy.rs @@ -222,7 +222,7 @@ pub mod rpc { impl From for Program { fn from(pgm: connection::program::Program) -> Self { Self { - steps: pgm.steps.into_iter().map(|s| s.into()).collect(), + steps: pgm.steps.iter().map(|s| s.clone().into()).collect(), } } } diff --git a/libsql-server/src/rpc/replication/auth.rs b/libsql-server/src/rpc/replication/auth.rs new file mode 100644 index 0000000000..5d451804a6 --- /dev/null +++ b/libsql-server/src/rpc/replication/auth.rs @@ -0,0 +1,39 @@ +use tonic::Status; + +use crate::auth::parsers::parse_grpc_auth_header; +use crate::auth::{Auth, Jwt}; +use crate::namespace::{NamespaceName, NamespaceStore}; + +pub async fn authenticate( + namespaces: &NamespaceStore, + req: &tonic::Request, + namespace: NamespaceName, + user_auth_strategy: &Option, +) -> Result<(), Status> { + // todo dupe #auth + let namespace_jwt_keys = namespaces.with(namespace.clone(), |ns| ns.jwt_keys()).await; + + let auth = match namespace_jwt_keys { + Ok(Ok(Some(key))) => Some(Auth::new(Jwt::new(key))), + Ok(Ok(None)) => user_auth_strategy.clone(), + Err(e) => match e.as_ref() { + crate::error::Error::NamespaceDoesntExist(_) => user_auth_strategy.clone(), + _ => Err(Status::internal(format!( + "Error fetching jwt key for a namespace: {}", + e + )))?, + }, + Ok(Err(e)) => Err(Status::internal(format!( + "Error fetching jwt key for a namespace: {}", + e + )))?, + }; + + if let Some(auth) = auth { + let context = parse_grpc_auth_header(req.metadata(), &auth.user_strategy.required_fields()) + .map_err(|e| tonic::Status::internal(format!("Error parsing auth header: {}", e)))?; + auth.authenticate(context)?; + } + + Ok(()) +} diff --git a/libsql-server/src/rpc/replication/libsql_replicator.rs b/libsql-server/src/rpc/replication/libsql_replicator.rs new file mode 100644 index 0000000000..6376b12962 --- /dev/null +++ b/libsql-server/src/rpc/replication/libsql_replicator.rs @@ -0,0 +1,217 @@ +use std::mem::size_of; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Context, Poll}; + +use bytes::Bytes; +use futures::stream::BoxStream; +use libsql_replication::rpc::replication::log_offset::WalFlavor; +use libsql_replication::rpc::replication::replication_log_server::ReplicationLog; +use libsql_replication::rpc::replication::{ + Frame as RpcFrame, Frames, HelloRequest, HelloResponse, LogOffset, NAMESPACE_DOESNT_EXIST, +}; +use libsql_wal::io::StdIO; +use libsql_wal::registry::WalRegistry; +use libsql_wal::segment::Frame; +use md5::{Digest as _, Md5}; +use tokio_stream::Stream; +use tonic::Status; +use uuid::Uuid; + +use crate::auth::Auth; +use crate::namespace::{NamespaceName, NamespaceStore}; +use crate::SqldStorage; + +pub struct LibsqlReplicationService { + registry: Arc>, + store: NamespaceStore, + user_auth_strategy: Option, + disable_namespaces: bool, + session_token: Bytes, +} + +impl LibsqlReplicationService { + pub fn new( + registry: Arc>, + store: NamespaceStore, + user_auth_strategy: Option, + disable_namespaces: bool, + ) -> Self { + let session_token = Uuid::new_v4().to_string().into(); + Self { + registry, + disable_namespaces, + store, + user_auth_strategy, + session_token, + } + } + + async fn authenticate( + &self, + req: &tonic::Request, + namespace: NamespaceName, + ) -> Result<(), Status> { + super::auth::authenticate(&self.store, req, namespace, &self.user_auth_strategy).await + } + + fn encode_session_token(&self, version: usize) -> Uuid { + let mut sha = Md5::new(); + sha.update(&self.session_token[..]); + sha.update(version.to_le_bytes()); + + let num = sha.finalize(); + let num = u128::from_le_bytes(num.into()); + Uuid::from_u128(num) + } +} + +pin_project_lite::pin_project! { + struct FrameStreamAdapter { + #[pin] + inner: S, + flavor: WalFlavor, + } +} + +impl FrameStreamAdapter { + fn new(inner: S, flavor: WalFlavor) -> Self { + Self { inner, flavor } + } +} + +impl Stream for FrameStreamAdapter +where + S: Stream, libsql_wal::replication::Error>>, +{ + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + match ready!(this.inner.poll_next(cx)) { + Some(Ok(f)) => { + match this.flavor { + WalFlavor::Libsql => { + // safety: frame implemements zerocopy traits, so it can safely be interpreted as a + // byte slize of the same size + let bytes: Box<[u8; size_of::()]> = unsafe { std::mem::transmute(f) }; + + let data = Bytes::from(bytes as Box<[u8]>); + Poll::Ready(Some(Ok(RpcFrame { + data, + timestamp: None, + }))) + } + WalFlavor::Sqlite => { + let header = libsql_replication::frame::FrameHeader { + frame_no: f.header().frame_no().into(), + checksum: 0.into(), + page_no: f.header().page_no().into(), + size_after: f.header().size_after().into(), + }; + + let frame = libsql_replication::frame::Frame::from_parts(&header, f.data()); + Poll::Ready(Some(Ok(RpcFrame { + data: frame.bytes(), + timestamp: None, + }))) + }, + } + } + Some(Err(_e)) => todo!(), + None => Poll::Ready(None), + } + } +} + +#[tonic::async_trait] +impl ReplicationLog for LibsqlReplicationService { + type LogEntriesStream = BoxStream<'static, Result>; + type SnapshotStream = BoxStream<'static, Result>; + + async fn log_entries( + &self, + req: tonic::Request, + ) -> Result, Status> { + let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; + self.authenticate(&req, namespace.clone()).await?; + let shared = self.registry.get_async(&namespace.into()).await.unwrap(); + let req = req.into_inner(); + // TODO: replicator should only accecpt NonZero + let replicator = libsql_wal::replication::replicator::Replicator::new( + shared, + req.next_offset.max(1), + ); + + let flavor = req.wal_flavor(); + let stream = FrameStreamAdapter::new(replicator.into_frame_stream(), flavor); + Ok(tonic::Response::new(Box::pin(stream))) + } + + async fn batch_log_entries( + &self, + _req: tonic::Request, + ) -> Result, Status> { + todo!() + // let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; + // self.authenticate(&req, namespace.clone()).await?; + // let shared = self.registry.get_async(&namespace.into()).await.unwrap(); + // let replicator = libsql_wal::replication::replicator::Replicator::new(shared, req.into_inner().next_offset); + // + // let frames = FrameStreamAdapter::new(replicator.into_frame_stream()) + // .take_while(|) + // .collect::, Status>>().await?; + // Ok(tonic::Response::new(Frames { frames })) + } + + async fn hello( + &self, + req: tonic::Request, + ) -> Result, Status> { + let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; + self.authenticate(&req, namespace.clone()).await?; + + let shared = self + .registry + .get_async(&namespace.clone().into()) + .await + .unwrap(); + let log_id = shared.log_id(); + let current_replication_index = shared.last_committed_frame_no(); + let (config, version) = self + .store + .with(namespace, |ns| -> Result<_, Status> { + let config = ns.config(); + let version = ns.config_version(); + Ok((config, version)) + }) + .await + .map_err(|e| { + if let crate::error::Error::NamespaceDoesntExist(_) = e.as_ref() { + Status::failed_precondition(NAMESPACE_DOESNT_EXIST) + } else { + Status::internal(e.to_string()) + } + })??; + + let session_hash = self.encode_session_token(version); + + let response = HelloResponse { + log_id: log_id.to_string(), + session_token: session_hash.to_string().into(), + generation_id: Uuid::from_u128(0).to_string(), + generation_start_index: 0, + current_replication_index: Some(current_replication_index), + config: Some(config.as_ref().into()), + }; + + Ok(tonic::Response::new(response)) + } + + async fn snapshot( + &self, + _req: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("no snapshot required with libsql wal")) + } +} diff --git a/libsql-server/src/rpc/replication/mod.rs b/libsql-server/src/rpc/replication/mod.rs new file mode 100644 index 0000000000..9058a40a91 --- /dev/null +++ b/libsql-server/src/rpc/replication/mod.rs @@ -0,0 +1,5 @@ +pub mod libsql_replicator; +pub mod replication_log; +pub mod replication_log_proxy; +mod auth; + diff --git a/libsql-server/src/rpc/replication_log.rs b/libsql-server/src/rpc/replication/replication_log.rs similarity index 89% rename from libsql-server/src/rpc/replication_log.rs rename to libsql-server/src/rpc/replication/replication_log.rs index c0b216739e..22c184392b 100644 --- a/libsql-server/src/rpc/replication_log.rs +++ b/libsql-server/src/rpc/replication/replication_log.rs @@ -8,6 +8,7 @@ use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures_core::Future; pub use libsql_replication::rpc::replication as rpc; +use libsql_replication::rpc::replication::log_offset::WalFlavor; use libsql_replication::rpc::replication::replication_log_server::ReplicationLog; use libsql_replication::rpc::replication::{ Frame, Frames, HelloRequest, HelloResponse, LogOffset, NAMESPACE_DOESNT_EXIST, @@ -19,8 +20,7 @@ use tonic::transport::server::TcpConnectInfo; use tonic::Status; use uuid::Uuid; -use crate::auth::Jwt; -use crate::auth::{parsers::parse_grpc_auth_header, Auth}; +use crate::auth::Auth; use crate::connection::config::DatabaseConfig; use crate::namespace::{NamespaceName, NamespaceStore}; use crate::replication::primary::frame_stream::FrameStream; @@ -28,7 +28,7 @@ use crate::replication::{LogReadError, ReplicationLogger}; use crate::stats::Stats; use crate::utils::services::idle_shutdown::IdleShutdownKicker; -use super::extract_namespace; +use crate::rpc::extract_namespace; pub struct ReplicationLogService { namespaces: NamespaceStore, @@ -71,38 +71,7 @@ impl ReplicationLogService { req: &tonic::Request, namespace: NamespaceName, ) -> Result<(), Status> { - // todo dupe #auth - let namespace_jwt_keys = self - .namespaces - .with(namespace.clone(), |ns| ns.jwt_keys()) - .await; - - let auth = match namespace_jwt_keys { - Ok(Ok(Some(key))) => Some(Auth::new(Jwt::new(key))), - Ok(Ok(None)) => self.user_auth_strategy.clone(), - Err(e) => match e.as_ref() { - crate::error::Error::NamespaceDoesntExist(_) => self.user_auth_strategy.clone(), - _ => Err(Status::internal(format!( - "Error fetching jwt key for a namespace: {}", - e - )))?, - }, - Ok(Err(e)) => Err(Status::internal(format!( - "Error fetching jwt key for a namespace: {}", - e - )))?, - }; - - if let Some(auth) = auth { - let context = - parse_grpc_auth_header(req.metadata(), &auth.user_strategy.required_fields()) - .map_err(|e| { - tonic::Status::internal(format!("Error parsing auth header: {}", e)) - })?; - auth.authenticate(context)?; - } - - Ok(()) + super::auth::authenticate(&self.namespaces, req, namespace, &self.user_auth_strategy).await } fn verify_session_token( @@ -259,7 +228,10 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { - let namespace = super::extract_namespace(self.disable_namespaces, &req)?; + if let WalFlavor::Libsql = req.get_ref().wal_flavor() { + return Err(Status::invalid_argument("libsql wal not supported")); + } + let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; self.authenticate(&req, namespace.clone()).await?; @@ -304,7 +276,10 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { - let namespace = super::extract_namespace(self.disable_namespaces, &req)?; + if let WalFlavor::Libsql = req.get_ref().wal_flavor() { + return Err(Status::invalid_argument("libsql wal not supported")); + } + let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; self.authenticate(&req, namespace.clone()).await?; let (logger, _, _, stats, _) = self.logger_from_namespace(namespace, &req, true).await?; @@ -339,7 +314,7 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { - let namespace = super::extract_namespace(self.disable_namespaces, &req)?; + let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; self.authenticate(&req, namespace.clone()).await?; // legacy support @@ -354,7 +329,6 @@ impl ReplicationLog for ReplicationLogService { guard.insert((replica_addr, namespace.clone())); } } - let (logger, config, version, _, _) = self.logger_from_namespace(namespace, &req, false).await?; @@ -376,7 +350,10 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { - let namespace = super::extract_namespace(self.disable_namespaces, &req)?; + if let WalFlavor::Libsql = req.get_ref().wal_flavor() { + return Err(Status::invalid_argument("libsql wal not supported")); + } + let namespace = super::super::extract_namespace(self.disable_namespaces, &req)?; self.authenticate(&req, namespace.clone()).await?; let (logger, _, _, stats, _) = self.logger_from_namespace(namespace, &req, true).await?; diff --git a/libsql-server/src/rpc/replication_log_proxy.rs b/libsql-server/src/rpc/replication/replication_log_proxy.rs similarity index 100% rename from libsql-server/src/rpc/replication_log_proxy.rs rename to libsql-server/src/rpc/replication/replication_log_proxy.rs diff --git a/libsql-server/src/rpc/streaming_exec.rs b/libsql-server/src/rpc/streaming_exec.rs index 5214ace732..87a24b851a 100644 --- a/libsql-server/src/rpc/streaming_exec.rs +++ b/libsql-server/src/rpc/streaming_exec.rs @@ -367,7 +367,7 @@ pub mod test { use tokio_stream::wrappers::ReceiverStream; use crate::auth::Authenticated; - use crate::connection::libsql::LibSqlConnection; + use crate::connection::legacy::LegacyConnection; use crate::connection::program::Program; use crate::namespace::meta_store::{metastore_connection_maker, MetaStore}; use crate::namespace::NamespaceName; @@ -390,7 +390,7 @@ pub mod test { #[tokio::test] async fn invalid_request() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -416,7 +416,7 @@ pub mod test { #[tokio::test] async fn request_stream_dropped() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -438,7 +438,7 @@ pub mod test { #[tokio::test] async fn perform_query_simple() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -462,7 +462,7 @@ pub mod test { #[tokio::test] async fn single_query_split_response() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -519,7 +519,7 @@ pub mod test { #[tokio::test] async fn request_interupted() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(2); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -546,7 +546,7 @@ pub mod test { #[tokio::test] async fn perform_multiple_queries() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -573,7 +573,7 @@ pub mod test { #[tokio::test] async fn query_number_less_than_previous_query() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( @@ -602,7 +602,7 @@ pub mod test { #[tokio::test] async fn describe() { let tmp = tempdir().unwrap(); - let conn = LibSqlConnection::new_test(tmp.path()).await; + let conn = LegacyConnection::new_test(tmp.path()).await; let (snd, rcv) = mpsc::channel(1); let (maker, manager) = metastore_connection_maker(None, tmp.path()).await.unwrap(); let ctx = RequestContext::new( diff --git a/libsql-server/src/schema/migration.rs b/libsql-server/src/schema/migration.rs index 73135db470..e0cfe52759 100644 --- a/libsql-server/src/schema/migration.rs +++ b/libsql-server/src/schema/migration.rs @@ -240,7 +240,7 @@ mod test { use libsql_sys::wal::Sqlite3WalManager; use tempfile::tempdir; - use crate::connection::libsql::open_conn_active_checkpoint; + use crate::connection::legacy::open_conn_active_checkpoint; use crate::namespace::NamespaceName; use crate::schema::status::MigrationTask; diff --git a/libsql-server/src/schema/mod.rs b/libsql-server/src/schema/mod.rs index d024586ee1..e7c7681262 100644 --- a/libsql-server/src/schema/mod.rs +++ b/libsql-server/src/schema/mod.rs @@ -60,7 +60,7 @@ pub fn validate_migration(migration: &mut Program) -> Result<(), Error> { ) { return Err(Error::MigrationContainsTransactionStatements); } - migration.steps[0].query = Query { + migration.steps_mut().unwrap()[0].query = Query { stmt: Statement::parse("PRAGMA max_page_count") .next() .unwrap() @@ -72,7 +72,7 @@ pub fn validate_migration(migration: &mut Program) -> Result<(), Error> { if !matches!(step.query.stmt.kind, StmtKind::TxnEnd) { break; } - migration.steps.pop(); + migration.steps_mut().unwrap().pop(); } } if migration.steps().iter().any(|s| s.query.stmt.kind.is_txn()) { diff --git a/libsql-server/src/schema/scheduler.rs b/libsql-server/src/schema/scheduler.rs index 17fdfb3143..d3ea6ddea7 100644 --- a/libsql-server/src/schema/scheduler.rs +++ b/libsql-server/src/schema/scheduler.rs @@ -10,7 +10,6 @@ use tokio::task::JoinSet; use crate::connection::program::Program; use crate::connection::{Connection, MakeConnection}; -use crate::database::PrimaryConnectionMaker; use crate::namespace::meta_store::{MetaStore, MetaStoreConnection}; use crate::namespace::{NamespaceName, NamespaceStore}; use crate::query_result_builder::{IgnoreResult, QueryBuilderConfig}; @@ -349,10 +348,8 @@ impl Scheduler { let (connection_maker, block_writes) = self.namespace_store .with(task.namespace(), move |ns| { - let db = ns.db.as_primary().expect( - "attempting to perform schema migration on non-primary database", - ); - (db.connection_maker().clone(), db.block_writes.clone()) + assert!(ns.db.is_primary(), "attempting to perform schema migration on non-primary database"); + (ns.db.connection_maker().clone(), ns.db.block_writes().unwrap()) }) .await .map_err(|e| Error::NamespaceLoad(Box::new(e)))?; @@ -426,7 +423,7 @@ async fn try_step_task( _permit: OwnedSemaphorePermit, namespace_store: NamespaceStore, migration_db: Arc>, - connection_maker: Arc, + connection_maker: Arc>, job_status: MigrationJobStatus, migration: Arc, mut task: MigrationTask, @@ -477,7 +474,7 @@ async fn try_step_task( async fn try_step_task_inner( namespace_store: NamespaceStore, - connection_maker: Arc, + connection_maker: Arc>, job_status: MigrationJobStatus, migration: Arc, task: &MigrationTask, @@ -739,11 +736,8 @@ async fn step_job_run_success( // TODO: check that all tasks actually reported success before migration let connection_maker = namespace_store .with(schema.clone(), |ns| { - ns.db - .as_schema() - .expect("expected database to be a schema database") - .connection_maker() - .clone() + assert!(ns.db.is_schema(), "expected database to be a schema database"); + ns.db.connection_maker() }) .await .map_err(|e| Error::NamespaceLoad(Box::new(e)))?; @@ -754,7 +748,6 @@ async fn step_job_run_success( .map_err(|e| Error::FailedToConnect(schema.clone(), e.into()))?; tokio::task::spawn_blocking(move || -> Result<(), Error> { connection - .connection() .with_raw(|conn| -> Result<(), Error> { let mut txn = conn.transaction()?; let schema_version = @@ -808,8 +801,12 @@ mod test { use crate::connection::config::DatabaseConfig; use crate::database::DatabaseKind; + use crate::namespace::configurator::{ + BaseNamespaceConfig, NamespaceConfigurators, PrimaryConfigurator, PrimaryConfig, + SchemaConfigurator, + }; use crate::namespace::meta_store::{metastore_connection_maker, MetaStore}; - use crate::namespace::{NamespaceConfig, RestoreOption}; + use crate::namespace::RestoreOption; use crate::schema::SchedulerHandle; use super::super::migration::has_pending_migration_task; @@ -826,9 +823,10 @@ mod test { .unwrap(); let (sender, mut receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let mut scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); @@ -858,9 +856,10 @@ mod test { let (block_write, ns_conn_maker) = store .with("ns".into(), |ns| { + assert!(ns.db.is_primary()); ( - ns.db.as_primary().unwrap().block_writes.clone(), - ns.db.as_primary().unwrap().connection_maker(), + ns.db.block_writes().unwrap(), + ns.db.connection_maker(), ) }) .await @@ -902,27 +901,42 @@ mod test { assert!(!block_write.load(std::sync::atomic::Ordering::Relaxed)); } - fn make_config(migration_scheduler: SchedulerHandle, path: &Path) -> NamespaceConfig { - NamespaceConfig { - db_kind: DatabaseKind::Primary, + fn make_config(migration_scheduler: SchedulerHandle, path: &Path) -> NamespaceConfigurators { + let mut configurators = NamespaceConfigurators::empty(); + let base_config = BaseNamespaceConfig { base_path: path.to_path_buf().into(), - max_log_size: 1000000000, - max_log_duration: None, extensions: Arc::new([]), stats_sender: tokio::sync::mpsc::channel(1).0, max_response_size: 100000000000000, max_total_response_size: 100000000000, - checkpoint_interval: None, max_concurrent_connections: Arc::new(Semaphore::new(10)), max_concurrent_requests: 10000, encryption_config: None, - channel: None, - uri: None, + }; + + let primary_config = PrimaryConfig { + max_log_size: 1000000000, + max_log_duration: None, bottomless_replication: None, scripted_backup: None, + checkpoint_interval: None, + }; + + let make_wal_manager = Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())); + + configurators.with_schema(SchemaConfigurator::new( + base_config.clone(), + primary_config.clone(), + make_wal_manager.clone(), migration_scheduler, - make_wal_manager: Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - } + )); + configurators.with_primary(PrimaryConfigurator::new( + base_config, + primary_config, + make_wal_manager.clone(), + )); + + configurators } #[tokio::test] @@ -936,9 +950,10 @@ mod test { .unwrap(); let (sender, mut receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let mut scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); @@ -968,9 +983,10 @@ mod test { let (block_write, ns_conn_maker) = store .with("ns".into(), |ns| { + assert!(ns.db.is_primary()); ( - ns.db.as_primary().unwrap().block_writes.clone(), - ns.db.as_primary().unwrap().connection_maker(), + ns.db.block_writes().unwrap(), + ns.db.connection_maker(), ) }) .await @@ -1012,18 +1028,20 @@ mod test { .unwrap(); let (sender, _receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); store .with("ns".into(), |ns| { + assert!(ns.db.is_primary()); assert!(ns .db - .as_primary() + .block_writes() .unwrap() - .block_writes - .load(std::sync::atomic::Ordering::Relaxed)); + .load(std::sync::atomic::Ordering::Relaxed) + ); }) .await .unwrap(); @@ -1039,9 +1057,10 @@ mod test { .unwrap(); let (sender, mut receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let mut scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); @@ -1112,9 +1131,10 @@ mod test { .unwrap(); let (sender, _receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); diff --git a/libsql-server/src/test/bottomless.rs b/libsql-server/src/test/bottomless.rs index 5f3015e11e..159dae0226 100644 --- a/libsql-server/src/test/bottomless.rs +++ b/libsql-server/src/test/bottomless.rs @@ -458,7 +458,6 @@ async fn remove_snapshots(bucket: &str) { if let Ok(out) = client.list_objects().bucket(bucket).send().await { let keys = out .contents() - .unwrap() .iter() .map(|o| { let key = o.key().unwrap(); @@ -466,7 +465,7 @@ async fn remove_snapshots(bucket: &str) { format!("{}/db.gz", prefix) }) .unique() - .map(|key| ObjectIdentifier::builder().key(key).build()) + .map(|key| ObjectIdentifier::builder().key(key).build().unwrap()) .collect(); client @@ -476,7 +475,7 @@ async fn remove_snapshots(bucket: &str) { Delete::builder() .set_objects(Some(keys)) .quiet(true) - .build(), + .build().unwrap(), ) .send() .await @@ -489,7 +488,7 @@ async fn remove_snapshots(bucket: &str) { async fn assert_bucket_occupancy(bucket: &str, expect_empty: bool) { let client = s3_client().await.unwrap(); if let Ok(out) = client.list_objects().bucket(bucket).send().await { - let contents = out.contents().unwrap_or_default(); + let contents = out.contents(); if expect_empty { assert!( contents.is_empty(), @@ -545,17 +544,17 @@ impl S3BucketCleaner { let client = s3_client().await?; let objects = client.list_objects().bucket(bucket).send().await?; let mut delete_keys = Vec::new(); - for o in objects.contents().unwrap_or_default() { + for o in objects.contents() { let id = ObjectIdentifier::builder() .set_key(o.key().map(String::from)) - .build(); + .build().unwrap(); delete_keys.push(id); } let _ = client .delete_objects() .bucket(bucket) - .delete(Delete::builder().set_objects(Some(delete_keys)).build()) + .delete(Delete::builder().set_objects(Some(delete_keys)).build().unwrap()) .send() .await?; diff --git a/libsql-sqlite3/ext/fts5/fts5_tokenize.c b/libsql-sqlite3/ext/fts5/fts5_tokenize.c index f12056170f..7e239b6ca5 100644 --- a/libsql-sqlite3/ext/fts5/fts5_tokenize.c +++ b/libsql-sqlite3/ext/fts5/fts5_tokenize.c @@ -1290,40 +1290,46 @@ static int fts5TriCreate( Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); - UNUSED_PARAM(pUnused); - if( pNew==0 ){ - rc = SQLITE_NOMEM; + TrigramTokenizer *pNew = 0; + + if( nArg%2 ){ + rc = SQLITE_ERROR; }else{ - int i; - pNew->bFold = 1; - pNew->iFoldParam = 0; - for(i=0; rc==SQLITE_OK && ibFold = 1; + pNew->iFoldParam = 0; + for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); + } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ - pNew->bFold = (zArg[0]=='0'); - } - }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ - if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; - }else{ - pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } - }else{ - rc = SQLITE_ERROR; } - } - if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ - rc = SQLITE_ERROR; - } + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } - if( rc!=SQLITE_OK ){ - fts5TriDelete((Fts5Tokenizer*)pNew); - pNew = 0; + if( rc!=SQLITE_OK ){ + fts5TriDelete((Fts5Tokenizer*)pNew); + pNew = 0; + } } } *ppOut = (Fts5Tokenizer*)pNew; diff --git a/libsql-sqlite3/src/vacuum.c b/libsql-sqlite3/src/vacuum.c index c0ae4bc1e1..f8e848aca6 100644 --- a/libsql-sqlite3/src/vacuum.c +++ b/libsql-sqlite3/src/vacuum.c @@ -17,6 +17,10 @@ #include "sqliteInt.h" #include "vdbeInt.h" +#ifndef SQLITE_OMIT_VECTOR +#include "vectorIndexInt.h" +#endif + #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* @@ -294,6 +298,27 @@ SQLITE_NOINLINE int sqlite3RunVacuum( if( rc!=SQLITE_OK ) goto end_of_vacuum; db->init.iDb = 0; +#ifndef SQLITE_OMIT_VECTOR + // shadow tables for vector index will be populated automatically during CREATE INDEX command + // so we must skip them at this step + if( sqlite3FindTable(db, VECTOR_INDEX_GLOBAL_META_TABLE, zDbMain) != NULL ){ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0 AND name NOT IN (SELECT name||'_shadow' FROM " VECTOR_INDEX_GLOBAL_META_TABLE ")", + zDbMain + ); + }else{ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0", + zDbMain + ); + } +#else /* Loop through the tables in the main database. For each, do ** an "INSERT INTO vacuum_db.xxx SELECT * FROM main.xxx;" to copy ** the contents to the temporary database. @@ -305,6 +330,7 @@ SQLITE_NOINLINE int sqlite3RunVacuum( "WHERE type='table'AND coalesce(rootpage,1)>0", zDbMain ); +#endif assert( (db->mDbFlags & DBFLAG_Vacuum)!=0 ); db->mDbFlags &= ~DBFLAG_Vacuum; if( rc!=SQLITE_OK ) goto end_of_vacuum; diff --git a/libsql-sqlite3/src/vectorIndex.c b/libsql-sqlite3/src/vectorIndex.c index f627c98e00..001a1aae10 100644 --- a/libsql-sqlite3/src/vectorIndex.c +++ b/libsql-sqlite3/src/vectorIndex.c @@ -49,11 +49,6 @@ ** VectorIdxParams utilities ****************************************************************************/ -// VACUUM creates tables and indices first and only then populate data -// we need to ignore inserts from 'INSERT INTO vacuum.t SELECT * FROM t' statements because -// all shadow tables will be populated by VACUUM process during regular process of table copy -#define IsVacuum(db) ((db->mDbFlags&DBFLAG_Vacuum)!=0) - void vectorIdxParamsInit(VectorIdxParams *pParams, u8 *pBinBuf, int nBinSize) { assert( nBinSize <= VECTOR_INDEX_PARAMS_BUF_SIZE ); @@ -772,10 +767,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { // this is done to prevent unrecoverable situations where index were dropped but index parameters deletion failed and second attempt will fail on first step int rcIdx, rcParams; - if( IsVacuum(db) ){ - return SQLITE_OK; - } - assert( zDbSName != NULL ); rcIdx = diskAnnDropIndex(db, zDbSName, zIdxName); @@ -786,10 +777,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { assert( zDbSName != NULL ); - if( IsVacuum(db) ){ - return SQLITE_OK; - } - return diskAnnClearIndex(db, zDbSName, zIdxName); } @@ -799,7 +786,7 @@ int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { * this made intentionally in order to natively support upload of SQLite dumps * * dump populates tables first and create indices after - * so we must omit them because shadow tables already filled + * so we must omit index refill setp because shadow tables already filled * * 1. in case of any error :-1 returned (and pParse errMsg is populated with some error message) * 2. if vector index must not be created : 0 returned @@ -817,10 +804,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co int hasLibsqlVectorIdxFn = 0, hasCollation = 0; const char *pzErrMsg; - if( IsVacuum(pParse->db) ){ - return CREATE_IGNORE; - } - assert( zDbSName != NULL ); sqlite3 *db = pParse->db; @@ -879,11 +862,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: must contain exactly one column wrapped into the " VECTOR_INDEX_MARKER_FUNCTION " function"); return CREATE_FAIL; } - // we are able to support this but I doubt this works for now - more polishing required to make this work - if( pIdx->pPartIdxWhere != NULL ) { - sqlite3ErrorMsg(pParse, "vector index: where condition is forbidden"); - return CREATE_FAIL; - } pArgsList = pIdx->aColExpr->a[0].pExpr->x.pList; pListItem = pArgsList->a; @@ -973,7 +951,6 @@ int vectorIndexSearch( VectorIdxParams idxParams; vectorIdxParamsInit(&idxParams, NULL, 0); - assert( !IsVacuum(db) ); assert( zDbSName != NULL ); if( argc != 3 ){ @@ -1058,10 +1035,6 @@ int vectorIndexInsert( int rc; VectorInRow vectorInRow; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - rc = vectorInRowAlloc(pCur->db, pRecord, &vectorInRow, pzErrMsg); if( rc != SQLITE_OK ){ return rc; @@ -1081,10 +1054,6 @@ int vectorIndexDelete( ){ VectorInRow payload; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - payload.pVector = NULL; payload.nKeys = r->nField - 1; payload.pKeyValues = r->aMem + 1; diff --git a/libsql-sqlite3/src/vectordiskann.c b/libsql-sqlite3/src/vectordiskann.c index 95d473b630..fc39e00d30 100644 --- a/libsql-sqlite3/src/vectordiskann.c +++ b/libsql-sqlite3/src/vectordiskann.c @@ -442,6 +442,7 @@ int diskAnnCreateIndex( int type, dims; u64 maxNeighborsParam, blockSizeBytes; char *zSql; + const char *zRowidColumnName; char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){ @@ -509,6 +510,7 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "index_key"; }else{ zSql = sqlite3MPrintf( db, @@ -518,9 +520,31 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "rowid"; } rc = sqlite3_exec(db, zSql, 0, 0, 0); sqlite3DbFree(db, zSql); + if( rc != SQLITE_OK ){ + return rc; + } + /* + * vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB) + * in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell + * + * as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them + * so, in order to remove this overhead for random row selection - we creating an index with just single column used + * in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small + */ + zSql = sqlite3MPrintf( + db, + "CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)", + zDbSName, + zIdxName, + zIdxName, + zRowidColumnName + ); + rc = sqlite3_exec(db, zSql, 0, 0, 0); + sqlite3DbFree(db, zSql); return rc; } @@ -550,8 +574,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid) zSql = sqlite3MPrintf( pIndex->db, - "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)", - pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow + "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)", + pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow ); if( zSql == NULL ){ rc = SQLITE_NOMEM_BKPT; diff --git a/libsql-sqlite3/test/libsql_vector_index.test b/libsql-sqlite3/test/libsql_vector_index.test index 19d31ba19c..a173c773d3 100644 --- a/libsql-sqlite3/test/libsql_vector_index.test +++ b/libsql-sqlite3/test/libsql_vector_index.test @@ -140,7 +140,7 @@ do_execsql_test vector-sql { INSERT INTO t_sql VALUES(vector('[1,2,3]')), (vector('[2,3,4]')); SELECT sql FROM sqlite_master WHERE name LIKE '%t_sql%'; SELECT name FROM libsql_vector_meta_shadow WHERE name = 't_sql_idx'; -} {{CREATE TABLE t_sql( v FLOAT32(3))} {CREATE TABLE t_sql_idx_shadow (index_key INTEGER , data BLOB, PRIMARY KEY (index_key))} {CREATE INDEX t_sql_idx ON t_sql( libsql_vector_idx(v) )} {t_sql_idx}} +} {{CREATE TABLE t_sql( v FLOAT32(3))} {CREATE TABLE t_sql_idx_shadow (index_key INTEGER , data BLOB, PRIMARY KEY (index_key))} {CREATE INDEX t_sql_idx_shadow_idx ON t_sql_idx_shadow (index_key)} {CREATE INDEX t_sql_idx ON t_sql( libsql_vector_idx(v) )} {t_sql_idx}} do_execsql_test vector-drop-index { CREATE TABLE t_index_drop( v FLOAT32(3)); @@ -236,12 +236,17 @@ do_execsql_test vector-attach { do_execsql_test vector-vacuum { CREATE TABLE t_vacuum ( emb FLOAT32(2) ); - INSERT INTO t_vacuum VALUES (vector('[1,2]')), (vector('[3,4]')); + INSERT INTO t_vacuum VALUES (vector('[1,2]')), (vector('[3,4]')), (vector('[5,6]')); CREATE INDEX t_vacuum_idx ON t_vacuum(libsql_vector_idx(emb)); VACUUM; SELECT COUNT(*) FROM t_vacuum; SELECT COUNT(*) FROM t_vacuum_idx_shadow; -} {2 2} + DELETE FROM t_vacuum WHERE rowid = 2; + VACUUM; + SELECT * FROM vector_top_k('t_vacuum_idx', vector('[1,2]'), 3); + SELECT * FROM vector_top_k('t_vacuum_idx', vector('[5,6]'), 3); + SELECT * FROM vector_top_k('t_vacuum_idx', vector('[3,4]'), 3); +} {3 3 1 2 2 1 2 1} do_execsql_test vector-many-columns { CREATE TABLE t_many ( i INTEGER PRIMARY KEY, e1 FLOAT32(2), e2 FLOAT32(2) ); @@ -270,6 +275,36 @@ do_execsql_test vector-all-params { SELECT * FROM vector_top_k('t_all_params_idx', vector('[1,2]'), 2); } {1 2} +do_execsql_test vector-partial { + CREATE TABLE t_partial( name TEXT, type INT, v FLOAT32(3)); + INSERT INTO t_partial VALUES ( 'a', 0, vector('[1,2,3]') ); + INSERT INTO t_partial VALUES ( 'b', 1, vector('[3,4,5]') ); + INSERT INTO t_partial VALUES ( 'c', 2, vector('[4,5,6]') ); + INSERT INTO t_partial VALUES ( 'd', 0, vector('[5,6,7]') ); + INSERT INTO t_partial VALUES ( 'e', 1, vector('[6,7,8]') ); + INSERT INTO t_partial VALUES ( 'f', 2, vector('[7,8,9]') ); + CREATE INDEX t_partial_idx_0 ON t_partial( libsql_vector_idx(v) ) WHERE type = 0; + CREATE INDEX t_partial_idx_1 ON t_partial( libsql_vector_idx(v) ) WHERE type = 1; + CREATE INDEX t_partial_idx_not_0 ON t_partial( libsql_vector_idx(v) ) WHERE type != 0; + SELECT id FROM vector_top_k('t_partial_idx_0', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_1', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_not_0', vector('[1,2,3]'), 10); + INSERT INTO t_partial VALUES ( 'g', 0, vector('[8,9,10]') ); + INSERT INTO t_partial VALUES ( 'h', 1, vector('[9,10,11]') ); + INSERT INTO t_partial VALUES ( 'i', 2, vector('[10,11,12]') ); + SELECT id FROM vector_top_k('t_partial_idx_0', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_1', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_not_0', vector('[1,2,3]'), 10); +} { + 1 4 + 2 5 + 2 3 5 6 + + 1 4 7 + 2 5 8 + 2 3 5 6 8 9 +} + proc error_messages {sql} { set ret "" catch { @@ -304,8 +339,6 @@ do_test vector-errors { sqlite3_exec db { CREATE TABLE t_mixed_t( v FLOAT32(3)); } sqlite3_exec db { INSERT INTO t_mixed_t VALUES('[1]'); } lappend ret [error_messages {CREATE INDEX t_mixed_t_idx ON t_mixed_t( libsql_vector_idx(v) )}] - sqlite3_exec db { CREATE TABLE t_partial( name TEXT, type INT, v FLOAT32(3)); } - lappend ret [error_messages {CREATE INDEX t_partial_idx ON t_partial( libsql_vector_idx(v) ) WHERE type = 0}] } [list {*}{ {no such table: main.t_no} {no such column: v} @@ -323,5 +356,4 @@ do_test vector-errors { {vector index(insert): only f32 vectors are supported} {vector index(search): dimensions are different: 2 != 4} {vector index(insert): dimensions are different: 1 != 3} - {vector index: where condition is forbidden} }] diff --git a/libsql-sys/Cargo.toml b/libsql-sys/Cargo.toml index 26dd091ea9..8351012d9f 100644 --- a/libsql-sys/Cargo.toml +++ b/libsql-sys/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql-sys" -version = "0.6.0" +version = "0.7.0" edition = "2021" license = "MIT" description = "Native bindings to libSQL" @@ -12,7 +12,7 @@ categories = ["external-ffi-bindings"] [dependencies] bytes = "1.5.0" -libsql-ffi = { version = "0.3", path = "../libsql-ffi/" } +libsql-ffi = { version = "0.4", path = "../libsql-ffi/" } once_cell = "1.18.0" rusqlite = { workspace = true, features = ["trace"], optional = true } tracing = "0.1.37" diff --git a/libsql-wal/Cargo.toml b/libsql-wal/Cargo.toml index 9624596c28..f24f2e4c59 100644 --- a/libsql-wal/Cargo.toml +++ b/libsql-wal/Cargo.toml @@ -9,6 +9,7 @@ publish = false [dependencies] arc-swap = "1.7.1" async-stream = "0.3.5" +async-lock = "3.4.0" bitflags = "2.5.0" bytes = "1.6.0" chrono = "0.4.38" @@ -29,7 +30,7 @@ tokio-stream = "0.1.15" tracing = "0.1.40" uuid = { version = "1.8.0", features = ["v4"] } walkdir = "2.5.0" -zerocopy = { version = "0.7.32", features = ["derive", "alloc"] } +zerocopy = { workspace = true } aws-config = { version = "1", optional = true, features = ["behavior-version-latest"] } aws-sdk-s3 = { version = "1", optional = true } diff --git a/libsql-wal/src/checkpointer.rs b/libsql-wal/src/checkpointer.rs index 1f389bc265..049ceea6f3 100644 --- a/libsql-wal/src/checkpointer.rs +++ b/libsql-wal/src/checkpointer.rs @@ -135,6 +135,7 @@ where fn should_exit(&self) -> bool { self.shutting_down + && self.recv.is_empty() && self.scheduled.is_empty() && self.checkpointing.is_empty() && self.join_set.is_empty() @@ -158,9 +159,11 @@ where notified = self.recv.recv(), if !self.shutting_down => { match notified { Some(CheckpointMessage::Namespace(namespace)) => { + tracing::info!(namespace = namespace.as_str(), "notified for checkpoint"); self.scheduled.insert(namespace); } None | Some(CheckpointMessage::Shutdown) => { + tracing::info!("checkpointed is shutting down. {} namespaces to checkpoint", self.checkpointing.len()); self.shutting_down = true; } } diff --git a/libsql-wal/src/error.rs b/libsql-wal/src/error.rs index 003c4b4062..92bd51504a 100644 --- a/libsql-wal/src/error.rs +++ b/libsql-wal/src/error.rs @@ -17,6 +17,8 @@ pub enum Error { InvalidHeaderVersion, #[error("Invalid page size, only 4095 is supported")] InvalidPageSize, + #[error("Registry is shutting down")] + ShuttingDown, } impl Into for Error { diff --git a/libsql-wal/src/io/file.rs b/libsql-wal/src/io/file.rs index 63e5cda235..d7f581c145 100644 --- a/libsql-wal/src/io/file.rs +++ b/libsql-wal/src/io/file.rs @@ -2,6 +2,8 @@ use std::fs::File; use std::future::Future; use std::io::{self, ErrorKind, IoSlice, Result, Write}; +use libsql_sys::wal::either::Either; + use super::buf::{IoBuf, IoBufMut}; pub trait FileExt: Send + Sync + 'static { @@ -73,6 +75,91 @@ pub trait FileExt: Send + Sync + 'static { ) -> impl Future)> + Send; } +impl FileExt for Either +where V: FileExt, U: FileExt, +{ + fn len(&self) -> io::Result { + match self { + Either::A(x) => x.len(), + Either::B(x) => x.len(), + } + } + + fn write_at_vectored(&self, bufs: &[IoSlice], offset: u64) -> Result { + match self { + Either::A(x) => x.write_at_vectored(bufs, offset), + Either::B(x) => x.write_at_vectored(bufs, offset), + } + } + + fn write_at(&self, buf: &[u8], offset: u64) -> Result { + match self { + Either::A(x) => x.write_at(buf, offset), + Either::B(x) => x.write_at(buf, offset), + } + } + + fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { + match self { + Either::A(x) => x.read_at(buf, offset), + Either::B(x) => x.read_at(buf, offset), + } + } + + fn sync_all(&self) -> Result<()> { + match self { + Either::A(x) => x.sync_all(), + Either::B(x) => x.sync_all(), + } + } + + fn set_len(&self, len: u64) -> Result<()> { + match self { + Either::A(x) => x.set_len(len), + Either::B(x) => x.set_len(len), + } + } + + fn read_exact_at_async( + &self, + buf: B, + offset: u64, + ) -> impl Future)> + Send { + async move { + match self { + Either::A(x) => x.read_exact_at_async(buf, offset).await, + Either::B(x) => x.read_exact_at_async(buf, offset).await, + } + } + } + + fn read_at_async( + &self, + buf: B, + offset: u64, + ) -> impl Future)> + Send { + async move { + match self { + Either::A(x) => x.read_at_async(buf, offset).await, + Either::B(x) => x.read_at_async(buf, offset).await, + } + } + } + + fn write_all_at_async( + &self, + buf: B, + offset: u64, + ) -> impl Future)> + Send { + async move { + match self { + Either::A(x) => x.write_all_at_async(buf, offset).await, + Either::B(x) => x.write_all_at_async(buf, offset).await, + } + } + } +} + impl FileExt for File { fn write_at_vectored(&self, bufs: &[IoSlice], offset: u64) -> Result { Ok(nix::sys::uio::pwritev(self, bufs, offset as _)?) diff --git a/libsql-wal/src/io/mod.rs b/libsql-wal/src/io/mod.rs index b7a618617d..4f370065aa 100644 --- a/libsql-wal/src/io/mod.rs +++ b/libsql-wal/src/io/mod.rs @@ -1,4 +1,4 @@ -use std::io; +use std::{future::Future, io}; use std::path::Path; use std::sync::Arc; @@ -30,11 +30,18 @@ pub trait Io: Send + Sync + 'static { // todo: create an async counterpart fn tempfile(&self) -> io::Result; fn now(&self) -> DateTime; - fn uuid(&self) -> Uuid; fn hard_link(&self, src: &Path, dst: &Path) -> io::Result<()>; fn with_rng(&self, f: F) -> R where F: FnOnce(&mut Self::Rng) -> R; + fn uuid(&self) -> uuid::Uuid { + self.with_rng(|rng| { + let n: u128 = rng.gen(); + Uuid::from_u128(n) + }) + } + + fn remove_file_async(&self, path: &Path) -> impl Future> + Send; } #[derive(Default, Debug, Clone, Copy)] @@ -85,6 +92,10 @@ impl Io for StdIO { { f(&mut thread_rng()) } + + async fn remove_file_async(&self, path: &Path) -> io::Result<()> { + tokio::fs::remove_file(path).await + } } impl Io for Arc { @@ -128,6 +139,10 @@ impl Io for Arc { { self.as_ref().with_rng(f) } + + async fn remove_file_async(&self, path: &Path) ->io::Result<()> { + self.as_ref().remove_file_async(path).await + } } pub struct Inspect { diff --git a/libsql-wal/src/lib.rs b/libsql-wal/src/lib.rs index df104eda49..e8309541c4 100644 --- a/libsql-wal/src/lib.rs +++ b/libsql-wal/src/lib.rs @@ -15,6 +15,31 @@ const LIBSQL_MAGIC: u64 = u64::from_be_bytes(*b"LIBSQL\0\0"); const LIBSQL_PAGE_SIZE: u16 = 4096; const LIBSQL_WAL_VERSION: u16 = 1; +use uuid::Uuid; +use zerocopy::byteorder::big_endian::{U16 as bu16, U64 as bu64, U128 as bu128}; +/// LibsqlFooter is located at the end of the libsql file. I contains libsql specific metadata, +/// while remaining fully compatible with sqlite (which just ignores that footer) +/// +/// The fields are in big endian to remain coherent with sqlite +#[derive(Copy, Clone, Debug, zerocopy::FromBytes, zerocopy::FromZeroes, zerocopy::AsBytes)] +#[repr(C)] +pub struct LibsqlFooter { + pub magic: bu64, + pub version: bu16, + /// Replication index checkpointed into this file. + /// only valid if there are no outstanding segments to checkpoint, since a checkpoint could be + /// partial. + pub replication_index: bu64, + /// Id of the log for this this database + pub log_id: bu128, +} + +impl LibsqlFooter { + pub fn log_id(&self) -> Uuid { + Uuid::from_u128(self.log_id.get()) + } +} + #[cfg(any(debug_assertions, test))] pub mod test { use std::fs::OpenOptions; @@ -101,6 +126,7 @@ pub mod test { self.tmp.path().join(namespace) } + #[track_caller] pub fn open_conn(&self, namespace: &'static str) -> libsql_sys::Connection> { let path = self.db_path(namespace); let wal = self.wal.clone(); diff --git a/libsql-wal/src/registry.rs b/libsql-wal/src/registry.rs index 103bbf8631..290fdf196e 100644 --- a/libsql-wal/src/registry.rs +++ b/libsql-wal/src/registry.rs @@ -1,3 +1,4 @@ +use std::io; use std::num::NonZeroU64; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, Ordering}; @@ -9,6 +10,7 @@ use parking_lot::{Condvar, Mutex}; use rand::Rng; use tokio::sync::{mpsc, Notify, Semaphore}; use tokio::task::JoinSet; +use uuid::Uuid; use zerocopy::{AsBytes, FromZeroes}; use crate::checkpointer::CheckpointMessage; @@ -21,6 +23,7 @@ use crate::segment::{current::CurrentSegment, sealed::SealedSegment}; use crate::shared_wal::{SharedWal, SwapLog}; use crate::storage::{OnStoreCallback, Storage}; use crate::transaction::TxGuard; +use crate::{LibsqlFooter, LIBSQL_PAGE_SIZE}; use libsql_sys::name::NamespaceName; enum Slot { @@ -33,7 +36,7 @@ enum Slot { /// Wal Registry maintains a set of shared Wal, and their respective set of files. pub struct WalRegistry { - io: IO, + io: Arc, path: PathBuf, shutdown: AtomicBool, opened: DashMap>, @@ -60,7 +63,7 @@ impl WalRegistry { ) -> Result { io.create_dir_all(&path)?; let registry = Self { - io, + io: io.into(), path, opened: Default::default(), shutdown: Default::default(), @@ -115,6 +118,7 @@ where current.db_size(), current.tail().clone(), salt, + current.log_id() )?; // sealing must the last fallible operation, because we don't want to end up in a situation // where the current log is sealed and it wasn't swapped. @@ -166,7 +170,7 @@ where namespace: &NamespaceName, ) -> Result>> { if self.shutdown.load(Ordering::SeqCst) { - todo!("open after shutdown"); + return Err(crate::error::Error::ShuttingDown) } loop { @@ -267,10 +271,29 @@ where } let db_file = self.io.open(false, true, true, db_path)?; - let mut header: Sqlite3DbHeader = Sqlite3DbHeader::new_zeroed(); db_file.read_exact_at(header.as_bytes_mut(), 0)?; + let log_id = if db_file.len()? <= LIBSQL_PAGE_SIZE as u64 && tail.is_empty() { + // this is a new database + self.io.uuid() + } else if let Some(log_id) = tail.with_head(|h| h.header().log_id.get()) { + // there is a segment list, read the logid from there. + let log_id = Uuid::from_u128(log_id); + #[cfg(debug_assertions)] + { + // if the main db file has footer, then the logid must match that of the segment + if let Ok(db_log_id) = read_log_id_from_footer(&db_file, header.db_size.get() as u64) { + assert_eq!(db_log_id, log_id); + } + } + + log_id + } else { + read_log_id_from_footer(&db_file, header.db_size.get() as u64)? + }; + + let (db_size, next_frame_no) = tail .with_head(|segment| { let header = segment.header(); @@ -294,6 +317,7 @@ where db_size, tail.into(), salt, + log_id, )?)); let (new_frame_notifier, _) = tokio::sync::watch::channel(next_frame_no.get() - 1); @@ -313,6 +337,8 @@ where )), shutdown: false.into(), checkpoint_notifier: self.checkpoint_notifier.clone(), + max_segment_size: 1000.into(), + io: self.io.clone(), }); self.opened @@ -324,6 +350,7 @@ where // On shutdown, we checkpoint all the WALs. This require sealing the current segment, and when // checkpointing all the segments pub async fn shutdown(self: Arc) -> Result<()> { + tracing::info!("shutting down registry"); self.shutdown.store(true, Ordering::SeqCst); let mut join_set = JoinSet::>::new(); @@ -362,7 +389,12 @@ where } while join_set.join_next().await.is_some() {} + dbg!(); + + // we process any pending storage job, then checkpoint everything + self.storage.shutdown().await; + dbg!(); // wait for checkpointer to exit let _ = self .checkpoint_notifier @@ -370,6 +402,18 @@ where .await; self.checkpoint_notifier.closed().await; + tracing::info!("registry shutdown gracefully"); + Ok(()) } } + +fn read_log_id_from_footer(db_file: &F, db_size: u64) -> io::Result { + let mut footer: LibsqlFooter = LibsqlFooter::new_zeroed(); + let footer_offset = LIBSQL_PAGE_SIZE as u64 * db_size; + // FIXME: failing to read the footer here is a sign of corrupted database: either we + // have a tail to the segment list, or we have fully checkpointed the database. Can we + // recover from that? + db_file.read_exact_at(footer.as_bytes_mut(), footer_offset)?; + Ok(footer.log_id()) +} diff --git a/libsql-wal/src/replication/injector.rs b/libsql-wal/src/replication/injector.rs index 66710bbb22..3a152b412e 100644 --- a/libsql-wal/src/replication/injector.rs +++ b/libsql-wal/src/replication/injector.rs @@ -6,23 +6,23 @@ use crate::error::Result; use crate::io::Io; use crate::segment::Frame; use crate::shared_wal::SharedWal; -use crate::transaction::TxGuard; +use crate::transaction::TxGuardOwned; /// The injector takes frames and injects them in the wal. -pub struct Injector<'a, IO: Io> { +pub struct Injector { // The wal to which we are injecting wal: Arc>, buffer: Vec>, /// capacity of the frame buffer capacity: usize, - tx: TxGuard<'a, IO::File>, + tx: TxGuardOwned, max_tx_frame_no: u64, } -impl<'a, IO: Io> Injector<'a, IO> { +impl Injector { pub fn new( wal: Arc>, - tx: TxGuard<'a, IO::File>, + tx: TxGuardOwned, buffer_capacity: usize, ) -> Result { Ok(Self { @@ -34,7 +34,7 @@ impl<'a, IO: Io> Injector<'a, IO> { }) } - pub async fn insert_frame(&mut self, frame: Box) -> Result<()> { + pub async fn insert_frame(&mut self, frame: Box) -> Result> { let size_after = frame.size_after(); self.max_tx_frame_no = self.max_tx_frame_no.max(frame.header().frame_no()); self.buffer.push(frame); @@ -43,10 +43,10 @@ impl<'a, IO: Io> Injector<'a, IO> { self.flush(size_after).await?; } - Ok(()) + Ok(size_after.map(|_| self.max_tx_frame_no)) } - async fn flush(&mut self, size_after: Option) -> Result<()> { + pub async fn flush(&mut self, size_after: Option) -> Result<()> { let buffer = std::mem::take(&mut self.buffer); let current = self.wal.current.load(); let commit_data = size_after.map(|size| (size, self.max_tx_frame_no)); @@ -57,9 +57,15 @@ impl<'a, IO: Io> Injector<'a, IO> { .inject_frames(buffer, commit_data, &mut self.tx) .await?; self.buffer = buffer; + self.buffer.clear(); Ok(()) } + + pub fn rollback(&mut self) { + self.buffer.clear(); + self.tx.reset(0); + } } #[cfg(test)] @@ -77,8 +83,8 @@ mod test { let primary_conn = primary_env.open_conn("test"); let primary_shared = primary_env.shared("test"); - let mut replicator = Replicator::new(primary_shared.clone(), 1); - let stream = replicator.frame_stream(); + let replicator = Replicator::new(primary_shared.clone(), 1); + let stream = replicator.into_frame_stream(); tokio::pin!(stream); @@ -89,7 +95,10 @@ mod test { let mut tx = crate::transaction::Transaction::Read(replica_shared.begin_read(42)); replica_shared.upgrade(&mut tx).unwrap(); - let guard = tx.as_write_mut().unwrap().lock(); + let guard = tx + .into_write() + .unwrap_or_else(|_| panic!()) + .into_lock_owned(); let mut injector = Injector::new(replica_shared.clone(), guard, 10).unwrap(); primary_conn.execute("create table test (x)", ()).unwrap(); diff --git a/libsql-wal/src/replication/replicator.rs b/libsql-wal/src/replication/replicator.rs index acb5230ed7..46937f01a8 100644 --- a/libsql-wal/src/replication/replicator.rs +++ b/libsql-wal/src/replication/replicator.rs @@ -40,20 +40,20 @@ impl Replicator { /// /// In a single replication step, the replicator guarantees that a minimal set of frames is /// sent to the replica. - pub fn frame_stream(&mut self) -> impl Stream>> + '_ { + pub fn into_frame_stream(mut self) -> impl Stream>> + Send { async_stream::try_stream! { loop { // First we decide up to what frame_no we want to replicate in this step. If we are // already up to date, wait for something to happen let most_recent_frame_no = *self .new_frame_notifier - .wait_for(|fno| *fno > self.next_frame_no) + .wait_for(|fno| *fno >= self.next_frame_no) .await .expect("channel cannot be closed because we hold a ref to the sending end"); let mut commit_frame_no = 0; // we have stuff to replicate - if most_recent_frame_no > self.next_frame_no { + if most_recent_frame_no >= self.next_frame_no { // first replicate the most recent version of each page from the current // segment. We also return how far we have replicated from the current log let current = self.shared.current.load(); @@ -162,10 +162,10 @@ mod test { .unwrap(); } - let mut replicator = Replicator::new(shared.clone(), 1); + let replicator = Replicator::new(shared.clone(), 1); let tmp = NamedTempFile::new().unwrap(); - let stream = replicator.frame_stream(); + let stream = replicator.into_frame_stream(); tokio::pin!(stream); let mut last_frame_no = 0; let mut size_after; @@ -233,8 +233,8 @@ mod test { // replicate everything from scratch again { let tmp = NamedTempFile::new().unwrap(); - let mut replicator = Replicator::new(shared.clone(), 1); - let stream = replicator.frame_stream(); + let replicator = Replicator::new(shared.clone(), 1); + let stream = replicator.into_frame_stream(); tokio::pin!(stream); @@ -295,8 +295,8 @@ mod test { let db_content = std::fs::read(&env.db_path("test").join("data")).unwrap(); - let mut replicator = Replicator::new(shared, 1); - let stream = replicator.frame_stream().take(3); + let replicator = Replicator::new(shared, 1); + let stream = replicator.into_frame_stream().take(3); tokio::pin!(stream); diff --git a/libsql-wal/src/replication/storage.rs b/libsql-wal/src/replication/storage.rs index 6972c0c6e2..35ea89fb09 100644 --- a/libsql-wal/src/replication/storage.rs +++ b/libsql-wal/src/replication/storage.rs @@ -18,7 +18,7 @@ pub trait ReplicateFromStorage: Sync + Send + 'static { seen: &'a mut RoaringBitmap, current: u64, until: u64, - ) -> Pin>> + 'a>>; + ) -> Pin>> + 'a + Send>>; } pub struct StorageReplicator { @@ -41,7 +41,7 @@ where seen: &'a mut roaring::RoaringBitmap, mut current: u64, until: u64, - ) -> Pin>> + 'a>> { + ) -> Pin>> + Send + 'a>> { Box::pin(async_stream::try_stream! { loop { let key = self.storage.find_segment(&self.namespace, current, None).await?; diff --git a/libsql-wal/src/segment/compacted.rs b/libsql-wal/src/segment/compacted.rs index 9fd65f045b..f8bcf340d5 100644 --- a/libsql-wal/src/segment/compacted.rs +++ b/libsql-wal/src/segment/compacted.rs @@ -52,6 +52,17 @@ pub struct CompactedSegment { file: F, } +impl CompactedSegment { + pub fn remap_file_type(self, f: FN) -> CompactedSegment + where FN: FnOnce(F) -> T, + { + CompactedSegment { + header: self.header, + file: f(self.file) + } + } +} + impl CompactedSegment { pub(crate) async fn open(file: F) -> Result { let buf = ZeroCopyBuf::new_uninit(); diff --git a/libsql-wal/src/segment/current.rs b/libsql-wal/src/segment/current.rs index d8d720a145..60f7428a54 100644 --- a/libsql-wal/src/segment/current.rs +++ b/libsql-wal/src/segment/current.rs @@ -14,6 +14,7 @@ use fst::MapBuilder; use parking_lot::{Mutex, RwLock}; use roaring::RoaringBitmap; use tokio_stream::Stream; +use uuid::Uuid; use zerocopy::little_endian::U32; use zerocopy::{AsBytes, FromZeroes}; @@ -22,7 +23,7 @@ use crate::io::file::FileExt; use crate::io::Inspect; use crate::segment::{checked_frame_offset, SegmentFlags}; use crate::segment::{frame_offset, page_offset, sealed::SealedSegment}; -use crate::transaction::{Transaction, TxGuard}; +use crate::transaction::{Transaction, TxGuard, TxGuardOwned}; use crate::{LIBSQL_MAGIC, LIBSQL_PAGE_SIZE, LIBSQL_WAL_VERSION}; use super::list::SegmentList; @@ -54,6 +55,7 @@ impl CurrentSegment { db_size: u32, tail: Arc>>, salt: u32, + log_id: Uuid, ) -> Result where F: FileExt, @@ -70,6 +72,7 @@ impl CurrentSegment { version: LIBSQL_WAL_VERSION.into(), salt: salt.into(), page_size: LIBSQL_PAGE_SIZE.into(), + log_id: log_id.as_u128().into(), }; header.recompute_checksum(); @@ -88,6 +91,10 @@ impl CurrentSegment { }) } + pub fn log_id(&self) -> Uuid { + Uuid::from_u128(self.header.lock().log_id.get()) + } + pub fn is_empty(&self) -> bool { self.count_committed() == 0 } @@ -125,7 +132,7 @@ impl CurrentSegment { frames: Vec>, // (size_after, last_frame_no) commit_data: Option<(u32, u64)>, - tx: &mut TxGuard<'_, F>, + tx: &mut TxGuardOwned, ) -> Result>> where F: FileExt, @@ -1015,6 +1022,12 @@ mod test { { f(&mut rand::thread_rng()) } + + fn remove_file_async(&self, path: &std::path::Path) -> impl std::future::Future> + Send { + async move { + std::fs::remove_file(path) + } + } } let tmp = Arc::new(tempdir().unwrap()); diff --git a/libsql-wal/src/segment/list.rs b/libsql-wal/src/segment/list.rs index 25dfa3a32a..0c5e66df1a 100644 --- a/libsql-wal/src/segment/list.rs +++ b/libsql-wal/src/segment/list.rs @@ -4,17 +4,18 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use arc_swap::ArcSwapOption; -use fst::map::{OpBuilder, Union}; use fst::raw::IndexedValue; use fst::Streamer; use roaring::RoaringBitmap; use tokio_stream::Stream; +use uuid::Uuid; use zerocopy::FromZeroes; use crate::error::Result; use crate::io::buf::{ZeroCopyBoxIoBuf, ZeroCopyBuf}; -use crate::io::FileExt; +use crate::io::{FileExt, Io}; use crate::segment::Frame; +use crate::{LibsqlFooter, LIBSQL_MAGIC, LIBSQL_PAGE_SIZE, LIBSQL_WAL_VERSION}; use super::Segment; @@ -77,9 +78,13 @@ where /// Checkpoints as many segments as possible to the main db file, and return the checkpointed /// frame_no, if anything was checkpointed - pub async fn checkpoint(&self, db_file: &F, until_frame_no: u64) -> Result> - where - F: FileExt, + pub async fn checkpoint( + &self, + db_file: &IO::File, + until_frame_no: u64, + log_id: Uuid, + io: &IO, + ) -> Result> { struct Guard<'a>(&'a AtomicBool); impl<'a> Drop for Guard<'a> { @@ -121,24 +126,15 @@ where let size_after = segs.first().unwrap().size_after(); - let union = segs + let index_iter = segs .iter() - .map(|s| s.index()) - .collect::() - .union(); - - /// Safety: Union contains a Box that doesn't require Send, to it's not send. - /// That's an issue for us, but all the indexes we have are safe to send, so we're good. - /// FIXME: we could implement union ourselves. - unsafe impl Send for SendUnion<'_> {} - unsafe impl Sync for SendUnion<'_> {} - struct SendUnion<'a>(Union<'a>); + .map(|s| s.index()); - let mut union = SendUnion(union); + let mut union = send_fst_ops::SendUnion::from_index_iter(index_iter); let mut buf = ZeroCopyBuf::::new_uninit(); let mut last_replication_index = 0; - while let Some((k, v)) = union.0.next() { + while let Some((k, v)) = union.next() { let page_no = u32::from_be_bytes(k.try_into().unwrap()); let v = v.iter().min_by_key(|i| i.index).unwrap(); let offset = v.value as u32; @@ -157,9 +153,30 @@ where buf = read_buf.into_inner(); } - //// todo: make async + // update the footer at the end of the db file. + let footer = LibsqlFooter { + magic: LIBSQL_MAGIC.into(), + version: LIBSQL_WAL_VERSION.into(), + replication_index: last_replication_index.into(), + log_id: log_id.as_u128().into(), + }; + + db_file.set_len(size_after as u64 * LIBSQL_PAGE_SIZE as u64)?; + + let footer_offset = size_after as usize * LIBSQL_PAGE_SIZE as usize; + let (_, ret) = db_file + .write_all_at_async(ZeroCopyBuf::new_init(footer), footer_offset as u64) + .await; + ret?; + + // todo: truncate if necessary + //// TODO: make async db_file.sync_all()?; + for seg in segs.iter() { + seg.destroy(io).await; + } + let mut current = self.head.compare_and_swap(&segs[0], None); if Arc::ptr_eq(&segs[0], current.as_ref().unwrap()) { // nothing to do @@ -180,12 +197,10 @@ where self.len.fetch_sub(segs.len(), Ordering::Relaxed); - db_file.set_len(size_after as u64 * 4096)?; - Ok(Some(last_replication_index)) } - /// returnsstream pages from the sealed segment list, and what's the lowest replication index + /// returns a stream of pages from the sealed segment list, and what's the lowest replication index /// that was covered. If the returned index is less than start frame_no, the missing frames /// must be read somewhere else. pub async fn stream_pages_from<'a>( @@ -225,7 +240,8 @@ where .max(until_fno); let stream = async_stream::try_stream! { - let mut union = fst::map::OpBuilder::from_iter(segments.iter().map(|s| s.index())).union(); + let index_iter = segments.iter().map(|s| s.index()); + let mut union = send_fst_ops::SendUnion::from_index_iter(index_iter); while let Some((key_bytes, indexes)) = union.next() { let page_no = u32::from_be_bytes(key_bytes.try_into().unwrap()); // we already have a more recent version of this page. @@ -321,6 +337,47 @@ impl List { } } +mod send_fst_ops { + use std::sync::Arc; + use std::ops::{Deref, DerefMut}; + + use fst::map::{OpBuilder, Union}; + + /// Safety: Union contains a Box that doesn't require Send, to it's not send. + /// That's an issue for us, but all the indexes we have are safe to send, so we're good. + /// FIXME: we could implement union ourselves. + unsafe impl Send for SendUnion<'_> {} + unsafe impl Sync for SendUnion<'_> {} + + #[repr(transparent)] + pub(super) struct SendUnion<'a>(Union<'a>); + + impl<'a> SendUnion<'a> { + pub fn from_index_iter(iter: I) -> Self + where + I: Iterator>>, + { + let op = iter.collect::().union(); + Self(op) + } + } + + impl<'a> Deref for SendUnion<'a> { + type Target = Union<'a>; + + fn deref(&self) -> &Self::Target { + &self.0 + } + } + + impl<'a> DerefMut for SendUnion<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } + } + +} + #[cfg(test)] mod test { use std::io::{Read, Seek, Write}; diff --git a/libsql-wal/src/segment/mod.rs b/libsql-wal/src/segment/mod.rs index 98d93bfe50..2dc6c7c1fb 100644 --- a/libsql-wal/src/segment/mod.rs +++ b/libsql-wal/src/segment/mod.rs @@ -15,12 +15,13 @@ use std::mem::size_of; use std::num::NonZeroU64; use std::sync::Arc; -use zerocopy::byteorder::little_endian::{U16, U32, U64}; +use zerocopy::byteorder::little_endian::{U16, U32, U64, U128}; use zerocopy::AsBytes; use crate::error::{Error, Result}; use crate::io::buf::IoBufMut; use crate::io::FileExt; +use crate::io::Io; use crate::LIBSQL_MAGIC; use crate::LIBSQL_PAGE_SIZE; @@ -62,6 +63,7 @@ pub struct SegmentHeader { /// right now we only support 4096, but if se decided to support other sizes, /// we could do it without changing the header pub page_size: U16, + pub log_id: U128, /// checksum of the header fields, excluding the checksum itself. This field must be the last pub header_cheksum: U32, @@ -167,6 +169,8 @@ pub trait Segment: Send + Sync + 'static { async fn read_frame_offset_async(&self, offset: u32, buf: B) -> (B, Result<()>) where B: IoBufMut + Send + 'static; + + fn destroy(&self, io: &IO) -> impl Future; } impl Segment for Arc { @@ -208,6 +212,10 @@ impl Segment for Arc { fn size_after(&self) -> u32 { self.as_ref().size_after() } + + fn destroy(&self, io: &IO) -> impl Future { + self.as_ref().destroy(io) + } } #[repr(C)] diff --git a/libsql-wal/src/segment/sealed.rs b/libsql-wal/src/segment/sealed.rs index 39f8cc9039..fa0cc3ce40 100644 --- a/libsql-wal/src/segment/sealed.rs +++ b/libsql-wal/src/segment/sealed.rs @@ -183,6 +183,14 @@ where fn size_after(&self) -> u32 { self.header().size_after() } + + fn destroy(&self, io: &IO) -> impl std::future::Future { + async move { + if let Err(e) = io.remove_file_async(&self.path).await { + tracing::error!("failed to remove segment file {:?}: {e}", self.path); + } + } + } } impl SealedSegment { @@ -202,6 +210,7 @@ impl SealedSegment { // This happens in case of crash: the segment is not empty, but it wasn't sealed. We need to // recover the index, and seal the segment. + // FIXME: we have a bung here if !header.flags().contains(SegmentFlags::SEALED) { assert_eq!(header.index_offset.get(), 0); return Self::recover(file, path, header).map(Some); diff --git a/libsql-wal/src/shared_wal.rs b/libsql-wal/src/shared_wal.rs index 09a2747c5a..6661c3a2a1 100644 --- a/libsql-wal/src/shared_wal.rs +++ b/libsql-wal/src/shared_wal.rs @@ -1,5 +1,5 @@ use std::collections::BTreeMap; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Instant; @@ -8,6 +8,7 @@ use crossbeam::deque::Injector; use crossbeam::sync::Unparker; use parking_lot::{Mutex, MutexGuard}; use tokio::sync::mpsc; +use uuid::Uuid; use crate::checkpointer::CheckpointMessage; use crate::error::{Error, Result}; @@ -20,7 +21,7 @@ use libsql_sys::name::NamespaceName; #[derive(Default)] pub struct WalLock { - pub(crate) tx_id: Arc>>, + pub(crate) tx_id: Arc>>, /// When a writer is popped from the write queue, its write transaction may not be reading from the most recent /// snapshot. In this case, we return `SQLITE_BUSY_SNAPHSOT` to the caller. If no reads were performed /// with that transaction before upgrading, then the caller will call us back immediately after re-acquiring @@ -51,13 +52,26 @@ pub struct SharedWal { pub(crate) stored_segments: Box, pub(crate) shutdown: AtomicBool, pub(crate) checkpoint_notifier: mpsc::Sender, + /// maximum size the segment is allowed to grow + pub(crate) max_segment_size: AtomicUsize, + pub(crate) io: Arc, } impl SharedWal { + #[tracing::instrument(skip(self), fields(namespace = self.namespace.as_str()))] pub fn shutdown(&self) -> Result<()> { + tracing::info!("started namespace shutdown"); self.shutdown.store(true, Ordering::SeqCst); - let mut tx = Transaction::Read(self.begin_read(u64::MAX)); - self.upgrade(&mut tx)?; + // fixme: for infinite loop + let mut tx = loop { + let mut tx = Transaction::Read(self.begin_read(u64::MAX)); + match self.upgrade(&mut tx) { + Ok(_) => break tx, + Err(Error::BusySnapshot) => continue, + Err(e) => return Err(e), + } + }; + { let mut tx = tx.as_write_mut().unwrap().lock(); tx.commit(); @@ -66,6 +80,7 @@ impl SharedWal { // The current segment will not be used anymore. It's empty, but we still seal it so that // the next startup doesn't find an unsealed segment. self.current.load().seal()?; + tracing::info!("namespace shutdown"); Ok(()) } @@ -73,6 +88,10 @@ impl SharedWal { self.current.load().db_size() } + pub fn log_id(&self) -> Uuid { + self.current.load().log_id() + } + #[tracing::instrument(skip_all)] pub fn begin_read(&self, conn_id: u64) -> ReadTransaction { // FIXME: this is not enough to just increment the counter, we must make sure that the segment @@ -101,41 +120,38 @@ impl SharedWal { match tx { Transaction::Write(_) => unreachable!("already in a write transaction"), Transaction::Read(read_tx) => { - { let mut reserved = self.wal_lock.reserved.lock(); match *reserved { // we have already reserved the slot, go ahead and try to acquire Some(id) if id == read_tx.conn_id => { tracing::trace!("taking reserved slot"); reserved.take(); - let lock = self.wal_lock.tx_id.lock(); + let lock = self.wal_lock.tx_id.lock_blocking(); + assert!(lock.is_none()); let write_tx = self.acquire_write(read_tx, lock, reserved)?; *tx = Transaction::Write(write_tx); return Ok(()); } + None => { + let lock = self.wal_lock.tx_id.lock_blocking(); + if lock.is_none() && self.wal_lock.waiters.is_empty() { + let write_tx = self.acquire_write(read_tx, lock, reserved)?; + *tx = Transaction::Write(write_tx); + return Ok(()); + } + } _ => (), } - } - let lock = self.wal_lock.tx_id.lock(); - match *lock { - None if self.wal_lock.waiters.is_empty() => { - let write_tx = - self.acquire_write(read_tx, lock, self.wal_lock.reserved.lock())?; - *tx = Transaction::Write(write_tx); - return Ok(()); - } - Some(_) | None => { - tracing::trace!( - "txn currently held by another connection, registering to wait queue" - ); - let parker = crossbeam::sync::Parker::new(); - let unparker = parker.unparker().clone(); - self.wal_lock.waiters.push((unparker, read_tx.conn_id)); - drop(lock); - parker.park(); - } - } + tracing::trace!( + "txn currently held by another connection, registering to wait queue" + ); + + let parker = crossbeam::sync::Parker::new(); + let unparker = parker.unparker().clone(); + self.wal_lock.waiters.push((unparker, read_tx.conn_id)); + drop(reserved); + parker.park(); } } } @@ -144,9 +160,11 @@ impl SharedWal { fn acquire_write( &self, read_tx: &ReadTransaction, - mut tx_id_lock: MutexGuard>, + mut tx_id_lock: async_lock::MutexGuard>, mut reserved: MutexGuard>, ) -> Result> { + assert!(reserved.is_none() || *reserved == Some(read_tx.conn_id)); + assert!(tx_id_lock.is_none()); // we read two fields in the header. There is no risk that a transaction commit in // between the two reads because this would require that: // 1) there would be a running txn @@ -230,21 +248,6 @@ impl SharedWal { } } - // The replication index from page 1 must match that of the SharedWal - #[cfg(debug_assertions)] - { - use libsql_sys::ffi::Sqlite3DbHeader; - use zerocopy::FromBytes; - - if page_no == 1 { - let header = Sqlite3DbHeader::read_from_prefix(buffer).unwrap(); - assert_eq!( - header.replication_index.get(), - self.checkpointed_frame_no.load(Ordering::Relaxed) - ); - } - } - tx.pages_read += 1; Ok(()) @@ -264,7 +267,7 @@ impl SharedWal { } // TODO: use config for max log size - if tx.is_commited() && current.count_committed() > 1000 { + if tx.is_commited() && current.count_committed() > self.max_segment_size.load(Ordering::Relaxed) { self.swap_current(&tx)?; } @@ -297,7 +300,7 @@ impl SharedWal { .current .load() .tail() - .checkpoint(&self.db_file, durable_frame_no) + .checkpoint(&self.db_file, durable_frame_no, self.log_id(), &self.io) .await?; if let Some(checkpointed_frame_no) = checkpointed_frame_no { self.checkpointed_frame_no diff --git a/libsql-wal/src/storage/async_storage.rs b/libsql-wal/src/storage/async_storage.rs index aca4cbe6c4..6b3f4d06a3 100644 --- a/libsql-wal/src/storage/async_storage.rs +++ b/libsql-wal/src/storage/async_storage.rs @@ -1,7 +1,6 @@ //! `AsyncStorage` is a `Storage` implementation that defer storage to a background thread. The //! durable frame_no is notified asynchronously. -use std::any::Any; use std::sync::Arc; use chrono::Utc; @@ -23,9 +22,9 @@ use super::{OnStoreCallback, RestoreOptions, Storage, StoreSegmentRequest}; /// /// On shutdown, attempts to empty the queue, and flush the receiver. When the last handle of the /// receiver is dropped, and the queue is empty, exit. -pub struct AsyncStorageLoop { - receiver: mpsc::UnboundedReceiver>, - scheduler: Scheduler, +pub struct AsyncStorageLoop { + receiver: mpsc::UnboundedReceiver>, + scheduler: Scheduler, backend: Arc, io: Arc, max_in_flight: usize, @@ -50,6 +49,7 @@ where pub async fn run(mut self) { let mut shutting_down = false; let mut in_flight_futs = JoinSet::new(); + let mut notify_shutdown = None; // run the loop until shutdown. loop { if shutting_down && self.scheduler.is_empty() { @@ -92,6 +92,11 @@ where Some(StorageLoopMessage::DurableFrameNoReq { namespace, ret, config_override }) => { self.fetch_durable_frame_no_async(namespace, ret, config_override); } + Some(StorageLoopMessage::Shutdown(ret)) => { + notify_shutdown.replace(ret); + shutting_down = true; + tracing::info!("Storage shutting down"); + } None => { shutting_down = true; } @@ -108,25 +113,23 @@ where } } } + + tracing::info!("Storage shutdown"); + if let Some(notify) = notify_shutdown { + let _ = notify.send(()); + } } fn fetch_durable_frame_no_async( &self, namespace: NamespaceName, ret: oneshot::Sender>, - config_override: Option>, + config_override: Option, ) { let backend = self.backend.clone(); - let config = match config_override - .map(|c| c.downcast::()) - .transpose() - { - Ok(Some(config)) => config, - Ok(None) => backend.default_config(), - Err(_) => { - let _ = ret.send(Err(super::Error::InvalidConfigType)); - return; - } + let config = match config_override { + Some(config) => config, + None => backend.default_config(), }; tokio::spawn(async move { @@ -147,18 +150,19 @@ pub struct BottomlessConfig { pub config: C, } -enum StorageLoopMessage { - StoreReq(StoreSegmentRequest), +enum StorageLoopMessage { + StoreReq(StoreSegmentRequest), DurableFrameNoReq { namespace: NamespaceName, - config_override: Option>, + config_override: Option, ret: oneshot::Sender>, }, + Shutdown(oneshot::Sender<()>), } -pub struct AsyncStorage { +pub struct AsyncStorage { /// send request to the main loop - job_sender: mpsc::UnboundedSender>, + job_sender: mpsc::UnboundedSender>, force_shutdown: oneshot::Sender<()>, backend: Arc, } @@ -171,22 +175,24 @@ where type Segment = S; type Config = B::Config; + async fn shutdown(&self) { + let (snd, rcv) = oneshot::channel(); + let _ = self.job_sender.send(StorageLoopMessage::Shutdown(snd)); + let _ = rcv.await; + } + fn store( &self, namespace: &NamespaceName, segment: Self::Segment, - config_override: Option>, + config_override: Option, on_store_callback: OnStoreCallback, ) { - fn into_any(t: Arc) -> Arc { - t - } - let req = StoreSegmentRequest { namespace: namespace.clone(), segment, created_at: Utc::now(), - storage_config_override: config_override.map(into_any), + storage_config_override: config_override, on_store_callback, }; @@ -198,7 +204,7 @@ where async fn durable_frame_no( &self, namespace: &NamespaceName, - config_override: Option>, + config_override: Option, ) -> u64 { let config = config_override.unwrap_or_else(|| self.backend.default_config()); let meta = self.backend.meta(&config, namespace).await.unwrap(); @@ -210,7 +216,7 @@ where file: impl crate::io::FileExt, namespace: &NamespaceName, restore_options: RestoreOptions, - config_override: Option>, + config_override: Option, ) -> super::Result<()> { let config = config_override.unwrap_or_else(|| self.backend.default_config()); self.backend @@ -221,7 +227,7 @@ where fn durable_frame_no_sync( &self, namespace: &NamespaceName, - config_override: Option>, + config_override: Option, ) -> u64 { tokio::runtime::Handle::current() .block_on(self.durable_frame_no(namespace, config_override)) @@ -231,7 +237,7 @@ where &self, namespace: &NamespaceName, frame_no: u64, - config_override: Option>, + config_override: Option, ) -> super::Result { let config = config_override.unwrap_or_else(|| self.backend.default_config()); let key = self @@ -245,7 +251,7 @@ where &self, namespace: &NamespaceName, key: &super::SegmentKey, - config_override: Option>, + config_override: Option, ) -> super::Result>> { let config = config_override.unwrap_or_else(|| self.backend.default_config()); let index = self @@ -259,7 +265,7 @@ where &self, namespace: &NamespaceName, key: &super::SegmentKey, - config_override: Option>, + config_override: Option, ) -> super::Result> { // TODO: make async let config = config_override.unwrap_or_else(|| self.backend.default_config()); @@ -270,6 +276,7 @@ where let segment = CompactedSegment::open(file).await?; Ok(segment) } + } pub struct AsyncStorageInitConfig { diff --git a/libsql-wal/src/storage/backend/mod.rs b/libsql-wal/src/storage/backend/mod.rs index 6c40903eb2..25cc57f1a0 100644 --- a/libsql-wal/src/storage/backend/mod.rs +++ b/libsql-wal/src/storage/backend/mod.rs @@ -31,7 +31,7 @@ pub struct DbMeta { pub trait Backend: Send + Sync + 'static { /// Config type associated with the Storage - type Config: Send + Sync + 'static; + type Config: Clone + Send + Sync + 'static; /// Store `segment_data` with its associated `meta` fn store( @@ -42,19 +42,19 @@ pub trait Backend: Send + Sync + 'static { segment_index: Vec, ) -> impl Future> + Send; - async fn find_segment( + fn find_segment( &self, config: &Self::Config, namespace: &NamespaceName, frame_no: u64, - ) -> Result; + ) -> impl Future> + Send; - async fn fetch_segment_index( + fn fetch_segment_index( &self, config: &Self::Config, namespace: &NamespaceName, key: &SegmentKey, - ) -> Result>>; + ) -> impl Future>>> + Send; /// Fetch a segment for `namespace` containing `frame_no`, and writes it to `dest`. async fn fetch_segment_data_to_file( @@ -67,12 +67,12 @@ pub trait Backend: Send + Sync + 'static { // this method taking self: Arc is an infortunate consequence of rust type system making // impl FileExt variant with all the arguments, with no escape hatch... - async fn fetch_segment_data( + fn fetch_segment_data( self: Arc, - config: Arc, + config: Self::Config, namespace: NamespaceName, key: SegmentKey, - ) -> Result; + ) -> impl Future> + Send; // /// Fetch a segment for `namespace` containing `frame_no`, and writes it to `dest`. async fn fetch_segment( @@ -99,7 +99,7 @@ pub trait Backend: Send + Sync + 'static { ) -> Result<()>; /// Returns the default configuration for this storage - fn default_config(&self) -> Arc; + fn default_config(&self) -> Self::Config; } impl Backend for Arc { @@ -132,7 +132,7 @@ impl Backend for Arc { self.as_ref().meta(config, namespace).await } - fn default_config(&self) -> Arc { + fn default_config(&self) -> Self::Config { self.as_ref().default_config() } @@ -184,7 +184,7 @@ impl Backend for Arc { async fn fetch_segment_data( self: Arc, - config: Arc, + config: Self::Config, namespace: NamespaceName, key: SegmentKey, ) -> Result { diff --git a/libsql-wal/src/storage/backend/s3.rs b/libsql-wal/src/storage/backend/s3.rs index ee09169961..811c84178b 100644 --- a/libsql-wal/src/storage/backend/s3.rs +++ b/libsql-wal/src/storage/backend/s3.rs @@ -334,7 +334,7 @@ impl Backend for S3Backend where IO: Io, { - type Config = S3Config; + type Config = Arc; async fn store( &self, @@ -425,7 +425,7 @@ where }) } - fn default_config(&self) -> Arc { + fn default_config(&self) -> Self::Config { self.default_config.clone() } @@ -489,7 +489,7 @@ where async fn fetch_segment_data( self: Arc, - config: Arc, + config: Self::Config, namespace: NamespaceName, key: SegmentKey, ) -> Result { @@ -650,11 +650,11 @@ mod tests { let dir = tempfile::tempdir().unwrap(); let (aws_config, _s3) = setup(&dir); - let s3_config = S3Config { + let s3_config = Arc::new(S3Config { bucket: "testbucket".into(), aws_config: aws_config.clone(), cluster_id: "123456789".into(), - }; + }); let storage = S3Backend::from_sdk_config_with_io( aws_config, diff --git a/libsql-wal/src/storage/job.rs b/libsql-wal/src/storage/job.rs index 2f4192b8df..42eaba9256 100644 --- a/libsql-wal/src/storage/job.rs +++ b/libsql-wal/src/storage/job.rs @@ -9,13 +9,13 @@ use crate::segment::Segment; /// A request, with an id #[derive(Debug)] -pub(crate) struct IndexedRequest { - pub(crate) request: StoreSegmentRequest, +pub(crate) struct IndexedRequest { + pub(crate) request: StoreSegmentRequest, pub(crate) id: u64, } -impl Deref for IndexedRequest { - type Target = StoreSegmentRequest; +impl Deref for IndexedRequest { + type Target = StoreSegmentRequest; fn deref(&self) -> &Self::Target { &self.request @@ -24,32 +24,21 @@ impl Deref for IndexedRequest { /// A storage Job to be performed #[derive(Debug)] -pub(crate) struct Job { +pub(crate) struct Job { /// Segment to store. // TODO: implement request batching (merge segment and send). - pub(crate) request: IndexedRequest, + pub(crate) request: IndexedRequest, } -// #[repr(transparent)] -// struct BytesLike(pub T); -// -// impl AsRef<[u8]> for BytesLike -// where -// T: AsBytes, -// { -// fn as_ref(&self) -> &[u8] { -// self.0.as_bytes() -// } -// } -// -impl Job +impl Job where Seg: Segment, + C: Clone, { /// Perform the job and return the JobResult. This is not allowed to panic. - pub(crate) async fn perform(self, backend: B, io: IO) -> JobResult + pub(crate) async fn perform(self, backend: B, io: IO) -> JobResult where - B: Backend, + B: Backend, IO: Io, { let result = self.try_perform(backend, io).await; @@ -58,13 +47,15 @@ where async fn try_perform(&self, backend: B, io: IO) -> Result where - B: Backend, + B: Backend, IO: Io, { let segment = &self.request.segment; let segment_id = io.uuid(); let tmp = io.tempfile()?; + tracing::debug!(namespace = self.request.namespace.as_str(), "sending segment to durable storage"); + let new_index = segment .compact(&tmp, segment_id) .await @@ -81,21 +72,25 @@ where .request .storage_config_override .clone() - .map(|c| c.downcast::()) - .transpose() - .map_err(|_| super::Error::InvalidConfigType)? .unwrap_or_else(|| backend.default_config()); backend.store(&config, meta, tmp, new_index).await?; + tracing::info!( + namespace = self.request.namespace.as_str(), + start_frame_no = segment.start_frame_no(), + end_frame_no = segment.last_committed(), + "stored segment" + ); + Ok(segment.last_committed()) } } #[derive(Debug)] -pub(crate) struct JobResult { +pub(crate) struct JobResult { /// The job that was performed - pub(crate) job: Job, + pub(crate) job: Job, /// The outcome of the job: the new durable index, or an error. pub(crate) result: Result, } @@ -421,6 +416,12 @@ mod test { { todo!() } + + fn destroy(&self, _io: &IO) -> impl std::future::Future { + async move { + todo!() + } + } } struct TestBackend; @@ -453,8 +454,8 @@ mod test { todo!() } - fn default_config(&self) -> Arc { - Arc::new(()) + fn default_config(&self) -> Self::Config { + () } async fn restore( @@ -497,7 +498,7 @@ mod test { async fn fetch_segment_data( self: Arc, - _config: Arc, + _config: Self::Config, _namespace: NamespaceName, _key: SegmentKey, ) -> Result { diff --git a/libsql-wal/src/storage/mod.rs b/libsql-wal/src/storage/mod.rs index 76347de96b..fdfe2e15e8 100644 --- a/libsql-wal/src/storage/mod.rs +++ b/libsql-wal/src/storage/mod.rs @@ -1,4 +1,3 @@ -use std::any::Any; use std::collections::BTreeMap; use std::fmt; use std::future::Future; @@ -11,7 +10,7 @@ use chrono::{DateTime, Utc}; use fst::Map; use hashbrown::HashMap; use libsql_sys::name::NamespaceName; -use parking_lot::Mutex; +use libsql_sys::wal::either::Either; use tempfile::{tempdir, TempDir}; use crate::io::{FileExt, Io, StdIO}; @@ -133,7 +132,7 @@ pub type OnStoreCallback = Box< pub trait Storage: Send + Sync + 'static { type Segment: Segment; - type Config; + type Config: Clone + Send; /// store the passed segment for `namespace`. This function is called in a context where /// blocking is acceptable. /// returns a future that resolves when the segment is stored @@ -142,20 +141,20 @@ pub trait Storage: Send + Sync + 'static { &self, namespace: &NamespaceName, seg: Self::Segment, - config_override: Option>, + config_override: Option, on_store: OnStoreCallback, ); fn durable_frame_no_sync( &self, namespace: &NamespaceName, - config_override: Option>, + config_override: Option, ) -> u64; async fn durable_frame_no( &self, namespace: &NamespaceName, - config_override: Option>, + config_override: Option, ) -> u64; async fn restore( @@ -163,29 +162,157 @@ pub trait Storage: Send + Sync + 'static { file: impl FileExt, namespace: &NamespaceName, restore_options: RestoreOptions, - config_override: Option>, + config_override: Option, ) -> Result<()>; - async fn find_segment( + fn find_segment( &self, namespace: &NamespaceName, frame_no: u64, - config_override: Option>, - ) -> Result; + config_override: Option, + ) -> impl Future> + Send; - async fn fetch_segment_index( + fn fetch_segment_index( &self, namespace: &NamespaceName, key: &SegmentKey, - config_override: Option>, - ) -> Result>>; + config_override: Option, + ) -> impl Future>>> + Send; - async fn fetch_segment_data( + fn fetch_segment_data( &self, namespace: &NamespaceName, key: &SegmentKey, - config_override: Option>, - ) -> Result>; + config_override: Option, + ) -> impl Future>> + Send; + + fn shutdown(&self) -> impl Future + Send { async { () } } +} + +/// special zip function for Either storage implementation +fn zip(x: &Either, y: Option>) -> Either<(&A, Option), (&B, Option)>{ + match (x, y) { + (Either::A(a), Some(Either::A(c))) => Either::A((a, Some(c))), + (Either::B(b), Some(Either::B(d))) => Either::B((b, Some(d))), + (Either::A(a), None) => Either::A((a, None)), + (Either::B(b), None) => Either::B((b, None)), + _ => panic!("incompatible options") + } +} + +impl Storage for Either +where A: Storage, + B: Storage, + S: Segment, +{ + type Segment = S; + type Config = Either; + + fn store( + &self, + namespace: &NamespaceName, + seg: Self::Segment, + config_override: Option, + on_store: OnStoreCallback, + ) { + + match zip(self, config_override) { + Either::A((s, c)) => s.store(namespace, seg, c, on_store), + Either::B((s, c)) => s.store(namespace, seg, c, on_store), + } + } + + fn durable_frame_no_sync( + &self, + namespace: &NamespaceName, + config_override: Option, + ) -> u64 { + match zip(self, config_override) { + Either::A((s, c)) => s.durable_frame_no_sync(namespace, c), + Either::B((s, c)) => s.durable_frame_no_sync(namespace, c), + } + } + + async fn durable_frame_no( + &self, + namespace: &NamespaceName, + config_override: Option, + ) -> u64 { + match zip(self, config_override) { + Either::A((s, c)) => s.durable_frame_no(namespace, c).await, + Either::B((s, c)) => s.durable_frame_no(namespace, c).await, + } + } + + async fn restore( + &self, + file: impl FileExt, + namespace: &NamespaceName, + restore_options: RestoreOptions, + config_override: Option, + ) -> Result<()> { + match zip(self, config_override) { + Either::A((s, c)) => s.restore(file, namespace, restore_options, c).await, + Either::B((s, c)) => s.restore(file, namespace, restore_options, c).await, + } + } + + fn find_segment( + &self, + namespace: &NamespaceName, + frame_no: u64, + config_override: Option, + ) -> impl Future> + Send { + async move { + match zip(self, config_override) { + Either::A((s, c)) => s.find_segment(namespace, frame_no, c).await, + Either::B((s, c)) => s.find_segment(namespace, frame_no, c).await, + } + } + } + + fn fetch_segment_index( + &self, + namespace: &NamespaceName, + key: &SegmentKey, + config_override: Option, + ) -> impl Future>>> + Send { + async move { + match zip(self, config_override) { + Either::A((s, c)) => s.fetch_segment_index(namespace, key, c).await, + Either::B((s, c)) => s.fetch_segment_index(namespace, key, c).await, + } + } + } + + fn fetch_segment_data( + &self, + namespace: &NamespaceName, + key: &SegmentKey, + config_override: Option, + ) -> impl Future>> + Send { + async move { + match zip(self, config_override) { + Either::A((s, c)) => { + let seg = s.fetch_segment_data(namespace, key, c).await?; + let seg = seg.remap_file_type(Either::A); + Ok(seg) + }, + Either::B((s, c)) => { + let seg = s.fetch_segment_data(namespace, key, c).await?; + let seg = seg.remap_file_type(Either::B); + Ok(seg) + }, + } + } + } + + async fn shutdown(&self) { + match self { + Either::A(a) => a.shutdown().await, + Either::B(b) => b.shutdown().await, + } + } } /// a placeholder storage that doesn't store segment @@ -200,7 +327,7 @@ impl Storage for NoStorage { &self, _namespace: &NamespaceName, _seg: Self::Segment, - _config: Option>, + _config: Option, _on_store: OnStoreCallback, ) { } @@ -208,7 +335,7 @@ impl Storage for NoStorage { async fn durable_frame_no( &self, namespace: &NamespaceName, - config: Option>, + config: Option, ) -> u64 { self.durable_frame_no_sync(namespace, config) } @@ -218,7 +345,7 @@ impl Storage for NoStorage { _file: impl FileExt, _namespace: &NamespaceName, _restore_options: RestoreOptions, - _config_override: Option>, + _config_override: Option, ) -> Result<()> { panic!("can restore from no storage") } @@ -226,7 +353,7 @@ impl Storage for NoStorage { fn durable_frame_no_sync( &self, _namespace: &NamespaceName, - _config_override: Option>, + _config_override: Option, ) -> u64 { u64::MAX } @@ -235,7 +362,7 @@ impl Storage for NoStorage { &self, _namespace: &NamespaceName, _frame_no: u64, - _config_override: Option>, + _config_override: Option, ) -> Result { unimplemented!() } @@ -244,7 +371,7 @@ impl Storage for NoStorage { &self, _namespace: &NamespaceName, _key: &SegmentKey, - _config_override: Option>, + _config_override: Option, ) -> Result>> { unimplemented!() } @@ -253,7 +380,7 @@ impl Storage for NoStorage { &self, _namespace: &NamespaceName, _key: &SegmentKey, - _config_override: Option>, + _config_override: Option, ) -> Result> { unimplemented!(); #[allow(unreachable_code)] @@ -264,7 +391,7 @@ impl Storage for NoStorage { #[doc(hidden)] #[derive(Debug)] pub struct TestStorage { - inner: Arc>>, + inner: Arc>>, } #[derive(Debug)] @@ -318,10 +445,10 @@ impl Storage for TestStorage { &self, namespace: &NamespaceName, seg: Self::Segment, - _config: Option>, + _config: Option, on_store: OnStoreCallback, ) { - let mut inner = self.inner.lock(); + let mut inner = self.inner.lock_blocking(); if inner.store { let id = uuid::Uuid::new_v4(); let out_path = inner.dir.path().join(id.to_string()); @@ -347,7 +474,7 @@ impl Storage for TestStorage { async fn durable_frame_no( &self, namespace: &NamespaceName, - config: Option>, + config: Option, ) -> u64 { self.durable_frame_no_sync(namespace, config) } @@ -357,7 +484,7 @@ impl Storage for TestStorage { _file: impl FileExt, _namespace: &NamespaceName, _restore_options: RestoreOptions, - _config_override: Option>, + _config_override: Option, ) -> Result<()> { todo!(); } @@ -365,9 +492,9 @@ impl Storage for TestStorage { fn durable_frame_no_sync( &self, namespace: &NamespaceName, - _config_override: Option>, + _config_override: Option, ) -> u64 { - let inner = self.inner.lock(); + let inner = self.inner.lock_blocking(); if inner.store { let Some(segs) = inner.stored.get(namespace) else { return 0; @@ -382,9 +509,9 @@ impl Storage for TestStorage { &self, namespace: &NamespaceName, frame_no: u64, - _config_override: Option>, + _config_override: Option, ) -> Result { - let inner = self.inner.lock(); + let inner = self.inner.lock().await; if inner.store { if let Some(segs) = inner.stored.get(namespace) { let Some((key, _path)) = segs.iter().find(|(k, _)| k.includes(frame_no)) else { @@ -403,9 +530,9 @@ impl Storage for TestStorage { &self, namespace: &NamespaceName, key: &SegmentKey, - _config_override: Option>, + _config_override: Option, ) -> Result>> { - let inner = self.inner.lock(); + let inner = self.inner.lock().await; if inner.store { match inner.stored.get(namespace) { Some(segs) => Ok(segs.get(&key).unwrap().1.clone()), @@ -420,9 +547,9 @@ impl Storage for TestStorage { &self, namespace: &NamespaceName, key: &SegmentKey, - _config_override: Option>, + _config_override: Option, ) -> Result> { - let inner = self.inner.lock(); + let inner = self.inner.lock().await; if inner.store { match inner.stored.get(namespace) { Some(segs) => { @@ -438,7 +565,7 @@ impl Storage for TestStorage { } } -pub struct StoreSegmentRequest { +pub struct StoreSegmentRequest { namespace: NamespaceName, /// Path to the segment. Read-only for bottomless segment: S, @@ -447,12 +574,12 @@ pub struct StoreSegmentRequest { /// alternative configuration to use with the storage layer. /// e.g: S3 overrides - storage_config_override: Option>, + storage_config_override: Option, /// Called after the segment was stored, with the new durable index on_store_callback: OnStoreCallback, } -impl fmt::Debug for StoreSegmentRequest +impl fmt::Debug for StoreSegmentRequest where S: fmt::Debug, { diff --git a/libsql-wal/src/storage/scheduler.rs b/libsql-wal/src/storage/scheduler.rs index 8e07deaeb2..a9de9a746a 100644 --- a/libsql-wal/src/storage/scheduler.rs +++ b/libsql-wal/src/storage/scheduler.rs @@ -5,13 +5,13 @@ use super::job::{IndexedRequest, Job, JobResult}; use super::StoreSegmentRequest; use libsql_sys::name::NamespaceName; -struct NamespaceRequests { - requests: VecDeque>, +struct NamespaceRequests { + requests: VecDeque>, /// there's work in flight for this namespace in_flight: bool, } -impl Default for NamespaceRequests { +impl Default for NamespaceRequests { fn default() -> Self { Self { requests: Default::default(), @@ -28,14 +28,14 @@ impl Default for NamespaceRequests { /// processed, because only the most recent segment is checked for durability. This property /// ensures that all segments are present up to the max durable index. /// It is generic over C: the storage config type (for config overrides), and T, the segment type -pub(crate) struct Scheduler { +pub(crate) struct Scheduler { /// notify new durability index for namespace - requests: HashMap>, + requests: HashMap>, queue: priority_queue::PriorityQueue>, next_request_id: u64, } -impl Scheduler { +impl Scheduler { pub fn new() -> Self { Self { requests: Default::default(), @@ -46,7 +46,7 @@ impl Scheduler { /// Register a new request with the scheduler #[tracing::instrument(skip_all)] - pub fn register(&mut self, request: StoreSegmentRequest) { + pub fn register(&mut self, request: StoreSegmentRequest) { // invariant: new segment comes immediately after the latest segment for that namespace. This means: // - immediately after the last registered segment, if there is any // - immediately after the last durable index @@ -71,7 +71,7 @@ impl Scheduler { /// be scheduled, and returns description of the job to be performed. No other job for this /// namespace will be scheduled, until the `JobResult` is reported #[tracing::instrument(skip_all)] - pub fn schedule(&mut self) -> Option> { + pub fn schedule(&mut self) -> Option> { let (name, _) = self.queue.pop()?; let requests = self .requests @@ -90,7 +90,7 @@ impl Scheduler { /// Report the job result to the scheduler. If the job result was a success, the request as /// removed from the queue, else, the job is rescheduled #[tracing::instrument(skip_all, fields(req_id = result.job.request.id))] - pub async fn report(&mut self, result: JobResult) { + pub async fn report(&mut self, result: JobResult) { // re-schedule, or report new max durable frame_no for segment let name = result.job.request.request.namespace.clone(); let requests = self @@ -151,7 +151,7 @@ mod test { #[tokio::test] async fn schedule_simple() { - let mut scheduler = Scheduler::<()>::new(); + let mut scheduler = Scheduler::<(), ()>::new(); let ns1 = NamespaceName::from("test1"); let ns2 = NamespaceName::from("test2"); @@ -224,7 +224,7 @@ mod test { #[tokio::test] async fn job_error_reschedule() { - let mut scheduler = Scheduler::<()>::new(); + let mut scheduler = Scheduler::<(), ()>::new(); let ns1 = NamespaceName::from("test1"); let ns2 = NamespaceName::from("test2"); @@ -264,7 +264,7 @@ mod test { #[tokio::test] async fn schedule_while_in_flight() { - let mut scheduler = Scheduler::<()>::new(); + let mut scheduler = Scheduler::<(), ()>::new(); let ns1 = NamespaceName::from("test1"); diff --git a/libsql-wal/src/transaction.rs b/libsql-wal/src/transaction.rs index 723cffeae1..33ff9fcea1 100644 --- a/libsql-wal/src/transaction.rs +++ b/libsql-wal/src/transaction.rs @@ -4,7 +4,6 @@ use std::sync::Arc; use std::time::Instant; use libsql_sys::name::NamespaceName; -use parking_lot::{ArcMutexGuard, RawMutex}; use tokio::sync::mpsc; use crate::checkpointer::CheckpointMessage; @@ -31,6 +30,14 @@ impl Transaction { } } + pub fn into_write(self) -> Result, Self> { + if let Self::Write(v) = self { + Ok(v) + } else { + Err(self) + } + } + pub fn max_frame_no(&self) -> u64 { match self { Transaction::Write(w) => w.next_frame_no - 1, @@ -147,8 +154,27 @@ pub struct WriteTransaction { pub recompute_checksum: Option, } +pub struct TxGuardOwned { + _lock: async_lock::MutexGuardArc>, + inner: WriteTransaction, +} + +impl Deref for TxGuardOwned { + type Target = WriteTransaction; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for TxGuardOwned { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + pub struct TxGuard<'a, F> { - _lock: ArcMutexGuard>, + _lock: async_lock::MutexGuardArc>, inner: &'a mut WriteTransaction, } @@ -189,7 +215,7 @@ impl WriteTransaction { todo!("txn has already been commited"); } - let g = self.wal_lock.tx_id.lock_arc(); + let g = self.wal_lock.tx_id.lock_arc_blocking(); match *g { // we still hold the lock, we can proceed Some(id) if self.id == id => TxGuard { @@ -202,6 +228,25 @@ impl WriteTransaction { } } + pub fn into_lock_owned(self) -> TxGuardOwned { + if self.is_commited { + tracing::error!("transaction already commited"); + todo!("txn has already been commited"); + } + + let g = self.wal_lock.tx_id.lock_arc_blocking(); + match *g { + // we still hold the lock, we can proceed + Some(id) if self.id == id => TxGuardOwned { + _lock: g, + inner: self, + }, + // Somebody took the lock from us + Some(_) => todo!("lock stolen"), + None => todo!("not a transaction"), + } + } + pub fn reset(&mut self, savepoint_id: usize) { if savepoint_id >= self.savepoints.len() { unreachable!("savepoint doesn't exist"); @@ -231,7 +276,9 @@ impl WriteTransaction { let Self { wal_lock, read_tx, .. } = self; - let mut lock = wal_lock.tx_id.lock(); + // always acquire lock in this order: reserved, then tx_id + let mut reserved = wal_lock.reserved.lock(); + let mut lock = wal_lock.tx_id.lock_blocking(); match *lock { Some(lock_id) if lock_id == read_tx.id => { lock.take(); @@ -239,7 +286,7 @@ impl WriteTransaction { _ => (), } - if let Some(id) = *wal_lock.reserved.lock() { + if let Some(id) = *reserved { tracing::trace!("tx already reserved by {id}"); return read_tx; } @@ -252,7 +299,7 @@ impl WriteTransaction { } crossbeam::deque::Steal::Success((unparker, id)) => { tracing::trace!("waking up {id}"); - wal_lock.reserved.lock().replace(id); + reserved.replace(id); unparker.unpark(); break; } diff --git a/libsql-wal/src/wal.rs b/libsql-wal/src/wal.rs index 4cbf81b10e..079a7127b9 100644 --- a/libsql-wal/src/wal.rs +++ b/libsql-wal/src/wal.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use libsql_sys::name::NamespaceResolver; use libsql_sys::wal::{Wal, WalManager}; -use crate::io::Io; +use crate::io::{FileExt as _, Io}; use crate::registry::WalRegistry; use crate::segment::sealed::SealedSegment; use crate::shared_wal::SharedWal; diff --git a/libsql-wal/tests/flaky_fs.rs b/libsql-wal/tests/flaky_fs.rs index 9ccc48fd21..701671f03b 100644 --- a/libsql-wal/tests/flaky_fs.rs +++ b/libsql-wal/tests/flaky_fs.rs @@ -144,10 +144,6 @@ impl Io for FlakyIo { todo!() } - fn uuid(&self) -> uuid::Uuid { - todo!() - } - fn hard_link(&self, _src: &Path, _dst: &Path) -> std::io::Result<()> { todo!() } @@ -158,6 +154,14 @@ impl Io for FlakyIo { { f(&mut self.rng.lock()) } + + fn remove_file_async(&self, path: &Path) -> impl std::future::Future> + Send { + async move { + self.with_random_failure(|| { + std::fs::remove_file(path) + }) + } + } } macro_rules! assert_not_corrupt { diff --git a/libsql/Cargo.toml b/libsql/Cargo.toml index fa89cc68ad..3d65f71c73 100644 --- a/libsql/Cargo.toml +++ b/libsql/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql" -version = "0.5.0-alpha.2" +version = "0.5.0" edition = "2021" description = "libSQL library: the main gateway for interacting with the database" repository = "https://github.com/tursodatabase/libsql" @@ -11,7 +11,7 @@ tracing = { version = "0.1.37", default-features = false } thiserror = "1.0.40" futures = { version = "0.3.28", optional = true } -libsql-sys = { version = "0.6", path = "../libsql-sys", optional = true } +libsql-sys = { version = "0.7", path = "../libsql-sys", optional = true } libsql-hrana = { version = "0.2", path = "../libsql-hrana", optional = true } tokio = { version = "1.29.1", features = ["sync"], optional = true } tokio-util = { version = "0.7", features = ["io-util", "codec"], optional = true } @@ -20,7 +20,7 @@ hyper = { workspace = true, features = ["client", "stream"], optional = true } hyper-rustls = { version = "0.25", features = ["webpki-roots"], optional = true } base64 = { version = "0.21", optional = true } serde = { version = "1", features = ["derive"], optional = true } -serde_json = { version = "1", optional = true } +serde_json = { version = "1", features = ["float_roundtrip"], optional = true } async-trait = "0.1" bitflags = { version = "2.4.0", optional = true } tower = { workspace = true, features = ["util"], optional = true } @@ -37,10 +37,10 @@ tower-http = { version = "0.4.4", features = ["trace", "set-header", "util"], op http = { version = "0.2", optional = true } zerocopy = { version = "0.7.28", optional = true } -sqlite3-parser = { package = "libsql-sqlite3-parser", path = "../vendored/sqlite3-parser", version = "0.12", optional = true } +sqlite3-parser = { package = "libsql-sqlite3-parser", path = "../vendored/sqlite3-parser", version = "0.13", optional = true } fallible-iterator = { version = "0.3", optional = true } -libsql_replication = { version = "0.4", path = "../libsql-replication", optional = true } +libsql_replication = { version = "0.5", path = "../libsql-replication", optional = true } async-stream = { version = "0.3.5", optional = true } [dev-dependencies] diff --git a/libsql/src/de.rs b/libsql/src/de.rs index 63ee71f598..44f231c134 100644 --- a/libsql/src/de.rs +++ b/libsql/src/de.rs @@ -68,7 +68,7 @@ impl<'de> Deserializer<'de> for RowDeserializer<'de> { visitor.visit_map(RowMapAccess { row: self.row, - idx: 0..self.row.inner.column_count(), + idx: 0..(self.row.inner.column_count() as usize), value: None, }) } diff --git a/libsql/src/hrana/mod.rs b/libsql/src/hrana/mod.rs index 9befe549de..2bfd90c00d 100644 --- a/libsql/src/hrana/mod.rs +++ b/libsql/src/hrana/mod.rs @@ -24,7 +24,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use super::rows::{RowInner, RowsInner}; +use super::rows::{ColumnsInner, RowInner, RowsInner}; pub(crate) type Result = std::result::Result; @@ -36,9 +36,9 @@ struct Cookie { base_url: Option, } -pub trait HttpSend: Clone { - type Stream: Stream> + Unpin; - type Result: Future>; +pub trait HttpSend: Clone + Send + 'static { + type Stream: Stream> + Unpin + Send; + type Result: Future> + Send; fn http_send(&self, url: Arc, auth: Arc, body: String) -> Self::Result; /// Schedule sending a HTTP post request without waiting for the completion. @@ -261,7 +261,12 @@ where async fn next(&mut self) -> crate::Result> { self.next().await } +} +impl ColumnsInner for HranaRows +where + S: Stream> + Send + Sync + Unpin, +{ fn column_count(&self) -> i32 { self.column_count() } @@ -303,13 +308,6 @@ impl RowInner for Row { Ok(into_value2(v)) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.cols - .get(idx as usize) - .and_then(|c| c.name.as_ref()) - .map(|s| s.as_str()) - } - fn column_str(&self, idx: i32) -> crate::Result<&str> { if let Some(value) = self.inner.get(idx as usize) { if let proto::Value::Text { value } = value { @@ -321,6 +319,15 @@ impl RowInner for Row { Err(crate::Error::ColumnNotFound(idx)) } } +} + +impl ColumnsInner for Row { + fn column_name(&self, idx: i32) -> Option<&str> { + self.cols + .get(idx as usize) + .and_then(|c| c.name.as_ref()) + .map(|s| s.as_str()) + } fn column_type(&self, idx: i32) -> crate::Result { if let Some(value) = self.inner.get(idx as usize) { @@ -337,8 +344,8 @@ impl RowInner for Row { } } - fn column_count(&self) -> usize { - self.cols.len() + fn column_count(&self) -> i32 { + self.cols.len() as i32 } } @@ -417,7 +424,9 @@ impl RowsInner for StmtResultRows { inner: Box::new(row), })) } +} +impl ColumnsInner for StmtResultRows { fn column_count(&self) -> i32 { self.cols.len() as i32 } diff --git a/libsql/src/hrana/stream.rs b/libsql/src/hrana/stream.rs index b27e48145e..23f2bcc220 100644 --- a/libsql/src/hrana/stream.rs +++ b/libsql/src/hrana/stream.rs @@ -68,7 +68,7 @@ where auth_token, sql_id_generator: 0, baton: None, - }), + }).into(), }), } } @@ -287,9 +287,10 @@ where total_changes: AtomicU64, last_insert_rowid: AtomicI64, is_autocommit: AtomicBool, - stream: Mutex>, + stream: Arc>>, } + #[derive(Debug)] struct RawStream where @@ -401,6 +402,15 @@ where Ok(responses) } + async fn close_stream(&mut self) -> Result<()> { + self + .send_requests([ + StreamRequest::Close(CloseStreamReq {}), + ]) + .await?; + Ok(()) + } + async fn finalize(&mut self, req: StreamRequest) -> Result<(StreamResponse, bool)> { let [resp, get_autocommit, _] = self .send_requests([ diff --git a/libsql/src/local/impls.rs b/libsql/src/local/impls.rs index 8a9a5f440e..2338317a34 100644 --- a/libsql/src/local/impls.rs +++ b/libsql/src/local/impls.rs @@ -5,7 +5,7 @@ use crate::connection::BatchRows; use crate::{ connection::Conn, params::Params, - rows::{RowInner, RowsInner}, + rows::{ColumnsInner, RowInner, RowsInner}, statement::Stmt, transaction::Tx, Column, Connection, Result, Row, Rows, Statement, Transaction, TransactionBehavior, Value, @@ -159,7 +159,9 @@ impl RowsInner for LibsqlRows { Ok(row) } +} +impl ColumnsInner for LibsqlRows { fn column_count(&self) -> i32 { self.0.column_count() } @@ -180,20 +182,22 @@ impl RowInner for LibsqlRow { self.0.get_value(idx) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.0.column_name(idx) - } - fn column_str(&self, idx: i32) -> Result<&str> { self.0.get::<&str>(idx) } +} + +impl ColumnsInner for LibsqlRow { + fn column_name(&self, idx: i32) -> Option<&str> { + self.0.column_name(idx) + } fn column_type(&self, idx: i32) -> Result { self.0.column_type(idx).map(ValueType::from) } - fn column_count(&self) -> usize { - self.0.stmt.column_count() + fn column_count(&self) -> i32 { + self.0.stmt.column_count() as i32 } } diff --git a/libsql/src/local/rows.rs b/libsql/src/local/rows.rs index 7eb52d461b..4d4e622c75 100644 --- a/libsql/src/local/rows.rs +++ b/libsql/src/local/rows.rs @@ -1,6 +1,6 @@ use crate::local::{Connection, Statement}; use crate::params::Params; -use crate::rows::{RowInner, RowsInner}; +use crate::rows::{ColumnsInner, RowInner, RowsInner}; use crate::{errors, Error, Result}; use crate::{Value, ValueRef}; use libsql_sys::ValueType; @@ -213,7 +213,9 @@ impl RowsInner for BatchedRows { Ok(None) } } +} +impl ColumnsInner for BatchedRows { fn column_count(&self) -> i32 { self.cols.len() as i32 } @@ -244,10 +246,6 @@ impl RowInner for BatchedRow { .ok_or(Error::InvalidColumnIndex) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.cols.get(idx as usize).map(|c| c.0.as_str()) - } - fn column_str(&self, idx: i32) -> Result<&str> { self.row .get(idx as usize) @@ -258,9 +256,15 @@ impl RowInner for BatchedRow { .ok_or(Error::InvalidColumnType) }) } +} + +impl ColumnsInner for BatchedRow { + fn column_name(&self, idx: i32) -> Option<&str> { + self.cols.get(idx as usize).map(|c| c.0.as_str()) + } - fn column_count(&self) -> usize { - self.cols.len() + fn column_count(&self) -> i32 { + self.cols.len() as i32 } fn column_type(&self, idx: i32) -> Result { diff --git a/libsql/src/local/statement.rs b/libsql/src/local/statement.rs index 70116a152e..c28a66f18f 100644 --- a/libsql/src/local/statement.rs +++ b/libsql/src/local/statement.rs @@ -250,15 +250,15 @@ impl Statement { /// sure that current statement has already been stepped once before /// calling this method. pub fn column_names(&self) -> Vec<&str> { - let n = self.column_count(); - let mut cols = Vec::with_capacity(n); - for i in 0..n { - let s = self.column_name(i); - if let Some(s) = s { - cols.push(s); - } - } - cols + let n = self.column_count(); + let mut cols = Vec::with_capacity(n); + for i in 0..n { + let s = self.column_name(i); + if let Some(s) = s { + cols.push(s); + } + } + cols } /// Return the number of columns in the result set returned by the prepared @@ -314,12 +314,11 @@ impl Statement { /// the specified `name`. pub fn column_index(&self, name: &str) -> Result { let bytes = name.as_bytes(); - let n = self.column_count() as i32; + let n = self.column_count(); for i in 0..n { // Note: `column_name` is only fallible if `i` is out of bounds, // which we've already checked. let col_name = self - .inner .column_name(i) .ok_or_else(|| Error::InvalidColumnName(name.to_string()))?; if bytes.eq_ignore_ascii_case(col_name.as_bytes()) { diff --git a/libsql/src/replication/connection.rs b/libsql/src/replication/connection.rs index c82f523559..c720838798 100644 --- a/libsql/src/replication/connection.rs +++ b/libsql/src/replication/connection.rs @@ -11,7 +11,7 @@ use parking_lot::Mutex; use crate::parser; use crate::parser::StmtKind; -use crate::rows::{RowInner, RowsInner}; +use crate::rows::{ColumnsInner, RowInner, RowsInner}; use crate::statement::Stmt; use crate::transaction::Tx; use crate::{ @@ -780,7 +780,9 @@ impl RowsInner for RemoteRows { let row = RemoteRow(values, self.0.column_descriptions.clone()); Ok(Some(row).map(Box::new).map(|inner| Row { inner })) } +} +impl ColumnsInner for RemoteRows { fn column_count(&self) -> i32 { self.0.column_descriptions.len() as i32 } @@ -813,10 +815,6 @@ impl RowInner for RemoteRow { .ok_or(Error::InvalidColumnIndex) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.1.get(idx as usize).map(|s| s.name.as_str()) - } - fn column_str(&self, idx: i32) -> Result<&str> { let value = self.0.get(idx as usize).ok_or(Error::InvalidColumnIndex)?; @@ -825,6 +823,12 @@ impl RowInner for RemoteRow { _ => Err(Error::InvalidColumnType), } } +} + +impl ColumnsInner for RemoteRow { + fn column_name(&self, idx: i32) -> Option<&str> { + self.1.get(idx as usize).map(|s| s.name.as_str()) + } fn column_type(&self, idx: i32) -> Result { let col = self.1.get(idx as usize).unwrap(); @@ -835,8 +839,8 @@ impl RowInner for RemoteRow { .ok_or(Error::InvalidColumnType) } - fn column_count(&self) -> usize { - self.1.len() + fn column_count(&self) -> i32 { + self.1.len() as i32 } } diff --git a/libsql/src/replication/local_client.rs b/libsql/src/replication/local_client.rs index 2d7b940c92..d3c713f530 100644 --- a/libsql/src/replication/local_client.rs +++ b/libsql/src/replication/local_client.rs @@ -3,6 +3,7 @@ use std::pin::Pin; use futures::{StreamExt, TryStreamExt}; use libsql_replication::{ + rpc::replication::Frame as RpcFrame, frame::{Frame, FrameNo}, meta::WalIndexMeta, replicator::{Error, ReplicatorClient}, @@ -35,7 +36,7 @@ impl LocalClient { #[async_trait::async_trait] impl ReplicatorClient for LocalClient { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -46,7 +47,7 @@ impl ReplicatorClient for LocalClient { async fn next_frames(&mut self) -> Result { match self.frames.take() { Some(Frames::Vec(f)) => { - let iter = f.into_iter().map(Ok); + let iter = f.into_iter().map(|f| RpcFrame { data: f.bytes(), timestamp: None }).map(Ok); Ok(Box::pin(tokio_stream::iter(iter))) } Some(f @ Frames::Snapshot(_)) => { @@ -70,7 +71,8 @@ impl ReplicatorClient for LocalClient { if s.as_mut().peek().await.is_none() { next.header_mut().size_after = size_after.into(); } - yield Frame::from(next); + let frame = Frame::from(next); + yield RpcFrame { data: frame.bytes(), timestamp: None }; } }; @@ -95,8 +97,9 @@ impl ReplicatorClient for LocalClient { #[cfg(test)] mod test { - use libsql_replication::snapshot::SnapshotFile; + use libsql_replication::{frame::FrameHeader, snapshot::SnapshotFile}; use tempfile::tempdir; + use zerocopy::FromBytes; use super::*; @@ -111,7 +114,8 @@ mod test { let mut s = client.snapshot().await.unwrap(); assert!(matches!(s.next().await, Some(Ok(_)))); let last = s.next().await.unwrap().unwrap(); - assert_eq!(last.header().size_after.get(), 2); + let header: FrameHeader = FrameHeader::read_from_prefix(&last.data[..]).unwrap(); + assert_eq!(header.size_after.get(), 2); assert!(s.next().await.is_none()); } } diff --git a/libsql/src/replication/mod.rs b/libsql/src/replication/mod.rs index 69cc0b5db2..2f4e9b49c0 100644 --- a/libsql/src/replication/mod.rs +++ b/libsql/src/replication/mod.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use std::time::Duration; pub use libsql_replication::frame::{Frame, FrameNo}; +use libsql_replication::injector::SqliteInjector; use libsql_replication::replicator::{Either, Replicator}; pub use libsql_replication::snapshot::SnapshotFile; @@ -129,7 +130,7 @@ impl Writer { #[derive(Clone)] pub(crate) struct EmbeddedReplicator { - replicator: Arc>>>, + replicator: Arc, SqliteInjector>>>, bg_abort: Option>, last_frames_synced: Arc, } @@ -149,7 +150,7 @@ impl EmbeddedReplicator { perodic_sync: Option, ) -> Result { let replicator = Arc::new(Mutex::new( - Replicator::new( + Replicator::new_sqlite( Either::Left(client), db_path, auto_checkpoint, @@ -193,7 +194,7 @@ impl EmbeddedReplicator { encryption_config: Option, ) -> Result { let replicator = Arc::new(Mutex::new( - Replicator::new( + Replicator::new_sqlite( Either::Right(client), db_path, auto_checkpoint, diff --git a/libsql/src/replication/remote_client.rs b/libsql/src/replication/remote_client.rs index dbab056938..864392ddb5 100644 --- a/libsql/src/replication/remote_client.rs +++ b/libsql/src/replication/remote_client.rs @@ -4,12 +4,12 @@ use std::pin::Pin; use std::time::{Duration, Instant}; use bytes::Bytes; -use futures::StreamExt as _; -use libsql_replication::frame::{Frame, FrameHeader, FrameNo}; +use futures::{StreamExt as _, TryStreamExt}; +use libsql_replication::frame::{FrameHeader, FrameNo}; use libsql_replication::meta::WalIndexMeta; -use libsql_replication::replicator::{map_frame_err, Error, ReplicatorClient}; +use libsql_replication::replicator::{Error, ReplicatorClient}; use libsql_replication::rpc::replication::{ - verify_session_token, Frames, HelloRequest, HelloResponse, LogOffset, SESSION_TOKEN_KEY, + Frame as RpcFrame, verify_session_token, Frames, HelloRequest, HelloResponse, LogOffset, SESSION_TOKEN_KEY, }; use tokio_stream::Stream; use tonic::metadata::AsciiMetadataValue; @@ -119,6 +119,7 @@ impl RemoteClient { let hello_req = self.make_request(HelloRequest::new()); let log_offset_req = self.make_request(LogOffset { next_offset: self.next_offset(), + wal_flavor: None, }); let mut client_clone = self.remote.clone(); let hello_fut = time(async { @@ -135,7 +136,7 @@ impl RemoteClient { (hello_fut.await, None) }; self.prefetched_batch_log_entries = if let Ok(true) = hello.0 { - tracing::warn!( + tracing::debug!( "Frames prefetching failed because of new session token returned by handshake" ); None @@ -160,7 +161,7 @@ impl RemoteClient { let frames_iter = frames .into_iter() - .map(|f| Frame::try_from(&*f.data).map_err(|e| Error::Client(e.into()))); + .map(Ok); let stream = tokio_stream::iter(frames_iter); @@ -178,6 +179,7 @@ impl RemoteClient { None => { let req = self.make_request(LogOffset { next_offset: self.next_offset(), + wal_flavor: None, }); time(self.remote.replication.batch_log_entries(req)).await } @@ -189,6 +191,7 @@ impl RemoteClient { async fn do_snapshot(&mut self) -> Result<::FrameStream, Error> { let req = self.make_request(LogOffset { next_offset: self.next_offset(), + wal_flavor: None, }); let mut frames = self .remote @@ -196,7 +199,7 @@ impl RemoteClient { .snapshot(req) .await? .into_inner() - .map(map_frame_err) + .map_err(|e| e.into()) .peekable(); { @@ -204,7 +207,8 @@ impl RemoteClient { // the first frame is the one with the highest frame_no in the snapshot if let Some(Ok(f)) = frames.peek().await { - self.last_received = Some(f.header().frame_no.get()); + let header: FrameHeader = FrameHeader::read_from_prefix(&f.data[..]).unwrap(); + self.last_received = Some(header.frame_no.get()); } } @@ -239,7 +243,7 @@ fn maybe_log( #[async_trait::async_trait] impl ReplicatorClient for RemoteClient { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { diff --git a/libsql/src/rows.rs b/libsql/src/rows.rs index b97aeac203..a10d82b827 100644 --- a/libsql/src/rows.rs +++ b/libsql/src/rows.rs @@ -38,14 +38,8 @@ impl Column<'_> { } #[async_trait::async_trait] -pub(crate) trait RowsInner { +pub(crate) trait RowsInner: ColumnsInner { async fn next(&mut self) -> Result>; - - fn column_count(&self) -> i32; - - fn column_name(&self, idx: i32) -> Option<&str>; - - fn column_type(&self, idx: i32) -> Result; } /// A set of rows returned from a connection. @@ -131,7 +125,7 @@ impl Row { } /// Get the count of columns in this set of rows. - pub fn column_count(&self) -> usize { + pub fn column_count(&self) -> i32 { self.inner.column_count() } @@ -284,12 +278,15 @@ where } impl Sealed for Option {} -pub(crate) trait RowInner: fmt::Debug { - fn column_value(&self, idx: i32) -> Result; - fn column_str(&self, idx: i32) -> Result<&str>; +pub(crate) trait ColumnsInner { fn column_name(&self, idx: i32) -> Option<&str>; fn column_type(&self, idx: i32) -> Result; - fn column_count(&self) -> usize; + fn column_count(&self) -> i32; +} + +pub(crate) trait RowInner: ColumnsInner + fmt::Debug { + fn column_value(&self, idx: i32) -> Result; + fn column_str(&self, idx: i32) -> Result<&str>; } mod sealed { diff --git a/libsql/tests/integration_tests.rs b/libsql/tests/integration_tests.rs index 0f8e575949..cdb0a985c3 100644 --- a/libsql/tests/integration_tests.rs +++ b/libsql/tests/integration_tests.rs @@ -596,6 +596,26 @@ async fn debug_print_row() { ); } +#[tokio::test] +async fn fts5_invalid_tokenizer() { + let db = Database::open(":memory:").unwrap(); + let conn = db.connect().unwrap(); + assert!(conn + .execute( + "CREATE VIRTUAL TABLE t USING fts5(s, tokenize='trigram case_sensitive ')", + (), + ) + .await + .is_err()); + assert!(conn + .execute( + "CREATE VIRTUAL TABLE t USING fts5(s, tokenize='trigram remove_diacritics ')", + (), + ) + .await + .is_err()); +} + #[cfg(feature = "serde")] #[tokio::test] async fn deserialize_row() { diff --git a/vendored/rusqlite/Cargo.toml b/vendored/rusqlite/Cargo.toml index 2d332f3279..d9fbcc525e 100644 --- a/vendored/rusqlite/Cargo.toml +++ b/vendored/rusqlite/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "libsql-rusqlite" # Note: Update version in README.md when you change this. -version = "0.31.0" +version = "0.32.0" authors = ["The rusqlite developers"] edition = "2018" description = "Ergonomic wrapper for SQLite (libsql fork)" @@ -109,7 +109,7 @@ fallible-iterator = "0.2" fallible-streaming-iterator = "0.1" uuid = { version = "1.0", optional = true } smallvec = "1.6.1" -libsql-ffi = { version = "0.3", path = "../../libsql-ffi" } +libsql-ffi = { version = "0.4", path = "../../libsql-ffi" } [dev-dependencies] doc-comment = "0.3" diff --git a/vendored/sqlite3-parser/Cargo.toml b/vendored/sqlite3-parser/Cargo.toml index 5ed9e31f4d..0381ac1d99 100644 --- a/vendored/sqlite3-parser/Cargo.toml +++ b/vendored/sqlite3-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql-sqlite3-parser" -version = "0.12.0" +version = "0.13.0" edition = "2021" authors = ["gwenn"] description = "SQL parser (as understood by SQLite) (libsql fork)"