diff --git a/.github/templates/libsql-server-release-build-setup.yml b/.github/templates/libsql-server-release-build-setup.yml new file mode 100644 index 0000000000..997e70066d --- /dev/null +++ b/.github/templates/libsql-server-release-build-setup.yml @@ -0,0 +1,2 @@ +- name: Prepare env vars + run: echo "RUSTFLAGS=--cfg tokio_unstable" >> $GITHUB_ENV diff --git a/.github/workflows/libsql-server-release.yml b/.github/workflows/libsql-server-release.yml index 919d6fa179..db15bfc641 100644 --- a/.github/workflows/libsql-server-release.yml +++ b/.github/workflows/libsql-server-release.yml @@ -12,9 +12,8 @@ # title/body based on your changelogs. name: Release - permissions: - contents: write + "contents": "write" # This task will run whenever you push a git tag that looks like a version # like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc. @@ -38,15 +37,15 @@ permissions: # If there's a prerelease-style suffix to the version, then the release(s) # will be marked as a prerelease. on: + pull_request: push: tags: - 'libsql-server**[0-9]+.[0-9]+.[0-9]+*' - pull_request: jobs: # Run 'cargo dist plan' (or host) to determine what tasks we need to do plan: - runs-on: ubuntu-latest + runs-on: "ubuntu-20.04" outputs: val: ${{ steps.plan.outputs.manifest }} tag: ${{ !github.event.pull_request && github.ref_name || '' }} @@ -62,7 +61,12 @@ jobs: # we specify bash to get pipefail; it guards against the `curl` command # failing. otherwise `sh` won't catch that `curl` returned non-0 shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.14.1/cargo-dist-installer.sh | sh" + run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.21.0/cargo-dist-installer.sh | sh" + - name: Cache cargo-dist + uses: actions/upload-artifact@v4 + with: + name: cargo-dist-cache + path: ~/.cargo/bin/cargo-dist # sure would be cool if github gave us proper conditionals... # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible # functionality based on whether this is a pull_request, and whether it's from a fork. @@ -111,9 +115,8 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - uses: swatinem/rust-cache@v2 - with: - key: ${{ join(matrix.targets, '-') }} + - name: "Prepare env vars" + run: "echo \"RUSTFLAGS=--cfg tokio_unstable\" >> $GITHUB_ENV" - name: Install cargo-dist run: ${{ matrix.install_dist }} # Get the dist-manifest @@ -165,9 +168,12 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: Install cargo-dist - shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.14.1/cargo-dist-installer.sh | sh" + - name: Install cached cargo-dist + uses: actions/download-artifact@v4 + with: + name: cargo-dist-cache + path: ~/.cargo/bin/ + - run: chmod +x ~/.cargo/bin/cargo-dist # Get all the local artifacts for the global tasks to use (for e.g. checksums) - name: Fetch local artifacts uses: actions/download-artifact@v4 @@ -211,8 +217,12 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: Install cargo-dist - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.14.1/cargo-dist-installer.sh | sh" + - name: Install cached cargo-dist + uses: actions/download-artifact@v4 + with: + name: cargo-dist-cache + path: ~/.cargo/bin/ + - run: chmod +x ~/.cargo/bin/cargo-dist # Fetch artifacts from scratch-storage - name: Fetch artifacts uses: actions/download-artifact@v4 @@ -220,7 +230,6 @@ jobs: pattern: artifacts-* path: target/distrib/ merge-multiple: true - # This is a harmless no-op for GitHub Releases, hosting for that happens in "announce" - id: host shell: bash run: | @@ -234,6 +243,28 @@ jobs: # Overwrite the previous copy name: artifacts-dist-manifest path: dist-manifest.json + # Create a GitHub Release while uploading all files to it + - name: "Download GitHub Artifacts" + uses: actions/download-artifact@v4 + with: + pattern: artifacts-* + path: artifacts + merge-multiple: true + - name: Cleanup + run: | + # Remove the granular manifests + rm -f artifacts/*-dist-manifest.json + - name: Create GitHub Release + env: + PRERELEASE_FLAG: "${{ fromJson(steps.host.outputs.manifest).announcement_is_prerelease && '--prerelease' || '' }}" + ANNOUNCEMENT_TITLE: "${{ fromJson(steps.host.outputs.manifest).announcement_title }}" + ANNOUNCEMENT_BODY: "${{ fromJson(steps.host.outputs.manifest).announcement_github_body }}" + RELEASE_COMMIT: "${{ github.sha }}" + run: | + # Write and read notes from a file to avoid quoting breaking things + echo "$ANNOUNCEMENT_BODY" > $RUNNER_TEMP/notes.txt + + gh release create "${{ needs.plan.outputs.tag }}" --target "$RELEASE_COMMIT" $PRERELEASE_FLAG --title "$ANNOUNCEMENT_TITLE" --notes-file "$RUNNER_TEMP/notes.txt" artifacts/* publish-homebrew-formula: needs: @@ -275,7 +306,6 @@ jobs: done git push - # Create a GitHub Release while uploading all files to it announce: needs: - plan @@ -292,21 +322,3 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: "Download GitHub Artifacts" - uses: actions/download-artifact@v4 - with: - pattern: artifacts-* - path: artifacts - merge-multiple: true - - name: Cleanup - run: | - # Remove the granular manifests - rm -f artifacts/*-dist-manifest.json - - name: Create GitHub Release - uses: ncipollo/release-action@v1 - with: - tag: ${{ needs.plan.outputs.tag }} - name: ${{ fromJson(needs.host.outputs.val).announcement_title }} - body: ${{ fromJson(needs.host.outputs.val).announcement_github_body }} - prerelease: ${{ fromJson(needs.host.outputs.val).announcement_is_prerelease }} - artifacts: "artifacts/*" diff --git a/.github/workflows/nemesis.yml b/.github/workflows/nemesis.yml index 9cfdbc39b0..090f2626b8 100644 --- a/.github/workflows/nemesis.yml +++ b/.github/workflows/nemesis.yml @@ -18,7 +18,7 @@ jobs: if: github.repository == 'tursodatabase/libsql' name: Run Nemesis Tests env: - RUSTFLAGS: -D warnings + RUSTFLAGS: -D warnings --cfg tokio_unstable steps: - uses: hecrj/setup-rust-action@v2 diff --git a/.github/workflows/publish-server.yml b/.github/workflows/publish-server.yml index 10820457b8..e1973fe47c 100644 --- a/.github/workflows/publish-server.yml +++ b/.github/workflows/publish-server.yml @@ -118,23 +118,9 @@ jobs: context: . platforms: ${{ env.platform }} labels: ${{ steps.meta.outputs.labels }} - outputs: type=image,name=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-debug,push-by-digest=true,name-canonical=true,push=true build-args: | BUILD_DEBUG=true - - - name: Export digest - run: | - mkdir -p /tmp/digests - digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - - - name: Upload digest - uses: actions/upload-artifact@v4 - with: - name: digests-debug-${{ env.PLATFORM_PAIR }} - path: /tmp/digests/* - if-no-files-found: error - retention-days: 1 build-arm64: permissions: write-all diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 26eaba46cf..7036c6cf2f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest name: Run Checks env: - RUSTFLAGS: -D warnings + RUSTFLAGS: -D warnings --cfg tokio_unstable steps: - uses: hecrj/setup-rust-action@v2 @@ -80,15 +80,15 @@ jobs: - uses: taiki-e/install-action@cargo-udeps - uses: Swatinem/rust-cache@v2 - run: cargo +nightly hack udeps -p libsql --each-feature - - run: RUSTFLAGS="-D warnings" cargo check -p libsql --no-default-features --features core - - run: RUSTFLAGS="-D warnings" cargo check -p libsql --no-default-features --features replication - - run: RUSTFLAGS="-D warnings" cargo check -p libsql --no-default-features --features remote + - run: RUSTFLAGS="-D warnings --cfg tokio_unstable" cargo check -p libsql --no-default-features --features core + - run: RUSTFLAGS="-D warnings --cfg tokio_unstable" cargo check -p libsql --no-default-features --features replication + - run: RUSTFLAGS="-D warnings --cfg tokio_unstable" cargo check -p libsql --no-default-features --features remote test: runs-on: ubuntu-latest name: Run Tests env: - RUSTFLAGS: -D warnings + RUSTFLAGS: -D warnings --cfg tokio_unstable steps: - uses: hecrj/setup-rust-action@v2 @@ -159,8 +159,8 @@ jobs: target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} restore-keys: ${{ runner.os }}-cargo- - - name: check libsql remote - run: cargo check -p libsql --no-default-features -F remote + - name: build libsql all features + run: cargo build -p libsql --all-features # test-rust-wasm: # runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 04ae728065..483f703ed6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3430,7 +3430,7 @@ dependencies = [ [[package]] name = "libsql" -version = "0.5.0-alpha.2" +version = "0.5.1" dependencies = [ "anyhow", "async-stream", @@ -3488,7 +3488,7 @@ dependencies = [ [[package]] name = "libsql-ffi" -version = "0.3.0" +version = "0.4.0" dependencies = [ "bindgen 0.66.1", "cc", @@ -3508,7 +3508,7 @@ dependencies = [ [[package]] name = "libsql-rusqlite" -version = "0.31.0" +version = "0.32.0" dependencies = [ "bencher", "bitflags 2.6.0", @@ -3532,7 +3532,7 @@ dependencies = [ [[package]] name = "libsql-server" -version = "0.24.18" +version = "0.24.19" dependencies = [ "aes", "anyhow", @@ -3631,7 +3631,7 @@ dependencies = [ [[package]] name = "libsql-sqlite3-parser" -version = "0.12.0" +version = "0.13.0" dependencies = [ "bitflags 2.6.0", "cc", @@ -3687,7 +3687,7 @@ dependencies = [ [[package]] name = "libsql-sys" -version = "0.6.0" +version = "0.7.0" dependencies = [ "bytes", "libsql-ffi", @@ -3702,6 +3702,7 @@ name = "libsql-wal" version = "0.1.0" dependencies = [ "arc-swap", + "async-lock 3.4.0", "async-stream", "aws-config 1.5.4", "aws-credential-types 1.2.0", @@ -3768,7 +3769,7 @@ dependencies = [ [[package]] name = "libsql_replication" -version = "0.4.0" +version = "0.5.0" dependencies = [ "aes", "arbitrary", @@ -3779,6 +3780,7 @@ dependencies = [ "cbc", "libsql-rusqlite", "libsql-sys", + "libsql-wal", "parking_lot", "prost", "prost-build", diff --git a/Cargo.toml b/Cargo.toml index 92487ecdd0..94c851721f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ codegen-units = 1 panic = "unwind" [workspace.dependencies] -rusqlite = { package = "libsql-rusqlite", path = "vendored/rusqlite", version = "0.31", default-features = false, features = [ +rusqlite = { package = "libsql-rusqlite", path = "vendored/rusqlite", version = "0.32", default-features = false, features = [ "libsql-experimental", "column_decltype", "load_extension", @@ -45,11 +45,12 @@ rusqlite = { package = "libsql-rusqlite", path = "vendored/rusqlite", version = ] } hyper = { version = "0.14" } tower = { version = "0.4.13" } +zerocopy = { version = "0.7.32", features = ["derive", "alloc"] } # Config for 'cargo dist' [workspace.metadata.dist] # The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax) -cargo-dist-version = "0.14.1" +cargo-dist-version = "0.21.0" # CI backends to support ci = "github" # The installers to generate for each app @@ -64,12 +65,16 @@ targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-da publish-jobs = ["homebrew"] # Whether cargo-dist should create a Github Release or use an existing draft create-release = true -# Publish jobs to run in CI +# Which actions to run on pull requests pr-run-mode = "plan" # A prefix git tags must include for cargo-dist to care about them tag-namespace = "libsql-server" # Whether to install an updater program install-updater = false +# additional setup steps +github-build-setup = "../templates/libsql-server-release-build-setup.yml" +# Path that installers should place binaries in +install-path = "CARGO_HOME" [workspace.metadata.dist.github-custom-runners] aarch64-apple-darwin = "macos-14" diff --git a/bindings/c/include/libsql.h b/bindings/c/include/libsql.h index 8178980466..7fdfb4b3c0 100644 --- a/bindings/c/include/libsql.h +++ b/bindings/c/include/libsql.h @@ -27,6 +27,11 @@ typedef struct libsql_stmt libsql_stmt; typedef const libsql_database *libsql_database_t; +typedef struct { + int frame_no; + int frames_synced; +} replicated; + typedef struct { const char *db_path; const char *primary_url; @@ -58,6 +63,8 @@ extern "C" { int libsql_sync(libsql_database_t db, const char **out_err_msg); +int libsql_sync2(libsql_database_t db, replicated *out_replicated, const char **out_err_msg); + int libsql_open_sync(const char *db_path, const char *primary_url, const char *auth_token, diff --git a/bindings/c/src/lib.rs b/bindings/c/src/lib.rs index 96e4effd2a..6cb1dc096e 100644 --- a/bindings/c/src/lib.rs +++ b/bindings/c/src/lib.rs @@ -11,7 +11,7 @@ use tokio::runtime::Runtime; use types::{ blob, libsql_connection, libsql_connection_t, libsql_database, libsql_database_t, libsql_row, libsql_row_t, libsql_rows, libsql_rows_future_t, libsql_rows_t, libsql_stmt, libsql_stmt_t, - stmt, + replicated, stmt, }; lazy_static! { @@ -46,6 +46,29 @@ pub unsafe extern "C" fn libsql_sync( } } +#[no_mangle] +pub unsafe extern "C" fn libsql_sync2( + db: libsql_database_t, + out_replicated: *mut replicated, + out_err_msg: *mut *const std::ffi::c_char, +) -> std::ffi::c_int { + let db = db.get_ref(); + match RT.block_on(db.sync()) { + Ok(replicated) => { + if !out_replicated.is_null() { + (*out_replicated).frame_no = replicated.frame_no().unwrap_or(0) as i32; + (*out_replicated).frames_synced = replicated.frames_synced() as i32; + } + + 0 + } + Err(e) => { + set_err_msg(format!("Error syncing database: {e}"), out_err_msg); + 1 + } + } +} + #[no_mangle] pub unsafe extern "C" fn libsql_open_sync( db_path: *const std::ffi::c_char, diff --git a/bindings/c/src/types.rs b/bindings/c/src/types.rs index 9f818e28d4..5d9f0b517f 100644 --- a/bindings/c/src/types.rs +++ b/bindings/c/src/types.rs @@ -115,6 +115,12 @@ impl From<&mut libsql_connection> for libsql_connection_t { } } +#[repr(C)] +pub struct replicated { + pub frame_no: std::ffi::c_int, + pub frames_synced: std::ffi::c_int, +} + pub struct stmt { pub stmt: libsql::Statement, pub params: Vec, diff --git a/bottomless/src/replicator.rs b/bottomless/src/replicator.rs index f2ef812f75..cd37a70165 100644 --- a/bottomless/src/replicator.rs +++ b/bottomless/src/replicator.rs @@ -17,6 +17,8 @@ use aws_sdk_s3::primitives::ByteStream; use aws_sdk_s3::{Client, Config}; use bytes::{Buf, Bytes}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use libsql_replication::injector::Injector as _; +use libsql_replication::rpc::replication::Frame as RpcFrame; use libsql_sys::{Cipher, EncryptionConfig}; use std::ops::Deref; use std::path::{Path, PathBuf}; @@ -1449,12 +1451,13 @@ impl Replicator { db_path: &Path, ) -> Result { let encryption_config = self.encryption_config.clone(); - let mut injector = libsql_replication::injector::Injector::new( - db_path, + let mut injector = libsql_replication::injector::SqliteInjector::new( + db_path.to_path_buf(), 4096, libsql_sys::connection::NO_AUTOCHECKPOINT, encryption_config, - )?; + ) + .await?; let prefix = format!("{}-{}/", self.db_name, generation); let mut page_buf = { let mut v = Vec::with_capacity(page_size); @@ -1552,7 +1555,11 @@ impl Replicator { }, page_buf.as_slice(), ); - injector.inject_frame(frame_to_inject)?; + let frame = RpcFrame { + data: frame_to_inject.bytes(), + timestamp: None, + }; + injector.inject_frame(frame).await?; applied_wal_frame = true; } } diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index f9d03fa508..96e2089e98 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -237,6 +237,12 @@ For example, if you have the following entries in your `/etc/hosts` file: You can access `db1` with the `http://db1.local:8080`URL and `db2` with `http://db2.local:8080`. The database files for the databases are stored in `/dbs/db1` and `idxType==SQLITE_IDXTYPE_PRIMARYKEY) @@ -19305,10 +19307,7 @@ struct Index { #define IsUniqueIndex(X) ((X)->onError!=OE_None) /* Return true if index X is a vector index */ -#define IsVectorIndex(X) ((X)->idxType==SQLITE_IDXTYPE_VECTOR) - -/* Return true if index X is an user defined index (APPDEF or VECTOR) */ -#define IsAppDefIndex(X) ((X)->idxType==SQLITE_IDXTYPE_APPDEF||(X)->idxType==SQLITE_IDXTYPE_VECTOR) +#define IsVectorIndex(X) ((X)->idxIsVector==1) /* The Index.aiColumn[] values are normally positive integer. But ** there are some negative values that have special meaning: @@ -85239,14 +85238,45 @@ typedef u32 VectorDims; */ #define MAX_VECTOR_SZ 65536 +/* + * on-disk binary format for vector of different types: + * 1. float32 + * [data[0] as f32] [data[1] as f32] ... [data[dims - 1] as f32] [1 as u8]? + * - last 'type'-byte is optional for float32 vectors + * + * 2. float64 + * [data[0] as f64] [data[1] as f64] ... [data[dims - 1] as f64] [2 as u8] + * - last 'type'-byte is mandatory for float64 vectors + * + * 3. float1bit + * [data[0] as u8] [data[1] as u8] ... [data[(dims + 7) / 8] as u8] [_ as u8; padding]? [trailing_bits as u8] [3 as u8] + * - every data byte (except for the last) represents exactly 8 components of the vector + * - last data byte represents [1..8] components of the vector + * - optional padding byte ensures that "trailing_bits" byte will be written at the odd blob position (0-based) + * - "trailing_bits" byte specify amount of trailing *bits* in the blob without last 'type'-byte which must be omitted + * (so, vector dimensions are equal to 8 * (blob_size - 1) - trailing_bits) + * - last 'type'-byte is mandatory for float1bit vectors + * + * 4. float8 + * [data[0] as u8] [data[1] as u8] ... [data[dims - 1] as u8] [_ as u8; alignment_padding]* [alpha as f32] [shift as f32] [padding as u8] [trailing_bytes as u8] [4 as u8] + * - every data byte represents single quantized vector component + * - "alignment_padding" has size from 0 to 3 bytes in order to pad content to multiple of 4 = sizeof(float) + * - "trailing_bytes" byte specify amount of bytes in the "alignment_padding" + * - last 'type'-byte is mandatory for float8 vectors +*/ + /* * Enumerate of supported vector types (0 omitted intentionally as we can use zero as "undefined" value) */ -#define VECTOR_TYPE_FLOAT32 1 -#define VECTOR_TYPE_FLOAT64 2 +#define VECTOR_TYPE_FLOAT32 1 +#define VECTOR_TYPE_FLOAT64 2 +#define VECTOR_TYPE_FLOAT1BIT 3 +#define VECTOR_TYPE_FLOAT8 4 #define VECTOR_FLAGS_STATIC 1 +#define ALIGN(n, size) (((n + size - 1) / size) * size) + /* * Object which represents a vector * data points to the memory which must be interpreted according to the vector type @@ -85261,15 +85291,20 @@ struct Vector { size_t vectorDataSize(VectorType, VectorDims); Vector *vectorAlloc(VectorType, VectorDims); void vectorFree(Vector *v); -int vectorParse(sqlite3_value *, Vector *, char **); +int vectorParseWithType(sqlite3_value *, Vector *, char **); void vectorInit(Vector *, VectorType, VectorDims, void *); /* * Dumps vector on the console (used only for debugging) */ -void vectorDump (const Vector *v); -void vectorF32Dump(const Vector *v); -void vectorF64Dump(const Vector *v); +void vectorDump (const Vector *v); +void vectorF8Dump (const Vector *v); +void vectorF32Dump (const Vector *v); +void vectorF64Dump (const Vector *v); +void vector1BitDump(const Vector *v); + +void vectorF8GetParameters(const u8 *, int, float *, float *); +void vectorF8SetParameters(u8 *, int, float, float); /* * Converts vector to the text representation and write the result to the sqlite3_context @@ -85281,28 +85316,30 @@ void vectorF64MarshalToText(sqlite3_context *, const Vector *); /* * Serializes vector to the blob in little-endian format according to the IEEE-754 standard */ -size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t); -size_t vectorF32SerializeToBlob(const Vector *, unsigned char *, size_t); -size_t vectorF64SerializeToBlob(const Vector *, unsigned char *, size_t); - -/* - * Deserializes vector from the blob in little-endian format according to the IEEE-754 standard -*/ -size_t vectorDeserializeFromBlob (Vector *, const unsigned char *, size_t); -size_t vectorF32DeserializeFromBlob(Vector *, const unsigned char *, size_t); -size_t vectorF64DeserializeFromBlob(Vector *, const unsigned char *, size_t); +void vectorSerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF8SerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF32SerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF64SerializeToBlob (const Vector *, unsigned char *, size_t); +void vector1BitSerializeToBlob(const Vector *, unsigned char *, size_t); /* * Calculates cosine distance between two vectors (vector must have same type and same dimensions) */ float vectorDistanceCos (const Vector *, const Vector *); +float vectorF8DistanceCos (const Vector *, const Vector *); float vectorF32DistanceCos (const Vector *, const Vector *); double vectorF64DistanceCos(const Vector *, const Vector *); +/* + * Calculates hamming distance between two 1-bit vectors (vector must have same dimensions) +*/ +int vector1BitDistanceHamming(const Vector *, const Vector *); + /* * Calculates L2 distance between two vectors (vector must have same type and same dimensions) */ float vectorDistanceL2 (const Vector *, const Vector *); +float vectorF8DistanceL2 (const Vector *, const Vector *); float vectorF32DistanceL2 (const Vector *, const Vector *); double vectorF64DistanceL2(const Vector *, const Vector *); @@ -85311,25 +85348,44 @@ double vectorF64DistanceL2(const Vector *, const Vector *); * LibSQL can append one trailing byte in the end of final blob. This byte will be later used to determine type of the blob * By default, blob with even length will be treated as a f32 blob */ -void vectorSerialize (sqlite3_context *, const Vector *); -void vectorF32Serialize(sqlite3_context *, const Vector *); -void vectorF64Serialize(sqlite3_context *, const Vector *); +void vectorSerializeWithMeta(sqlite3_context *, const Vector *); /* * Parses Vector content from the blob; vector type and dimensions must be filled already */ -int vectorParseSqliteBlob (sqlite3_value *, Vector *, char **); -int vectorF32ParseSqliteBlob(sqlite3_value *, Vector *, char **); -int vectorF64ParseSqliteBlob(sqlite3_value *, Vector *, char **); +int vectorParseSqliteBlobWithType(sqlite3_value *, Vector *, char **); -void vectorInitStatic(Vector *, VectorType, const unsigned char *, size_t); +void vectorF8DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vectorF32DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vectorF64DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vector1BitDeserializeFromBlob(Vector *, const unsigned char *, size_t); + +void vectorInitStatic(Vector *, VectorType, VectorDims, void *); void vectorInitFromBlob(Vector *, const unsigned char *, size_t); -void vectorF32InitFromBlob(Vector *, const unsigned char *, size_t); -void vectorF64InitFromBlob(Vector *, const unsigned char *, size_t); + +void vectorConvert(const Vector *, Vector *); /* Detect type and dimension of vector provided with first parameter of sqlite3_value * type */ int detectVectorParameters(sqlite3_value *, int, int *, int *, char **); +static inline unsigned serializeF32(unsigned char *pBuf, float value){ + u32 *p = (u32 *)&value; + pBuf[0] = *p & 0xFF; + pBuf[1] = (*p >> 8) & 0xFF; + pBuf[2] = (*p >> 16) & 0xFF; + pBuf[3] = (*p >> 24) & 0xFF; + return sizeof(float); +} + +static inline float deserializeF32(const unsigned char *pBuf){ + u32 value = 0; + value |= (u32)pBuf[0]; + value |= (u32)pBuf[1] << 8; + value |= (u32)pBuf[2] << 16; + value |= (u32)pBuf[3] << 24; + return *(float *)&value; +} + #if 0 } /* end of the 'extern "C"' block */ #endif @@ -85408,10 +85464,10 @@ int nodeEdgesMetadataOffset(const DiskAnnIndex *pIndex); void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Vector *pVector); void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector); u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot); -void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector); +void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, float *distance, Vector *pVector); int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u64 nRowid); void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPruned); -void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector); +void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, float distance, Vector *pVector); void nodeBinDeleteEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iDelete); void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot); @@ -85435,43 +85491,47 @@ typedef u8 MetricType; */ /* format version which can help to upgrade vector on-disk format without breaking older version of the db */ -#define VECTOR_FORMAT_PARAM_ID 1 +#define VECTOR_FORMAT_PARAM_ID 1 /* - * 1 - initial version + * 1 - v1 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u64 unused ] [u64 edge rowid] ] ... + * 2 - v2 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u32 unused] [f32 distance] [u64 edge rowid] ] ... */ -#define VECTOR_FORMAT_DEFAULT 1 +#define VECTOR_FORMAT_V1 1 +#define VECTOR_FORMAT_DEFAULT 2 /* type of the vector index */ -#define VECTOR_INDEX_TYPE_PARAM_ID 2 -#define VECTOR_INDEX_TYPE_DISKANN 1 +#define VECTOR_INDEX_TYPE_PARAM_ID 2 +#define VECTOR_INDEX_TYPE_DISKANN 1 /* type of the underlying vector for the vector index */ -#define VECTOR_TYPE_PARAM_ID 3 +#define VECTOR_TYPE_PARAM_ID 3 /* dimension of the underlying vector for the vector index */ -#define VECTOR_DIM_PARAM_ID 4 +#define VECTOR_DIM_PARAM_ID 4 /* metric type used for comparing two vectors */ -#define VECTOR_METRIC_TYPE_PARAM_ID 5 -#define VECTOR_METRIC_TYPE_COS 1 -#define VECTOR_METRIC_TYPE_L2 2 +#define VECTOR_METRIC_TYPE_PARAM_ID 5 +#define VECTOR_METRIC_TYPE_COS 1 +#define VECTOR_METRIC_TYPE_L2 2 /* block size */ -#define VECTOR_BLOCK_SIZE_PARAM_ID 6 -#define VECTOR_BLOCK_SIZE_DEFAULT 128 +#define VECTOR_BLOCK_SIZE_PARAM_ID 6 +#define VECTOR_BLOCK_SIZE_DEFAULT 128 -#define VECTOR_PRUNING_ALPHA_PARAM_ID 7 -#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2 +#define VECTOR_PRUNING_ALPHA_PARAM_ID 7 +#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2 -#define VECTOR_INSERT_L_PARAM_ID 8 -#define VECTOR_INSERT_L_DEFAULT 70 +#define VECTOR_INSERT_L_PARAM_ID 8 +#define VECTOR_INSERT_L_DEFAULT 70 -#define VECTOR_SEARCH_L_PARAM_ID 9 -#define VECTOR_SEARCH_L_DEFAULT 200 +#define VECTOR_SEARCH_L_PARAM_ID 9 +#define VECTOR_SEARCH_L_DEFAULT 200 -#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 +#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 + +#define VECTOR_COMPRESS_NEIGHBORS_PARAM_ID 11 /* total amount of vector index parameters */ -#define VECTOR_PARAM_IDS_COUNT 9 +#define VECTOR_PARAM_IDS_COUNT 11 /* * Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float) @@ -85553,7 +85613,7 @@ int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *); void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int); void vectorOutRowsFree(sqlite3 *, VectorOutRows *); -int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *); +int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *, const char **); int diskAnnClearIndex(sqlite3 *, const char *, const char *); int diskAnnDropIndex(sqlite3 *, const char *, const char *); int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **); @@ -123180,7 +123240,7 @@ static void SQLITE_NOINLINE deleteTable(sqlite3 *db, Table *pTable){ for(pIndex = pTable->pIndex; pIndex; pIndex=pNext){ pNext = pIndex->pNext; assert( pIndex->pSchema==pTable->pSchema - || (IsVirtual(pTable) && !IsAppDefIndex(pIndex)) ); + || (IsVirtual(pTable) && pIndex->idxType!=SQLITE_IDXTYPE_APPDEF) ); if( db->pnBytesFreed==0 && !IsVirtual(pTable) ){ char *zName = pIndex->zName; TESTONLY ( Index *pOld = ) sqlite3HashInsert( @@ -126692,13 +126752,12 @@ SQLITE_PRIVATE void sqlite3CreateIndex( goto exit_create_index; } if( vectorIdxRc >= 1 ){ - idxType = SQLITE_IDXTYPE_VECTOR; /* * SQLite can use B-Tree indices in some optimizations (like SELECT COUNT(*) can use any full B-Tree index instead of PK index) * But, SQLite pretty conservative about usage of unordered indices - that's what we need here */ pIndex->bUnordered = 1; - pIndex->idxType = idxType; + pIndex->idxIsVector = 1; } if( vectorIdxRc == 1 ){ skipRefill = 1; @@ -126746,7 +126805,7 @@ SQLITE_PRIVATE void sqlite3CreateIndex( for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){ int k; assert( IsUniqueIndex(pIdx) ); - assert( !IsAppDefIndex(pIdx) ); + assert( pIdx->idxType!=SQLITE_IDXTYPE_APPDEF ); assert( IsUniqueIndex(pIndex) ); if( pIdx->nKeyCol!=pIndex->nKeyCol ) continue; @@ -127027,7 +127086,7 @@ SQLITE_PRIVATE void sqlite3DropIndex(Parse *pParse, SrcList *pName, int ifExists pParse->checkSchema = 1; goto exit_drop_index; } - if( !IsAppDefIndex(pIndex) ){ + if( pIndex->idxType!=SQLITE_IDXTYPE_APPDEF ){ sqlite3ErrorMsg(pParse, "index associated with UNIQUE " "or PRIMARY KEY constraint cannot be dropped", 0); goto exit_drop_index; @@ -155952,6 +156011,10 @@ SQLITE_PRIVATE void sqlite3UpsertDoUpdate( /* #include "sqliteInt.h" */ /* #include "vdbeInt.h" */ +#ifndef SQLITE_OMIT_VECTOR +/* #include "vectorIndexInt.h" */ +#endif + #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* @@ -156229,6 +156292,27 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( if( rc!=SQLITE_OK ) goto end_of_vacuum; db->init.iDb = 0; +#ifndef SQLITE_OMIT_VECTOR + // shadow tables for vector index will be populated automatically during CREATE INDEX command + // so we must skip them at this step + if( sqlite3FindTable(db, VECTOR_INDEX_GLOBAL_META_TABLE, zDbMain) != NULL ){ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0 AND name NOT IN (SELECT name||'_shadow' FROM " VECTOR_INDEX_GLOBAL_META_TABLE ")", + zDbMain + ); + }else{ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0", + zDbMain + ); + } +#else /* Loop through the tables in the main database. For each, do ** an "INSERT INTO vacuum_db.xxx SELECT * FROM main.xxx;" to copy ** the contents to the temporary database. @@ -156240,6 +156324,7 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( "WHERE type='table'AND coalesce(rootpage,1)>0", zDbMain ); +#endif assert( (db->mDbFlags & DBFLAG_Vacuum)!=0 ); db->mDbFlags &= ~DBFLAG_Vacuum; if( rc!=SQLITE_OK ) goto end_of_vacuum; @@ -177876,9 +177961,6 @@ static YYACTIONTYPE yy_reduce( case 242: /* cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm indextype ON nm LP sortlist RP where_opt */ { u8 idxType = SQLITE_IDXTYPE_APPDEF; - if( yymsp[-6].minor.yy421.pUsing!=0 ){ - idxType = SQLITE_IDXTYPE_VECTOR; - } sqlite3CreateIndex(pParse, &yymsp[-8].minor.yy0, &yymsp[-7].minor.yy0, sqlite3SrcListAppend(pParse,0,&yymsp[-4].minor.yy0,0), yymsp[-2].minor.yy402, yymsp[-11].minor.yy502, &yymsp[-12].minor.yy0, yymsp[0].minor.yy590, SQLITE_SO_ASC, yymsp[-9].minor.yy502, idxType, yymsp[-6].minor.yy421.pUsing); @@ -210953,6 +211035,10 @@ size_t vectorDataSize(VectorType type, VectorDims dims){ return dims * sizeof(float); case VECTOR_TYPE_FLOAT64: return dims * sizeof(double); + case VECTOR_TYPE_FLOAT1BIT: + return (dims + 7) / 8; + case VECTOR_TYPE_FLOAT8: + return ALIGN(dims, sizeof(float)) + sizeof(float) /* alpha */ + sizeof(float) /* shift */; default: assert(0); } @@ -210984,10 +211070,11 @@ Vector *vectorAlloc(VectorType type, VectorDims dims){ ** Note that the vector object points to the blob so if ** you free the blob, the vector becomes invalid. **/ -void vectorInitStatic(Vector *pVector, VectorType type, const unsigned char *pBlob, size_t nBlobSize){ - pVector->type = type; +void vectorInitStatic(Vector *pVector, VectorType type, VectorDims dims, void *pBlob){ pVector->flags = VECTOR_FLAGS_STATIC; - vectorInitFromBlob(pVector, pBlob, nBlobSize); + pVector->type = type; + pVector->dims = dims; + pVector->data = pBlob; } /* @@ -211023,6 +211110,10 @@ float vectorDistanceCos(const Vector *pVector1, const Vector *pVector2){ return vectorF32DistanceCos(pVector1, pVector2); case VECTOR_TYPE_FLOAT64: return vectorF64DistanceCos(pVector1, pVector2); + case VECTOR_TYPE_FLOAT1BIT: + return vector1BitDistanceHamming(pVector1, pVector2); + case VECTOR_TYPE_FLOAT8: + return vectorF8DistanceCos(pVector1, pVector2); default: assert(0); } @@ -211036,6 +211127,8 @@ float vectorDistanceL2(const Vector *pVector1, const Vector *pVector2){ return vectorF32DistanceL2(pVector1, pVector2); case VECTOR_TYPE_FLOAT64: return vectorF64DistanceL2(pVector1, pVector2); + case VECTOR_TYPE_FLOAT8: + return vectorF8DistanceL2(pVector1, pVector2); default: assert(0); } @@ -211159,16 +211252,97 @@ static int vectorParseSqliteText( return -1; } -int vectorParseSqliteBlob( +static int vectorParseMeta(const unsigned char *pBlob, size_t nBlobSize, int *pType, int *pDims, size_t *pDataSize, char **pzErrMsg){ + int nTrailingBits; + int nTrailingBytes; + + if( nBlobSize % 2 == 0 ){ + *pType = VECTOR_TYPE_FLOAT32; + *pDims = nBlobSize / sizeof(float); + *pDataSize = nBlobSize; + return SQLITE_OK; + } + *pType = pBlob[nBlobSize - 1]; + nBlobSize--; + + if( *pType == VECTOR_TYPE_FLOAT32 ){ + if( nBlobSize % 4 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float32 vector blob length must be divisible by 4 (excluding optional 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + *pDims = nBlobSize / sizeof(float); + *pDataSize = nBlobSize; + }else if( *pType == VECTOR_TYPE_FLOAT64 ){ + if( nBlobSize % 8 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float64 vector blob length must be divisible by 8 (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + *pDims = nBlobSize / sizeof(double); + *pDataSize = nBlobSize; + }else if( *pType == VECTOR_TYPE_FLOAT1BIT ){ + if( nBlobSize == 0 || nBlobSize % 2 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float1bit vector blob length must be divisible by 2 and not be empty (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + nTrailingBits = pBlob[nBlobSize - 1]; + *pDims = nBlobSize * 8 - nTrailingBits; + *pDataSize = (*pDims + 7) / 8; + }else if( *pType == VECTOR_TYPE_FLOAT8 ){ + if( nBlobSize < 2 || nBlobSize % 2 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float8 vector blob length must be divisible by 2 and has at least 2 bytes (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + nTrailingBytes = pBlob[nBlobSize - 1]; + *pDims = (nBlobSize - 2) - sizeof(float) - sizeof(float) - nTrailingBytes; + *pDataSize = nBlobSize - 2; + }else{ + *pzErrMsg = sqlite3_mprintf("vector: unexpected binary type: %d", *pType); + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +int vectorParseSqliteBlobWithType( sqlite3_value *arg, Vector *pVector, char **pzErrMsg ){ + const unsigned char *pBlob; + size_t nBlobSize, nDataSize; + int type, dims; + + assert( sqlite3_value_type(arg) == SQLITE_BLOB ); + + pBlob = sqlite3_value_blob(arg); + nBlobSize = sqlite3_value_bytes(arg); + if( vectorParseMeta(pBlob, nBlobSize, &type, &dims, &nDataSize, pzErrMsg) != SQLITE_OK ){ + return SQLITE_ERROR; + } + + if( nDataSize != vectorDataSize(pVector->type, pVector->dims) ){ + *pzErrMsg = sqlite3_mprintf( + "vector: unexpected data part size: type=%d, dims=%d, %u != %u", + pVector->type, + pVector->dims, + nDataSize, + vectorDataSize(pVector->type, pVector->dims) + ); + return SQLITE_ERROR; + } + switch (pVector->type) { case VECTOR_TYPE_FLOAT32: - return vectorF32ParseSqliteBlob(arg, pVector, pzErrMsg); + vectorF32DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; case VECTOR_TYPE_FLOAT64: - return vectorF64ParseSqliteBlob(arg, pVector, pzErrMsg); + vectorF64DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitDeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; + case VECTOR_TYPE_FLOAT8: + vectorF8DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; default: assert(0); } @@ -211177,32 +211351,21 @@ int vectorParseSqliteBlob( int detectBlobVectorParameters(sqlite3_value *arg, int *pType, int *pDims, char **pzErrMsg) { const u8 *pBlob; - int nBlobSize; + size_t nBlobSize, nDataSize; assert( sqlite3_value_type(arg) == SQLITE_BLOB ); pBlob = sqlite3_value_blob(arg); nBlobSize = sqlite3_value_bytes(arg); - if( nBlobSize % 2 != 0 ){ - // we have trailing byte with explicit type definition - *pType = pBlob[nBlobSize - 1]; - } else { - // else, fallback to FLOAT32 - *pType = VECTOR_TYPE_FLOAT32; - } - if( *pType == VECTOR_TYPE_FLOAT32 ){ - *pDims = nBlobSize / sizeof(float); - } else if( *pType == VECTOR_TYPE_FLOAT64 ){ - *pDims = nBlobSize / sizeof(double); - } else{ - *pzErrMsg = sqlite3_mprintf("vector: unexpected binary type: got %d, expected %d or %d", *pType, VECTOR_TYPE_FLOAT32, VECTOR_TYPE_FLOAT64); - return -1; + + if( vectorParseMeta(pBlob, nBlobSize, pType, pDims, &nDataSize, pzErrMsg) != SQLITE_OK ){ + return SQLITE_ERROR; } if( *pDims > MAX_VECTOR_SZ ){ *pzErrMsg = sqlite3_mprintf("vector: max size exceeded: %d > %d", *pDims, MAX_VECTOR_SZ); - return -1; + return SQLITE_ERROR; } - return 0; + return SQLITE_OK; } int detectTextVectorParameters(sqlite3_value *arg, int typeHint, int *pType, int *pDims, char **pzErrMsg) { @@ -211251,14 +211414,14 @@ int detectVectorParameters(sqlite3_value *arg, int typeHint, int *pType, int *pD } } -int vectorParse( +int vectorParseWithType( sqlite3_value *arg, Vector *pVector, char **pzErrMsg ){ switch( sqlite3_value_type(arg) ){ case SQLITE_BLOB: - return vectorParseSqliteBlob(arg, pVector, pzErrMsg); + return vectorParseSqliteBlobWithType(arg, pVector, pzErrMsg); case SQLITE_TEXT: return vectorParseSqliteText(arg, pVector, pzErrMsg); default: @@ -211275,6 +211438,12 @@ void vectorDump(const Vector *pVector){ case VECTOR_TYPE_FLOAT64: vectorF64Dump(pVector); break; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitDump(pVector); + break; + case VECTOR_TYPE_FLOAT8: + vectorF8Dump(pVector); + break; default: assert(0); } @@ -211296,56 +211465,326 @@ void vectorMarshalToText( } } -void vectorSerialize( +static int vectorMetaSize(VectorType type, VectorDims dims){ + int nDataSize; + if( type == VECTOR_TYPE_FLOAT32 ){ + return 0; + }else if( type == VECTOR_TYPE_FLOAT64 ){ + return 1; + }else if( type == VECTOR_TYPE_FLOAT1BIT ){ + nDataSize = vectorDataSize(type, dims); + // optional padding byte + "trailing-bits" byte + "vector-type" byte + return (nDataSize % 2 == 0 ? 1 : 0) + 1 + 1; + }else if( type == VECTOR_TYPE_FLOAT8 ){ + nDataSize = vectorDataSize(type, dims); + assert( nDataSize % 2 == 0 ); + /* padding byte + "trailing-bytes" byte + "vector-type" byte */ + return 1 + 1 + 1; + }else{ + assert( 0 ); + } +} + +static void vectorSerializeMeta(const Vector *pVector, size_t nDataSize, unsigned char *pBlob, size_t nBlobSize){ + if( pVector->type == VECTOR_TYPE_FLOAT32 ){ + // no meta for f32 type as this is "default" vector type + }else if( pVector->type == VECTOR_TYPE_FLOAT64 ){ + assert( nDataSize % 2 == 0 ); + assert( nBlobSize == nDataSize + 1 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT64; + }else if( pVector->type == VECTOR_TYPE_FLOAT1BIT ){ + assert( nBlobSize % 2 == 1 ); + assert( nBlobSize >= 3 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT1BIT; + pBlob[nBlobSize - 2] = 8 * (nBlobSize - 1) - pVector->dims; + if( vectorMetaSize(pVector->type, pVector->dims) == 3 ){ + pBlob[nBlobSize - 3] = 0; + } + }else if( pVector->type == VECTOR_TYPE_FLOAT8 ){ + assert( nBlobSize % 2 == 1 ); + assert( nDataSize % 2 == 0 ); + assert( nBlobSize == nDataSize + 3 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT8; + pBlob[nBlobSize - 2] = ALIGN(pVector->dims, sizeof(float)) - pVector->dims; + }else{ + assert( 0 ); + } +} + +void vectorSerializeWithMeta( sqlite3_context *context, const Vector *pVector ){ + unsigned char *pBlob; + size_t nBlobSize, nDataSize, nMetaSize; + + assert( pVector->dims <= MAX_VECTOR_SZ ); + + nDataSize = vectorDataSize(pVector->type, pVector->dims); + nMetaSize = vectorMetaSize(pVector->type, pVector->dims); + nBlobSize = nDataSize + nMetaSize; + if( nBlobSize == 0 ){ + sqlite3_result_zeroblob(context, 0); + return; + } + + pBlob = sqlite3_malloc64(nBlobSize); + if( pBlob == NULL ){ + sqlite3_result_error_nomem(context); + return; + } + + vectorSerializeToBlob(pVector, pBlob, nDataSize); + vectorSerializeMeta(pVector, nDataSize, pBlob, nBlobSize); + sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); +} + +void vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t nBlobSize){ switch (pVector->type) { case VECTOR_TYPE_FLOAT32: - vectorF32Serialize(context, pVector); + vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); break; case VECTOR_TYPE_FLOAT64: - vectorF64Serialize(context, pVector); + vectorF64SerializeToBlob(pVector, pBlob, nBlobSize); + break; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitSerializeToBlob(pVector, pBlob, nBlobSize); + break; + case VECTOR_TYPE_FLOAT8: + vectorF8SerializeToBlob(pVector, pBlob, nBlobSize); break; default: assert(0); } } -size_t vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - return vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); - case VECTOR_TYPE_FLOAT64: - return vectorF64SerializeToBlob(pVector, pBlob, nBlobSize); - default: - assert(0); +void vectorInitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ + pVector->data = (void*)pBlob; +} + +static void vectorConvertFromF32(const Vector *pFrom, Vector *pTo){ + int i; + float *src; + + u8 *dst1Bit; + double *dstF64; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT32 ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF64[i] = src[i]; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( src[i] > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); } - return 0; } -size_t vectorDeserializeFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - return vectorF32DeserializeFromBlob(pVector, pBlob, nBlobSize); - case VECTOR_TYPE_FLOAT64: - return vectorF64DeserializeFromBlob(pVector, pBlob, nBlobSize); - default: - assert(0); +static void vectorConvertFromF64(const Vector *pFrom, Vector *pTo){ + int i; + double *src; + + u8 *dst1Bit; + float *dstF32; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT64 ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF32[i] = src[i]; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( src[i] > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); } - return 0; } -void vectorInitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - vectorF32InitFromBlob(pVector, pBlob, nBlobSize); - break; - case VECTOR_TYPE_FLOAT64: - vectorF64InitFromBlob(pVector, pBlob, nBlobSize); - break; - default: - assert(0); +static void vectorConvertFrom1Bit(const Vector *pFrom, Vector *pTo){ + int i; + u8 *src; + + float *dstF32; + double *dstF64; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT1BIT ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + if( ((src[i / 8] >> (i & 7)) & 1) == 1 ){ + dstF32[i] = +1; + }else{ + dstF32[i] = -1; + } + } + }else if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + if( ((src[i / 8] >> (i & 7)) & 1) == 1 ){ + dstF64[i] = +1; + }else{ + dstF64[i] = -1; + } + } + }else{ + assert( 0 ); + } +} + +static void vectorConvertFromF8(const Vector *pFrom, Vector *pTo){ + int i; + u8 *src; + float alpha, shift; + + float *dstF32; + double *dstF64; + u8 *dst1Bit; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(pFrom->data, pFrom->dims, &alpha, &shift); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF32[i] = alpha * src[i] + shift; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF64[i] = alpha * src[i] + shift; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( (alpha * src[i] + shift) > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); + } +} + +static inline int clip(float f, int minF, int maxF){ + if( f < minF ){ + return minF; + }else if( f > maxF ){ + return maxF; + } + return (int)(f + 0.5); +} + +#define MINMAX(i, value, minValue, maxValue) {if(i == 0){ minValue = (value); maxValue = (value);} else { minValue = MIN(minValue, (value)); maxValue = MAX(maxValue, (value)); }} + +static void vectorConvertToF8(const Vector *pFrom, Vector *pTo){ + int i; + u8 *dst; + float alpha, shift; + float minF = 0, maxF = 0; + + float *srcF32; + double *srcF64; + u8 *src1Bit; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pTo->type == VECTOR_TYPE_FLOAT8 ); + + dst = pTo->data; + if( pFrom->type == VECTOR_TYPE_FLOAT32 ){ + srcF32 = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, srcF32[i], minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip((srcF32[i] - shift) / alpha, 0, 255); + } + }else if( pFrom->type == VECTOR_TYPE_FLOAT64 ){ + srcF64 = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, srcF64[i], minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip((srcF64[i] - shift) / alpha, 0, 255); + } + }else if( pFrom->type == VECTOR_TYPE_FLOAT1BIT ){ + src1Bit = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, ((src1Bit[i / 8] >> (i & 7)) & 1) ? +1 : -1, minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip(((((src1Bit[i / 8] >> (i & 7)) & 1) ? +1 : -1) - shift) / alpha, 0, 255); + } + }else{ + assert( 0 ); + } + vectorF8SetParameters(pTo->data, pTo->dims, alpha, shift); +} + + +void vectorConvert(const Vector *pFrom, Vector *pTo){ + assert( pFrom->dims == pTo->dims ); + + if( pFrom->type == pTo->type ){ + memcpy(pTo->data, pFrom->data, vectorDataSize(pFrom->type, pFrom->dims)); + return; + } + + if( pTo->type == VECTOR_TYPE_FLOAT8 ){ + vectorConvertToF8(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT32 ){ + vectorConvertFromF32(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT64 ){ + vectorConvertFromF64(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT1BIT ){ + vectorConvertFrom1Bit(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT8 ){ + vectorConvertFromF8(pFrom, pTo); + }else{ + assert( 0 ); } } @@ -211360,31 +211799,49 @@ static void vectorFuncHintedType( sqlite3_context *context, int argc, sqlite3_value **argv, - int typeHint + int targetType ){ char *pzErrMsg = NULL; - Vector *pVector; - int type, dims; + Vector *pVector = NULL, *pTarget = NULL; + int type, dims, typeHint = VECTOR_TYPE_FLOAT32; if( argc < 1 ){ - return; + goto out; + } + // simplification in order to support only parsing from text to f32 and f64 vectors + if( targetType == VECTOR_TYPE_FLOAT64 ){ + typeHint = targetType; } if( detectVectorParameters(argv[0], typeHint, &type, &dims, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - return; + goto out; } pVector = vectorContextAlloc(context, type, dims); - if( pVector==NULL ){ - return; + if( pVector == NULL ){ + goto out; } - if( vectorParse(argv[0], pVector, &pzErrMsg) != 0 ){ + if( vectorParseWithType(argv[0], pVector, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - goto out_free_vec; + goto out; + } + if( type == targetType ){ + vectorSerializeWithMeta(context, pVector); + }else{ + pTarget = vectorContextAlloc(context, targetType, dims); + if( pTarget == NULL ){ + goto out; + } + vectorConvert(pVector, pTarget); + vectorSerializeWithMeta(context, pTarget); + } +out: + if( pVector != NULL ){ + vectorFree(pVector); + } + if( pTarget != NULL ){ + vectorFree(pTarget); } - vectorSerialize(context, pVector); -out_free_vec: - vectorFree(pVector); } static void vector32Func( @@ -211402,6 +211859,22 @@ static void vector64Func( vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT64); } +static void vector8Func( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT8); +} + +static void vector1BitFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT1BIT); +} + /* ** Implementation of vector_extract(X) function. */ @@ -211411,39 +211884,51 @@ static void vectorExtractFunc( sqlite3_value **argv ){ char *pzErrMsg = NULL; - Vector *pVector; + Vector *pVector = NULL, *pTarget = NULL; unsigned i; int type, dims; if( argc < 1 ){ - return; + goto out; } if( detectVectorParameters(argv[0], 0, &type, &dims, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - return; + goto out; } pVector = vectorContextAlloc(context, type, dims); - if( pVector==NULL ){ - return; + if( pVector == NULL ){ + goto out; } - if( vectorParse(argv[0], pVector, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[0], pVector, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - goto out_free; + goto out; + } + if( pVector->type == VECTOR_TYPE_FLOAT32 || pVector->type == VECTOR_TYPE_FLOAT64 ){ + vectorMarshalToText(context, pVector); + }else{ + pTarget = vectorContextAlloc(context, VECTOR_TYPE_FLOAT32, dims); + if( pTarget == NULL ){ + goto out; + } + vectorConvert(pVector, pTarget); + vectorMarshalToText(context, pTarget); + } +out: + if( pVector != NULL ){ + vectorFree(pVector); + } + if( pTarget != NULL ){ + vectorFree(pTarget); } - vectorMarshalToText(context, pVector); -out_free: - vectorFree(pVector); } -/* -** Implementation of vector_distance_cos(X, Y) function. -*/ -static void vectorDistanceCosFunc( +static void vectorDistanceFunc( sqlite3_context *context, int argc, - sqlite3_value **argv + sqlite3_value **argv, + float (*vectorDistance)(const Vector *pVector1, const Vector *pVector2) ){ char *pzErrMsg = NULL; Vector *pVector1 = NULL, *pVector2 = NULL; @@ -211463,13 +211948,19 @@ static void vectorDistanceCosFunc( goto out_free; } if( type1 != type2 ){ - pzErrMsg = sqlite3_mprintf("vector_distance_cos: vectors must have the same type: %d != %d", type1, type2); + pzErrMsg = sqlite3_mprintf("vector_distance: vectors must have the same type: %d != %d", type1, type2); sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } if( dims1 != dims2 ){ - pzErrMsg = sqlite3_mprintf("vector_distance_cos: vectors must have the same length: %d != %d", dims1, dims2); + pzErrMsg = sqlite3_mprintf("vector_distance: vectors must have the same length: %d != %d", dims1, dims2); + sqlite3_result_error(context, pzErrMsg, -1); + sqlite3_free(pzErrMsg); + goto out_free; + } + if( vectorDistance == vectorDistanceL2 && type1 == VECTOR_TYPE_FLOAT1BIT ){ + pzErrMsg = sqlite3_mprintf("vector_distance: l2 distance is not supported for float1bit vectors", dims1, dims2); sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; @@ -211482,17 +211973,17 @@ static void vectorDistanceCosFunc( if( pVector2==NULL ){ goto out_free; } - if( vectorParse(argv[0], pVector1, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[0], pVector1, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } - if( vectorParse(argv[1], pVector2, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[1], pVector2, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } - sqlite3_result_double(context, vectorDistanceCos(pVector1, pVector2)); + sqlite3_result_double(context, vectorDistance(pVector1, pVector2)); out_free: if( pVector2 ){ vectorFree(pVector2); @@ -211502,6 +211993,20 @@ static void vectorDistanceCosFunc( } } +/* +** Implementation of vector_distance_cos(X, Y) function. +*/ +static void vectorDistanceCosFunc(sqlite3_context *context, int argc, sqlite3_value **argv){ + vectorDistanceFunc(context, argc, argv, vectorDistanceCos); +} + +/* +** Implementation of vector_distance_l2(X, Y) function. +*/ +static void vectorDistanceL2Func(sqlite3_context *context, int argc, sqlite3_value **argv){ + vectorDistanceFunc(context, argc, argv, vectorDistanceL2); +} + /* * Marker function which is used in index creation syntax: CREATE INDEX idx ON t(libsql_vector_idx(emb)); */ @@ -211518,8 +212023,11 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ FUNCTION(vector, 1, 0, 0, vector32Func), FUNCTION(vector32, 1, 0, 0, vector32Func), FUNCTION(vector64, 1, 0, 0, vector64Func), + FUNCTION(vector1bit, 1, 0, 0, vector1BitFunc), + FUNCTION(vector8, 1, 0, 0, vector8Func), FUNCTION(vector_extract, 1, 0, 0, vectorExtractFunc), FUNCTION(vector_distance_cos, 2, 0, 0, vectorDistanceCosFunc), + FUNCTION(vector_distance_l2, 2, 0, 0, vectorDistanceL2Func), FUNCTION(libsql_vector_idx, -1, 0, 0, libsqlVectorIdx), }; @@ -211585,7 +212093,7 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ /* #include "sqliteInt.h" */ /* #include "vectorIndexInt.h" */ -#define SQLITE_VECTOR_TRACE +// #define SQLITE_VECTOR_TRACE #if defined(SQLITE_DEBUG) && defined(SQLITE_VECTOR_TRACE) #define DiskAnnTrace(X) sqlite3DebugPrintf X; #else @@ -211611,9 +212119,19 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ #define VECTOR_NODE_METADATA_SIZE (sizeof(u64) + sizeof(u16)) #define VECTOR_EDGE_METADATA_SIZE (sizeof(u64) + sizeof(u64)) +typedef struct VectorPair VectorPair; typedef struct DiskAnnSearchCtx DiskAnnSearchCtx; typedef struct DiskAnnNode DiskAnnNode; +// VectorPair represents single vector where pNode is an exact representation and pEdge - compressed representation +// (pEdge pointer always equals to pNode if pNodeType == pEdgeType) +struct VectorPair { + int nodeType; + int edgeType; + Vector *pNode; + Vector *pEdge; +}; + // DiskAnnNode represents single node in the DiskAnn graph struct DiskAnnNode { u64 nRowid; /* node id */ @@ -211629,14 +212147,18 @@ struct DiskAnnNode { * so caller which puts nodes in the context can forget about resource managmenet (context will take care of this) */ struct DiskAnnSearchCtx { - const Vector *pQuery; /* initial query vector; user query for SELECT and row vector for INSERT */ - DiskAnnNode **aCandidates; /* array of candidates ordered by distance to the query (ascending) */ - double *aDistances; /* array of distances to the query vector */ - unsigned int nCandidates; /* current size of aCandidates/aDistances arrays */ - unsigned int maxCandidates; /* max size of aCandidates/aDistances arrays */ - DiskAnnNode *visitedList; /* list of all visited candidates (so, candidates from aCandidates array either got replaced or moved to the visited list) */ - unsigned int nUnvisited; /* amount of unvisited candidates in the aCadidates array */ - int blobMode; /* DISKANN_BLOB_READONLY if we wont modify node blobs; DISKANN_BLOB_WRITABLE - otherwise */ + VectorPair query; /* initial query vector; user query for SELECT and row vector for INSERT */ + DiskAnnNode **aCandidates; /* array of unvisited candidates ordered by distance (possibly approximate) to the query (ascending) */ + float *aDistances; /* array of distances (possible approximate) to the query vector */ + unsigned int nCandidates; /* current size of aCandidates/aDistances arrays */ + unsigned int maxCandidates; /* max size of aCandidates/aDistances arrays */ + DiskAnnNode **aTopCandidates; /* top candidates with exact distance calculated */ + float *aTopDistances; /* top candidates exact distances */ + int nTopCandidates; /* current size of aTopCandidates/aTopDistances arrays */ + int maxTopCandidates; /* max size of aTopCandidates/aTopDistances arrays */ + DiskAnnNode *visitedList; /* list of all visited candidates (so, candidates from aCandidates array either got replaced or moved to the visited list) */ + unsigned int nUnvisited; /* amount of unvisited candidates in the aCadidates array */ + int blobMode; /* DISKANN_BLOB_READONLY if we wont modify node blobs; DISKANN_BLOB_WRITABLE - otherwise */ }; /************************************************************************** @@ -211647,6 +212169,10 @@ static inline u16 readLE16(const unsigned char *p){ return (u16)p[0] | (u16)p[1] << 8; } +static inline u32 readLE32(const unsigned char *p){ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16 | (u32)p[3] << 24; +} + static inline u64 readLE64(const unsigned char *p){ return (u64)p[0] | (u64)p[1] << 8 @@ -211663,6 +212189,13 @@ static inline void writeLE16(unsigned char *p, u16 v){ p[1] = v >> 8; } +static inline void writeLE32(unsigned char *p, u32 v){ + p[0] = v; + p[1] = v >> 8; + p[2] = v >> 16; + p[3] = v >> 24; +} + static inline void writeLE64(unsigned char *p, u64 v){ p[0] = v; p[1] = v >> 8; @@ -211842,7 +212375,7 @@ void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Ve void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector) { assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize ); - vectorInitStatic(pVector, pIndex->nNodeVectorType, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE, pIndex->nNodeVectorSize); + vectorInitStatic(pVector, pIndex->nNodeVectorType, pIndex->nVectorDims, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE); } u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { @@ -211851,20 +212384,25 @@ u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { return readLE16(pBlobSpot->pBuffer + sizeof(u64)); } -void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector) { +void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, float *pDistance, Vector *pVector) { + u32 distance; int offset = nodeEdgesMetadataOffset(pIndex); if( pRowid != NULL ){ assert( offset + (iEdge + 1) * VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize ); *pRowid = readLE64(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u64)); } + if( pIndex->nFormatVersion != VECTOR_FORMAT_V1 && pDistance != NULL ){ + distance = readLE32(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u32)); + *pDistance = *((float*)&distance); + } if( pVector != NULL ){ assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize < offset ); vectorInitStatic( pVector, pIndex->nEdgeVectorType, - pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nNodeVectorSize, - pIndex->nEdgeVectorSize + pIndex->nVectorDims, + pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize ); } } @@ -211874,7 +212412,7 @@ int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u6 // todo: if edges will be sorted by identifiers we can use binary search here (although speed up will be visible only on pretty loaded nodes: >128 edges) for(i = 0; i < nEdges; i++){ u64 edgeId; - nodeBinEdge(pIndex, pBlobSpot, i, &edgeId, NULL); + nodeBinEdge(pIndex, pBlobSpot, i, &edgeId, NULL, NULL); if( edgeId == nRowid ){ return i; } @@ -211889,7 +212427,7 @@ void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPru } // replace edge at position iReplace or add new one if iReplace == nEdges -void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector) { +void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, float distance, Vector *pVector) { int nMaxEdges = nodeEdgesMaxCount(pIndex); int nEdges = nodeBinEdges(pIndex, pBlobSpot); int edgeVectorOffset, edgeMetaOffset, itemsToMove; @@ -211908,6 +212446,7 @@ void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iRe assert( edgeMetaOffset + VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize ); vectorSerializeToBlob(pVector, pBlobSpot->pBuffer + edgeVectorOffset, pIndex->nEdgeVectorSize); + writeLE32(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u32), *((u32*)&distance)); writeLE64(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u64), nRowid); writeLE16(pBlobSpot->pBuffer + sizeof(u64), nEdges); @@ -211942,6 +212481,7 @@ void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { #if defined(SQLITE_DEBUG) && defined(SQLITE_VECTOR_TRACE) int nEdges, nMaxEdges, i; u64 nRowid; + float distance = 0; Vector vector; nEdges = nodeBinEdges(pIndex, pBlobSpot); @@ -211952,8 +212492,8 @@ void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { DiskAnnTrace((" nEdges=%d, nMaxEdges=%d, vector=", nEdges, nMaxEdges)); vectorDump(&vector); for(i = 0; i < nEdges; i++){ - nodeBinEdge(pIndex, pBlobSpot, i, &nRowid, &vector); - DiskAnnTrace((" to=%lld, vector=", nRowid, nRowid)); + nodeBinEdge(pIndex, pBlobSpot, i, &nRowid, &distance, &vector); + DiskAnnTrace((" to=%lld, distance=%f, vector=", nRowid, distance)); vectorDump(&vector); } #endif @@ -211968,12 +212508,14 @@ int diskAnnCreateIndex( const char *zDbSName, const char *zIdxName, const VectorIdxKey *pKey, - VectorIdxParams *pParams + VectorIdxParams *pParams, + const char **pzErrMsg ){ int rc; - int type, dims; + int type, dims, metric, neighbours; u64 maxNeighborsParam, blockSizeBytes; char *zSql; + const char *zRowidColumnName; char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){ @@ -211995,24 +212537,36 @@ int diskAnnCreateIndex( } assert( 0 < dims && dims <= MAX_VECTOR_SZ ); + metric = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID); + if( metric == 0 ){ + metric = VECTOR_METRIC_TYPE_COS; + if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, metric) != 0 ){ + return SQLITE_ERROR; + } + } + neighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID); + if( neighbours == VECTOR_TYPE_FLOAT1BIT && metric != VECTOR_METRIC_TYPE_COS ){ + *pzErrMsg = "1-bit compression available only for cosine metric"; + return SQLITE_ERROR; + } + if( neighbours == 0 ){ + neighbours = type; + } + maxNeighborsParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_NEIGHBORS_PARAM_ID); if( maxNeighborsParam == 0 ){ // 3 D**(1/2) gives good recall values (90%+) // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound - maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(neighbours, dims)) + 1); } - blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(neighbours, dims)); if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ return SQLITE_ERROR; } if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){ return SQLITE_ERROR; } - if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){ - if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){ - return SQLITE_ERROR; - } - } + if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){ if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){ return SQLITE_ERROR; @@ -212041,6 +212595,7 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "index_key"; }else{ zSql = sqlite3MPrintf( db, @@ -212050,7 +212605,29 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "rowid"; + } + rc = sqlite3_exec(db, zSql, 0, 0, 0); + sqlite3DbFree(db, zSql); + if( rc != SQLITE_OK ){ + return rc; } + /* + * vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB) + * in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell + * + * as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them + * so, in order to remove this overhead for random row selection - we creating an index with just single column used + * in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small + */ + zSql = sqlite3MPrintf( + db, + "CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)", + zDbSName, + zIdxName, + zIdxName, + zRowidColumnName + ); rc = sqlite3_exec(db, zSql, 0, 0, 0); sqlite3DbFree(db, zSql); return rc; @@ -212082,8 +212659,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid) zSql = sqlite3MPrintf( pIndex->db, - "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)", - pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow + "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)", + pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow ); if( zSql == NULL ){ rc = SQLITE_NOMEM_BKPT; @@ -212327,6 +212904,83 @@ static int diskAnnDeleteShadowRow(const DiskAnnIndex *pIndex, i64 nRowid){ return rc; } +/************************************************************************** +** Generic utilities +**************************************************************************/ + +int initVectorPair(int nodeType, int edgeType, int dims, VectorPair *pPair){ + pPair->nodeType = nodeType; + pPair->edgeType = edgeType; + pPair->pNode = NULL; + pPair->pEdge = NULL; + if( pPair->nodeType == pPair->edgeType ){ + return 0; + } + pPair->pEdge = vectorAlloc(edgeType, dims); + if( pPair->pEdge == NULL ){ + return SQLITE_NOMEM_BKPT; + } + return 0; +} + +void loadVectorPair(VectorPair *pPair, const Vector *pVector){ + pPair->pNode = (Vector*)pVector; + if( pPair->edgeType != pPair->nodeType ){ + vectorConvert(pPair->pNode, pPair->pEdge); + }else{ + pPair->pEdge = pPair->pNode; + } +} + +void deinitVectorPair(VectorPair *pPair) { + if( pPair->pEdge != NULL && pPair->pNode != pPair->pEdge ){ + vectorFree(pPair->pEdge); + } +} + +int distanceBufferInsertIdx(const float *aDistances, int nSize, int nMaxSize, float distance){ + int i; +#ifdef SQLITE_DEBUG + for(i = 0; i < nSize - 1; i++){ + assert(aDistances[i] <= aDistances[i + 1]); + } +#endif + for(i = 0; i < nSize; i++){ + if( distance < aDistances[i] ){ + return i; + } + } + return nSize < nMaxSize ? nSize : -1; +} + +void bufferInsert(u8 *aBuffer, int nSize, int nMaxSize, int iInsert, int nItemSize, const u8 *pItem, u8 *pLast) { + int itemsToMove; + + assert( nMaxSize > 0 && nItemSize > 0 ); + assert( nSize <= nMaxSize ); + assert( 0 <= iInsert && iInsert <= nSize && iInsert < nMaxSize ); + + if( nSize == nMaxSize ){ + if( pLast != NULL ){ + memcpy(pLast, aBuffer + (nSize - 1) * nItemSize, nItemSize); + } + nSize--; + } + itemsToMove = nSize - iInsert; + memmove(aBuffer + (iInsert + 1) * nItemSize, aBuffer + iInsert * nItemSize, itemsToMove * nItemSize); + memcpy(aBuffer + iInsert * nItemSize, pItem, nItemSize); +} + +void bufferDelete(u8 *aBuffer, int nSize, int iDelete, int nItemSize) { + int itemsToMove; + + assert( nItemSize > 0 ); + assert( 0 <= iDelete && iDelete < nSize ); + + itemsToMove = nSize - iDelete - 1; + memmove(aBuffer + iDelete * nItemSize, aBuffer + (iDelete + 1) * nItemSize, itemsToMove * nItemSize); +} + /************************************************************************** ** DiskANN internals **************************************************************************/ @@ -212363,26 +213017,40 @@ static void diskAnnNodeFree(DiskAnnNode *pNode){ sqlite3_free(pNode); } -static int diskAnnSearchCtxInit(DiskAnnSearchCtx *pCtx, const Vector* pQuery, unsigned int maxCandidates, int blobMode){ - pCtx->pQuery = pQuery; +static int diskAnnSearchCtxInit(const DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, const Vector* pQuery, int maxCandidates, int topCandidates, int blobMode){ + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &pCtx->query) != 0 ){ + return SQLITE_NOMEM_BKPT; + } + loadVectorPair(&pCtx->query, pQuery); + pCtx->aDistances = sqlite3_malloc(maxCandidates * sizeof(double)); pCtx->aCandidates = sqlite3_malloc(maxCandidates * sizeof(DiskAnnNode*)); pCtx->nCandidates = 0; pCtx->maxCandidates = maxCandidates; + pCtx->aTopDistances = sqlite3_malloc(topCandidates * sizeof(double)); + pCtx->aTopCandidates = sqlite3_malloc(topCandidates * sizeof(DiskAnnNode*)); + pCtx->nTopCandidates = 0; + pCtx->maxTopCandidates = topCandidates; pCtx->visitedList = NULL; pCtx->nUnvisited = 0; pCtx->blobMode = blobMode; - if( pCtx->aDistances == NULL || pCtx->aCandidates == NULL ){ - goto out_oom; + + if( pCtx->aDistances != NULL && pCtx->aCandidates != NULL && pCtx->aTopDistances != NULL && pCtx->aTopCandidates != NULL ){ + return SQLITE_OK; } - return SQLITE_OK; -out_oom: if( pCtx->aDistances != NULL ){ sqlite3_free(pCtx->aDistances); } if( pCtx->aCandidates != NULL ){ sqlite3_free(pCtx->aCandidates); } + if( pCtx->aTopDistances != NULL ){ + sqlite3_free(pCtx->aTopDistances); + } + if( pCtx->aTopCandidates != NULL ){ + sqlite3_free(pCtx->aTopCandidates); + } + deinitVectorPair(&pCtx->query); return SQLITE_NOMEM_BKPT; } @@ -212406,6 +213074,9 @@ static void diskAnnSearchCtxDeinit(DiskAnnSearchCtx *pCtx){ } sqlite3_free(pCtx->aCandidates); sqlite3_free(pCtx->aDistances); + sqlite3_free(pCtx->aTopCandidates); + sqlite3_free(pCtx->aTopDistances); + deinitVectorPair(&pCtx->query); } // check if we visited this node earlier @@ -212447,7 +213118,9 @@ static int diskAnnSearchCtxShouldAddCandidate(const DiskAnnIndex *pIndex, const } // mark node as visited and put it in the head of visitedList -static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNode){ +static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNode, float distance){ + int iInsert; + assert( pCtx->nUnvisited > 0 ); assert( pNode->visited == 0 ); @@ -212456,56 +213129,51 @@ static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNo pNode->pNext = pCtx->visitedList; pCtx->visitedList = pNode; + + iInsert = distanceBufferInsertIdx(pCtx->aTopDistances, pCtx->nTopCandidates, pCtx->maxTopCandidates, distance); + if( iInsert < 0 ){ + return; + } + bufferInsert((u8*)pCtx->aTopCandidates, pCtx->nTopCandidates, pCtx->maxTopCandidates, iInsert, sizeof(DiskAnnNode*), (u8*)&pNode, NULL); + bufferInsert((u8*)pCtx->aTopDistances, pCtx->nTopCandidates, pCtx->maxTopCandidates, iInsert, sizeof(float), (u8*)&distance, NULL); + pCtx->nTopCandidates = MIN(pCtx->nTopCandidates + 1, pCtx->maxTopCandidates); } static int diskAnnSearchCtxHasUnvisited(const DiskAnnSearchCtx *pCtx){ return pCtx->nUnvisited > 0; } -static DiskAnnNode* diskAnnSearchCtxGetCandidate(DiskAnnSearchCtx *pCtx, int i){ +static void diskAnnSearchCtxGetCandidate(DiskAnnSearchCtx *pCtx, int i, DiskAnnNode **ppNode, float *pDistance){ assert( 0 <= i && i < pCtx->nCandidates ); - return pCtx->aCandidates[i]; + *ppNode = pCtx->aCandidates[i]; + *pDistance = pCtx->aDistances[i]; } static void diskAnnSearchCtxDeleteCandidate(DiskAnnSearchCtx *pCtx, int iDelete){ int i; - assert( 0 <= iDelete && iDelete < pCtx->nCandidates ); assert( pCtx->nUnvisited > 0 ); assert( !pCtx->aCandidates[iDelete]->visited ); assert( pCtx->aCandidates[iDelete]->pBlobSpot == NULL ); diskAnnNodeFree(pCtx->aCandidates[iDelete]); + bufferDelete((u8*)pCtx->aCandidates, pCtx->nCandidates, iDelete, sizeof(DiskAnnNode*)); + bufferDelete((u8*)pCtx->aDistances, pCtx->nCandidates, iDelete, sizeof(float)); - for(i = iDelete + 1; i < pCtx->nCandidates; i++){ - pCtx->aCandidates[i - 1] = pCtx->aCandidates[i]; - pCtx->aDistances[i - 1] = pCtx->aDistances[i]; - } pCtx->nCandidates--; pCtx->nUnvisited--; } -static void diskAnnSearchCtxInsertCandidate(DiskAnnSearchCtx *pCtx, int iInsert, DiskAnnNode* pCandidate, float candidateDist){ - int i; - assert( 0 <= iInsert && iInsert <= pCtx->nCandidates && iInsert < pCtx->maxCandidates ); - if( pCtx->nCandidates < pCtx->maxCandidates ){ - pCtx->nCandidates++; - } else { - DiskAnnNode *pLast = pCtx->aCandidates[pCtx->nCandidates - 1]; - if( !pLast->visited ){ - // since pLast is not visited it should have uninitialized pBlobSpot - so it's safe to completely free the node - assert( pLast->pBlobSpot == NULL ); - pCtx->nUnvisited--; - diskAnnNodeFree(pLast); - } - } - // Shift the candidates to the right to make space for the new one. - for(i = pCtx->nCandidates - 1; i > iInsert; i--){ - pCtx->aCandidates[i] = pCtx->aCandidates[i - 1]; - pCtx->aDistances[i] = pCtx->aDistances[i - 1]; - } - // Insert the new candidate. - pCtx->aCandidates[iInsert] = pCandidate; - pCtx->aDistances[iInsert] = candidateDist; +static void diskAnnSearchCtxInsertCandidate(DiskAnnSearchCtx *pCtx, int iInsert, DiskAnnNode* pCandidate, float distance){ + DiskAnnNode *pLast = NULL; + bufferInsert((u8*)pCtx->aCandidates, pCtx->nCandidates, pCtx->maxCandidates, iInsert, sizeof(DiskAnnNode*), (u8*)&pCandidate, (u8*)&pLast); + bufferInsert((u8*)pCtx->aDistances, pCtx->nCandidates, pCtx->maxCandidates, iInsert, sizeof(float), (u8*)&distance, NULL); + pCtx->nCandidates = MIN(pCtx->nCandidates + 1, pCtx->maxCandidates); + if( pLast != NULL && !pLast->visited ){ + // since pLast is not visited it should have uninitialized pBlobSpot - so it's safe to completely free the node + assert( pLast->pBlobSpot == NULL ); + pCtx->nUnvisited--; + diskAnnNodeFree(pLast); + } pCtx->nUnvisited++; } @@ -212535,7 +213203,14 @@ static int diskAnnSearchCtxFindClosestCandidateIdx(const DiskAnnSearchCtx *pCtx) // return position for new edge(C) which will replace previous edge on that position or -1 if we should ignore it // we also check that no current edge(B) will "prune" new vertex: i.e. dist(B, C) >= (means worse than) alpha * dist(node, C) for all current edges // if any edge(B) will "prune" new edge(C) we will ignore it (return -1) -static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, u64 newRowid, const Vector *pNewVector) { +static int diskAnnReplaceEdgeIdx( + const DiskAnnIndex *pIndex, + BlobSpot *pNodeBlob, + u64 newRowid, + VectorPair *pNewVector, + VectorPair *pPlaceholder, + float *pNodeToNew +) { int i, nEdges, nMaxEdges, iReplace = -1; Vector nodeVector, edgeVector; float nodeToNew, nodeToReplace; @@ -212543,20 +213218,27 @@ static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob nEdges = nodeBinEdges(pIndex, pNodeBlob); nMaxEdges = nodeEdgesMaxCount(pIndex); nodeBinVector(pIndex, pNodeBlob, &nodeVector); - nodeToNew = diskAnnVectorDistance(pIndex, &nodeVector, pNewVector); + loadVectorPair(pPlaceholder, &nodeVector); + + // we need to evaluate potentially approximate distance here in order to correctly compare it with edge distances + nodeToNew = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, pNewVector->pEdge); + *pNodeToNew = nodeToNew; for(i = nEdges - 1; i >= 0; i--){ u64 edgeRowid; float edgeToNew, nodeToEdge; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &nodeToEdge, &edgeVector); if( edgeRowid == newRowid ){ // deletes can leave "zombie" edges in the graph and we must override them and not store duplicate edges in the node return i; } - edgeToNew = diskAnnVectorDistance(pIndex, &edgeVector, pNewVector); - nodeToEdge = diskAnnVectorDistance(pIndex, &nodeVector, &edgeVector); + if( pIndex->nFormatVersion == VECTOR_FORMAT_V1 ){ + nodeToEdge = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, &edgeVector); + } + + edgeToNew = diskAnnVectorDistance(pIndex, &edgeVector, pNewVector->pEdge); if( nodeToNew > pIndex->pruningAlpha * edgeToNew ){ return -1; } @@ -212574,12 +213256,14 @@ static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob // prune edges after we inserted new edge at position iInserted // we only need to check for edges which will be pruned by new vertex // no need to check for other pairs as we checked them on previous insertions -static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, int iInserted) { +static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, int iInserted, VectorPair *pPlaceholder) { int i, s, nEdges; - Vector nodeVector, hintVector; + Vector nodeVector, hintEdgeVector; u64 hintRowid; nodeBinVector(pIndex, pNodeBlob, &nodeVector); + loadVectorPair(pPlaceholder, &nodeVector); + nEdges = nodeBinEdges(pIndex, pNodeBlob); assert( 0 <= iInserted && iInserted < nEdges ); @@ -212589,7 +213273,7 @@ static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, i nodeBinDebug(pIndex, pNodeBlob); #endif - nodeBinEdge(pIndex, pNodeBlob, iInserted, &hintRowid, &hintVector); + nodeBinEdge(pIndex, pNodeBlob, iInserted, &hintRowid, NULL, &hintEdgeVector); // remove edges which is no longer interesting due to the addition of iInserted i = 0; @@ -212597,14 +213281,17 @@ static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, i Vector edgeVector; float nodeToEdge, hintToEdge; u64 edgeRowid; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &nodeToEdge, &edgeVector); if( hintRowid == edgeRowid ){ i++; continue; } - nodeToEdge = diskAnnVectorDistance(pIndex, &nodeVector, &edgeVector); - hintToEdge = diskAnnVectorDistance(pIndex, &hintVector, &edgeVector); + if( pIndex->nFormatVersion == VECTOR_FORMAT_V1 ){ + nodeToEdge = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, &edgeVector); + } + + hintToEdge = diskAnnVectorDistance(pIndex, &hintEdgeVector, &edgeVector); if( nodeToEdge > pIndex->pruningAlpha * hintToEdge ){ nodeBinDeleteEdge(pIndex, pNodeBlob, i); nEdges--; @@ -212653,7 +213340,7 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u } nodeBinVector(pIndex, start->pBlobSpot, &startVector); - startDistance = diskAnnVectorDistance(pIndex, pCtx->pQuery, &startVector); + startDistance = diskAnnVectorDistance(pIndex, pCtx->query.pNode, &startVector); if( pCtx->blobMode == DISKANN_BLOB_READONLY ){ assert( start->pBlobSpot != NULL ); @@ -212670,8 +213357,9 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u Vector vCandidate; DiskAnnNode *pCandidate; BlobSpot *pCandidateBlob; + float distance; int iCandidate = diskAnnSearchCtxFindClosestCandidateIdx(pCtx); - pCandidate = diskAnnSearchCtxGetCandidate(pCtx, iCandidate); + diskAnnSearchCtxGetCandidate(pCtx, iCandidate, &pCandidate, &distance); rc = SQLITE_OK; if( pReusableBlobSpot != NULL ){ @@ -212699,25 +213387,30 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u goto out; } - diskAnnSearchCtxMarkVisited(pCtx, pCandidate); - nVisited += 1; DiskAnnTrace(("visiting candidate(%d): id=%lld\n", nVisited, pCandidate->nRowid)); nodeBinVector(pIndex, pCandidateBlob, &vCandidate); nEdges = nodeBinEdges(pIndex, pCandidateBlob); + // if pNodeQuery != pEdgeQuery then distance from aDistances is approximate and we must recalculate it + if( pCtx->query.pNode != pCtx->query.pEdge ){ + distance = diskAnnVectorDistance(pIndex, &vCandidate, pCtx->query.pNode); + } + + diskAnnSearchCtxMarkVisited(pCtx, pCandidate, distance); + for(i = 0; i < nEdges; i++){ u64 edgeRowid; Vector edgeVector; float edgeDistance; int iInsert; DiskAnnNode *pNewCandidate; - nodeBinEdge(pIndex, pCandidateBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pCandidateBlob, i, &edgeRowid, NULL, &edgeVector); if( diskAnnSearchCtxIsVisited(pCtx, edgeRowid) || diskAnnSearchCtxHasCandidate(pCtx, edgeRowid) ){ continue; } - edgeDistance = diskAnnVectorDistance(pIndex, pCtx->pQuery, &edgeVector); + edgeDistance = diskAnnVectorDistance(pIndex, pCtx->query.pEdge, &edgeVector); iInsert = diskAnnSearchCtxShouldAddCandidate(pIndex, pCtx, edgeDistance); if( iInsert < 0 ){ continue; @@ -212775,12 +213468,12 @@ int diskAnnSearch( *pzErrMsg = sqlite3_mprintf("vector index(search): k must be a non-negative integer"); return SQLITE_ERROR; } - if( pIndex->nVectorDims != pVector->dims ){ + if( pVector->dims != pIndex->nVectorDims ){ *pzErrMsg = sqlite3_mprintf("vector index(search): dimensions are different: %d != %d", pVector->dims, pIndex->nVectorDims); return SQLITE_ERROR; } - if( pVector->type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(search): only f32 vectors are supported"); + if( pVector->type != pIndex->nNodeVectorType ){ + *pzErrMsg = sqlite3_mprintf("vector index(search): vector type differs from column type: %d != %d", pVector->type, pIndex->nNodeVectorType); return SQLITE_ERROR; } @@ -212794,7 +213487,7 @@ int diskAnnSearch( *pzErrMsg = sqlite3_mprintf("vector index(search): failed to select start node for search"); return rc; } - rc = diskAnnSearchCtxInit(&ctx, pVector, pIndex->searchL, DISKANN_BLOB_READONLY); + rc = diskAnnSearchCtxInit(pIndex, &ctx, pVector, pIndex->searchL, k, DISKANN_BLOB_READONLY); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to initialize search context"); goto out; @@ -212803,7 +213496,7 @@ int diskAnnSearch( if( rc != SQLITE_OK ){ goto out; } - nOutRows = MIN(k, ctx.nCandidates); + nOutRows = MIN(k, ctx.nTopCandidates); rc = vectorOutRowsAlloc(pIndex->db, pRows, nOutRows, pKey->nKeyColumns, vectorIdxKeyRowidLike(pKey)); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to allocate output rows"); @@ -212811,9 +213504,9 @@ int diskAnnSearch( } for(i = 0; i < nOutRows; i++){ if( pRows->aIntValues != NULL ){ - rc = vectorOutRowsPut(pRows, i, 0, &ctx.aCandidates[i]->nRowid, NULL); + rc = vectorOutRowsPut(pRows, i, 0, &ctx.aTopCandidates[i]->nRowid, NULL); }else{ - rc = diskAnnGetShadowRowKeys(pIndex, ctx.aCandidates[i]->nRowid, pKey, pRows, i); + rc = diskAnnGetShadowRowKeys(pIndex, ctx.aTopCandidates[i]->nRowid, pKey, pRows, i); } if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to put result in the output row"); @@ -212837,24 +213530,39 @@ int diskAnnInsert( BlobSpot *pBlobSpot = NULL; DiskAnnNode *pVisited; DiskAnnSearchCtx ctx; + VectorPair vInsert, vCandidate; + vInsert.pNode = NULL; vInsert.pEdge = NULL; + vCandidate.pNode = NULL; vCandidate.pEdge = NULL; if( pVectorInRow->pVector->dims != pIndex->nVectorDims ){ *pzErrMsg = sqlite3_mprintf("vector index(insert): dimensions are different: %d != %d", pVectorInRow->pVector->dims, pIndex->nVectorDims); return SQLITE_ERROR; } - if( pVectorInRow->pVector->type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(insert): only f32 vectors are supported"); + if( pVectorInRow->pVector->type != pIndex->nNodeVectorType ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): vector type differs from column type: %d != %d", pVectorInRow->pVector->type, pIndex->nNodeVectorType); return SQLITE_ERROR; } DiskAnnTrace(("diskAnnInset started\n")); - rc = diskAnnSearchCtxInit(&ctx, pVectorInRow->pVector, pIndex->insertL, DISKANN_BLOB_WRITABLE); + rc = diskAnnSearchCtxInit(pIndex, &ctx, pVectorInRow->pVector, pIndex->insertL, 1, DISKANN_BLOB_WRITABLE); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(insert): failed to initialize search context"); return rc; } + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &vInsert) != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): unable to allocate mem for node VectorPair"); + rc = SQLITE_NOMEM_BKPT; + goto out; + } + + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &vCandidate) != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): unable to allocate mem for candidate VectorPair"); + rc = SQLITE_NOMEM_BKPT; + goto out; + } + // note: we must select random row before we will insert new row in the shadow table rc = diskAnnSelectRandomShadowRow(pIndex, &nStartRowid); if( rc == SQLITE_DONE ){ @@ -212892,28 +213600,33 @@ int diskAnnInsert( } // first pass - add all visited nodes as a potential neighbours of new node for(pVisited = ctx.visitedList; pVisited != NULL; pVisited = pVisited->pNext){ - Vector vector; + Vector nodeVector; int iReplace; + float nodeToNew; - nodeBinVector(pIndex, pVisited->pBlobSpot, &vector); - iReplace = diskAnnReplaceEdgeIdx(pIndex, pBlobSpot, pVisited->nRowid, &vector); + nodeBinVector(pIndex, pVisited->pBlobSpot, &nodeVector); + loadVectorPair(&vCandidate, &nodeVector); + + iReplace = diskAnnReplaceEdgeIdx(pIndex, pBlobSpot, pVisited->nRowid, &vCandidate, &vInsert, &nodeToNew); if( iReplace == -1 ){ continue; } - nodeBinReplaceEdge(pIndex, pBlobSpot, iReplace, pVisited->nRowid, &vector); - diskAnnPruneEdges(pIndex, pBlobSpot, iReplace); + nodeBinReplaceEdge(pIndex, pBlobSpot, iReplace, pVisited->nRowid, nodeToNew, vCandidate.pEdge); + diskAnnPruneEdges(pIndex, pBlobSpot, iReplace, &vInsert); } // second pass - add new node as a potential neighbour of all visited nodes + loadVectorPair(&vInsert, pVectorInRow->pVector); for(pVisited = ctx.visitedList; pVisited != NULL; pVisited = pVisited->pNext){ int iReplace; + float nodeToNew; - iReplace = diskAnnReplaceEdgeIdx(pIndex, pVisited->pBlobSpot, nNewRowid, pVectorInRow->pVector); + iReplace = diskAnnReplaceEdgeIdx(pIndex, pVisited->pBlobSpot, nNewRowid, &vInsert, &vCandidate, &nodeToNew); if( iReplace == -1 ){ continue; } - nodeBinReplaceEdge(pIndex, pVisited->pBlobSpot, iReplace, nNewRowid, pVectorInRow->pVector); - diskAnnPruneEdges(pIndex, pVisited->pBlobSpot, iReplace); + nodeBinReplaceEdge(pIndex, pVisited->pBlobSpot, iReplace, nNewRowid, nodeToNew, vInsert.pEdge); + diskAnnPruneEdges(pIndex, pVisited->pBlobSpot, iReplace, &vCandidate); rc = blobSpotFlush(pIndex, pVisited->pBlobSpot); if( rc != SQLITE_OK ){ @@ -212924,6 +213637,8 @@ int diskAnnInsert( rc = SQLITE_OK; out: + deinitVectorPair(&vInsert); + deinitVectorPair(&vCandidate); if( rc == SQLITE_OK ){ rc = blobSpotFlush(pIndex, pBlobSpot); if( rc != SQLITE_OK ){ @@ -212958,7 +213673,12 @@ int diskAnnDelete( DiskAnnTrace(("diskAnnDelete started: rowid=%lld\n", nodeRowid)); rc = blobSpotCreate(pIndex, &pNodeBlob, nodeRowid, pIndex->nBlockSize, DISKANN_BLOB_WRITABLE); - if( rc != SQLITE_OK ){ + if( rc == DISKANN_ROW_NOT_FOUND ){ + // as we omit rows with NULL values during insert, it can be the case that there is nothing to delete in the index, while row exists in the base table + // so, we must simply silently stop delete process as there is nothing to delete from index + rc = SQLITE_OK; + goto out; + }else if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(delete): failed to create blob for node row"); goto out; } @@ -212975,7 +213695,7 @@ int diskAnnDelete( nNeighbours = nodeBinEdges(pIndex, pNodeBlob); for(i = 0; i < nNeighbours; i++){ u64 edgeRowid; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, NULL); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, NULL, NULL); rc = blobSpotReload(pIndex, pEdgeBlob, edgeRowid, pIndex->nBlockSize); if( rc == DISKANN_ROW_NOT_FOUND ){ continue; @@ -213022,6 +213742,7 @@ int diskAnnOpenIndex( ){ DiskAnnIndex *pIndex; u64 nBlockSize; + int compressNeighbours; pIndex = sqlite3DbMallocRaw(db, sizeof(DiskAnnIndex)); if( pIndex == NULL ){ return SQLITE_NOMEM; @@ -213068,11 +213789,18 @@ int diskAnnOpenIndex( pIndex->searchL = VECTOR_SEARCH_L_DEFAULT; } pIndex->nNodeVectorSize = vectorDataSize(pIndex->nNodeVectorType, pIndex->nVectorDims); - // will change in future when we will support compression of edges vectors - pIndex->nEdgeVectorType = pIndex->nNodeVectorType; - pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize; + + compressNeighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID); + if( compressNeighbours == 0 ){ + pIndex->nEdgeVectorType = pIndex->nNodeVectorType; + pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize; + }else{ + pIndex->nEdgeVectorType = compressNeighbours; + pIndex->nEdgeVectorSize = vectorDataSize(compressNeighbours, pIndex->nVectorDims); + } *ppIndex = pIndex; + DiskAnnTrace(("opened index %s: max edges %d\n", zIdxName, nodeEdgesMaxCount(pIndex))); return SQLITE_OK; } @@ -213091,7 +213819,7 @@ void diskAnnCloseIndex(DiskAnnIndex *pIndex){ #endif /* !defined(SQLITE_OMIT_VECTOR) */ /************** End of vectordiskann.c ***************************************/ -/************** Begin file vectorfloat32.c ***********************************/ +/************** Begin file vectorfloat1bit.c *********************************/ /* ** 2024-07-04 ** @@ -213116,7 +213844,7 @@ void diskAnnCloseIndex(DiskAnnIndex *pIndex){ ** ****************************************************************************** ** -** 32-bit floating point vector format utilities. +** 1-bit vector format utilities. */ #ifndef SQLITE_OMIT_VECTOR /* #include "sqliteInt.h" */ @@ -213129,111 +213857,188 @@ void diskAnnCloseIndex(DiskAnnIndex *pIndex){ ** Utility routines for debugging **************************************************************************/ -void vectorF32Dump(const Vector *pVec){ - float *elems = pVec->data; +void vector1BitDump(const Vector *pVec){ + u8 *elems = pVec->data; unsigned i; - assert( pVec->type == VECTOR_TYPE_FLOAT32 ); + assert( pVec->type == VECTOR_TYPE_FLOAT1BIT ); + printf("f1bit: ["); for(i = 0; i < pVec->dims; i++){ - printf("%f ", elems[i]); + printf("%s%d", i == 0 ? "" : ", ", ((elems[i / 8] >> (i & 7)) & 1) ? +1 : -1); } - printf("\n"); + printf("]\n"); } /************************************************************************** ** Utility routines for vector serialization and deserialization **************************************************************************/ -static inline unsigned formatF32(float value, char *pBuf, int nBufSize){ - sqlite3_snprintf(nBufSize, pBuf, "%g", (double)value); - return strlen(pBuf); -} - -static inline unsigned serializeF32(unsigned char *pBuf, float value){ - u32 *p = (u32 *)&value; - pBuf[0] = *p & 0xFF; - pBuf[1] = (*p >> 8) & 0xFF; - pBuf[2] = (*p >> 16) & 0xFF; - pBuf[3] = (*p >> 24) & 0xFF; - return sizeof(float); -} - -static inline float deserializeF32(const unsigned char *pBuf){ - u32 value = 0; - value |= (u32)pBuf[0]; - value |= (u32)pBuf[1] << 8; - value |= (u32)pBuf[2] << 16; - value |= (u32)pBuf[3] << 24; - return *(float *)&value; -} - -size_t vectorF32SerializeToBlob( +void vector1BitSerializeToBlob( const Vector *pVector, unsigned char *pBlob, size_t nBlobSize ){ - float *elems = pVector->data; - unsigned char *pPtr = pBlob; - size_t len = 0; + u8 *elems = pVector->data; + u8 *pPtr = pBlob; unsigned i; - assert( pVector->type == VECTOR_TYPE_FLOAT32 ); + assert( pVector->type == VECTOR_TYPE_FLOAT1BIT ); assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize >= pVector->dims * sizeof(float) ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + for(i = 0; i < (pVector->dims + 7) / 8; i++){ + pPtr[i] = elems[i]; + } +} + +// [sum(map(int, bin(i)[2:])) for i in range(256)] +static int BitsCount[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; - for(i = 0; i < pVector->dims; i++){ - pPtr += serializeF32(pPtr, elems[i]); +static inline int sqlite3PopCount32(u32 a){ +#if GCC_VERSION>=5004000 && !defined(__INTEL_COMPILER) + return __builtin_popcount(a); +#else + return BitsCount[a >> 24] + BitsCount[(a >> 16) & 0xff] + BitsCount[(a >> 8) & 0xff] + BitsCount[a & 0xff]; +#endif +} + +int vector1BitDistanceHamming(const Vector *v1, const Vector *v2){ + int diff = 0; + u8 *e1U8 = v1->data; + u32 *e1U32 = v1->data; + u8 *e2U8 = v2->data; + u32 *e2U32 = v2->data; + int i, len8, len32, offset8; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT1BIT ); + assert( v2->type == VECTOR_TYPE_FLOAT1BIT ); + + len8 = (v1->dims + 7) / 8; + len32 = v1->dims / 32; + offset8 = len32 * 4; + + for(i = 0; i < len32; i++){ + diff += sqlite3PopCount32(e1U32[i] ^ e2U32[i]); + } + for(i = offset8; i < len8; i++){ + diff += sqlite3PopCount32(e1U8[i] ^ e2U8[i]); } - return sizeof(float) * pVector->dims; + return diff; } -size_t vectorF32DeserializeFromBlob( +void vector1BitDeserializeFromBlob( Vector *pVector, const unsigned char *pBlob, size_t nBlobSize ){ - float *elems = pVector->data; + u8 *elems = pVector->data; + + assert( pVector->type == VECTOR_TYPE_FLOAT1BIT ); + assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy(elems, pBlob, (pVector->dims + 7) / 8); +} + +#endif /* !defined(SQLITE_OMIT_VECTOR) */ + +/************** End of vectorfloat1bit.c *************************************/ +/************** Begin file vectorfloat32.c ***********************************/ +/* +** 2024-07-04 +** +** Copyright 2024 the libSQL authors +** +** Permission is hereby granted, free of charge, to any person obtaining a copy of +** this software and associated documentation files (the "Software"), to deal in +** the Software without restriction, including without limitation the rights to +** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +** the Software, and to permit persons to whom the Software is furnished to do so, +** subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +****************************************************************************** +** +** 32-bit floating point vector format utilities. +*/ +#ifndef SQLITE_OMIT_VECTOR +/* #include "sqliteInt.h" */ + +/* #include "vectorInt.h" */ + +/* #include */ + +/************************************************************************** +** Utility routines for debugging +**************************************************************************/ + +void vectorF32Dump(const Vector *pVec){ + float *elems = pVec->data; unsigned i; - pVector->type = VECTOR_TYPE_FLOAT32; - pVector->dims = nBlobSize / sizeof(float); - assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize % 2 == 0 || pBlob[nBlobSize - 1] == VECTOR_TYPE_FLOAT32 ); + assert( pVec->type == VECTOR_TYPE_FLOAT32 ); - for(i = 0; i < pVector->dims; i++){ - elems[i] = deserializeF32(pBlob); - pBlob += sizeof(float); + printf("f32: ["); + for(i = 0; i < pVec->dims; i++){ + printf("%s%f", i == 0 ? "" : ", ", elems[i]); } - return vectorDataSize(pVector->type, pVector->dims); + printf("]\n"); } -void vectorF32Serialize( - sqlite3_context *context, - const Vector *pVector +/************************************************************************** +** Utility routines for vector serialization and deserialization +**************************************************************************/ + +static inline unsigned formatF32(float value, char *pBuf, int nBufSize){ + sqlite3_snprintf(nBufSize, pBuf, "%g", (double)value); + return strlen(pBuf); +} + +void vectorF32SerializeToBlob( + const Vector *pVector, + unsigned char *pBlob, + size_t nBlobSize ){ float *elems = pVector->data; - unsigned char *pBlob; - size_t nBlobSize; + unsigned char *pPtr = pBlob; + size_t len = 0; + unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT32 ); assert( pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); - nBlobSize = vectorDataSize(pVector->type, pVector->dims); - - if( nBlobSize == 0 ){ - sqlite3_result_zeroblob(context, 0); - return; - } - - pBlob = sqlite3_malloc64(nBlobSize); - if( pBlob == NULL ){ - sqlite3_result_error_nomem(context); - return; + for(i = 0; i < pVector->dims; i++){ + pPtr += serializeF32(pPtr, elems[i]); } - - vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); - sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); } #define SINGLE_FLOAT_CHAR_LIMIT 32 @@ -213309,37 +214114,22 @@ float vectorF32DistanceL2(const Vector *v1, const Vector *v2){ return sqrt(sum); } -void vectorF32InitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - pVector->dims = nBlobSize / sizeof(float); - pVector->data = (void*)pBlob; -} - -int vectorF32ParseSqliteBlob( - sqlite3_value *arg, +void vectorF32DeserializeFromBlob( Vector *pVector, - char **pzErr + const unsigned char *pBlob, + size_t nBlobSize ){ - const unsigned char *pBlob; float *elems = pVector->data; unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT32 ); assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); - assert( sqlite3_value_type(arg) == SQLITE_BLOB ); - - pBlob = sqlite3_value_blob(arg); - if( sqlite3_value_bytes(arg) < sizeof(float) * pVector->dims ){ - *pzErr = sqlite3_mprintf("invalid f32 vector: not enough bytes for all dimensions"); - goto error; - } + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ elems[i] = deserializeF32(pBlob); pBlob += sizeof(float); } - return 0; -error: - return -1; } #endif /* !defined(SQLITE_OMIT_VECTOR) */ @@ -213386,10 +214176,14 @@ int vectorF32ParseSqliteBlob( void vectorF64Dump(const Vector *pVec){ double *elems = pVec->data; unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT64 ); + + printf("f64: ["); for(i = 0; i < pVec->dims; i++){ - printf("%lf ", elems[i]); + printf("%s%lf", i == 0 ? "" : ", ", elems[i]); } - printf("\n"); + printf("]\n"); } /************************************************************************** @@ -213427,7 +214221,7 @@ static inline double deserializeF64(const unsigned char *pBuf){ return *(double *)&value; } -size_t vectorF64SerializeToBlob( +void vectorF64SerializeToBlob( const Vector *pVector, unsigned char *pBlob, size_t nBlobSize @@ -213438,63 +214232,11 @@ size_t vectorF64SerializeToBlob( assert( pVector->type == VECTOR_TYPE_FLOAT64 ); assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize >= pVector->dims * sizeof(double) ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for (i = 0; i < pVector->dims; i++) { pPtr += serializeF64(pPtr, elems[i]); } - return sizeof(double) * pVector->dims; -} - -size_t vectorF64DeserializeFromBlob( - Vector *pVector, - const unsigned char *pBlob, - size_t nBlobSize -){ - double *elems = pVector->data; - unsigned i; - pVector->type = VECTOR_TYPE_FLOAT64; - pVector->dims = nBlobSize / sizeof(double); - - assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize % 2 == 1 && pBlob[nBlobSize - 1] == VECTOR_TYPE_FLOAT64 ); - - for(i = 0; i < pVector->dims; i++){ - elems[i] = deserializeF64(pBlob); - pBlob += sizeof(double); - } - return vectorDataSize(pVector->type, pVector->dims); -} - -void vectorF64Serialize( - sqlite3_context *context, - const Vector *pVector -){ - double *elems = pVector->data; - unsigned char *pBlob; - size_t nBlobSize; - - assert( pVector->type == VECTOR_TYPE_FLOAT64 ); - assert( pVector->dims <= MAX_VECTOR_SZ ); - - // allocate one extra trailing byte with vector blob type metadata - nBlobSize = vectorDataSize(pVector->type, pVector->dims) + 1; - - if( nBlobSize == 0 ){ - sqlite3_result_zeroblob(context, 0); - return; - } - - pBlob = sqlite3_malloc64(nBlobSize); - if( pBlob == NULL ){ - sqlite3_result_error_nomem(context); - return; - } - - vectorF64SerializeToBlob(pVector, pBlob, nBlobSize - 1); - pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT64; - - sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); } #define SINGLE_DOUBLE_CHAR_LIMIT 32 @@ -213570,42 +214312,192 @@ double vectorF64DistanceL2(const Vector *v1, const Vector *v2){ return sqrt(sum); } -void vectorF64InitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - pVector->dims = nBlobSize / sizeof(double); - pVector->data = (void*)pBlob; -} - -int vectorF64ParseSqliteBlob( - sqlite3_value *arg, +void vectorF64DeserializeFromBlob( Vector *pVector, - char **pzErr + const unsigned char *pBlob, + size_t nBlobSize ){ - const unsigned char *pBlob; double *elems = pVector->data; unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT64 ); assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); - assert( sqlite3_value_type(arg) == SQLITE_BLOB ); - - pBlob = sqlite3_value_blob(arg); - if( sqlite3_value_bytes(arg) < sizeof(double) * pVector->dims ){ - *pzErr = sqlite3_mprintf("invalid f64 vector: not enough bytes for all dimensions"); - goto error; - } + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ elems[i] = deserializeF64(pBlob); pBlob += sizeof(double); } - return 0; -error: - return -1; } #endif /* !defined(SQLITE_OMIT_VECTOR) */ /************** End of vectorfloat64.c ***************************************/ +/************** Begin file vectorfloat8.c ************************************/ +/* +** 2024-07-04 +** +** Copyright 2024 the libSQL authors +** +** Permission is hereby granted, free of charge, to any person obtaining a copy of +** this software and associated documentation files (the "Software"), to deal in +** the Software without restriction, including without limitation the rights to +** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +** the Software, and to permit persons to whom the Software is furnished to do so, +** subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +****************************************************************************** +** +** 8-bit (INT8) floating point vector format utilities. +** +** The idea is to replace vector [f_0, f_1, ... f_k] with quantized uint8 values [q_0, q_1, ..., q_k] in such a way that +** f_i = alpha * q_i + shift, when alpha and shift determined from all f_i values like that: +** alpha = (max(f) - min(f)) / 255, shift = min(f) +** +** This differs from uint8 quantization in neural-network as it usually take form of f_i = alpha * (q_i - z) conversion instead +** But, neural-network uint8 quantization is less generic and works better for distributions centered around zero (symmetric or not) +** In our implementation we want to handle more generic cases - so profits from neural-network-style quantization are not clear +*/ +#ifndef SQLITE_OMIT_VECTOR +/* #include "sqliteInt.h" */ + +/* #include "vectorInt.h" */ + +/* #include */ + +/************************************************************************** +** Utility routines for vector serialization and deserialization +**************************************************************************/ + +void vectorF8GetParameters(const u8 *pData, int dims, float *pAlpha, float *pShift){ + pData = pData + ALIGN(dims, sizeof(float)); + *pAlpha = deserializeF32(pData); + *pShift = deserializeF32(pData + sizeof(*pAlpha)); +} + +void vectorF8SetParameters(u8 *pData, int dims, float alpha, float shift){ + pData = pData + ALIGN(dims, sizeof(float)); + serializeF32(pData, alpha); + serializeF32(pData + sizeof(alpha), shift); +} + +void vectorF8Dump(const Vector *pVec){ + u8 *elems = pVec->data; + float alpha, shift; + unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(pVec->data, pVec->dims, &alpha, &shift); + + printf("f8: ["); + for(i = 0; i < pVec->dims; i++){ + printf("%s%f", i == 0 ? "" : ", ", (float)elems[i] * alpha + shift); + } + printf("]\n"); +} + +void vectorF8SerializeToBlob( + const Vector *pVector, + unsigned char *pBlob, + size_t nBlobSize +){ + float alpha, shift; + + assert( pVector->type == VECTOR_TYPE_FLOAT8 ); + assert( pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy(pBlob, pVector->data, pVector->dims); + + vectorF8GetParameters(pVector->data, pVector->dims, &alpha, &shift); + vectorF8SetParameters(pBlob, pVector->dims, alpha, shift); +} + +float vectorF8DistanceCos(const Vector *v1, const Vector *v2){ + int i; + float alpha1, shift1, alpha2, shift2; + u32 sum1 = 0, sum2 = 0, sumsq1 = 0, sumsq2 = 0, doti = 0; + float dot = 0, norm1 = 0, norm2 = 0; + u8 *data1 = v1->data, *data2 = v2->data; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT8 ); + assert( v2->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(v1->data, v1->dims, &alpha1, &shift1); + vectorF8GetParameters(v2->data, v2->dims, &alpha2, &shift2); + + /* + * (Ax + S)^2 = A^2 x^2 + 2AS x + S^2 -> we need to maintain 'sumsq' and 'sum' + * (A1x + S1) * (A2y + S2) = A1A2 xy + A1 S2 x + A2 S1 y + S1 S2 -> we need to maintain 'dot' and 'sum' again + */ + + for(i = 0; i < v1->dims; i++){ + sum1 += data1[i]; + sum2 += data2[i]; + sumsq1 += data1[i]*data1[i]; + sumsq2 += data2[i]*data2[i]; + doti += data1[i]*data2[i]; + } + + dot = alpha1 * alpha2 * (float)doti + alpha1 * shift2 * (float)sum1 + alpha2 * shift1 * (float)sum2 + shift1 * shift2 * v1->dims; + norm1 = alpha1 * alpha1 * (float)sumsq1 + 2 * alpha1 * shift1 * (float)sum1 + shift1 * shift1 * v1->dims; + norm2 = alpha2 * alpha2 * (float)sumsq2 + 2 * alpha2 * shift2 * (float)sum2 + shift2 * shift2 * v1->dims; + + return 1.0 - (dot / sqrt(norm1 * norm2)); +} + +float vectorF8DistanceL2(const Vector *v1, const Vector *v2){ + int i; + float alpha1, shift1, alpha2, shift2; + float sum = 0; + u8 *data1 = v1->data, *data2 = v2->data; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT8 ); + assert( v2->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(v1->data, v1->dims, &alpha1, &shift1); + vectorF8GetParameters(v2->data, v2->dims, &alpha2, &shift2); + + for(i = 0; i < v1->dims; i++){ + float d = (alpha1 * data1[i] + shift1) - (alpha2 * data2[i] + shift2); + sum += d*d; + } + return sqrt(sum); +} + +void vectorF8DeserializeFromBlob( + Vector *pVector, + const unsigned char *pBlob, + size_t nBlobSize +){ + float alpha, shift; + + assert( pVector->type == VECTOR_TYPE_FLOAT8 ); + assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy((u8*)pVector->data, (u8*)pBlob, ALIGN(pVector->dims, sizeof(float))); + + vectorF8GetParameters(pBlob, pVector->dims, &alpha, &shift); + vectorF8SetParameters(pVector->data, pVector->dims, alpha, shift); +} + +#endif /* !defined(SQLITE_OMIT_VECTOR) */ + +/************** End of vectorfloat8.c ****************************************/ /************** Begin file vectorIndex.c *************************************/ /* ** 2024-03-18 @@ -213658,11 +214550,6 @@ int vectorF64ParseSqliteBlob( ** VectorIdxParams utilities ****************************************************************************/ -// VACUUM creates tables and indices first and only then populate data -// we need to ignore inserts from 'INSERT INTO vacuum.t SELECT * FROM t' statements because -// all shadow tables will be populated by VACUUM process during regular process of table copy -#define IsVacuum(db) ((db->mDbFlags&DBFLAG_Vacuum)!=0) - void vectorIdxParamsInit(VectorIdxParams *pParams, u8 *pBinBuf, int nBinSize) { assert( nBinSize <= VECTOR_INDEX_PARAMS_BUF_SIZE ); @@ -213875,7 +214762,7 @@ int vectorInRowAlloc(sqlite3 *db, const UnpackedRecord *pRecord, VectorInRow *pV vectorInitFromBlob(pVectorInRow->pVector, sqlite3_value_blob(pVectorValue), sqlite3_value_bytes(pVectorValue)); } else if( sqlite3_value_type(pVectorValue) == SQLITE_TEXT ){ // users can put strings (e.g. '[1,2,3]') in the table and we should process them correctly - if( vectorParse(pVectorValue, pVectorInRow->pVector, pzErrMsg) != 0 ){ + if( vectorParseWithType(pVectorValue, pVectorInRow->pVector, pzErrMsg) != 0 ){ rc = SQLITE_ERROR; goto out; } @@ -213987,14 +214874,18 @@ void vectorOutRowsFree(sqlite3 *db, VectorOutRows *pRows) { */ struct VectorColumnType { const char *zName; - int nBits; + int type; }; static struct VectorColumnType VECTOR_COLUMN_TYPES[] = { - { "FLOAT32", 32 }, - { "FLOAT64", 64 }, - { "F32_BLOB", 32 }, - { "F64_BLOB", 64 } + { "FLOAT32", VECTOR_TYPE_FLOAT32 }, + { "F32_BLOB", VECTOR_TYPE_FLOAT32 }, + { "FLOAT64", VECTOR_TYPE_FLOAT64 }, + { "F64_BLOB", VECTOR_TYPE_FLOAT64 }, + { "FLOAT1BIT", VECTOR_TYPE_FLOAT1BIT }, + { "F1BIT_BLOB", VECTOR_TYPE_FLOAT1BIT }, + { "FLOAT8", VECTOR_TYPE_FLOAT8 }, + { "F8_BLOB", VECTOR_TYPE_FLOAT8 }, }; /* @@ -214010,13 +214901,16 @@ struct VectorParamName { }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, - { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float1bit", VECTOR_TYPE_FLOAT1BIT }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float8", VECTOR_TYPE_FLOAT8 }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float32", VECTOR_TYPE_FLOAT32 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { @@ -214182,14 +215076,7 @@ int vectorIdxParseColumnType(const char *zType, int *pType, int *pDims, const ch } *pDims = dimensions; - if( VECTOR_COLUMN_TYPES[i].nBits == 32 ) { - *pType = VECTOR_TYPE_FLOAT32; - } else if( VECTOR_COLUMN_TYPES[i].nBits == 64 ) { - *pType = VECTOR_TYPE_FLOAT64; - } else { - *pErrMsg = "unsupported vector type"; - return -1; - } + *pType = VECTOR_COLUMN_TYPES[i].type; return 0; } *pErrMsg = "unexpected vector column type"; @@ -214381,10 +215268,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { // this is done to prevent unrecoverable situations where index were dropped but index parameters deletion failed and second attempt will fail on first step int rcIdx, rcParams; - if( IsVacuum(db) ){ - return SQLITE_OK; - } - assert( zDbSName != NULL ); rcIdx = diskAnnDropIndex(db, zDbSName, zIdxName); @@ -214395,10 +215278,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { assert( zDbSName != NULL ); - if( IsVacuum(db) ){ - return SQLITE_OK; - } - return diskAnnClearIndex(db, zDbSName, zIdxName); } @@ -214408,7 +215287,7 @@ int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { * this made intentionally in order to natively support upload of SQLite dumps * * dump populates tables first and create indices after - * so we must omit them because shadow tables already filled + * so we must omit index refill setp because shadow tables already filled * * 1. in case of any error :-1 returned (and pParse errMsg is populated with some error message) * 2. if vector index must not be created : 0 returned @@ -214424,11 +215303,7 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co int i, rc = SQLITE_OK; int dims, type; int hasLibsqlVectorIdxFn = 0, hasCollation = 0; - const char *pzErrMsg; - - if( IsVacuum(pParse->db) ){ - return CREATE_IGNORE; - } + const char *pzErrMsg = NULL; assert( zDbSName != NULL ); @@ -214488,11 +215363,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: must contain exactly one column wrapped into the " VECTOR_INDEX_MARKER_FUNCTION " function"); return CREATE_FAIL; } - // we are able to support this but I doubt this works for now - more polishing required to make this work - if( pIdx->pPartIdxWhere != NULL ) { - sqlite3ErrorMsg(pParse, "vector index: where condition is forbidden"); - return CREATE_FAIL; - } pArgsList = pIdx->aColExpr->a[0].pExpr->x.pList; pListItem = pArgsList->a; @@ -214517,7 +215387,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: %s: %s", pzErrMsg, zEmbeddingColumnTypeName); return CREATE_FAIL; } - // schema is locked while db is initializing and we need to just proceed here if( db->init.busy == 1 ){ return CREATE_OK; @@ -214540,9 +215409,13 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: unsupported for tables without ROWID and composite primary key"); return CREATE_FAIL; } - rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams); + rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams, &pzErrMsg); if( rc != SQLITE_OK ){ - sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann"); + if( pzErrMsg != NULL ){ + sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann: %s", pzErrMsg); + }else{ + sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann"); + } return CREATE_FAIL; } rc = insertIndexParameters(db, zDbSName, pIdx->zName, &idxParams); @@ -214582,7 +215455,6 @@ int vectorIndexSearch( VectorIdxParams idxParams; vectorIdxParamsInit(&idxParams, NULL, 0); - assert( !IsVacuum(db) ); assert( zDbSName != NULL ); if( argc != 3 ){ @@ -214594,17 +215466,14 @@ int vectorIndexSearch( rc = SQLITE_ERROR; goto out; } - if( type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(search): only f32 vectors are supported"); - rc = SQLITE_ERROR; - goto out; - } + assert( type == VECTOR_TYPE_FLOAT32 || type == VECTOR_TYPE_FLOAT64 || type == VECTOR_TYPE_FLOAT1BIT ); + pVector = vectorAlloc(type, dims); if( pVector == NULL ){ rc = SQLITE_NOMEM_BKPT; goto out; } - if( vectorParse(argv[1], pVector, pzErrMsg) != 0 ){ + if( vectorParseWithType(argv[1], pVector, pzErrMsg) != 0 ){ rc = SQLITE_ERROR; goto out; } @@ -214667,10 +215536,6 @@ int vectorIndexInsert( int rc; VectorInRow vectorInRow; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - rc = vectorInRowAlloc(pCur->db, pRecord, &vectorInRow, pzErrMsg); if( rc != SQLITE_OK ){ return rc; @@ -214690,10 +215555,6 @@ int vectorIndexDelete( ){ VectorInRow payload; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - payload.pVector = NULL; payload.nKeys = r->nField - 1; payload.pKeyValues = r->aMem + 1; @@ -259749,40 +260610,46 @@ static int fts5TriCreate( Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); - UNUSED_PARAM(pUnused); - if( pNew==0 ){ - rc = SQLITE_NOMEM; + TrigramTokenizer *pNew = 0; + + if( nArg%2 ){ + rc = SQLITE_ERROR; }else{ - int i; - pNew->bFold = 1; - pNew->iFoldParam = 0; - for(i=0; rc==SQLITE_OK && ibFold = 1; + pNew->iFoldParam = 0; + for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); + } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ - pNew->bFold = (zArg[0]=='0'); - } - }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ - if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; - }else{ - pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } - }else{ - rc = SQLITE_ERROR; } - } - if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ - rc = SQLITE_ERROR; - } + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } - if( rc!=SQLITE_OK ){ - fts5TriDelete((Fts5Tokenizer*)pNew); - pNew = 0; + if( rc!=SQLITE_OK ){ + fts5TriDelete((Fts5Tokenizer*)pNew); + pNew = 0; + } } } *ppOut = (Fts5Tokenizer*)pNew; diff --git a/libsql-ffi/bundled/bindings/bindgen.rs b/libsql-ffi/bundled/bindings/bindgen.rs index 9dec505c10..cc73807f33 100644 --- a/libsql-ffi/bundled/bindings/bindgen.rs +++ b/libsql-ffi/bundled/bindings/bindgen.rs @@ -940,7 +940,7 @@ extern "C" { extern "C" { pub fn sqlite3_vmprintf( arg1: *const ::std::os::raw::c_char, - arg2: va_list, + arg2: *mut __va_list_tag, ) -> *mut ::std::os::raw::c_char; } extern "C" { @@ -956,7 +956,7 @@ extern "C" { arg1: ::std::os::raw::c_int, arg2: *mut ::std::os::raw::c_char, arg3: *const ::std::os::raw::c_char, - arg4: va_list, + arg4: *mut __va_list_tag, ) -> *mut ::std::os::raw::c_char; } extern "C" { @@ -2503,7 +2503,7 @@ extern "C" { pub fn sqlite3_str_vappendf( arg1: *mut sqlite3_str, zFormat: *const ::std::os::raw::c_char, - arg2: va_list, + arg2: *mut __va_list_tag, ); } extern "C" { @@ -3524,4 +3524,12 @@ extern "C" { extern "C" { pub static sqlite3_wal_manager: libsql_wal_manager; } -pub type __builtin_va_list = *mut ::std::os::raw::c_char; +pub type __builtin_va_list = [__va_list_tag; 1usize]; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct __va_list_tag { + pub gp_offset: ::std::os::raw::c_uint, + pub fp_offset: ::std::os::raw::c_uint, + pub overflow_arg_area: *mut ::std::os::raw::c_void, + pub reg_save_area: *mut ::std::os::raw::c_void, +} diff --git a/libsql-ffi/bundled/src/sqlite3.c b/libsql-ffi/bundled/src/sqlite3.c index 9d51c1d645..13b2556957 100644 --- a/libsql-ffi/bundled/src/sqlite3.c +++ b/libsql-ffi/bundled/src/sqlite3.c @@ -28,6 +28,7 @@ ** README.md ** configure ** configure.ac +** ext/fts5/fts5_tokenize.c ** ext/jni/src/org/sqlite/jni/capi/CollationNeededCallback.java ** ext/jni/src/org/sqlite/jni/capi/CommitHookCallback.java ** ext/jni/src/org/sqlite/jni/capi/PreupdateHookCallback.java @@ -69,6 +70,7 @@ ** src/test2.c ** src/test3.c ** src/test8.c +** src/vacuum.c ** src/vdbe.c ** src/vdbeInt.h ** src/vdbeapi.c @@ -19264,7 +19266,8 @@ struct Index { u16 nKeyCol; /* Number of columns forming the key */ u16 nColumn; /* Number of columns stored in the index */ u8 onError; /* OE_Abort, OE_Ignore, OE_Replace, or OE_None */ - unsigned idxType:3; /* 0:Normal 1:UNIQUE, 2:PRIMARY KEY, 3:IPK, 4:VECTOR INDEX */ + unsigned idxType:2; /* 0:Normal 1:UNIQUE, 2:PRIMARY KEY, 3:IPK */ + unsigned idxIsVector:1; /* 0:Normal 1:VECTOR INDEX */ unsigned bUnordered:1; /* Use this index for == or IN queries only */ unsigned uniqNotNull:1; /* True if UNIQUE and NOT NULL for all columns */ unsigned isResized:1; /* True if resizeIndexObject() has been called */ @@ -19296,7 +19299,6 @@ struct Index { #define SQLITE_IDXTYPE_UNIQUE 1 /* Implements a UNIQUE constraint */ #define SQLITE_IDXTYPE_PRIMARYKEY 2 /* Is the PRIMARY KEY for the table */ #define SQLITE_IDXTYPE_IPK 3 /* INTEGER PRIMARY KEY index */ -#define SQLITE_IDXTYPE_VECTOR 4 /* libSQL vector index */ /* Return true if index X is a PRIMARY KEY index */ #define IsPrimaryKeyIndex(X) ((X)->idxType==SQLITE_IDXTYPE_PRIMARYKEY) @@ -19305,10 +19307,7 @@ struct Index { #define IsUniqueIndex(X) ((X)->onError!=OE_None) /* Return true if index X is a vector index */ -#define IsVectorIndex(X) ((X)->idxType==SQLITE_IDXTYPE_VECTOR) - -/* Return true if index X is an user defined index (APPDEF or VECTOR) */ -#define IsAppDefIndex(X) ((X)->idxType==SQLITE_IDXTYPE_APPDEF||(X)->idxType==SQLITE_IDXTYPE_VECTOR) +#define IsVectorIndex(X) ((X)->idxIsVector==1) /* The Index.aiColumn[] values are normally positive integer. But ** there are some negative values that have special meaning: @@ -85239,14 +85238,45 @@ typedef u32 VectorDims; */ #define MAX_VECTOR_SZ 65536 +/* + * on-disk binary format for vector of different types: + * 1. float32 + * [data[0] as f32] [data[1] as f32] ... [data[dims - 1] as f32] [1 as u8]? + * - last 'type'-byte is optional for float32 vectors + * + * 2. float64 + * [data[0] as f64] [data[1] as f64] ... [data[dims - 1] as f64] [2 as u8] + * - last 'type'-byte is mandatory for float64 vectors + * + * 3. float1bit + * [data[0] as u8] [data[1] as u8] ... [data[(dims + 7) / 8] as u8] [_ as u8; padding]? [trailing_bits as u8] [3 as u8] + * - every data byte (except for the last) represents exactly 8 components of the vector + * - last data byte represents [1..8] components of the vector + * - optional padding byte ensures that "trailing_bits" byte will be written at the odd blob position (0-based) + * - "trailing_bits" byte specify amount of trailing *bits* in the blob without last 'type'-byte which must be omitted + * (so, vector dimensions are equal to 8 * (blob_size - 1) - trailing_bits) + * - last 'type'-byte is mandatory for float1bit vectors + * + * 4. float8 + * [data[0] as u8] [data[1] as u8] ... [data[dims - 1] as u8] [_ as u8; alignment_padding]* [alpha as f32] [shift as f32] [padding as u8] [trailing_bytes as u8] [4 as u8] + * - every data byte represents single quantized vector component + * - "alignment_padding" has size from 0 to 3 bytes in order to pad content to multiple of 4 = sizeof(float) + * - "trailing_bytes" byte specify amount of bytes in the "alignment_padding" + * - last 'type'-byte is mandatory for float8 vectors +*/ + /* * Enumerate of supported vector types (0 omitted intentionally as we can use zero as "undefined" value) */ -#define VECTOR_TYPE_FLOAT32 1 -#define VECTOR_TYPE_FLOAT64 2 +#define VECTOR_TYPE_FLOAT32 1 +#define VECTOR_TYPE_FLOAT64 2 +#define VECTOR_TYPE_FLOAT1BIT 3 +#define VECTOR_TYPE_FLOAT8 4 #define VECTOR_FLAGS_STATIC 1 +#define ALIGN(n, size) (((n + size - 1) / size) * size) + /* * Object which represents a vector * data points to the memory which must be interpreted according to the vector type @@ -85261,15 +85291,20 @@ struct Vector { size_t vectorDataSize(VectorType, VectorDims); Vector *vectorAlloc(VectorType, VectorDims); void vectorFree(Vector *v); -int vectorParse(sqlite3_value *, Vector *, char **); +int vectorParseWithType(sqlite3_value *, Vector *, char **); void vectorInit(Vector *, VectorType, VectorDims, void *); /* * Dumps vector on the console (used only for debugging) */ -void vectorDump (const Vector *v); -void vectorF32Dump(const Vector *v); -void vectorF64Dump(const Vector *v); +void vectorDump (const Vector *v); +void vectorF8Dump (const Vector *v); +void vectorF32Dump (const Vector *v); +void vectorF64Dump (const Vector *v); +void vector1BitDump(const Vector *v); + +void vectorF8GetParameters(const u8 *, int, float *, float *); +void vectorF8SetParameters(u8 *, int, float, float); /* * Converts vector to the text representation and write the result to the sqlite3_context @@ -85281,28 +85316,30 @@ void vectorF64MarshalToText(sqlite3_context *, const Vector *); /* * Serializes vector to the blob in little-endian format according to the IEEE-754 standard */ -size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t); -size_t vectorF32SerializeToBlob(const Vector *, unsigned char *, size_t); -size_t vectorF64SerializeToBlob(const Vector *, unsigned char *, size_t); - -/* - * Deserializes vector from the blob in little-endian format according to the IEEE-754 standard -*/ -size_t vectorDeserializeFromBlob (Vector *, const unsigned char *, size_t); -size_t vectorF32DeserializeFromBlob(Vector *, const unsigned char *, size_t); -size_t vectorF64DeserializeFromBlob(Vector *, const unsigned char *, size_t); +void vectorSerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF8SerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF32SerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF64SerializeToBlob (const Vector *, unsigned char *, size_t); +void vector1BitSerializeToBlob(const Vector *, unsigned char *, size_t); /* * Calculates cosine distance between two vectors (vector must have same type and same dimensions) */ float vectorDistanceCos (const Vector *, const Vector *); +float vectorF8DistanceCos (const Vector *, const Vector *); float vectorF32DistanceCos (const Vector *, const Vector *); double vectorF64DistanceCos(const Vector *, const Vector *); +/* + * Calculates hamming distance between two 1-bit vectors (vector must have same dimensions) +*/ +int vector1BitDistanceHamming(const Vector *, const Vector *); + /* * Calculates L2 distance between two vectors (vector must have same type and same dimensions) */ float vectorDistanceL2 (const Vector *, const Vector *); +float vectorF8DistanceL2 (const Vector *, const Vector *); float vectorF32DistanceL2 (const Vector *, const Vector *); double vectorF64DistanceL2(const Vector *, const Vector *); @@ -85311,25 +85348,44 @@ double vectorF64DistanceL2(const Vector *, const Vector *); * LibSQL can append one trailing byte in the end of final blob. This byte will be later used to determine type of the blob * By default, blob with even length will be treated as a f32 blob */ -void vectorSerialize (sqlite3_context *, const Vector *); -void vectorF32Serialize(sqlite3_context *, const Vector *); -void vectorF64Serialize(sqlite3_context *, const Vector *); +void vectorSerializeWithMeta(sqlite3_context *, const Vector *); /* * Parses Vector content from the blob; vector type and dimensions must be filled already */ -int vectorParseSqliteBlob (sqlite3_value *, Vector *, char **); -int vectorF32ParseSqliteBlob(sqlite3_value *, Vector *, char **); -int vectorF64ParseSqliteBlob(sqlite3_value *, Vector *, char **); +int vectorParseSqliteBlobWithType(sqlite3_value *, Vector *, char **); -void vectorInitStatic(Vector *, VectorType, const unsigned char *, size_t); +void vectorF8DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vectorF32DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vectorF64DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vector1BitDeserializeFromBlob(Vector *, const unsigned char *, size_t); + +void vectorInitStatic(Vector *, VectorType, VectorDims, void *); void vectorInitFromBlob(Vector *, const unsigned char *, size_t); -void vectorF32InitFromBlob(Vector *, const unsigned char *, size_t); -void vectorF64InitFromBlob(Vector *, const unsigned char *, size_t); + +void vectorConvert(const Vector *, Vector *); /* Detect type and dimension of vector provided with first parameter of sqlite3_value * type */ int detectVectorParameters(sqlite3_value *, int, int *, int *, char **); +static inline unsigned serializeF32(unsigned char *pBuf, float value){ + u32 *p = (u32 *)&value; + pBuf[0] = *p & 0xFF; + pBuf[1] = (*p >> 8) & 0xFF; + pBuf[2] = (*p >> 16) & 0xFF; + pBuf[3] = (*p >> 24) & 0xFF; + return sizeof(float); +} + +static inline float deserializeF32(const unsigned char *pBuf){ + u32 value = 0; + value |= (u32)pBuf[0]; + value |= (u32)pBuf[1] << 8; + value |= (u32)pBuf[2] << 16; + value |= (u32)pBuf[3] << 24; + return *(float *)&value; +} + #if 0 } /* end of the 'extern "C"' block */ #endif @@ -85408,10 +85464,10 @@ int nodeEdgesMetadataOffset(const DiskAnnIndex *pIndex); void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Vector *pVector); void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector); u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot); -void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector); +void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, float *distance, Vector *pVector); int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u64 nRowid); void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPruned); -void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector); +void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, float distance, Vector *pVector); void nodeBinDeleteEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iDelete); void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot); @@ -85435,43 +85491,47 @@ typedef u8 MetricType; */ /* format version which can help to upgrade vector on-disk format without breaking older version of the db */ -#define VECTOR_FORMAT_PARAM_ID 1 +#define VECTOR_FORMAT_PARAM_ID 1 /* - * 1 - initial version + * 1 - v1 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u64 unused ] [u64 edge rowid] ] ... + * 2 - v2 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u32 unused] [f32 distance] [u64 edge rowid] ] ... */ -#define VECTOR_FORMAT_DEFAULT 1 +#define VECTOR_FORMAT_V1 1 +#define VECTOR_FORMAT_DEFAULT 2 /* type of the vector index */ -#define VECTOR_INDEX_TYPE_PARAM_ID 2 -#define VECTOR_INDEX_TYPE_DISKANN 1 +#define VECTOR_INDEX_TYPE_PARAM_ID 2 +#define VECTOR_INDEX_TYPE_DISKANN 1 /* type of the underlying vector for the vector index */ -#define VECTOR_TYPE_PARAM_ID 3 +#define VECTOR_TYPE_PARAM_ID 3 /* dimension of the underlying vector for the vector index */ -#define VECTOR_DIM_PARAM_ID 4 +#define VECTOR_DIM_PARAM_ID 4 /* metric type used for comparing two vectors */ -#define VECTOR_METRIC_TYPE_PARAM_ID 5 -#define VECTOR_METRIC_TYPE_COS 1 -#define VECTOR_METRIC_TYPE_L2 2 +#define VECTOR_METRIC_TYPE_PARAM_ID 5 +#define VECTOR_METRIC_TYPE_COS 1 +#define VECTOR_METRIC_TYPE_L2 2 /* block size */ -#define VECTOR_BLOCK_SIZE_PARAM_ID 6 -#define VECTOR_BLOCK_SIZE_DEFAULT 128 +#define VECTOR_BLOCK_SIZE_PARAM_ID 6 +#define VECTOR_BLOCK_SIZE_DEFAULT 128 -#define VECTOR_PRUNING_ALPHA_PARAM_ID 7 -#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2 +#define VECTOR_PRUNING_ALPHA_PARAM_ID 7 +#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2 -#define VECTOR_INSERT_L_PARAM_ID 8 -#define VECTOR_INSERT_L_DEFAULT 70 +#define VECTOR_INSERT_L_PARAM_ID 8 +#define VECTOR_INSERT_L_DEFAULT 70 -#define VECTOR_SEARCH_L_PARAM_ID 9 -#define VECTOR_SEARCH_L_DEFAULT 200 +#define VECTOR_SEARCH_L_PARAM_ID 9 +#define VECTOR_SEARCH_L_DEFAULT 200 -#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 +#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 + +#define VECTOR_COMPRESS_NEIGHBORS_PARAM_ID 11 /* total amount of vector index parameters */ -#define VECTOR_PARAM_IDS_COUNT 9 +#define VECTOR_PARAM_IDS_COUNT 11 /* * Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float) @@ -85553,7 +85613,7 @@ int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *); void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int); void vectorOutRowsFree(sqlite3 *, VectorOutRows *); -int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *); +int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *, const char **); int diskAnnClearIndex(sqlite3 *, const char *, const char *); int diskAnnDropIndex(sqlite3 *, const char *, const char *); int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **); @@ -123180,7 +123240,7 @@ static void SQLITE_NOINLINE deleteTable(sqlite3 *db, Table *pTable){ for(pIndex = pTable->pIndex; pIndex; pIndex=pNext){ pNext = pIndex->pNext; assert( pIndex->pSchema==pTable->pSchema - || (IsVirtual(pTable) && !IsAppDefIndex(pIndex)) ); + || (IsVirtual(pTable) && pIndex->idxType!=SQLITE_IDXTYPE_APPDEF) ); if( db->pnBytesFreed==0 && !IsVirtual(pTable) ){ char *zName = pIndex->zName; TESTONLY ( Index *pOld = ) sqlite3HashInsert( @@ -126692,13 +126752,12 @@ SQLITE_PRIVATE void sqlite3CreateIndex( goto exit_create_index; } if( vectorIdxRc >= 1 ){ - idxType = SQLITE_IDXTYPE_VECTOR; /* * SQLite can use B-Tree indices in some optimizations (like SELECT COUNT(*) can use any full B-Tree index instead of PK index) * But, SQLite pretty conservative about usage of unordered indices - that's what we need here */ pIndex->bUnordered = 1; - pIndex->idxType = idxType; + pIndex->idxIsVector = 1; } if( vectorIdxRc == 1 ){ skipRefill = 1; @@ -126746,7 +126805,7 @@ SQLITE_PRIVATE void sqlite3CreateIndex( for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){ int k; assert( IsUniqueIndex(pIdx) ); - assert( !IsAppDefIndex(pIdx) ); + assert( pIdx->idxType!=SQLITE_IDXTYPE_APPDEF ); assert( IsUniqueIndex(pIndex) ); if( pIdx->nKeyCol!=pIndex->nKeyCol ) continue; @@ -127027,7 +127086,7 @@ SQLITE_PRIVATE void sqlite3DropIndex(Parse *pParse, SrcList *pName, int ifExists pParse->checkSchema = 1; goto exit_drop_index; } - if( !IsAppDefIndex(pIndex) ){ + if( pIndex->idxType!=SQLITE_IDXTYPE_APPDEF ){ sqlite3ErrorMsg(pParse, "index associated with UNIQUE " "or PRIMARY KEY constraint cannot be dropped", 0); goto exit_drop_index; @@ -155952,6 +156011,10 @@ SQLITE_PRIVATE void sqlite3UpsertDoUpdate( /* #include "sqliteInt.h" */ /* #include "vdbeInt.h" */ +#ifndef SQLITE_OMIT_VECTOR +/* #include "vectorIndexInt.h" */ +#endif + #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* @@ -156229,6 +156292,27 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( if( rc!=SQLITE_OK ) goto end_of_vacuum; db->init.iDb = 0; +#ifndef SQLITE_OMIT_VECTOR + // shadow tables for vector index will be populated automatically during CREATE INDEX command + // so we must skip them at this step + if( sqlite3FindTable(db, VECTOR_INDEX_GLOBAL_META_TABLE, zDbMain) != NULL ){ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0 AND name NOT IN (SELECT name||'_shadow' FROM " VECTOR_INDEX_GLOBAL_META_TABLE ")", + zDbMain + ); + }else{ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0", + zDbMain + ); + } +#else /* Loop through the tables in the main database. For each, do ** an "INSERT INTO vacuum_db.xxx SELECT * FROM main.xxx;" to copy ** the contents to the temporary database. @@ -156240,6 +156324,7 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( "WHERE type='table'AND coalesce(rootpage,1)>0", zDbMain ); +#endif assert( (db->mDbFlags & DBFLAG_Vacuum)!=0 ); db->mDbFlags &= ~DBFLAG_Vacuum; if( rc!=SQLITE_OK ) goto end_of_vacuum; @@ -177876,9 +177961,6 @@ static YYACTIONTYPE yy_reduce( case 242: /* cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm indextype ON nm LP sortlist RP where_opt */ { u8 idxType = SQLITE_IDXTYPE_APPDEF; - if( yymsp[-6].minor.yy421.pUsing!=0 ){ - idxType = SQLITE_IDXTYPE_VECTOR; - } sqlite3CreateIndex(pParse, &yymsp[-8].minor.yy0, &yymsp[-7].minor.yy0, sqlite3SrcListAppend(pParse,0,&yymsp[-4].minor.yy0,0), yymsp[-2].minor.yy402, yymsp[-11].minor.yy502, &yymsp[-12].minor.yy0, yymsp[0].minor.yy590, SQLITE_SO_ASC, yymsp[-9].minor.yy502, idxType, yymsp[-6].minor.yy421.pUsing); @@ -210953,6 +211035,10 @@ size_t vectorDataSize(VectorType type, VectorDims dims){ return dims * sizeof(float); case VECTOR_TYPE_FLOAT64: return dims * sizeof(double); + case VECTOR_TYPE_FLOAT1BIT: + return (dims + 7) / 8; + case VECTOR_TYPE_FLOAT8: + return ALIGN(dims, sizeof(float)) + sizeof(float) /* alpha */ + sizeof(float) /* shift */; default: assert(0); } @@ -210984,10 +211070,11 @@ Vector *vectorAlloc(VectorType type, VectorDims dims){ ** Note that the vector object points to the blob so if ** you free the blob, the vector becomes invalid. **/ -void vectorInitStatic(Vector *pVector, VectorType type, const unsigned char *pBlob, size_t nBlobSize){ - pVector->type = type; +void vectorInitStatic(Vector *pVector, VectorType type, VectorDims dims, void *pBlob){ pVector->flags = VECTOR_FLAGS_STATIC; - vectorInitFromBlob(pVector, pBlob, nBlobSize); + pVector->type = type; + pVector->dims = dims; + pVector->data = pBlob; } /* @@ -211023,6 +211110,10 @@ float vectorDistanceCos(const Vector *pVector1, const Vector *pVector2){ return vectorF32DistanceCos(pVector1, pVector2); case VECTOR_TYPE_FLOAT64: return vectorF64DistanceCos(pVector1, pVector2); + case VECTOR_TYPE_FLOAT1BIT: + return vector1BitDistanceHamming(pVector1, pVector2); + case VECTOR_TYPE_FLOAT8: + return vectorF8DistanceCos(pVector1, pVector2); default: assert(0); } @@ -211036,6 +211127,8 @@ float vectorDistanceL2(const Vector *pVector1, const Vector *pVector2){ return vectorF32DistanceL2(pVector1, pVector2); case VECTOR_TYPE_FLOAT64: return vectorF64DistanceL2(pVector1, pVector2); + case VECTOR_TYPE_FLOAT8: + return vectorF8DistanceL2(pVector1, pVector2); default: assert(0); } @@ -211159,16 +211252,97 @@ static int vectorParseSqliteText( return -1; } -int vectorParseSqliteBlob( +static int vectorParseMeta(const unsigned char *pBlob, size_t nBlobSize, int *pType, int *pDims, size_t *pDataSize, char **pzErrMsg){ + int nTrailingBits; + int nTrailingBytes; + + if( nBlobSize % 2 == 0 ){ + *pType = VECTOR_TYPE_FLOAT32; + *pDims = nBlobSize / sizeof(float); + *pDataSize = nBlobSize; + return SQLITE_OK; + } + *pType = pBlob[nBlobSize - 1]; + nBlobSize--; + + if( *pType == VECTOR_TYPE_FLOAT32 ){ + if( nBlobSize % 4 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float32 vector blob length must be divisible by 4 (excluding optional 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + *pDims = nBlobSize / sizeof(float); + *pDataSize = nBlobSize; + }else if( *pType == VECTOR_TYPE_FLOAT64 ){ + if( nBlobSize % 8 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float64 vector blob length must be divisible by 8 (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + *pDims = nBlobSize / sizeof(double); + *pDataSize = nBlobSize; + }else if( *pType == VECTOR_TYPE_FLOAT1BIT ){ + if( nBlobSize == 0 || nBlobSize % 2 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float1bit vector blob length must be divisible by 2 and not be empty (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + nTrailingBits = pBlob[nBlobSize - 1]; + *pDims = nBlobSize * 8 - nTrailingBits; + *pDataSize = (*pDims + 7) / 8; + }else if( *pType == VECTOR_TYPE_FLOAT8 ){ + if( nBlobSize < 2 || nBlobSize % 2 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float8 vector blob length must be divisible by 2 and has at least 2 bytes (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + nTrailingBytes = pBlob[nBlobSize - 1]; + *pDims = (nBlobSize - 2) - sizeof(float) - sizeof(float) - nTrailingBytes; + *pDataSize = nBlobSize - 2; + }else{ + *pzErrMsg = sqlite3_mprintf("vector: unexpected binary type: %d", *pType); + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +int vectorParseSqliteBlobWithType( sqlite3_value *arg, Vector *pVector, char **pzErrMsg ){ + const unsigned char *pBlob; + size_t nBlobSize, nDataSize; + int type, dims; + + assert( sqlite3_value_type(arg) == SQLITE_BLOB ); + + pBlob = sqlite3_value_blob(arg); + nBlobSize = sqlite3_value_bytes(arg); + if( vectorParseMeta(pBlob, nBlobSize, &type, &dims, &nDataSize, pzErrMsg) != SQLITE_OK ){ + return SQLITE_ERROR; + } + + if( nDataSize != vectorDataSize(pVector->type, pVector->dims) ){ + *pzErrMsg = sqlite3_mprintf( + "vector: unexpected data part size: type=%d, dims=%d, %u != %u", + pVector->type, + pVector->dims, + nDataSize, + vectorDataSize(pVector->type, pVector->dims) + ); + return SQLITE_ERROR; + } + switch (pVector->type) { case VECTOR_TYPE_FLOAT32: - return vectorF32ParseSqliteBlob(arg, pVector, pzErrMsg); + vectorF32DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; case VECTOR_TYPE_FLOAT64: - return vectorF64ParseSqliteBlob(arg, pVector, pzErrMsg); + vectorF64DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitDeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; + case VECTOR_TYPE_FLOAT8: + vectorF8DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; default: assert(0); } @@ -211177,32 +211351,21 @@ int vectorParseSqliteBlob( int detectBlobVectorParameters(sqlite3_value *arg, int *pType, int *pDims, char **pzErrMsg) { const u8 *pBlob; - int nBlobSize; + size_t nBlobSize, nDataSize; assert( sqlite3_value_type(arg) == SQLITE_BLOB ); pBlob = sqlite3_value_blob(arg); nBlobSize = sqlite3_value_bytes(arg); - if( nBlobSize % 2 != 0 ){ - // we have trailing byte with explicit type definition - *pType = pBlob[nBlobSize - 1]; - } else { - // else, fallback to FLOAT32 - *pType = VECTOR_TYPE_FLOAT32; - } - if( *pType == VECTOR_TYPE_FLOAT32 ){ - *pDims = nBlobSize / sizeof(float); - } else if( *pType == VECTOR_TYPE_FLOAT64 ){ - *pDims = nBlobSize / sizeof(double); - } else{ - *pzErrMsg = sqlite3_mprintf("vector: unexpected binary type: got %d, expected %d or %d", *pType, VECTOR_TYPE_FLOAT32, VECTOR_TYPE_FLOAT64); - return -1; + + if( vectorParseMeta(pBlob, nBlobSize, pType, pDims, &nDataSize, pzErrMsg) != SQLITE_OK ){ + return SQLITE_ERROR; } if( *pDims > MAX_VECTOR_SZ ){ *pzErrMsg = sqlite3_mprintf("vector: max size exceeded: %d > %d", *pDims, MAX_VECTOR_SZ); - return -1; + return SQLITE_ERROR; } - return 0; + return SQLITE_OK; } int detectTextVectorParameters(sqlite3_value *arg, int typeHint, int *pType, int *pDims, char **pzErrMsg) { @@ -211251,14 +211414,14 @@ int detectVectorParameters(sqlite3_value *arg, int typeHint, int *pType, int *pD } } -int vectorParse( +int vectorParseWithType( sqlite3_value *arg, Vector *pVector, char **pzErrMsg ){ switch( sqlite3_value_type(arg) ){ case SQLITE_BLOB: - return vectorParseSqliteBlob(arg, pVector, pzErrMsg); + return vectorParseSqliteBlobWithType(arg, pVector, pzErrMsg); case SQLITE_TEXT: return vectorParseSqliteText(arg, pVector, pzErrMsg); default: @@ -211275,6 +211438,12 @@ void vectorDump(const Vector *pVector){ case VECTOR_TYPE_FLOAT64: vectorF64Dump(pVector); break; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitDump(pVector); + break; + case VECTOR_TYPE_FLOAT8: + vectorF8Dump(pVector); + break; default: assert(0); } @@ -211296,56 +211465,326 @@ void vectorMarshalToText( } } -void vectorSerialize( +static int vectorMetaSize(VectorType type, VectorDims dims){ + int nDataSize; + if( type == VECTOR_TYPE_FLOAT32 ){ + return 0; + }else if( type == VECTOR_TYPE_FLOAT64 ){ + return 1; + }else if( type == VECTOR_TYPE_FLOAT1BIT ){ + nDataSize = vectorDataSize(type, dims); + // optional padding byte + "trailing-bits" byte + "vector-type" byte + return (nDataSize % 2 == 0 ? 1 : 0) + 1 + 1; + }else if( type == VECTOR_TYPE_FLOAT8 ){ + nDataSize = vectorDataSize(type, dims); + assert( nDataSize % 2 == 0 ); + /* padding byte + "trailing-bytes" byte + "vector-type" byte */ + return 1 + 1 + 1; + }else{ + assert( 0 ); + } +} + +static void vectorSerializeMeta(const Vector *pVector, size_t nDataSize, unsigned char *pBlob, size_t nBlobSize){ + if( pVector->type == VECTOR_TYPE_FLOAT32 ){ + // no meta for f32 type as this is "default" vector type + }else if( pVector->type == VECTOR_TYPE_FLOAT64 ){ + assert( nDataSize % 2 == 0 ); + assert( nBlobSize == nDataSize + 1 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT64; + }else if( pVector->type == VECTOR_TYPE_FLOAT1BIT ){ + assert( nBlobSize % 2 == 1 ); + assert( nBlobSize >= 3 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT1BIT; + pBlob[nBlobSize - 2] = 8 * (nBlobSize - 1) - pVector->dims; + if( vectorMetaSize(pVector->type, pVector->dims) == 3 ){ + pBlob[nBlobSize - 3] = 0; + } + }else if( pVector->type == VECTOR_TYPE_FLOAT8 ){ + assert( nBlobSize % 2 == 1 ); + assert( nDataSize % 2 == 0 ); + assert( nBlobSize == nDataSize + 3 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT8; + pBlob[nBlobSize - 2] = ALIGN(pVector->dims, sizeof(float)) - pVector->dims; + }else{ + assert( 0 ); + } +} + +void vectorSerializeWithMeta( sqlite3_context *context, const Vector *pVector ){ + unsigned char *pBlob; + size_t nBlobSize, nDataSize, nMetaSize; + + assert( pVector->dims <= MAX_VECTOR_SZ ); + + nDataSize = vectorDataSize(pVector->type, pVector->dims); + nMetaSize = vectorMetaSize(pVector->type, pVector->dims); + nBlobSize = nDataSize + nMetaSize; + if( nBlobSize == 0 ){ + sqlite3_result_zeroblob(context, 0); + return; + } + + pBlob = sqlite3_malloc64(nBlobSize); + if( pBlob == NULL ){ + sqlite3_result_error_nomem(context); + return; + } + + vectorSerializeToBlob(pVector, pBlob, nDataSize); + vectorSerializeMeta(pVector, nDataSize, pBlob, nBlobSize); + sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); +} + +void vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t nBlobSize){ switch (pVector->type) { case VECTOR_TYPE_FLOAT32: - vectorF32Serialize(context, pVector); + vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); break; case VECTOR_TYPE_FLOAT64: - vectorF64Serialize(context, pVector); + vectorF64SerializeToBlob(pVector, pBlob, nBlobSize); + break; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitSerializeToBlob(pVector, pBlob, nBlobSize); + break; + case VECTOR_TYPE_FLOAT8: + vectorF8SerializeToBlob(pVector, pBlob, nBlobSize); break; default: assert(0); } } -size_t vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - return vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); - case VECTOR_TYPE_FLOAT64: - return vectorF64SerializeToBlob(pVector, pBlob, nBlobSize); - default: - assert(0); +void vectorInitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ + pVector->data = (void*)pBlob; +} + +static void vectorConvertFromF32(const Vector *pFrom, Vector *pTo){ + int i; + float *src; + + u8 *dst1Bit; + double *dstF64; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT32 ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF64[i] = src[i]; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( src[i] > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); } - return 0; } -size_t vectorDeserializeFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - return vectorF32DeserializeFromBlob(pVector, pBlob, nBlobSize); - case VECTOR_TYPE_FLOAT64: - return vectorF64DeserializeFromBlob(pVector, pBlob, nBlobSize); - default: - assert(0); +static void vectorConvertFromF64(const Vector *pFrom, Vector *pTo){ + int i; + double *src; + + u8 *dst1Bit; + float *dstF32; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT64 ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF32[i] = src[i]; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( src[i] > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); } - return 0; } -void vectorInitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - vectorF32InitFromBlob(pVector, pBlob, nBlobSize); - break; - case VECTOR_TYPE_FLOAT64: - vectorF64InitFromBlob(pVector, pBlob, nBlobSize); - break; - default: - assert(0); +static void vectorConvertFrom1Bit(const Vector *pFrom, Vector *pTo){ + int i; + u8 *src; + + float *dstF32; + double *dstF64; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT1BIT ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + if( ((src[i / 8] >> (i & 7)) & 1) == 1 ){ + dstF32[i] = +1; + }else{ + dstF32[i] = -1; + } + } + }else if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + if( ((src[i / 8] >> (i & 7)) & 1) == 1 ){ + dstF64[i] = +1; + }else{ + dstF64[i] = -1; + } + } + }else{ + assert( 0 ); + } +} + +static void vectorConvertFromF8(const Vector *pFrom, Vector *pTo){ + int i; + u8 *src; + float alpha, shift; + + float *dstF32; + double *dstF64; + u8 *dst1Bit; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(pFrom->data, pFrom->dims, &alpha, &shift); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF32[i] = alpha * src[i] + shift; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF64[i] = alpha * src[i] + shift; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( (alpha * src[i] + shift) > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); + } +} + +static inline int clip(float f, int minF, int maxF){ + if( f < minF ){ + return minF; + }else if( f > maxF ){ + return maxF; + } + return (int)(f + 0.5); +} + +#define MINMAX(i, value, minValue, maxValue) {if(i == 0){ minValue = (value); maxValue = (value);} else { minValue = MIN(minValue, (value)); maxValue = MAX(maxValue, (value)); }} + +static void vectorConvertToF8(const Vector *pFrom, Vector *pTo){ + int i; + u8 *dst; + float alpha, shift; + float minF = 0, maxF = 0; + + float *srcF32; + double *srcF64; + u8 *src1Bit; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pTo->type == VECTOR_TYPE_FLOAT8 ); + + dst = pTo->data; + if( pFrom->type == VECTOR_TYPE_FLOAT32 ){ + srcF32 = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, srcF32[i], minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip((srcF32[i] - shift) / alpha, 0, 255); + } + }else if( pFrom->type == VECTOR_TYPE_FLOAT64 ){ + srcF64 = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, srcF64[i], minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip((srcF64[i] - shift) / alpha, 0, 255); + } + }else if( pFrom->type == VECTOR_TYPE_FLOAT1BIT ){ + src1Bit = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, ((src1Bit[i / 8] >> (i & 7)) & 1) ? +1 : -1, minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip(((((src1Bit[i / 8] >> (i & 7)) & 1) ? +1 : -1) - shift) / alpha, 0, 255); + } + }else{ + assert( 0 ); + } + vectorF8SetParameters(pTo->data, pTo->dims, alpha, shift); +} + + +void vectorConvert(const Vector *pFrom, Vector *pTo){ + assert( pFrom->dims == pTo->dims ); + + if( pFrom->type == pTo->type ){ + memcpy(pTo->data, pFrom->data, vectorDataSize(pFrom->type, pFrom->dims)); + return; + } + + if( pTo->type == VECTOR_TYPE_FLOAT8 ){ + vectorConvertToF8(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT32 ){ + vectorConvertFromF32(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT64 ){ + vectorConvertFromF64(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT1BIT ){ + vectorConvertFrom1Bit(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT8 ){ + vectorConvertFromF8(pFrom, pTo); + }else{ + assert( 0 ); } } @@ -211360,31 +211799,49 @@ static void vectorFuncHintedType( sqlite3_context *context, int argc, sqlite3_value **argv, - int typeHint + int targetType ){ char *pzErrMsg = NULL; - Vector *pVector; - int type, dims; + Vector *pVector = NULL, *pTarget = NULL; + int type, dims, typeHint = VECTOR_TYPE_FLOAT32; if( argc < 1 ){ - return; + goto out; + } + // simplification in order to support only parsing from text to f32 and f64 vectors + if( targetType == VECTOR_TYPE_FLOAT64 ){ + typeHint = targetType; } if( detectVectorParameters(argv[0], typeHint, &type, &dims, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - return; + goto out; } pVector = vectorContextAlloc(context, type, dims); - if( pVector==NULL ){ - return; + if( pVector == NULL ){ + goto out; } - if( vectorParse(argv[0], pVector, &pzErrMsg) != 0 ){ + if( vectorParseWithType(argv[0], pVector, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - goto out_free_vec; + goto out; + } + if( type == targetType ){ + vectorSerializeWithMeta(context, pVector); + }else{ + pTarget = vectorContextAlloc(context, targetType, dims); + if( pTarget == NULL ){ + goto out; + } + vectorConvert(pVector, pTarget); + vectorSerializeWithMeta(context, pTarget); + } +out: + if( pVector != NULL ){ + vectorFree(pVector); + } + if( pTarget != NULL ){ + vectorFree(pTarget); } - vectorSerialize(context, pVector); -out_free_vec: - vectorFree(pVector); } static void vector32Func( @@ -211402,6 +211859,22 @@ static void vector64Func( vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT64); } +static void vector8Func( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT8); +} + +static void vector1BitFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT1BIT); +} + /* ** Implementation of vector_extract(X) function. */ @@ -211411,39 +211884,51 @@ static void vectorExtractFunc( sqlite3_value **argv ){ char *pzErrMsg = NULL; - Vector *pVector; + Vector *pVector = NULL, *pTarget = NULL; unsigned i; int type, dims; if( argc < 1 ){ - return; + goto out; } if( detectVectorParameters(argv[0], 0, &type, &dims, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - return; + goto out; } pVector = vectorContextAlloc(context, type, dims); - if( pVector==NULL ){ - return; + if( pVector == NULL ){ + goto out; } - if( vectorParse(argv[0], pVector, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[0], pVector, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - goto out_free; + goto out; + } + if( pVector->type == VECTOR_TYPE_FLOAT32 || pVector->type == VECTOR_TYPE_FLOAT64 ){ + vectorMarshalToText(context, pVector); + }else{ + pTarget = vectorContextAlloc(context, VECTOR_TYPE_FLOAT32, dims); + if( pTarget == NULL ){ + goto out; + } + vectorConvert(pVector, pTarget); + vectorMarshalToText(context, pTarget); + } +out: + if( pVector != NULL ){ + vectorFree(pVector); + } + if( pTarget != NULL ){ + vectorFree(pTarget); } - vectorMarshalToText(context, pVector); -out_free: - vectorFree(pVector); } -/* -** Implementation of vector_distance_cos(X, Y) function. -*/ -static void vectorDistanceCosFunc( +static void vectorDistanceFunc( sqlite3_context *context, int argc, - sqlite3_value **argv + sqlite3_value **argv, + float (*vectorDistance)(const Vector *pVector1, const Vector *pVector2) ){ char *pzErrMsg = NULL; Vector *pVector1 = NULL, *pVector2 = NULL; @@ -211463,13 +211948,19 @@ static void vectorDistanceCosFunc( goto out_free; } if( type1 != type2 ){ - pzErrMsg = sqlite3_mprintf("vector_distance_cos: vectors must have the same type: %d != %d", type1, type2); + pzErrMsg = sqlite3_mprintf("vector_distance: vectors must have the same type: %d != %d", type1, type2); sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } if( dims1 != dims2 ){ - pzErrMsg = sqlite3_mprintf("vector_distance_cos: vectors must have the same length: %d != %d", dims1, dims2); + pzErrMsg = sqlite3_mprintf("vector_distance: vectors must have the same length: %d != %d", dims1, dims2); + sqlite3_result_error(context, pzErrMsg, -1); + sqlite3_free(pzErrMsg); + goto out_free; + } + if( vectorDistance == vectorDistanceL2 && type1 == VECTOR_TYPE_FLOAT1BIT ){ + pzErrMsg = sqlite3_mprintf("vector_distance: l2 distance is not supported for float1bit vectors", dims1, dims2); sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; @@ -211482,17 +211973,17 @@ static void vectorDistanceCosFunc( if( pVector2==NULL ){ goto out_free; } - if( vectorParse(argv[0], pVector1, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[0], pVector1, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } - if( vectorParse(argv[1], pVector2, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[1], pVector2, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } - sqlite3_result_double(context, vectorDistanceCos(pVector1, pVector2)); + sqlite3_result_double(context, vectorDistance(pVector1, pVector2)); out_free: if( pVector2 ){ vectorFree(pVector2); @@ -211502,6 +211993,20 @@ static void vectorDistanceCosFunc( } } +/* +** Implementation of vector_distance_cos(X, Y) function. +*/ +static void vectorDistanceCosFunc(sqlite3_context *context, int argc, sqlite3_value **argv){ + vectorDistanceFunc(context, argc, argv, vectorDistanceCos); +} + +/* +** Implementation of vector_distance_l2(X, Y) function. +*/ +static void vectorDistanceL2Func(sqlite3_context *context, int argc, sqlite3_value **argv){ + vectorDistanceFunc(context, argc, argv, vectorDistanceL2); +} + /* * Marker function which is used in index creation syntax: CREATE INDEX idx ON t(libsql_vector_idx(emb)); */ @@ -211518,8 +212023,11 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ FUNCTION(vector, 1, 0, 0, vector32Func), FUNCTION(vector32, 1, 0, 0, vector32Func), FUNCTION(vector64, 1, 0, 0, vector64Func), + FUNCTION(vector1bit, 1, 0, 0, vector1BitFunc), + FUNCTION(vector8, 1, 0, 0, vector8Func), FUNCTION(vector_extract, 1, 0, 0, vectorExtractFunc), FUNCTION(vector_distance_cos, 2, 0, 0, vectorDistanceCosFunc), + FUNCTION(vector_distance_l2, 2, 0, 0, vectorDistanceL2Func), FUNCTION(libsql_vector_idx, -1, 0, 0, libsqlVectorIdx), }; @@ -211585,7 +212093,7 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ /* #include "sqliteInt.h" */ /* #include "vectorIndexInt.h" */ -#define SQLITE_VECTOR_TRACE +// #define SQLITE_VECTOR_TRACE #if defined(SQLITE_DEBUG) && defined(SQLITE_VECTOR_TRACE) #define DiskAnnTrace(X) sqlite3DebugPrintf X; #else @@ -211611,9 +212119,19 @@ SQLITE_PRIVATE void sqlite3RegisterVectorFunctions(void){ #define VECTOR_NODE_METADATA_SIZE (sizeof(u64) + sizeof(u16)) #define VECTOR_EDGE_METADATA_SIZE (sizeof(u64) + sizeof(u64)) +typedef struct VectorPair VectorPair; typedef struct DiskAnnSearchCtx DiskAnnSearchCtx; typedef struct DiskAnnNode DiskAnnNode; +// VectorPair represents single vector where pNode is an exact representation and pEdge - compressed representation +// (pEdge pointer always equals to pNode if pNodeType == pEdgeType) +struct VectorPair { + int nodeType; + int edgeType; + Vector *pNode; + Vector *pEdge; +}; + // DiskAnnNode represents single node in the DiskAnn graph struct DiskAnnNode { u64 nRowid; /* node id */ @@ -211629,14 +212147,18 @@ struct DiskAnnNode { * so caller which puts nodes in the context can forget about resource managmenet (context will take care of this) */ struct DiskAnnSearchCtx { - const Vector *pQuery; /* initial query vector; user query for SELECT and row vector for INSERT */ - DiskAnnNode **aCandidates; /* array of candidates ordered by distance to the query (ascending) */ - double *aDistances; /* array of distances to the query vector */ - unsigned int nCandidates; /* current size of aCandidates/aDistances arrays */ - unsigned int maxCandidates; /* max size of aCandidates/aDistances arrays */ - DiskAnnNode *visitedList; /* list of all visited candidates (so, candidates from aCandidates array either got replaced or moved to the visited list) */ - unsigned int nUnvisited; /* amount of unvisited candidates in the aCadidates array */ - int blobMode; /* DISKANN_BLOB_READONLY if we wont modify node blobs; DISKANN_BLOB_WRITABLE - otherwise */ + VectorPair query; /* initial query vector; user query for SELECT and row vector for INSERT */ + DiskAnnNode **aCandidates; /* array of unvisited candidates ordered by distance (possibly approximate) to the query (ascending) */ + float *aDistances; /* array of distances (possible approximate) to the query vector */ + unsigned int nCandidates; /* current size of aCandidates/aDistances arrays */ + unsigned int maxCandidates; /* max size of aCandidates/aDistances arrays */ + DiskAnnNode **aTopCandidates; /* top candidates with exact distance calculated */ + float *aTopDistances; /* top candidates exact distances */ + int nTopCandidates; /* current size of aTopCandidates/aTopDistances arrays */ + int maxTopCandidates; /* max size of aTopCandidates/aTopDistances arrays */ + DiskAnnNode *visitedList; /* list of all visited candidates (so, candidates from aCandidates array either got replaced or moved to the visited list) */ + unsigned int nUnvisited; /* amount of unvisited candidates in the aCadidates array */ + int blobMode; /* DISKANN_BLOB_READONLY if we wont modify node blobs; DISKANN_BLOB_WRITABLE - otherwise */ }; /************************************************************************** @@ -211647,6 +212169,10 @@ static inline u16 readLE16(const unsigned char *p){ return (u16)p[0] | (u16)p[1] << 8; } +static inline u32 readLE32(const unsigned char *p){ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16 | (u32)p[3] << 24; +} + static inline u64 readLE64(const unsigned char *p){ return (u64)p[0] | (u64)p[1] << 8 @@ -211663,6 +212189,13 @@ static inline void writeLE16(unsigned char *p, u16 v){ p[1] = v >> 8; } +static inline void writeLE32(unsigned char *p, u32 v){ + p[0] = v; + p[1] = v >> 8; + p[2] = v >> 16; + p[3] = v >> 24; +} + static inline void writeLE64(unsigned char *p, u64 v){ p[0] = v; p[1] = v >> 8; @@ -211842,7 +212375,7 @@ void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Ve void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector) { assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize ); - vectorInitStatic(pVector, pIndex->nNodeVectorType, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE, pIndex->nNodeVectorSize); + vectorInitStatic(pVector, pIndex->nNodeVectorType, pIndex->nVectorDims, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE); } u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { @@ -211851,20 +212384,25 @@ u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { return readLE16(pBlobSpot->pBuffer + sizeof(u64)); } -void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector) { +void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, float *pDistance, Vector *pVector) { + u32 distance; int offset = nodeEdgesMetadataOffset(pIndex); if( pRowid != NULL ){ assert( offset + (iEdge + 1) * VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize ); *pRowid = readLE64(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u64)); } + if( pIndex->nFormatVersion != VECTOR_FORMAT_V1 && pDistance != NULL ){ + distance = readLE32(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u32)); + *pDistance = *((float*)&distance); + } if( pVector != NULL ){ assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize < offset ); vectorInitStatic( pVector, pIndex->nEdgeVectorType, - pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nNodeVectorSize, - pIndex->nEdgeVectorSize + pIndex->nVectorDims, + pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize ); } } @@ -211874,7 +212412,7 @@ int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u6 // todo: if edges will be sorted by identifiers we can use binary search here (although speed up will be visible only on pretty loaded nodes: >128 edges) for(i = 0; i < nEdges; i++){ u64 edgeId; - nodeBinEdge(pIndex, pBlobSpot, i, &edgeId, NULL); + nodeBinEdge(pIndex, pBlobSpot, i, &edgeId, NULL, NULL); if( edgeId == nRowid ){ return i; } @@ -211889,7 +212427,7 @@ void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPru } // replace edge at position iReplace or add new one if iReplace == nEdges -void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector) { +void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, float distance, Vector *pVector) { int nMaxEdges = nodeEdgesMaxCount(pIndex); int nEdges = nodeBinEdges(pIndex, pBlobSpot); int edgeVectorOffset, edgeMetaOffset, itemsToMove; @@ -211908,6 +212446,7 @@ void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iRe assert( edgeMetaOffset + VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize ); vectorSerializeToBlob(pVector, pBlobSpot->pBuffer + edgeVectorOffset, pIndex->nEdgeVectorSize); + writeLE32(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u32), *((u32*)&distance)); writeLE64(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u64), nRowid); writeLE16(pBlobSpot->pBuffer + sizeof(u64), nEdges); @@ -211942,6 +212481,7 @@ void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { #if defined(SQLITE_DEBUG) && defined(SQLITE_VECTOR_TRACE) int nEdges, nMaxEdges, i; u64 nRowid; + float distance = 0; Vector vector; nEdges = nodeBinEdges(pIndex, pBlobSpot); @@ -211952,8 +212492,8 @@ void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { DiskAnnTrace((" nEdges=%d, nMaxEdges=%d, vector=", nEdges, nMaxEdges)); vectorDump(&vector); for(i = 0; i < nEdges; i++){ - nodeBinEdge(pIndex, pBlobSpot, i, &nRowid, &vector); - DiskAnnTrace((" to=%lld, vector=", nRowid, nRowid)); + nodeBinEdge(pIndex, pBlobSpot, i, &nRowid, &distance, &vector); + DiskAnnTrace((" to=%lld, distance=%f, vector=", nRowid, distance)); vectorDump(&vector); } #endif @@ -211968,12 +212508,14 @@ int diskAnnCreateIndex( const char *zDbSName, const char *zIdxName, const VectorIdxKey *pKey, - VectorIdxParams *pParams + VectorIdxParams *pParams, + const char **pzErrMsg ){ int rc; - int type, dims; + int type, dims, metric, neighbours; u64 maxNeighborsParam, blockSizeBytes; char *zSql; + const char *zRowidColumnName; char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){ @@ -211995,24 +212537,36 @@ int diskAnnCreateIndex( } assert( 0 < dims && dims <= MAX_VECTOR_SZ ); + metric = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID); + if( metric == 0 ){ + metric = VECTOR_METRIC_TYPE_COS; + if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, metric) != 0 ){ + return SQLITE_ERROR; + } + } + neighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID); + if( neighbours == VECTOR_TYPE_FLOAT1BIT && metric != VECTOR_METRIC_TYPE_COS ){ + *pzErrMsg = "1-bit compression available only for cosine metric"; + return SQLITE_ERROR; + } + if( neighbours == 0 ){ + neighbours = type; + } + maxNeighborsParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_NEIGHBORS_PARAM_ID); if( maxNeighborsParam == 0 ){ // 3 D**(1/2) gives good recall values (90%+) // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound - maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(neighbours, dims)) + 1); } - blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(neighbours, dims)); if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ return SQLITE_ERROR; } if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){ return SQLITE_ERROR; } - if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){ - if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){ - return SQLITE_ERROR; - } - } + if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){ if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){ return SQLITE_ERROR; @@ -212041,6 +212595,7 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "index_key"; }else{ zSql = sqlite3MPrintf( db, @@ -212050,7 +212605,29 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "rowid"; + } + rc = sqlite3_exec(db, zSql, 0, 0, 0); + sqlite3DbFree(db, zSql); + if( rc != SQLITE_OK ){ + return rc; } + /* + * vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB) + * in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell + * + * as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them + * so, in order to remove this overhead for random row selection - we creating an index with just single column used + * in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small + */ + zSql = sqlite3MPrintf( + db, + "CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)", + zDbSName, + zIdxName, + zIdxName, + zRowidColumnName + ); rc = sqlite3_exec(db, zSql, 0, 0, 0); sqlite3DbFree(db, zSql); return rc; @@ -212082,8 +212659,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid) zSql = sqlite3MPrintf( pIndex->db, - "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)", - pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow + "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)", + pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow ); if( zSql == NULL ){ rc = SQLITE_NOMEM_BKPT; @@ -212327,6 +212904,83 @@ static int diskAnnDeleteShadowRow(const DiskAnnIndex *pIndex, i64 nRowid){ return rc; } +/************************************************************************** +** Generic utilities +**************************************************************************/ + +int initVectorPair(int nodeType, int edgeType, int dims, VectorPair *pPair){ + pPair->nodeType = nodeType; + pPair->edgeType = edgeType; + pPair->pNode = NULL; + pPair->pEdge = NULL; + if( pPair->nodeType == pPair->edgeType ){ + return 0; + } + pPair->pEdge = vectorAlloc(edgeType, dims); + if( pPair->pEdge == NULL ){ + return SQLITE_NOMEM_BKPT; + } + return 0; +} + +void loadVectorPair(VectorPair *pPair, const Vector *pVector){ + pPair->pNode = (Vector*)pVector; + if( pPair->edgeType != pPair->nodeType ){ + vectorConvert(pPair->pNode, pPair->pEdge); + }else{ + pPair->pEdge = pPair->pNode; + } +} + +void deinitVectorPair(VectorPair *pPair) { + if( pPair->pEdge != NULL && pPair->pNode != pPair->pEdge ){ + vectorFree(pPair->pEdge); + } +} + +int distanceBufferInsertIdx(const float *aDistances, int nSize, int nMaxSize, float distance){ + int i; +#ifdef SQLITE_DEBUG + for(i = 0; i < nSize - 1; i++){ + assert(aDistances[i] <= aDistances[i + 1]); + } +#endif + for(i = 0; i < nSize; i++){ + if( distance < aDistances[i] ){ + return i; + } + } + return nSize < nMaxSize ? nSize : -1; +} + +void bufferInsert(u8 *aBuffer, int nSize, int nMaxSize, int iInsert, int nItemSize, const u8 *pItem, u8 *pLast) { + int itemsToMove; + + assert( nMaxSize > 0 && nItemSize > 0 ); + assert( nSize <= nMaxSize ); + assert( 0 <= iInsert && iInsert <= nSize && iInsert < nMaxSize ); + + if( nSize == nMaxSize ){ + if( pLast != NULL ){ + memcpy(pLast, aBuffer + (nSize - 1) * nItemSize, nItemSize); + } + nSize--; + } + itemsToMove = nSize - iInsert; + memmove(aBuffer + (iInsert + 1) * nItemSize, aBuffer + iInsert * nItemSize, itemsToMove * nItemSize); + memcpy(aBuffer + iInsert * nItemSize, pItem, nItemSize); +} + +void bufferDelete(u8 *aBuffer, int nSize, int iDelete, int nItemSize) { + int itemsToMove; + + assert( nItemSize > 0 ); + assert( 0 <= iDelete && iDelete < nSize ); + + itemsToMove = nSize - iDelete - 1; + memmove(aBuffer + iDelete * nItemSize, aBuffer + (iDelete + 1) * nItemSize, itemsToMove * nItemSize); +} + /************************************************************************** ** DiskANN internals **************************************************************************/ @@ -212363,26 +213017,40 @@ static void diskAnnNodeFree(DiskAnnNode *pNode){ sqlite3_free(pNode); } -static int diskAnnSearchCtxInit(DiskAnnSearchCtx *pCtx, const Vector* pQuery, unsigned int maxCandidates, int blobMode){ - pCtx->pQuery = pQuery; +static int diskAnnSearchCtxInit(const DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, const Vector* pQuery, int maxCandidates, int topCandidates, int blobMode){ + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &pCtx->query) != 0 ){ + return SQLITE_NOMEM_BKPT; + } + loadVectorPair(&pCtx->query, pQuery); + pCtx->aDistances = sqlite3_malloc(maxCandidates * sizeof(double)); pCtx->aCandidates = sqlite3_malloc(maxCandidates * sizeof(DiskAnnNode*)); pCtx->nCandidates = 0; pCtx->maxCandidates = maxCandidates; + pCtx->aTopDistances = sqlite3_malloc(topCandidates * sizeof(double)); + pCtx->aTopCandidates = sqlite3_malloc(topCandidates * sizeof(DiskAnnNode*)); + pCtx->nTopCandidates = 0; + pCtx->maxTopCandidates = topCandidates; pCtx->visitedList = NULL; pCtx->nUnvisited = 0; pCtx->blobMode = blobMode; - if( pCtx->aDistances == NULL || pCtx->aCandidates == NULL ){ - goto out_oom; + + if( pCtx->aDistances != NULL && pCtx->aCandidates != NULL && pCtx->aTopDistances != NULL && pCtx->aTopCandidates != NULL ){ + return SQLITE_OK; } - return SQLITE_OK; -out_oom: if( pCtx->aDistances != NULL ){ sqlite3_free(pCtx->aDistances); } if( pCtx->aCandidates != NULL ){ sqlite3_free(pCtx->aCandidates); } + if( pCtx->aTopDistances != NULL ){ + sqlite3_free(pCtx->aTopDistances); + } + if( pCtx->aTopCandidates != NULL ){ + sqlite3_free(pCtx->aTopCandidates); + } + deinitVectorPair(&pCtx->query); return SQLITE_NOMEM_BKPT; } @@ -212406,6 +213074,9 @@ static void diskAnnSearchCtxDeinit(DiskAnnSearchCtx *pCtx){ } sqlite3_free(pCtx->aCandidates); sqlite3_free(pCtx->aDistances); + sqlite3_free(pCtx->aTopCandidates); + sqlite3_free(pCtx->aTopDistances); + deinitVectorPair(&pCtx->query); } // check if we visited this node earlier @@ -212447,7 +213118,9 @@ static int diskAnnSearchCtxShouldAddCandidate(const DiskAnnIndex *pIndex, const } // mark node as visited and put it in the head of visitedList -static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNode){ +static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNode, float distance){ + int iInsert; + assert( pCtx->nUnvisited > 0 ); assert( pNode->visited == 0 ); @@ -212456,56 +213129,51 @@ static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNo pNode->pNext = pCtx->visitedList; pCtx->visitedList = pNode; + + iInsert = distanceBufferInsertIdx(pCtx->aTopDistances, pCtx->nTopCandidates, pCtx->maxTopCandidates, distance); + if( iInsert < 0 ){ + return; + } + bufferInsert((u8*)pCtx->aTopCandidates, pCtx->nTopCandidates, pCtx->maxTopCandidates, iInsert, sizeof(DiskAnnNode*), (u8*)&pNode, NULL); + bufferInsert((u8*)pCtx->aTopDistances, pCtx->nTopCandidates, pCtx->maxTopCandidates, iInsert, sizeof(float), (u8*)&distance, NULL); + pCtx->nTopCandidates = MIN(pCtx->nTopCandidates + 1, pCtx->maxTopCandidates); } static int diskAnnSearchCtxHasUnvisited(const DiskAnnSearchCtx *pCtx){ return pCtx->nUnvisited > 0; } -static DiskAnnNode* diskAnnSearchCtxGetCandidate(DiskAnnSearchCtx *pCtx, int i){ +static void diskAnnSearchCtxGetCandidate(DiskAnnSearchCtx *pCtx, int i, DiskAnnNode **ppNode, float *pDistance){ assert( 0 <= i && i < pCtx->nCandidates ); - return pCtx->aCandidates[i]; + *ppNode = pCtx->aCandidates[i]; + *pDistance = pCtx->aDistances[i]; } static void diskAnnSearchCtxDeleteCandidate(DiskAnnSearchCtx *pCtx, int iDelete){ int i; - assert( 0 <= iDelete && iDelete < pCtx->nCandidates ); assert( pCtx->nUnvisited > 0 ); assert( !pCtx->aCandidates[iDelete]->visited ); assert( pCtx->aCandidates[iDelete]->pBlobSpot == NULL ); diskAnnNodeFree(pCtx->aCandidates[iDelete]); + bufferDelete((u8*)pCtx->aCandidates, pCtx->nCandidates, iDelete, sizeof(DiskAnnNode*)); + bufferDelete((u8*)pCtx->aDistances, pCtx->nCandidates, iDelete, sizeof(float)); - for(i = iDelete + 1; i < pCtx->nCandidates; i++){ - pCtx->aCandidates[i - 1] = pCtx->aCandidates[i]; - pCtx->aDistances[i - 1] = pCtx->aDistances[i]; - } pCtx->nCandidates--; pCtx->nUnvisited--; } -static void diskAnnSearchCtxInsertCandidate(DiskAnnSearchCtx *pCtx, int iInsert, DiskAnnNode* pCandidate, float candidateDist){ - int i; - assert( 0 <= iInsert && iInsert <= pCtx->nCandidates && iInsert < pCtx->maxCandidates ); - if( pCtx->nCandidates < pCtx->maxCandidates ){ - pCtx->nCandidates++; - } else { - DiskAnnNode *pLast = pCtx->aCandidates[pCtx->nCandidates - 1]; - if( !pLast->visited ){ - // since pLast is not visited it should have uninitialized pBlobSpot - so it's safe to completely free the node - assert( pLast->pBlobSpot == NULL ); - pCtx->nUnvisited--; - diskAnnNodeFree(pLast); - } - } - // Shift the candidates to the right to make space for the new one. - for(i = pCtx->nCandidates - 1; i > iInsert; i--){ - pCtx->aCandidates[i] = pCtx->aCandidates[i - 1]; - pCtx->aDistances[i] = pCtx->aDistances[i - 1]; - } - // Insert the new candidate. - pCtx->aCandidates[iInsert] = pCandidate; - pCtx->aDistances[iInsert] = candidateDist; +static void diskAnnSearchCtxInsertCandidate(DiskAnnSearchCtx *pCtx, int iInsert, DiskAnnNode* pCandidate, float distance){ + DiskAnnNode *pLast = NULL; + bufferInsert((u8*)pCtx->aCandidates, pCtx->nCandidates, pCtx->maxCandidates, iInsert, sizeof(DiskAnnNode*), (u8*)&pCandidate, (u8*)&pLast); + bufferInsert((u8*)pCtx->aDistances, pCtx->nCandidates, pCtx->maxCandidates, iInsert, sizeof(float), (u8*)&distance, NULL); + pCtx->nCandidates = MIN(pCtx->nCandidates + 1, pCtx->maxCandidates); + if( pLast != NULL && !pLast->visited ){ + // since pLast is not visited it should have uninitialized pBlobSpot - so it's safe to completely free the node + assert( pLast->pBlobSpot == NULL ); + pCtx->nUnvisited--; + diskAnnNodeFree(pLast); + } pCtx->nUnvisited++; } @@ -212535,7 +213203,14 @@ static int diskAnnSearchCtxFindClosestCandidateIdx(const DiskAnnSearchCtx *pCtx) // return position for new edge(C) which will replace previous edge on that position or -1 if we should ignore it // we also check that no current edge(B) will "prune" new vertex: i.e. dist(B, C) >= (means worse than) alpha * dist(node, C) for all current edges // if any edge(B) will "prune" new edge(C) we will ignore it (return -1) -static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, u64 newRowid, const Vector *pNewVector) { +static int diskAnnReplaceEdgeIdx( + const DiskAnnIndex *pIndex, + BlobSpot *pNodeBlob, + u64 newRowid, + VectorPair *pNewVector, + VectorPair *pPlaceholder, + float *pNodeToNew +) { int i, nEdges, nMaxEdges, iReplace = -1; Vector nodeVector, edgeVector; float nodeToNew, nodeToReplace; @@ -212543,20 +213218,27 @@ static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob nEdges = nodeBinEdges(pIndex, pNodeBlob); nMaxEdges = nodeEdgesMaxCount(pIndex); nodeBinVector(pIndex, pNodeBlob, &nodeVector); - nodeToNew = diskAnnVectorDistance(pIndex, &nodeVector, pNewVector); + loadVectorPair(pPlaceholder, &nodeVector); + + // we need to evaluate potentially approximate distance here in order to correctly compare it with edge distances + nodeToNew = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, pNewVector->pEdge); + *pNodeToNew = nodeToNew; for(i = nEdges - 1; i >= 0; i--){ u64 edgeRowid; float edgeToNew, nodeToEdge; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &nodeToEdge, &edgeVector); if( edgeRowid == newRowid ){ // deletes can leave "zombie" edges in the graph and we must override them and not store duplicate edges in the node return i; } - edgeToNew = diskAnnVectorDistance(pIndex, &edgeVector, pNewVector); - nodeToEdge = diskAnnVectorDistance(pIndex, &nodeVector, &edgeVector); + if( pIndex->nFormatVersion == VECTOR_FORMAT_V1 ){ + nodeToEdge = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, &edgeVector); + } + + edgeToNew = diskAnnVectorDistance(pIndex, &edgeVector, pNewVector->pEdge); if( nodeToNew > pIndex->pruningAlpha * edgeToNew ){ return -1; } @@ -212574,12 +213256,14 @@ static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob // prune edges after we inserted new edge at position iInserted // we only need to check for edges which will be pruned by new vertex // no need to check for other pairs as we checked them on previous insertions -static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, int iInserted) { +static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, int iInserted, VectorPair *pPlaceholder) { int i, s, nEdges; - Vector nodeVector, hintVector; + Vector nodeVector, hintEdgeVector; u64 hintRowid; nodeBinVector(pIndex, pNodeBlob, &nodeVector); + loadVectorPair(pPlaceholder, &nodeVector); + nEdges = nodeBinEdges(pIndex, pNodeBlob); assert( 0 <= iInserted && iInserted < nEdges ); @@ -212589,7 +213273,7 @@ static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, i nodeBinDebug(pIndex, pNodeBlob); #endif - nodeBinEdge(pIndex, pNodeBlob, iInserted, &hintRowid, &hintVector); + nodeBinEdge(pIndex, pNodeBlob, iInserted, &hintRowid, NULL, &hintEdgeVector); // remove edges which is no longer interesting due to the addition of iInserted i = 0; @@ -212597,14 +213281,17 @@ static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, i Vector edgeVector; float nodeToEdge, hintToEdge; u64 edgeRowid; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &nodeToEdge, &edgeVector); if( hintRowid == edgeRowid ){ i++; continue; } - nodeToEdge = diskAnnVectorDistance(pIndex, &nodeVector, &edgeVector); - hintToEdge = diskAnnVectorDistance(pIndex, &hintVector, &edgeVector); + if( pIndex->nFormatVersion == VECTOR_FORMAT_V1 ){ + nodeToEdge = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, &edgeVector); + } + + hintToEdge = diskAnnVectorDistance(pIndex, &hintEdgeVector, &edgeVector); if( nodeToEdge > pIndex->pruningAlpha * hintToEdge ){ nodeBinDeleteEdge(pIndex, pNodeBlob, i); nEdges--; @@ -212653,7 +213340,7 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u } nodeBinVector(pIndex, start->pBlobSpot, &startVector); - startDistance = diskAnnVectorDistance(pIndex, pCtx->pQuery, &startVector); + startDistance = diskAnnVectorDistance(pIndex, pCtx->query.pNode, &startVector); if( pCtx->blobMode == DISKANN_BLOB_READONLY ){ assert( start->pBlobSpot != NULL ); @@ -212670,8 +213357,9 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u Vector vCandidate; DiskAnnNode *pCandidate; BlobSpot *pCandidateBlob; + float distance; int iCandidate = diskAnnSearchCtxFindClosestCandidateIdx(pCtx); - pCandidate = diskAnnSearchCtxGetCandidate(pCtx, iCandidate); + diskAnnSearchCtxGetCandidate(pCtx, iCandidate, &pCandidate, &distance); rc = SQLITE_OK; if( pReusableBlobSpot != NULL ){ @@ -212699,25 +213387,30 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u goto out; } - diskAnnSearchCtxMarkVisited(pCtx, pCandidate); - nVisited += 1; DiskAnnTrace(("visiting candidate(%d): id=%lld\n", nVisited, pCandidate->nRowid)); nodeBinVector(pIndex, pCandidateBlob, &vCandidate); nEdges = nodeBinEdges(pIndex, pCandidateBlob); + // if pNodeQuery != pEdgeQuery then distance from aDistances is approximate and we must recalculate it + if( pCtx->query.pNode != pCtx->query.pEdge ){ + distance = diskAnnVectorDistance(pIndex, &vCandidate, pCtx->query.pNode); + } + + diskAnnSearchCtxMarkVisited(pCtx, pCandidate, distance); + for(i = 0; i < nEdges; i++){ u64 edgeRowid; Vector edgeVector; float edgeDistance; int iInsert; DiskAnnNode *pNewCandidate; - nodeBinEdge(pIndex, pCandidateBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pCandidateBlob, i, &edgeRowid, NULL, &edgeVector); if( diskAnnSearchCtxIsVisited(pCtx, edgeRowid) || diskAnnSearchCtxHasCandidate(pCtx, edgeRowid) ){ continue; } - edgeDistance = diskAnnVectorDistance(pIndex, pCtx->pQuery, &edgeVector); + edgeDistance = diskAnnVectorDistance(pIndex, pCtx->query.pEdge, &edgeVector); iInsert = diskAnnSearchCtxShouldAddCandidate(pIndex, pCtx, edgeDistance); if( iInsert < 0 ){ continue; @@ -212775,12 +213468,12 @@ int diskAnnSearch( *pzErrMsg = sqlite3_mprintf("vector index(search): k must be a non-negative integer"); return SQLITE_ERROR; } - if( pIndex->nVectorDims != pVector->dims ){ + if( pVector->dims != pIndex->nVectorDims ){ *pzErrMsg = sqlite3_mprintf("vector index(search): dimensions are different: %d != %d", pVector->dims, pIndex->nVectorDims); return SQLITE_ERROR; } - if( pVector->type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(search): only f32 vectors are supported"); + if( pVector->type != pIndex->nNodeVectorType ){ + *pzErrMsg = sqlite3_mprintf("vector index(search): vector type differs from column type: %d != %d", pVector->type, pIndex->nNodeVectorType); return SQLITE_ERROR; } @@ -212794,7 +213487,7 @@ int diskAnnSearch( *pzErrMsg = sqlite3_mprintf("vector index(search): failed to select start node for search"); return rc; } - rc = diskAnnSearchCtxInit(&ctx, pVector, pIndex->searchL, DISKANN_BLOB_READONLY); + rc = diskAnnSearchCtxInit(pIndex, &ctx, pVector, pIndex->searchL, k, DISKANN_BLOB_READONLY); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to initialize search context"); goto out; @@ -212803,7 +213496,7 @@ int diskAnnSearch( if( rc != SQLITE_OK ){ goto out; } - nOutRows = MIN(k, ctx.nCandidates); + nOutRows = MIN(k, ctx.nTopCandidates); rc = vectorOutRowsAlloc(pIndex->db, pRows, nOutRows, pKey->nKeyColumns, vectorIdxKeyRowidLike(pKey)); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to allocate output rows"); @@ -212811,9 +213504,9 @@ int diskAnnSearch( } for(i = 0; i < nOutRows; i++){ if( pRows->aIntValues != NULL ){ - rc = vectorOutRowsPut(pRows, i, 0, &ctx.aCandidates[i]->nRowid, NULL); + rc = vectorOutRowsPut(pRows, i, 0, &ctx.aTopCandidates[i]->nRowid, NULL); }else{ - rc = diskAnnGetShadowRowKeys(pIndex, ctx.aCandidates[i]->nRowid, pKey, pRows, i); + rc = diskAnnGetShadowRowKeys(pIndex, ctx.aTopCandidates[i]->nRowid, pKey, pRows, i); } if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to put result in the output row"); @@ -212837,24 +213530,39 @@ int diskAnnInsert( BlobSpot *pBlobSpot = NULL; DiskAnnNode *pVisited; DiskAnnSearchCtx ctx; + VectorPair vInsert, vCandidate; + vInsert.pNode = NULL; vInsert.pEdge = NULL; + vCandidate.pNode = NULL; vCandidate.pEdge = NULL; if( pVectorInRow->pVector->dims != pIndex->nVectorDims ){ *pzErrMsg = sqlite3_mprintf("vector index(insert): dimensions are different: %d != %d", pVectorInRow->pVector->dims, pIndex->nVectorDims); return SQLITE_ERROR; } - if( pVectorInRow->pVector->type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(insert): only f32 vectors are supported"); + if( pVectorInRow->pVector->type != pIndex->nNodeVectorType ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): vector type differs from column type: %d != %d", pVectorInRow->pVector->type, pIndex->nNodeVectorType); return SQLITE_ERROR; } DiskAnnTrace(("diskAnnInset started\n")); - rc = diskAnnSearchCtxInit(&ctx, pVectorInRow->pVector, pIndex->insertL, DISKANN_BLOB_WRITABLE); + rc = diskAnnSearchCtxInit(pIndex, &ctx, pVectorInRow->pVector, pIndex->insertL, 1, DISKANN_BLOB_WRITABLE); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(insert): failed to initialize search context"); return rc; } + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &vInsert) != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): unable to allocate mem for node VectorPair"); + rc = SQLITE_NOMEM_BKPT; + goto out; + } + + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &vCandidate) != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): unable to allocate mem for candidate VectorPair"); + rc = SQLITE_NOMEM_BKPT; + goto out; + } + // note: we must select random row before we will insert new row in the shadow table rc = diskAnnSelectRandomShadowRow(pIndex, &nStartRowid); if( rc == SQLITE_DONE ){ @@ -212892,28 +213600,33 @@ int diskAnnInsert( } // first pass - add all visited nodes as a potential neighbours of new node for(pVisited = ctx.visitedList; pVisited != NULL; pVisited = pVisited->pNext){ - Vector vector; + Vector nodeVector; int iReplace; + float nodeToNew; - nodeBinVector(pIndex, pVisited->pBlobSpot, &vector); - iReplace = diskAnnReplaceEdgeIdx(pIndex, pBlobSpot, pVisited->nRowid, &vector); + nodeBinVector(pIndex, pVisited->pBlobSpot, &nodeVector); + loadVectorPair(&vCandidate, &nodeVector); + + iReplace = diskAnnReplaceEdgeIdx(pIndex, pBlobSpot, pVisited->nRowid, &vCandidate, &vInsert, &nodeToNew); if( iReplace == -1 ){ continue; } - nodeBinReplaceEdge(pIndex, pBlobSpot, iReplace, pVisited->nRowid, &vector); - diskAnnPruneEdges(pIndex, pBlobSpot, iReplace); + nodeBinReplaceEdge(pIndex, pBlobSpot, iReplace, pVisited->nRowid, nodeToNew, vCandidate.pEdge); + diskAnnPruneEdges(pIndex, pBlobSpot, iReplace, &vInsert); } // second pass - add new node as a potential neighbour of all visited nodes + loadVectorPair(&vInsert, pVectorInRow->pVector); for(pVisited = ctx.visitedList; pVisited != NULL; pVisited = pVisited->pNext){ int iReplace; + float nodeToNew; - iReplace = diskAnnReplaceEdgeIdx(pIndex, pVisited->pBlobSpot, nNewRowid, pVectorInRow->pVector); + iReplace = diskAnnReplaceEdgeIdx(pIndex, pVisited->pBlobSpot, nNewRowid, &vInsert, &vCandidate, &nodeToNew); if( iReplace == -1 ){ continue; } - nodeBinReplaceEdge(pIndex, pVisited->pBlobSpot, iReplace, nNewRowid, pVectorInRow->pVector); - diskAnnPruneEdges(pIndex, pVisited->pBlobSpot, iReplace); + nodeBinReplaceEdge(pIndex, pVisited->pBlobSpot, iReplace, nNewRowid, nodeToNew, vInsert.pEdge); + diskAnnPruneEdges(pIndex, pVisited->pBlobSpot, iReplace, &vCandidate); rc = blobSpotFlush(pIndex, pVisited->pBlobSpot); if( rc != SQLITE_OK ){ @@ -212924,6 +213637,8 @@ int diskAnnInsert( rc = SQLITE_OK; out: + deinitVectorPair(&vInsert); + deinitVectorPair(&vCandidate); if( rc == SQLITE_OK ){ rc = blobSpotFlush(pIndex, pBlobSpot); if( rc != SQLITE_OK ){ @@ -212958,7 +213673,12 @@ int diskAnnDelete( DiskAnnTrace(("diskAnnDelete started: rowid=%lld\n", nodeRowid)); rc = blobSpotCreate(pIndex, &pNodeBlob, nodeRowid, pIndex->nBlockSize, DISKANN_BLOB_WRITABLE); - if( rc != SQLITE_OK ){ + if( rc == DISKANN_ROW_NOT_FOUND ){ + // as we omit rows with NULL values during insert, it can be the case that there is nothing to delete in the index, while row exists in the base table + // so, we must simply silently stop delete process as there is nothing to delete from index + rc = SQLITE_OK; + goto out; + }else if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(delete): failed to create blob for node row"); goto out; } @@ -212975,7 +213695,7 @@ int diskAnnDelete( nNeighbours = nodeBinEdges(pIndex, pNodeBlob); for(i = 0; i < nNeighbours; i++){ u64 edgeRowid; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, NULL); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, NULL, NULL); rc = blobSpotReload(pIndex, pEdgeBlob, edgeRowid, pIndex->nBlockSize); if( rc == DISKANN_ROW_NOT_FOUND ){ continue; @@ -213022,6 +213742,7 @@ int diskAnnOpenIndex( ){ DiskAnnIndex *pIndex; u64 nBlockSize; + int compressNeighbours; pIndex = sqlite3DbMallocRaw(db, sizeof(DiskAnnIndex)); if( pIndex == NULL ){ return SQLITE_NOMEM; @@ -213068,11 +213789,18 @@ int diskAnnOpenIndex( pIndex->searchL = VECTOR_SEARCH_L_DEFAULT; } pIndex->nNodeVectorSize = vectorDataSize(pIndex->nNodeVectorType, pIndex->nVectorDims); - // will change in future when we will support compression of edges vectors - pIndex->nEdgeVectorType = pIndex->nNodeVectorType; - pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize; + + compressNeighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID); + if( compressNeighbours == 0 ){ + pIndex->nEdgeVectorType = pIndex->nNodeVectorType; + pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize; + }else{ + pIndex->nEdgeVectorType = compressNeighbours; + pIndex->nEdgeVectorSize = vectorDataSize(compressNeighbours, pIndex->nVectorDims); + } *ppIndex = pIndex; + DiskAnnTrace(("opened index %s: max edges %d\n", zIdxName, nodeEdgesMaxCount(pIndex))); return SQLITE_OK; } @@ -213091,7 +213819,7 @@ void diskAnnCloseIndex(DiskAnnIndex *pIndex){ #endif /* !defined(SQLITE_OMIT_VECTOR) */ /************** End of vectordiskann.c ***************************************/ -/************** Begin file vectorfloat32.c ***********************************/ +/************** Begin file vectorfloat1bit.c *********************************/ /* ** 2024-07-04 ** @@ -213116,7 +213844,7 @@ void diskAnnCloseIndex(DiskAnnIndex *pIndex){ ** ****************************************************************************** ** -** 32-bit floating point vector format utilities. +** 1-bit vector format utilities. */ #ifndef SQLITE_OMIT_VECTOR /* #include "sqliteInt.h" */ @@ -213129,111 +213857,188 @@ void diskAnnCloseIndex(DiskAnnIndex *pIndex){ ** Utility routines for debugging **************************************************************************/ -void vectorF32Dump(const Vector *pVec){ - float *elems = pVec->data; +void vector1BitDump(const Vector *pVec){ + u8 *elems = pVec->data; unsigned i; - assert( pVec->type == VECTOR_TYPE_FLOAT32 ); + assert( pVec->type == VECTOR_TYPE_FLOAT1BIT ); + printf("f1bit: ["); for(i = 0; i < pVec->dims; i++){ - printf("%f ", elems[i]); + printf("%s%d", i == 0 ? "" : ", ", ((elems[i / 8] >> (i & 7)) & 1) ? +1 : -1); } - printf("\n"); + printf("]\n"); } /************************************************************************** ** Utility routines for vector serialization and deserialization **************************************************************************/ -static inline unsigned formatF32(float value, char *pBuf, int nBufSize){ - sqlite3_snprintf(nBufSize, pBuf, "%g", (double)value); - return strlen(pBuf); -} - -static inline unsigned serializeF32(unsigned char *pBuf, float value){ - u32 *p = (u32 *)&value; - pBuf[0] = *p & 0xFF; - pBuf[1] = (*p >> 8) & 0xFF; - pBuf[2] = (*p >> 16) & 0xFF; - pBuf[3] = (*p >> 24) & 0xFF; - return sizeof(float); -} - -static inline float deserializeF32(const unsigned char *pBuf){ - u32 value = 0; - value |= (u32)pBuf[0]; - value |= (u32)pBuf[1] << 8; - value |= (u32)pBuf[2] << 16; - value |= (u32)pBuf[3] << 24; - return *(float *)&value; -} - -size_t vectorF32SerializeToBlob( +void vector1BitSerializeToBlob( const Vector *pVector, unsigned char *pBlob, size_t nBlobSize ){ - float *elems = pVector->data; - unsigned char *pPtr = pBlob; - size_t len = 0; + u8 *elems = pVector->data; + u8 *pPtr = pBlob; unsigned i; - assert( pVector->type == VECTOR_TYPE_FLOAT32 ); + assert( pVector->type == VECTOR_TYPE_FLOAT1BIT ); assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize >= pVector->dims * sizeof(float) ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + for(i = 0; i < (pVector->dims + 7) / 8; i++){ + pPtr[i] = elems[i]; + } +} + +// [sum(map(int, bin(i)[2:])) for i in range(256)] +static int BitsCount[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; - for(i = 0; i < pVector->dims; i++){ - pPtr += serializeF32(pPtr, elems[i]); +static inline int sqlite3PopCount32(u32 a){ +#if GCC_VERSION>=5004000 && !defined(__INTEL_COMPILER) + return __builtin_popcount(a); +#else + return BitsCount[a >> 24] + BitsCount[(a >> 16) & 0xff] + BitsCount[(a >> 8) & 0xff] + BitsCount[a & 0xff]; +#endif +} + +int vector1BitDistanceHamming(const Vector *v1, const Vector *v2){ + int diff = 0; + u8 *e1U8 = v1->data; + u32 *e1U32 = v1->data; + u8 *e2U8 = v2->data; + u32 *e2U32 = v2->data; + int i, len8, len32, offset8; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT1BIT ); + assert( v2->type == VECTOR_TYPE_FLOAT1BIT ); + + len8 = (v1->dims + 7) / 8; + len32 = v1->dims / 32; + offset8 = len32 * 4; + + for(i = 0; i < len32; i++){ + diff += sqlite3PopCount32(e1U32[i] ^ e2U32[i]); + } + for(i = offset8; i < len8; i++){ + diff += sqlite3PopCount32(e1U8[i] ^ e2U8[i]); } - return sizeof(float) * pVector->dims; + return diff; } -size_t vectorF32DeserializeFromBlob( +void vector1BitDeserializeFromBlob( Vector *pVector, const unsigned char *pBlob, size_t nBlobSize ){ - float *elems = pVector->data; + u8 *elems = pVector->data; + + assert( pVector->type == VECTOR_TYPE_FLOAT1BIT ); + assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy(elems, pBlob, (pVector->dims + 7) / 8); +} + +#endif /* !defined(SQLITE_OMIT_VECTOR) */ + +/************** End of vectorfloat1bit.c *************************************/ +/************** Begin file vectorfloat32.c ***********************************/ +/* +** 2024-07-04 +** +** Copyright 2024 the libSQL authors +** +** Permission is hereby granted, free of charge, to any person obtaining a copy of +** this software and associated documentation files (the "Software"), to deal in +** the Software without restriction, including without limitation the rights to +** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +** the Software, and to permit persons to whom the Software is furnished to do so, +** subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +****************************************************************************** +** +** 32-bit floating point vector format utilities. +*/ +#ifndef SQLITE_OMIT_VECTOR +/* #include "sqliteInt.h" */ + +/* #include "vectorInt.h" */ + +/* #include */ + +/************************************************************************** +** Utility routines for debugging +**************************************************************************/ + +void vectorF32Dump(const Vector *pVec){ + float *elems = pVec->data; unsigned i; - pVector->type = VECTOR_TYPE_FLOAT32; - pVector->dims = nBlobSize / sizeof(float); - assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize % 2 == 0 || pBlob[nBlobSize - 1] == VECTOR_TYPE_FLOAT32 ); + assert( pVec->type == VECTOR_TYPE_FLOAT32 ); - for(i = 0; i < pVector->dims; i++){ - elems[i] = deserializeF32(pBlob); - pBlob += sizeof(float); + printf("f32: ["); + for(i = 0; i < pVec->dims; i++){ + printf("%s%f", i == 0 ? "" : ", ", elems[i]); } - return vectorDataSize(pVector->type, pVector->dims); + printf("]\n"); } -void vectorF32Serialize( - sqlite3_context *context, - const Vector *pVector +/************************************************************************** +** Utility routines for vector serialization and deserialization +**************************************************************************/ + +static inline unsigned formatF32(float value, char *pBuf, int nBufSize){ + sqlite3_snprintf(nBufSize, pBuf, "%g", (double)value); + return strlen(pBuf); +} + +void vectorF32SerializeToBlob( + const Vector *pVector, + unsigned char *pBlob, + size_t nBlobSize ){ float *elems = pVector->data; - unsigned char *pBlob; - size_t nBlobSize; + unsigned char *pPtr = pBlob; + size_t len = 0; + unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT32 ); assert( pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); - nBlobSize = vectorDataSize(pVector->type, pVector->dims); - - if( nBlobSize == 0 ){ - sqlite3_result_zeroblob(context, 0); - return; - } - - pBlob = sqlite3_malloc64(nBlobSize); - if( pBlob == NULL ){ - sqlite3_result_error_nomem(context); - return; + for(i = 0; i < pVector->dims; i++){ + pPtr += serializeF32(pPtr, elems[i]); } - - vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); - sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); } #define SINGLE_FLOAT_CHAR_LIMIT 32 @@ -213309,37 +214114,22 @@ float vectorF32DistanceL2(const Vector *v1, const Vector *v2){ return sqrt(sum); } -void vectorF32InitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - pVector->dims = nBlobSize / sizeof(float); - pVector->data = (void*)pBlob; -} - -int vectorF32ParseSqliteBlob( - sqlite3_value *arg, +void vectorF32DeserializeFromBlob( Vector *pVector, - char **pzErr + const unsigned char *pBlob, + size_t nBlobSize ){ - const unsigned char *pBlob; float *elems = pVector->data; unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT32 ); assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); - assert( sqlite3_value_type(arg) == SQLITE_BLOB ); - - pBlob = sqlite3_value_blob(arg); - if( sqlite3_value_bytes(arg) < sizeof(float) * pVector->dims ){ - *pzErr = sqlite3_mprintf("invalid f32 vector: not enough bytes for all dimensions"); - goto error; - } + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ elems[i] = deserializeF32(pBlob); pBlob += sizeof(float); } - return 0; -error: - return -1; } #endif /* !defined(SQLITE_OMIT_VECTOR) */ @@ -213386,10 +214176,14 @@ int vectorF32ParseSqliteBlob( void vectorF64Dump(const Vector *pVec){ double *elems = pVec->data; unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT64 ); + + printf("f64: ["); for(i = 0; i < pVec->dims; i++){ - printf("%lf ", elems[i]); + printf("%s%lf", i == 0 ? "" : ", ", elems[i]); } - printf("\n"); + printf("]\n"); } /************************************************************************** @@ -213427,7 +214221,7 @@ static inline double deserializeF64(const unsigned char *pBuf){ return *(double *)&value; } -size_t vectorF64SerializeToBlob( +void vectorF64SerializeToBlob( const Vector *pVector, unsigned char *pBlob, size_t nBlobSize @@ -213438,63 +214232,11 @@ size_t vectorF64SerializeToBlob( assert( pVector->type == VECTOR_TYPE_FLOAT64 ); assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize >= pVector->dims * sizeof(double) ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for (i = 0; i < pVector->dims; i++) { pPtr += serializeF64(pPtr, elems[i]); } - return sizeof(double) * pVector->dims; -} - -size_t vectorF64DeserializeFromBlob( - Vector *pVector, - const unsigned char *pBlob, - size_t nBlobSize -){ - double *elems = pVector->data; - unsigned i; - pVector->type = VECTOR_TYPE_FLOAT64; - pVector->dims = nBlobSize / sizeof(double); - - assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize % 2 == 1 && pBlob[nBlobSize - 1] == VECTOR_TYPE_FLOAT64 ); - - for(i = 0; i < pVector->dims; i++){ - elems[i] = deserializeF64(pBlob); - pBlob += sizeof(double); - } - return vectorDataSize(pVector->type, pVector->dims); -} - -void vectorF64Serialize( - sqlite3_context *context, - const Vector *pVector -){ - double *elems = pVector->data; - unsigned char *pBlob; - size_t nBlobSize; - - assert( pVector->type == VECTOR_TYPE_FLOAT64 ); - assert( pVector->dims <= MAX_VECTOR_SZ ); - - // allocate one extra trailing byte with vector blob type metadata - nBlobSize = vectorDataSize(pVector->type, pVector->dims) + 1; - - if( nBlobSize == 0 ){ - sqlite3_result_zeroblob(context, 0); - return; - } - - pBlob = sqlite3_malloc64(nBlobSize); - if( pBlob == NULL ){ - sqlite3_result_error_nomem(context); - return; - } - - vectorF64SerializeToBlob(pVector, pBlob, nBlobSize - 1); - pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT64; - - sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); } #define SINGLE_DOUBLE_CHAR_LIMIT 32 @@ -213570,42 +214312,192 @@ double vectorF64DistanceL2(const Vector *v1, const Vector *v2){ return sqrt(sum); } -void vectorF64InitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - pVector->dims = nBlobSize / sizeof(double); - pVector->data = (void*)pBlob; -} - -int vectorF64ParseSqliteBlob( - sqlite3_value *arg, +void vectorF64DeserializeFromBlob( Vector *pVector, - char **pzErr + const unsigned char *pBlob, + size_t nBlobSize ){ - const unsigned char *pBlob; double *elems = pVector->data; unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT64 ); assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); - assert( sqlite3_value_type(arg) == SQLITE_BLOB ); - - pBlob = sqlite3_value_blob(arg); - if( sqlite3_value_bytes(arg) < sizeof(double) * pVector->dims ){ - *pzErr = sqlite3_mprintf("invalid f64 vector: not enough bytes for all dimensions"); - goto error; - } + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ elems[i] = deserializeF64(pBlob); pBlob += sizeof(double); } - return 0; -error: - return -1; } #endif /* !defined(SQLITE_OMIT_VECTOR) */ /************** End of vectorfloat64.c ***************************************/ +/************** Begin file vectorfloat8.c ************************************/ +/* +** 2024-07-04 +** +** Copyright 2024 the libSQL authors +** +** Permission is hereby granted, free of charge, to any person obtaining a copy of +** this software and associated documentation files (the "Software"), to deal in +** the Software without restriction, including without limitation the rights to +** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +** the Software, and to permit persons to whom the Software is furnished to do so, +** subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +****************************************************************************** +** +** 8-bit (INT8) floating point vector format utilities. +** +** The idea is to replace vector [f_0, f_1, ... f_k] with quantized uint8 values [q_0, q_1, ..., q_k] in such a way that +** f_i = alpha * q_i + shift, when alpha and shift determined from all f_i values like that: +** alpha = (max(f) - min(f)) / 255, shift = min(f) +** +** This differs from uint8 quantization in neural-network as it usually take form of f_i = alpha * (q_i - z) conversion instead +** But, neural-network uint8 quantization is less generic and works better for distributions centered around zero (symmetric or not) +** In our implementation we want to handle more generic cases - so profits from neural-network-style quantization are not clear +*/ +#ifndef SQLITE_OMIT_VECTOR +/* #include "sqliteInt.h" */ + +/* #include "vectorInt.h" */ + +/* #include */ + +/************************************************************************** +** Utility routines for vector serialization and deserialization +**************************************************************************/ + +void vectorF8GetParameters(const u8 *pData, int dims, float *pAlpha, float *pShift){ + pData = pData + ALIGN(dims, sizeof(float)); + *pAlpha = deserializeF32(pData); + *pShift = deserializeF32(pData + sizeof(*pAlpha)); +} + +void vectorF8SetParameters(u8 *pData, int dims, float alpha, float shift){ + pData = pData + ALIGN(dims, sizeof(float)); + serializeF32(pData, alpha); + serializeF32(pData + sizeof(alpha), shift); +} + +void vectorF8Dump(const Vector *pVec){ + u8 *elems = pVec->data; + float alpha, shift; + unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(pVec->data, pVec->dims, &alpha, &shift); + + printf("f8: ["); + for(i = 0; i < pVec->dims; i++){ + printf("%s%f", i == 0 ? "" : ", ", (float)elems[i] * alpha + shift); + } + printf("]\n"); +} + +void vectorF8SerializeToBlob( + const Vector *pVector, + unsigned char *pBlob, + size_t nBlobSize +){ + float alpha, shift; + + assert( pVector->type == VECTOR_TYPE_FLOAT8 ); + assert( pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy(pBlob, pVector->data, pVector->dims); + + vectorF8GetParameters(pVector->data, pVector->dims, &alpha, &shift); + vectorF8SetParameters(pBlob, pVector->dims, alpha, shift); +} + +float vectorF8DistanceCos(const Vector *v1, const Vector *v2){ + int i; + float alpha1, shift1, alpha2, shift2; + u32 sum1 = 0, sum2 = 0, sumsq1 = 0, sumsq2 = 0, doti = 0; + float dot = 0, norm1 = 0, norm2 = 0; + u8 *data1 = v1->data, *data2 = v2->data; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT8 ); + assert( v2->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(v1->data, v1->dims, &alpha1, &shift1); + vectorF8GetParameters(v2->data, v2->dims, &alpha2, &shift2); + + /* + * (Ax + S)^2 = A^2 x^2 + 2AS x + S^2 -> we need to maintain 'sumsq' and 'sum' + * (A1x + S1) * (A2y + S2) = A1A2 xy + A1 S2 x + A2 S1 y + S1 S2 -> we need to maintain 'dot' and 'sum' again + */ + + for(i = 0; i < v1->dims; i++){ + sum1 += data1[i]; + sum2 += data2[i]; + sumsq1 += data1[i]*data1[i]; + sumsq2 += data2[i]*data2[i]; + doti += data1[i]*data2[i]; + } + + dot = alpha1 * alpha2 * (float)doti + alpha1 * shift2 * (float)sum1 + alpha2 * shift1 * (float)sum2 + shift1 * shift2 * v1->dims; + norm1 = alpha1 * alpha1 * (float)sumsq1 + 2 * alpha1 * shift1 * (float)sum1 + shift1 * shift1 * v1->dims; + norm2 = alpha2 * alpha2 * (float)sumsq2 + 2 * alpha2 * shift2 * (float)sum2 + shift2 * shift2 * v1->dims; + + return 1.0 - (dot / sqrt(norm1 * norm2)); +} + +float vectorF8DistanceL2(const Vector *v1, const Vector *v2){ + int i; + float alpha1, shift1, alpha2, shift2; + float sum = 0; + u8 *data1 = v1->data, *data2 = v2->data; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT8 ); + assert( v2->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(v1->data, v1->dims, &alpha1, &shift1); + vectorF8GetParameters(v2->data, v2->dims, &alpha2, &shift2); + + for(i = 0; i < v1->dims; i++){ + float d = (alpha1 * data1[i] + shift1) - (alpha2 * data2[i] + shift2); + sum += d*d; + } + return sqrt(sum); +} + +void vectorF8DeserializeFromBlob( + Vector *pVector, + const unsigned char *pBlob, + size_t nBlobSize +){ + float alpha, shift; + + assert( pVector->type == VECTOR_TYPE_FLOAT8 ); + assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy((u8*)pVector->data, (u8*)pBlob, ALIGN(pVector->dims, sizeof(float))); + + vectorF8GetParameters(pBlob, pVector->dims, &alpha, &shift); + vectorF8SetParameters(pVector->data, pVector->dims, alpha, shift); +} + +#endif /* !defined(SQLITE_OMIT_VECTOR) */ + +/************** End of vectorfloat8.c ****************************************/ /************** Begin file vectorIndex.c *************************************/ /* ** 2024-03-18 @@ -213658,11 +214550,6 @@ int vectorF64ParseSqliteBlob( ** VectorIdxParams utilities ****************************************************************************/ -// VACUUM creates tables and indices first and only then populate data -// we need to ignore inserts from 'INSERT INTO vacuum.t SELECT * FROM t' statements because -// all shadow tables will be populated by VACUUM process during regular process of table copy -#define IsVacuum(db) ((db->mDbFlags&DBFLAG_Vacuum)!=0) - void vectorIdxParamsInit(VectorIdxParams *pParams, u8 *pBinBuf, int nBinSize) { assert( nBinSize <= VECTOR_INDEX_PARAMS_BUF_SIZE ); @@ -213875,7 +214762,7 @@ int vectorInRowAlloc(sqlite3 *db, const UnpackedRecord *pRecord, VectorInRow *pV vectorInitFromBlob(pVectorInRow->pVector, sqlite3_value_blob(pVectorValue), sqlite3_value_bytes(pVectorValue)); } else if( sqlite3_value_type(pVectorValue) == SQLITE_TEXT ){ // users can put strings (e.g. '[1,2,3]') in the table and we should process them correctly - if( vectorParse(pVectorValue, pVectorInRow->pVector, pzErrMsg) != 0 ){ + if( vectorParseWithType(pVectorValue, pVectorInRow->pVector, pzErrMsg) != 0 ){ rc = SQLITE_ERROR; goto out; } @@ -213987,14 +214874,18 @@ void vectorOutRowsFree(sqlite3 *db, VectorOutRows *pRows) { */ struct VectorColumnType { const char *zName; - int nBits; + int type; }; static struct VectorColumnType VECTOR_COLUMN_TYPES[] = { - { "FLOAT32", 32 }, - { "FLOAT64", 64 }, - { "F32_BLOB", 32 }, - { "F64_BLOB", 64 } + { "FLOAT32", VECTOR_TYPE_FLOAT32 }, + { "F32_BLOB", VECTOR_TYPE_FLOAT32 }, + { "FLOAT64", VECTOR_TYPE_FLOAT64 }, + { "F64_BLOB", VECTOR_TYPE_FLOAT64 }, + { "FLOAT1BIT", VECTOR_TYPE_FLOAT1BIT }, + { "F1BIT_BLOB", VECTOR_TYPE_FLOAT1BIT }, + { "FLOAT8", VECTOR_TYPE_FLOAT8 }, + { "F8_BLOB", VECTOR_TYPE_FLOAT8 }, }; /* @@ -214010,13 +214901,16 @@ struct VectorParamName { }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, - { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float1bit", VECTOR_TYPE_FLOAT1BIT }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float8", VECTOR_TYPE_FLOAT8 }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float32", VECTOR_TYPE_FLOAT32 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { @@ -214182,14 +215076,7 @@ int vectorIdxParseColumnType(const char *zType, int *pType, int *pDims, const ch } *pDims = dimensions; - if( VECTOR_COLUMN_TYPES[i].nBits == 32 ) { - *pType = VECTOR_TYPE_FLOAT32; - } else if( VECTOR_COLUMN_TYPES[i].nBits == 64 ) { - *pType = VECTOR_TYPE_FLOAT64; - } else { - *pErrMsg = "unsupported vector type"; - return -1; - } + *pType = VECTOR_COLUMN_TYPES[i].type; return 0; } *pErrMsg = "unexpected vector column type"; @@ -214381,10 +215268,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { // this is done to prevent unrecoverable situations where index were dropped but index parameters deletion failed and second attempt will fail on first step int rcIdx, rcParams; - if( IsVacuum(db) ){ - return SQLITE_OK; - } - assert( zDbSName != NULL ); rcIdx = diskAnnDropIndex(db, zDbSName, zIdxName); @@ -214395,10 +215278,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { assert( zDbSName != NULL ); - if( IsVacuum(db) ){ - return SQLITE_OK; - } - return diskAnnClearIndex(db, zDbSName, zIdxName); } @@ -214408,7 +215287,7 @@ int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { * this made intentionally in order to natively support upload of SQLite dumps * * dump populates tables first and create indices after - * so we must omit them because shadow tables already filled + * so we must omit index refill setp because shadow tables already filled * * 1. in case of any error :-1 returned (and pParse errMsg is populated with some error message) * 2. if vector index must not be created : 0 returned @@ -214424,11 +215303,7 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co int i, rc = SQLITE_OK; int dims, type; int hasLibsqlVectorIdxFn = 0, hasCollation = 0; - const char *pzErrMsg; - - if( IsVacuum(pParse->db) ){ - return CREATE_IGNORE; - } + const char *pzErrMsg = NULL; assert( zDbSName != NULL ); @@ -214488,11 +215363,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: must contain exactly one column wrapped into the " VECTOR_INDEX_MARKER_FUNCTION " function"); return CREATE_FAIL; } - // we are able to support this but I doubt this works for now - more polishing required to make this work - if( pIdx->pPartIdxWhere != NULL ) { - sqlite3ErrorMsg(pParse, "vector index: where condition is forbidden"); - return CREATE_FAIL; - } pArgsList = pIdx->aColExpr->a[0].pExpr->x.pList; pListItem = pArgsList->a; @@ -214517,7 +215387,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: %s: %s", pzErrMsg, zEmbeddingColumnTypeName); return CREATE_FAIL; } - // schema is locked while db is initializing and we need to just proceed here if( db->init.busy == 1 ){ return CREATE_OK; @@ -214540,9 +215409,13 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: unsupported for tables without ROWID and composite primary key"); return CREATE_FAIL; } - rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams); + rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams, &pzErrMsg); if( rc != SQLITE_OK ){ - sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann"); + if( pzErrMsg != NULL ){ + sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann: %s", pzErrMsg); + }else{ + sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann"); + } return CREATE_FAIL; } rc = insertIndexParameters(db, zDbSName, pIdx->zName, &idxParams); @@ -214582,7 +215455,6 @@ int vectorIndexSearch( VectorIdxParams idxParams; vectorIdxParamsInit(&idxParams, NULL, 0); - assert( !IsVacuum(db) ); assert( zDbSName != NULL ); if( argc != 3 ){ @@ -214594,17 +215466,14 @@ int vectorIndexSearch( rc = SQLITE_ERROR; goto out; } - if( type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(search): only f32 vectors are supported"); - rc = SQLITE_ERROR; - goto out; - } + assert( type == VECTOR_TYPE_FLOAT32 || type == VECTOR_TYPE_FLOAT64 || type == VECTOR_TYPE_FLOAT1BIT ); + pVector = vectorAlloc(type, dims); if( pVector == NULL ){ rc = SQLITE_NOMEM_BKPT; goto out; } - if( vectorParse(argv[1], pVector, pzErrMsg) != 0 ){ + if( vectorParseWithType(argv[1], pVector, pzErrMsg) != 0 ){ rc = SQLITE_ERROR; goto out; } @@ -214667,10 +215536,6 @@ int vectorIndexInsert( int rc; VectorInRow vectorInRow; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - rc = vectorInRowAlloc(pCur->db, pRecord, &vectorInRow, pzErrMsg); if( rc != SQLITE_OK ){ return rc; @@ -214690,10 +215555,6 @@ int vectorIndexDelete( ){ VectorInRow payload; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - payload.pVector = NULL; payload.nKeys = r->nField - 1; payload.pKeyValues = r->aMem + 1; @@ -259749,40 +260610,46 @@ static int fts5TriCreate( Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); - UNUSED_PARAM(pUnused); - if( pNew==0 ){ - rc = SQLITE_NOMEM; + TrigramTokenizer *pNew = 0; + + if( nArg%2 ){ + rc = SQLITE_ERROR; }else{ - int i; - pNew->bFold = 1; - pNew->iFoldParam = 0; - for(i=0; rc==SQLITE_OK && ibFold = 1; + pNew->iFoldParam = 0; + for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); + } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ - pNew->bFold = (zArg[0]=='0'); - } - }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ - if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; - }else{ - pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } - }else{ - rc = SQLITE_ERROR; } - } - if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ - rc = SQLITE_ERROR; - } + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } - if( rc!=SQLITE_OK ){ - fts5TriDelete((Fts5Tokenizer*)pNew); - pNew = 0; + if( rc!=SQLITE_OK ){ + fts5TriDelete((Fts5Tokenizer*)pNew); + pNew = 0; + } } } *ppOut = (Fts5Tokenizer*)pNew; diff --git a/libsql-replication/Cargo.toml b/libsql-replication/Cargo.toml index 56f00d7a7d..068e23a652 100644 --- a/libsql-replication/Cargo.toml +++ b/libsql-replication/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql_replication" -version = "0.4.0" +version = "0.5.0" edition = "2021" description = "libSQL replication protocol" repository = "https://github.com/tursodatabase/libsql" @@ -11,7 +11,8 @@ license = "MIT" [dependencies] tonic = { version = "0.11", features = ["tls"] } prost = "0.12" -libsql-sys = { version = "0.6", path = "../libsql-sys", default-features = false, features = ["wal", "rusqlite", "api"] } +libsql-sys = { version = "0.7", path = "../libsql-sys", default-features = false, features = ["wal", "rusqlite", "api"] } +libsql-wal = { path = "../libsql-wal/", optional = true } rusqlite = { workspace = true } parking_lot = "0.12.1" bytes = { version = "1.5.0", features = ["serde"] } @@ -37,3 +38,4 @@ tonic-build = "0.11" [features] encryption = ["libsql-sys/encryption"] +libsql_wal = ["dep:libsql-wal"] diff --git a/libsql-replication/proto/replication_log.proto b/libsql-replication/proto/replication_log.proto index 6208874609..b358232705 100644 --- a/libsql-replication/proto/replication_log.proto +++ b/libsql-replication/proto/replication_log.proto @@ -5,6 +5,12 @@ import "metadata.proto"; message LogOffset { uint64 next_offset = 1; + enum WalFlavor { + Sqlite = 0; + Libsql = 1; + } + // the type of wal frames that the client is expecting + optional WalFlavor wal_flavor = 2; } message HelloRequest { diff --git a/libsql-replication/src/frame.rs b/libsql-replication/src/frame.rs index a6a2854e52..55b5b778b5 100644 --- a/libsql-replication/src/frame.rs +++ b/libsql-replication/src/frame.rs @@ -13,7 +13,6 @@ use crate::LIBSQL_PAGE_SIZE; pub type FrameNo = u64; /// The file header for the WAL log. All fields are represented in little-endian ordering. -/// See `encode` and `decode` for actual layout. // repr C for stable sizing #[repr(C)] #[derive(Debug, Clone, Copy, zerocopy::FromZeroes, zerocopy::FromBytes, zerocopy::AsBytes)] @@ -22,7 +21,7 @@ pub struct FrameHeader { pub frame_no: lu64, /// Rolling checksum of all the previous frames, including this one. pub checksum: lu64, - /// page number, if frame_type is FrameType::Page + /// page number pub page_no: lu32, /// Size of the database (in page) after committing the transaction. This is passed from sqlite, /// and serves as commit transaction boundary diff --git a/libsql-replication/src/generated/wal_log.rs b/libsql-replication/src/generated/wal_log.rs index 2d7330e732..a34d5e59dd 100644 --- a/libsql-replication/src/generated/wal_log.rs +++ b/libsql-replication/src/generated/wal_log.rs @@ -4,6 +4,48 @@ pub struct LogOffset { #[prost(uint64, tag = "1")] pub next_offset: u64, + /// the type of wal frames that the client is expecting + #[prost(enumeration = "log_offset::WalFlavor", optional, tag = "2")] + pub wal_flavor: ::core::option::Option, +} +/// Nested message and enum types in `LogOffset`. +pub mod log_offset { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum WalFlavor { + Sqlite = 0, + Libsql = 1, + } + impl WalFlavor { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + WalFlavor::Sqlite => "Sqlite", + WalFlavor::Libsql => "Libsql", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "Sqlite" => Some(Self::Sqlite), + "Libsql" => Some(Self::Libsql), + _ => None, + } + } + } } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/libsql-replication/src/injector/error.rs b/libsql-replication/src/injector/error.rs index 14899089ea..ac8f1be711 100644 --- a/libsql-replication/src/injector/error.rs +++ b/libsql-replication/src/injector/error.rs @@ -1,9 +1,12 @@ +pub type Result = std::result::Result; +pub type BoxError = Box; + #[derive(Debug, thiserror::Error)] pub enum Error { #[error("IO error: {0}")] Io(#[from] std::io::Error), #[error("SQLite error: {0}")] Sqlite(#[from] rusqlite::Error), - #[error("A fatal error occured injecting frames")] - FatalInjectError, + #[error("A fatal error occured injecting frames: {0}")] + FatalInjectError(BoxError), } diff --git a/libsql-replication/src/injector/libsql_injector.rs b/libsql-replication/src/injector/libsql_injector.rs new file mode 100644 index 0000000000..f867a29245 --- /dev/null +++ b/libsql-replication/src/injector/libsql_injector.rs @@ -0,0 +1,45 @@ +use std::mem::size_of; + +use libsql_wal::io::StdIO; +use libsql_wal::replication::injector::Injector; +use libsql_wal::segment::Frame as WalFrame; +use zerocopy::{AsBytes, FromZeroes}; + +use crate::frame::FrameNo; +use crate::rpc::replication::Frame as RpcFrame; + +use super::error::{Error, Result}; + +pub struct LibsqlInjector { + injector: Injector, +} + +impl super::Injector for LibsqlInjector { + async fn inject_frame(&mut self, frame: RpcFrame) -> Result> { + // this is a bit annoying be we want to read the frame, and it has to be aligned, so we + // must copy it... + // FIXME: optimize this. + let mut wal_frame = WalFrame::new_box_zeroed(); + if frame.data.len() != size_of::() { + todo!("invalid frame"); + } + wal_frame.as_bytes_mut().copy_from_slice(&frame.data[..]); + Ok(self + .injector + .insert_frame(wal_frame) + .await + .map_err(|e| Error::FatalInjectError(e.into()))?) + } + + async fn rollback(&mut self) { + self.injector.rollback(); + } + + async fn flush(&mut self) -> Result> { + self.injector + .flush(None) + .await + .map_err(|e| Error::FatalInjectError(e.into()))?; + Ok(None) + } +} diff --git a/libsql-replication/src/injector/mod.rs b/libsql-replication/src/injector/mod.rs index 80443964fe..b139f07cc9 100644 --- a/libsql-replication/src/injector/mod.rs +++ b/libsql-replication/src/injector/mod.rs @@ -1,299 +1,32 @@ -use std::path::Path; -use std::sync::Arc; -use std::{collections::VecDeque, path::PathBuf}; +use std::future::Future; -use parking_lot::Mutex; -use rusqlite::OpenFlags; +use super::rpc::replication::Frame as RpcFrame; +#[cfg(feature = "libsql_wal")] +pub use libsql_injector::LibsqlInjector; +pub use sqlite_injector::SqliteInjector; -use crate::frame::{Frame, FrameNo}; +use crate::frame::FrameNo; pub use error::Error; - -use self::injector_wal::{ - InjectorWal, InjectorWalManager, LIBSQL_INJECT_FATAL, LIBSQL_INJECT_OK, LIBSQL_INJECT_OK_TXN, -}; +use error::Result; mod error; -mod headers; -mod injector_wal; - -#[derive(Debug)] -pub enum InjectError {} - -pub type FrameBuffer = Arc>>; - -pub struct Injector { - /// The injector is in a transaction state - is_txn: bool, - /// Buffer for holding current transaction frames - buffer: FrameBuffer, - /// Maximum capacity of the frame buffer - capacity: usize, - /// Injector connection - // connection must be dropped before the hook context - connection: Arc>>, - biggest_uncommitted_seen: FrameNo, - - // Connection config items used to recreate the injection connection - path: PathBuf, - encryption_config: Option, - auto_checkpoint: u32, -} - -/// Methods from this trait are called before and after performing a frame injection. -/// This trait trait is used to record the last committed frame_no to the log. -/// The implementer can persist the pre and post commit frame no, and compare them in the event of -/// a crash; if the pre and post commit frame_no don't match, then the log may be corrupted. -impl Injector { - pub fn new( - path: impl AsRef, - capacity: usize, - auto_checkpoint: u32, - encryption_config: Option, - ) -> Result { - let path = path.as_ref().to_path_buf(); - - let buffer = FrameBuffer::default(); - let wal_manager = InjectorWalManager::new(buffer.clone()); - let connection = libsql_sys::Connection::open( - &path, - OpenFlags::SQLITE_OPEN_READ_WRITE - | OpenFlags::SQLITE_OPEN_CREATE - | OpenFlags::SQLITE_OPEN_URI - | OpenFlags::SQLITE_OPEN_NO_MUTEX, - wal_manager, - auto_checkpoint, - encryption_config.clone(), - )?; - - Ok(Self { - is_txn: false, - buffer, - capacity, - connection: Arc::new(Mutex::new(connection)), - biggest_uncommitted_seen: 0, - - path, - encryption_config, - auto_checkpoint, - }) - } - - /// Inject a frame into the log. If this was a commit frame, returns Ok(Some(FrameNo)). - pub fn inject_frame(&mut self, frame: Frame) -> Result, Error> { - let frame_close_txn = frame.header().size_after.get() != 0; - self.buffer.lock().push_back(frame); - if frame_close_txn || self.buffer.lock().len() >= self.capacity { - return self.flush(); - } +#[cfg(feature = "libsql_wal")] +mod libsql_injector; +mod sqlite_injector; - Ok(None) - } +pub trait Injector { + /// Inject a singular frame. + fn inject_frame( + &mut self, + frame: RpcFrame, + ) -> impl Future>> + Send; - pub fn rollback(&mut self) { - let conn = self.connection.lock(); - let mut rollback = conn.prepare_cached("ROLLBACK").unwrap(); - let _ = rollback.execute(()); - self.is_txn = false; - } + /// Discard any uncommintted frames. + fn rollback(&mut self) -> impl Future + Send; /// Flush the buffer to libsql WAL. /// Trigger a dummy write, and flush the cache to trigger a call to xFrame. The buffer's frame /// are then injected into the wal. - pub fn flush(&mut self) -> Result, Error> { - match self.try_flush() { - Err(e) => { - // something went wrong, rollback the connection to make sure we can retry in a - // clean state - self.biggest_uncommitted_seen = 0; - self.rollback(); - Err(e) - } - Ok(ret) => Ok(ret), - } - } - - fn try_flush(&mut self) -> Result, Error> { - if !self.is_txn { - self.begin_txn()?; - } - - let lock = self.buffer.lock(); - // the frames in the buffer are either monotonically increasing (log) or decreasing - // (snapshot). Either way, we want to find the biggest frameno we're about to commit, and - // that is either the front or the back of the buffer - let last_frame_no = match lock.back().zip(lock.front()) { - Some((b, f)) => f.header().frame_no.get().max(b.header().frame_no.get()), - None => { - tracing::trace!("nothing to inject"); - return Ok(None); - } - }; - - self.biggest_uncommitted_seen = self.biggest_uncommitted_seen.max(last_frame_no); - - drop(lock); - - let connection = self.connection.lock(); - // use prepare cached to avoid parsing the same statement over and over again. - let mut stmt = - connection.prepare_cached("INSERT INTO libsql_temp_injection VALUES (42)")?; - - // We execute the statement, and then force a call to xframe if necesacary. If the execute - // succeeds, then xframe wasn't called, in this case, we call cache_flush, and then process - // the error. - // It is unexpected that execute flushes, but it is possible, so we handle that case. - match stmt.execute(()).and_then(|_| connection.cache_flush()) { - Ok(_) => panic!("replication hook was not called"), - Err(e) => { - if let Some(e) = e.sqlite_error() { - if e.extended_code == LIBSQL_INJECT_OK { - // refresh schema - connection.pragma_update(None, "writable_schema", "reset")?; - let mut rollback = connection.prepare_cached("ROLLBACK")?; - let _ = rollback.execute(()); - self.is_txn = false; - assert!(self.buffer.lock().is_empty()); - let commit_frame_no = self.biggest_uncommitted_seen; - self.biggest_uncommitted_seen = 0; - return Ok(Some(commit_frame_no)); - } else if e.extended_code == LIBSQL_INJECT_OK_TXN { - self.is_txn = true; - assert!(self.buffer.lock().is_empty()); - return Ok(None); - } else if e.extended_code == LIBSQL_INJECT_FATAL { - return Err(Error::FatalInjectError); - } - } - - Err(Error::FatalInjectError) - } - } - } - - fn begin_txn(&mut self) -> Result<(), Error> { - let mut conn = self.connection.lock(); - - { - let wal_manager = InjectorWalManager::new(self.buffer.clone()); - let new_conn = libsql_sys::Connection::open( - &self.path, - OpenFlags::SQLITE_OPEN_READ_WRITE - | OpenFlags::SQLITE_OPEN_CREATE - | OpenFlags::SQLITE_OPEN_URI - | OpenFlags::SQLITE_OPEN_NO_MUTEX, - wal_manager, - self.auto_checkpoint, - self.encryption_config.clone(), - )?; - - let _ = std::mem::replace(&mut *conn, new_conn); - } - - conn.pragma_update(None, "writable_schema", "true")?; - - let mut stmt = conn.prepare_cached("BEGIN IMMEDIATE")?; - stmt.execute(())?; - // we create a dummy table. This table MUST not be persisted, otherwise the replica schema - // would differ with the primary's. - let mut stmt = - conn.prepare_cached("CREATE TABLE IF NOT EXISTS libsql_temp_injection (x)")?; - stmt.execute(())?; - - Ok(()) - } - - pub fn clear_buffer(&mut self) { - self.buffer.lock().clear() - } - - #[cfg(test)] - pub fn is_txn(&self) -> bool { - self.is_txn - } -} - -#[cfg(test)] -mod test { - use crate::frame::FrameBorrowed; - use std::mem::size_of; - - use super::*; - /// this this is generated by creating a table test, inserting 5 rows into it, and then - /// truncating the wal file of it's header. - const WAL: &[u8] = include_bytes!("../../assets/test/test_wallog"); - - fn wal_log() -> impl Iterator { - WAL.chunks(size_of::()) - .map(|b| Frame::try_from(b).unwrap()) - } - - #[test] - fn test_simple_inject_frames() { - let temp = tempfile::tempdir().unwrap(); - - let mut injector = Injector::new(temp.path().join("data"), 10, 10000, None).unwrap(); - let log = wal_log(); - for frame in log { - injector.inject_frame(frame).unwrap(); - } - - let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); - - conn.query_row("SELECT COUNT(*) FROM test", (), |row| { - assert_eq!(row.get::<_, usize>(0).unwrap(), 5); - Ok(()) - }) - .unwrap(); - } - - #[test] - fn test_inject_frames_split_txn() { - let temp = tempfile::tempdir().unwrap(); - - // inject one frame at a time - let mut injector = Injector::new(temp.path().join("data"), 1, 10000, None).unwrap(); - let log = wal_log(); - for frame in log { - injector.inject_frame(frame).unwrap(); - } - - let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); - - conn.query_row("SELECT COUNT(*) FROM test", (), |row| { - assert_eq!(row.get::<_, usize>(0).unwrap(), 5); - Ok(()) - }) - .unwrap(); - } - - #[test] - fn test_inject_partial_txn_isolated() { - let temp = tempfile::tempdir().unwrap(); - - // inject one frame at a time - let mut injector = Injector::new(temp.path().join("data"), 10, 1000, None).unwrap(); - let mut frames = wal_log(); - - assert!(injector - .inject_frame(frames.next().unwrap()) - .unwrap() - .is_none()); - let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); - assert!(conn - .query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) - .is_err()); - - while injector - .inject_frame(frames.next().unwrap()) - .unwrap() - .is_none() - {} - - // reset schema - conn.pragma_update(None, "writable_schema", "reset") - .unwrap(); - conn.query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) - .unwrap(); - } + fn flush(&mut self) -> impl Future>> + Send; } diff --git a/libsql-replication/src/injector/headers.rs b/libsql-replication/src/injector/sqlite_injector/headers.rs similarity index 100% rename from libsql-replication/src/injector/headers.rs rename to libsql-replication/src/injector/sqlite_injector/headers.rs diff --git a/libsql-replication/src/injector/injector_wal.rs b/libsql-replication/src/injector/sqlite_injector/injector_wal.rs similarity index 100% rename from libsql-replication/src/injector/injector_wal.rs rename to libsql-replication/src/injector/sqlite_injector/injector_wal.rs diff --git a/libsql-replication/src/injector/sqlite_injector/mod.rs b/libsql-replication/src/injector/sqlite_injector/mod.rs new file mode 100644 index 0000000000..f6ce2aa89f --- /dev/null +++ b/libsql-replication/src/injector/sqlite_injector/mod.rs @@ -0,0 +1,348 @@ +use std::path::Path; +use std::sync::Arc; +use std::{collections::VecDeque, path::PathBuf}; + +use parking_lot::Mutex; +use rusqlite::OpenFlags; +use tokio::task::spawn_blocking; + +use crate::frame::{Frame, FrameNo}; +use crate::rpc::replication::Frame as RpcFrame; + +use self::injector_wal::{ + InjectorWal, InjectorWalManager, LIBSQL_INJECT_FATAL, LIBSQL_INJECT_OK, LIBSQL_INJECT_OK_TXN, +}; + +use super::error::Result; +use super::{Error, Injector}; + +mod headers; +mod injector_wal; + +pub type FrameBuffer = Arc>>; + +pub struct SqliteInjector { + pub(in super::super) inner: Arc>, +} + +impl Injector for SqliteInjector { + async fn inject_frame(&mut self, frame: RpcFrame) -> Result> { + let inner = self.inner.clone(); + let frame = + Frame::try_from(&frame.data[..]).map_err(|e| Error::FatalInjectError(e.into()))?; + spawn_blocking(move || inner.lock().inject_frame(frame)) + .await + .unwrap() + } + + async fn rollback(&mut self) { + let inner = self.inner.clone(); + spawn_blocking(move || inner.lock().rollback()) + .await + .unwrap(); + } + + async fn flush(&mut self) -> Result> { + let inner = self.inner.clone(); + spawn_blocking(move || inner.lock().flush()).await.unwrap() + } +} + +impl SqliteInjector { + pub async fn new( + path: PathBuf, + capacity: usize, + auto_checkpoint: u32, + encryption_config: Option, + ) -> super::Result { + let inner = spawn_blocking(move || { + SqliteInjectorInner::new(path, capacity, auto_checkpoint, encryption_config) + }) + .await + .unwrap()?; + + Ok(Self { + inner: Arc::new(Mutex::new(inner)), + }) + } +} + +pub(in super::super) struct SqliteInjectorInner { + /// The injector is in a transaction state + is_txn: bool, + /// Buffer for holding current transaction frames + buffer: FrameBuffer, + /// Maximum capacity of the frame buffer + capacity: usize, + /// Injector connection + // connection must be dropped before the hook context + connection: Arc>>, + biggest_uncommitted_seen: FrameNo, + + // Connection config items used to recreate the injection connection + path: PathBuf, + encryption_config: Option, + auto_checkpoint: u32, +} + +/// Methods from this trait are called before and after performing a frame injection. +/// This trait trait is used to record the last committed frame_no to the log. +/// The implementer can persist the pre and post commit frame no, and compare them in the event of +/// a crash; if the pre and post commit frame_no don't match, then the log may be corrupted. +impl SqliteInjectorInner { + fn new( + path: impl AsRef, + capacity: usize, + auto_checkpoint: u32, + encryption_config: Option, + ) -> Result { + let path = path.as_ref().to_path_buf(); + + let buffer = FrameBuffer::default(); + let wal_manager = InjectorWalManager::new(buffer.clone()); + let connection = libsql_sys::Connection::open( + &path, + OpenFlags::SQLITE_OPEN_READ_WRITE + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX, + wal_manager, + auto_checkpoint, + encryption_config.clone(), + )?; + + Ok(Self { + is_txn: false, + buffer, + capacity, + connection: Arc::new(Mutex::new(connection)), + biggest_uncommitted_seen: 0, + + path, + encryption_config, + auto_checkpoint, + }) + } + + /// Inject a frame into the log. If this was a commit frame, returns Ok(Some(FrameNo)). + pub fn inject_frame(&mut self, frame: Frame) -> Result, Error> { + let frame_close_txn = frame.header().size_after.get() != 0; + self.buffer.lock().push_back(frame); + if frame_close_txn || self.buffer.lock().len() >= self.capacity { + return self.flush(); + } + + Ok(None) + } + + pub fn rollback(&mut self) { + self.clear_buffer(); + let conn = self.connection.lock(); + let mut rollback = conn.prepare_cached("ROLLBACK").unwrap(); + let _ = rollback.execute(()); + self.is_txn = false; + } + + /// Flush the buffer to libsql WAL. + /// Trigger a dummy write, and flush the cache to trigger a call to xFrame. The buffer's frame + /// are then injected into the wal. + pub fn flush(&mut self) -> Result, Error> { + match self.try_flush() { + Err(e) => { + // something went wrong, rollback the connection to make sure we can retry in a + // clean state + self.biggest_uncommitted_seen = 0; + self.rollback(); + Err(e) + } + Ok(ret) => Ok(ret), + } + } + + fn try_flush(&mut self) -> Result, Error> { + if !self.is_txn { + self.begin_txn()?; + } + + let lock = self.buffer.lock(); + // the frames in the buffer are either monotonically increasing (log) or decreasing + // (snapshot). Either way, we want to find the biggest frameno we're about to commit, and + // that is either the front or the back of the buffer + let last_frame_no = match lock.back().zip(lock.front()) { + Some((b, f)) => f.header().frame_no.get().max(b.header().frame_no.get()), + None => { + tracing::trace!("nothing to inject"); + return Ok(None); + } + }; + + self.biggest_uncommitted_seen = self.biggest_uncommitted_seen.max(last_frame_no); + + drop(lock); + + let connection = self.connection.lock(); + // use prepare cached to avoid parsing the same statement over and over again. + let mut stmt = + connection.prepare_cached("INSERT INTO libsql_temp_injection VALUES (42)")?; + + // We execute the statement, and then force a call to xframe if necesacary. If the execute + // succeeds, then xframe wasn't called, in this case, we call cache_flush, and then process + // the error. + // It is unexpected that execute flushes, but it is possible, so we handle that case. + match stmt.execute(()).and_then(|_| connection.cache_flush()) { + Ok(_) => panic!("replication hook was not called"), + Err(e) => { + if let Some(err) = e.sqlite_error() { + if err.extended_code == LIBSQL_INJECT_OK { + // refresh schema + connection.pragma_update(None, "writable_schema", "reset")?; + let mut rollback = connection.prepare_cached("ROLLBACK")?; + let _ = rollback.execute(()); + self.is_txn = false; + assert!(self.buffer.lock().is_empty()); + let commit_frame_no = self.biggest_uncommitted_seen; + self.biggest_uncommitted_seen = 0; + return Ok(Some(commit_frame_no)); + } else if err.extended_code == LIBSQL_INJECT_OK_TXN { + self.is_txn = true; + assert!(self.buffer.lock().is_empty()); + return Ok(None); + } else if err.extended_code == LIBSQL_INJECT_FATAL { + return Err(Error::FatalInjectError(e.into())); + } + } + + Err(Error::FatalInjectError(e.into())) + } + } + } + + fn begin_txn(&mut self) -> Result<(), Error> { + let mut conn = self.connection.lock(); + + { + let wal_manager = InjectorWalManager::new(self.buffer.clone()); + let new_conn = libsql_sys::Connection::open( + &self.path, + OpenFlags::SQLITE_OPEN_READ_WRITE + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX, + wal_manager, + self.auto_checkpoint, + self.encryption_config.clone(), + )?; + + let _ = std::mem::replace(&mut *conn, new_conn); + } + + conn.pragma_update(None, "writable_schema", "true")?; + + let mut stmt = conn.prepare_cached("BEGIN IMMEDIATE")?; + stmt.execute(())?; + // we create a dummy table. This table MUST not be persisted, otherwise the replica schema + // would differ with the primary's. + let mut stmt = + conn.prepare_cached("CREATE TABLE IF NOT EXISTS libsql_temp_injection (x)")?; + stmt.execute(())?; + + Ok(()) + } + + pub fn clear_buffer(&mut self) { + self.buffer.lock().clear() + } + + #[cfg(test)] + pub fn is_txn(&self) -> bool { + self.is_txn + } +} + +#[cfg(test)] +mod test { + use crate::frame::FrameBorrowed; + use std::mem::size_of; + + use super::*; + /// this this is generated by creating a table test, inserting 5 rows into it, and then + /// truncating the wal file of it's header. + const WAL: &[u8] = include_bytes!("../../../assets/test/test_wallog"); + + fn wal_log() -> impl Iterator { + WAL.chunks(size_of::()) + .map(|b| Frame::try_from(b).unwrap()) + } + + #[test] + fn test_simple_inject_frames() { + let temp = tempfile::tempdir().unwrap(); + + let mut injector = + SqliteInjectorInner::new(temp.path().join("data"), 10, 10000, None).unwrap(); + let log = wal_log(); + for frame in log { + injector.inject_frame(frame).unwrap(); + } + + let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); + + conn.query_row("SELECT COUNT(*) FROM test", (), |row| { + assert_eq!(row.get::<_, usize>(0).unwrap(), 5); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_inject_frames_split_txn() { + let temp = tempfile::tempdir().unwrap(); + + // inject one frame at a time + let mut injector = + SqliteInjectorInner::new(temp.path().join("data"), 1, 10000, None).unwrap(); + let log = wal_log(); + for frame in log { + injector.inject_frame(frame).unwrap(); + } + + let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); + + conn.query_row("SELECT COUNT(*) FROM test", (), |row| { + assert_eq!(row.get::<_, usize>(0).unwrap(), 5); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_inject_partial_txn_isolated() { + let temp = tempfile::tempdir().unwrap(); + + // inject one frame at a time + let mut injector = + SqliteInjectorInner::new(temp.path().join("data"), 10, 1000, None).unwrap(); + let mut frames = wal_log(); + + assert!(injector + .inject_frame(frames.next().unwrap()) + .unwrap() + .is_none()); + let conn = rusqlite::Connection::open(temp.path().join("data")).unwrap(); + assert!(conn + .query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) + .is_err()); + + while injector + .inject_frame(frames.next().unwrap()) + .unwrap() + .is_none() + {} + + // reset schema + conn.pragma_update(None, "writable_schema", "reset") + .unwrap(); + conn.query_row("SELECT COUNT(*) FROM test", (), |_| Ok(())) + .unwrap(); + } +} diff --git a/libsql-replication/src/replicator.rs b/libsql-replication/src/replicator.rs index bc1eada7f8..38cdbf6e7c 100644 --- a/libsql-replication/src/replicator.rs +++ b/libsql-replication/src/replicator.rs @@ -1,14 +1,11 @@ use std::path::PathBuf; -use std::sync::Arc; -use parking_lot::Mutex; -use tokio::task::spawn_blocking; use tokio::time::Duration; use tokio_stream::{Stream, StreamExt}; use tonic::{Code, Status}; use crate::frame::{Frame, FrameNo}; -use crate::injector::Injector; +use crate::injector::{Injector, SqliteInjector}; use crate::rpc::replication::{ Frame as RpcFrame, NAMESPACE_DOESNT_EXIST, NEED_SNAPSHOT_ERROR_MSG, NO_HELLO_ERROR_MSG, }; @@ -66,7 +63,7 @@ impl From for Error { #[async_trait::async_trait] pub trait ReplicatorClient { - type FrameStream: Stream> + Unpin + Send; + type FrameStream: Stream> + Unpin + Send; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error>; @@ -137,9 +134,9 @@ where /// The `Replicator`'s duty is to download frames from the primary, and pass them to the injector at /// transaction boundaries. -pub struct Replicator { +pub struct Replicator { client: C, - injector: Arc>, + injector: I, state: ReplicatorState, frames_synced: usize, } @@ -154,33 +151,41 @@ enum ReplicatorState { Exit, } -impl Replicator { +impl Replicator +where + C: ReplicatorClient, +{ /// Creates a replicator for the db file pointed at by `db_path` - pub async fn new( + pub async fn new_sqlite( client: C, db_path: PathBuf, auto_checkpoint: u32, encryption_config: Option, ) -> Result { - let injector = { - let db_path = db_path.clone(); - spawn_blocking(move || { - Injector::new( - db_path, - INJECTOR_BUFFER_CAPACITY, - auto_checkpoint, - encryption_config, - ) - }) - .await?? - }; + let injector = SqliteInjector::new( + db_path.clone(), + INJECTOR_BUFFER_CAPACITY, + auto_checkpoint, + encryption_config, + ) + .await?; + + Ok(Self::new(client, injector)) + } +} - Ok(Self { +impl Replicator +where + C: ReplicatorClient, + I: Injector, +{ + pub fn new(client: C, injector: I) -> Self { + Self { client, - injector: Arc::new(Mutex::new(injector)), + injector, state: ReplicatorState::NeedHandshake, frames_synced: 0, - }) + } } /// for a handshake on next call to replicate. @@ -250,7 +255,7 @@ impl Replicator { // in case of error we rollback the current injector transaction, and start over. if ret.is_err() { self.client.rollback(); - self.injector.lock().rollback(); + self.injector.rollback().await; } self.state = match ret { @@ -293,7 +298,8 @@ impl Replicator { } async fn load_snapshot(&mut self) -> Result<(), Error> { - self.injector.lock().clear_buffer(); + self.client.rollback(); + self.injector.rollback().await; loop { match self.client.snapshot().await { Ok(mut stream) => { @@ -312,29 +318,25 @@ impl Replicator { } } - async fn inject_frame(&mut self, frame: Frame) -> Result<(), Error> { + async fn inject_frame(&mut self, frame: RpcFrame) -> Result<(), Error> { self.frames_synced += 1; - let injector = self.injector.clone(); - match spawn_blocking(move || injector.lock().inject_frame(frame)).await? { - Ok(Some(commit_fno)) => { + match self.injector.inject_frame(frame).await? { + Some(commit_fno) => { self.client.commit_frame_no(commit_fno).await?; } - Ok(None) => (), - Err(e) => Err(e)?, + None => (), } Ok(()) } pub async fn flush(&mut self) -> Result<(), Error> { - let injector = self.injector.clone(); - match spawn_blocking(move || injector.lock().flush()).await? { - Ok(Some(commit_fno)) => { + match self.injector.flush().await? { + Some(commit_fno) => { self.client.commit_frame_no(commit_fno).await?; } - Ok(None) => (), - Err(e) => Err(e)?, + None => (), } Ok(()) @@ -358,6 +360,7 @@ mod test { use async_stream::stream; use crate::frame::{FrameBorrowed, FrameMut}; + use crate::rpc::replication::Frame as RpcFrame; use super::*; @@ -368,7 +371,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -395,7 +399,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); @@ -412,7 +416,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -438,7 +443,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -454,7 +459,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -482,7 +488,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -498,7 +504,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -526,7 +533,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -542,7 +549,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -568,7 +576,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -584,7 +592,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -610,7 +619,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); replicator.state = ReplicatorState::NeedSnapshot; @@ -625,7 +634,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -653,7 +663,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); // we assume that we already received the handshake and the handshake is not valid anymore @@ -670,7 +680,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -696,7 +707,7 @@ mod test { fn rollback(&mut self) {} } - let mut replicator = Replicator::new(Client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(Client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); replicator.state = ReplicatorState::NeedHandshake; @@ -738,7 +749,8 @@ mod test { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = + Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -750,15 +762,26 @@ mod test { let frames = self .frames .iter() + .map(|f| RpcFrame { + data: f.bytes(), + timestamp: None, + }) .take(2) - .cloned() .map(Ok) .chain(Some(Err(Error::Client("some client error".into())))) .collect::>(); Ok(Box::pin(tokio_stream::iter(frames))) } else { - let stream = tokio_stream::iter(self.frames.clone().into_iter().map(Ok)); - Ok(Box::pin(stream)) + let iter = self + .frames + .iter() + .map(|f| RpcFrame { + data: f.bytes(), + timestamp: None, + }) + .map(Ok) + .collect::>(); + Ok(Box::pin(tokio_stream::iter(iter))) } } /// Return a snapshot for the current replication index. Called after next_frame has returned a @@ -784,7 +807,7 @@ mod test { committed_frame_no: None, }; - let mut replicator = Replicator::new(client, tmp.path().to_path_buf(), 10000, None) + let mut replicator = Replicator::new_sqlite(client, tmp.path().to_path_buf(), 10000, None) .await .unwrap(); @@ -795,7 +818,7 @@ mod test { replicator.try_replicate_step().await.unwrap_err(), Error::Client(_) )); - assert!(!replicator.injector.lock().is_txn()); + assert!(!replicator.injector.inner.lock().is_txn()); assert!(replicator.client_mut().committed_frame_no.is_none()); assert_eq!(replicator.state, ReplicatorState::NeedHandshake); @@ -805,7 +828,7 @@ mod test { replicator.client_mut().should_error = false; replicator.try_replicate_step().await.unwrap(); - assert!(!replicator.injector.lock().is_txn()); + assert!(!replicator.injector.inner.lock().is_txn()); assert_eq!(replicator.state, ReplicatorState::Exit); assert_eq!(replicator.client_mut().committed_frame_no, Some(6)); } diff --git a/libsql-replication/src/rpc.rs b/libsql-replication/src/rpc.rs index ebc92cf10c..a538bc4c28 100644 --- a/libsql-replication/src/rpc.rs +++ b/libsql-replication/src/rpc.rs @@ -25,6 +25,7 @@ pub mod replication { #![allow(clippy::all)] use uuid::Uuid; + include!("generated/wal_log.rs"); pub const NO_HELLO_ERROR_MSG: &str = "NO_HELLO"; diff --git a/libsql-server/Cargo.toml b/libsql-server/Cargo.toml index 6763c02dfb..7dd5c965a2 100644 --- a/libsql-server/Cargo.toml +++ b/libsql-server/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "libsql-server" -version = "0.24.18" +version = "0.24.19" edition = "2021" default-run = "sqld" +repository = "https://github.com/tursodatabase/libsql" [[bin]] name = "sqld" @@ -83,7 +84,7 @@ url = { version = "2.3", features = ["serde"] } uuid = { version = "1.3", features = ["v4", "serde", "v7"] } aes = { version = "0.8.3", optional = true } cbc = { version = "0.1.2", optional = true } -zerocopy = { version = "0.7.28", features = ["derive", "alloc"] } +zerocopy = { workspace = true } hashbrown = { version = "0.14.3", features = ["serde"] } hdrhistogram = "7.5.4" crossbeam = "0.8.4" diff --git a/libsql-server/src/connection/libsql.rs b/libsql-server/src/connection/libsql.rs index aadff6190b..2e9183567a 100644 --- a/libsql-server/src/connection/libsql.rs +++ b/libsql-server/src/connection/libsql.rs @@ -15,7 +15,9 @@ use tokio::sync::watch; use tokio::time::{Duration, Instant}; use crate::error::Error; -use crate::metrics::{DESCRIBE_COUNT, PROGRAM_EXEC_COUNT, VACUUM_COUNT, WAL_CHECKPOINT_COUNT}; +use crate::metrics::{ + DESCRIBE_COUNT, PROGRAM_EXEC_COUNT, QUERY_CANCELED, VACUUM_COUNT, WAL_CHECKPOINT_COUNT, +}; use crate::namespace::broadcasters::BroadcasterHandle; use crate::namespace::meta_store::MetaStoreHandle; use crate::namespace::ResolveNamespacePathFn; @@ -391,14 +393,45 @@ where ctx: RequestContext, builder: B, ) -> Result<(B, Program)> { + struct Bomb { + canceled: Arc, + defused: bool, + } + + impl Drop for Bomb { + fn drop(&mut self) { + if !self.defused { + tracing::trace!("cancelling request"); + self.canceled.store(true, Ordering::Relaxed); + } + } + } + + let canceled = { + let cancelled = self.inner.lock().canceled.clone(); + cancelled.store(false, Ordering::Relaxed); + cancelled + }; + PROGRAM_EXEC_COUNT.increment(1); - check_program_auth(&ctx, &pgm, &self.inner.lock().config_store.get())?; + let config = self.inner.lock().config_store.get(); + check_program_auth(&ctx, &pgm, &config).await?; + + // create the bomb right before spawning the blocking task. + let mut bomb = Bomb { + canceled, + defused: false, + }; let conn = self.inner.clone(); - BLOCKING_RT + let ret = BLOCKING_RT .spawn_blocking(move || Connection::run(conn, pgm, builder)) .await - .unwrap() + .unwrap(); + + bomb.defused = true; + + ret } } @@ -413,6 +446,7 @@ pub(super) struct Connection { forced_rollback: bool, broadcaster: BroadcasterHandle, hooked: bool, + canceled: Arc, } fn update_stats( @@ -475,6 +509,20 @@ impl Connection { ); } + let canceled = Arc::new(AtomicBool::new(false)); + + conn.progress_handler(100, { + let canceled = canceled.clone(); + Some(move || { + let canceled = canceled.load(Ordering::Relaxed); + if canceled { + QUERY_CANCELED.increment(1); + tracing::trace!("request canceled"); + } + canceled + }) + }); + let this = Self { conn, stats, @@ -486,6 +534,7 @@ impl Connection { forced_rollback: false, broadcaster, hooked: false, + canceled, }; for ext in extensions.iter() { @@ -795,6 +844,7 @@ mod test { forced_rollback: false, broadcaster: Default::default(), hooked: false, + canceled: Arc::new(false.into()), }; let conn = Arc::new(Mutex::new(conn)); diff --git a/libsql-server/src/connection/program.rs b/libsql-server/src/connection/program.rs index f128c7538f..29ef408b68 100644 --- a/libsql-server/src/connection/program.rs +++ b/libsql-server/src/connection/program.rs @@ -341,7 +341,7 @@ fn value_size(val: &rusqlite::types::ValueRef) -> usize { } } -pub fn check_program_auth( +pub async fn check_program_auth( ctx: &RequestContext, pgm: &Program, config: &DatabaseConfig, @@ -363,7 +363,7 @@ pub fn check_program_auth( } StmtKind::Attach(ref ns) => { ctx.auth.has_right(ns, Permission::AttachRead)?; - if !ctx.meta_store.handle(ns.clone()).get().allow_attach { + if !ctx.meta_store.handle(ns.clone()).await.get().allow_attach { return Err(Error::NotAuthorized(format!( "Namespace `{ns}` doesn't allow attach" ))); diff --git a/libsql-server/src/database/schema.rs b/libsql-server/src/database/schema.rs index 0b9674bd60..4195f4603e 100644 --- a/libsql-server/src/database/schema.rs +++ b/libsql-server/src/database/schema.rs @@ -50,7 +50,7 @@ impl crate::connection::Connection for SchemaConnection { res } else { - check_program_auth(&ctx, &migration, &self.config.get())?; + check_program_auth(&ctx, &migration, &self.config.get()).await?; let connection = self.connection.clone(); validate_migration(&mut migration)?; let migration = Arc::new(migration); diff --git a/libsql-server/src/error.rs b/libsql-server/src/error.rs index 371630abdf..9cd0b81485 100644 --- a/libsql-server/src/error.rs +++ b/libsql-server/src/error.rs @@ -4,7 +4,7 @@ use tonic::metadata::errors::InvalidMetadataValueBytes; use crate::{ auth::AuthError, - namespace::{ForkError, NamespaceName}, + namespace::{configurator::fork::ForkError, NamespaceName}, query_result_builder::QueryResultBuilderError, }; diff --git a/libsql-server/src/http/admin/mod.rs b/libsql-server/src/http/admin/mod.rs index 683d67995e..908b66e3f2 100644 --- a/libsql-server/src/http/admin/mod.rs +++ b/libsql-server/src/http/admin/mod.rs @@ -93,9 +93,29 @@ where tokio::task::spawn(async move { loop { - tokio::time::sleep(std::time::Duration::from_secs(1)).await; + let runtime = tokio::runtime::Handle::current(); + let metrics = runtime.metrics(); + crate::metrics::TOKIO_RUNTIME_BLOCKING_QUEUE_DEPTH + .set(metrics.blocking_queue_depth() as f64); + crate::metrics::TOKIO_RUNTIME_INJECTION_QUEUE_DEPTH + .set(metrics.injection_queue_depth() as f64); + crate::metrics::TOKIO_RUNTIME_NUM_BLOCKING_THREADS + .set(metrics.num_blocking_threads() as f64); + crate::metrics::TOKIO_RUNTIME_NUM_IDLE_BLOCKING_THREADS + .set(metrics.num_idle_blocking_threads() as f64); + crate::metrics::TOKIO_RUNTIME_NUM_WORKERS.set(metrics.num_workers() as f64); + + crate::metrics::TOKIO_RUNTIME_IO_DRIVER_FD_DEREGISTERED_COUNT + .absolute(metrics.io_driver_fd_deregistered_count() as u64); + crate::metrics::TOKIO_RUNTIME_IO_DRIVER_FD_REGISTERED_COUNT + .absolute(metrics.io_driver_fd_registered_count() as u64); + crate::metrics::TOKIO_RUNTIME_IO_DRIVER_READY_COUNT + .absolute(metrics.io_driver_ready_count() as u64); + crate::metrics::TOKIO_RUNTIME_REMOTE_SCHEDULE_COUNT + .absolute(metrics.remote_schedule_count() as u64); crate::metrics::SERVER_COUNT.set(1.0); + tokio::time::sleep(std::time::Duration::from_secs(1)).await; } }); @@ -331,7 +351,7 @@ async fn handle_create_namespace( )); } // TODO: move this check into meta store - if !app_state.namespaces.exists(&ns) { + if !app_state.namespaces.exists(&ns).await { return Err(Error::NamespaceDoesntExist(ns.to_string())); } diff --git a/libsql-server/src/lib.rs b/libsql-server/src/lib.rs index 5404a11108..4b97b442f5 100644 --- a/libsql-server/src/lib.rs +++ b/libsql-server/src/lib.rs @@ -4,7 +4,6 @@ use std::alloc::Layout; use std::ffi::c_void; use std::mem::{align_of, size_of}; use std::path::{Path, PathBuf}; -use std::pin::Pin; use std::str::FromStr; use std::sync::{Arc, Weak}; @@ -29,10 +28,11 @@ use auth::Auth; use config::{ AdminApiConfig, DbConfig, HeartbeatConfig, RpcClientConfig, RpcServerConfig, UserApiConfig, }; -use futures::future::ready; +use futures::future::{pending, ready}; use futures::Future; use http::user::UserApi; use hyper::client::HttpConnector; +use hyper::Uri; use hyper_rustls::HttpsConnector; #[cfg(feature = "durable-wal")] use libsql_storage::{DurableWalManager, LockManager}; @@ -46,7 +46,7 @@ use libsql_wal::registry::WalRegistry; use libsql_wal::storage::NoStorage; use libsql_wal::wal::LibsqlWalManager; use namespace::meta_store::MetaStoreHandle; -use namespace::{NamespaceConfig, NamespaceName}; +use namespace::NamespaceName; use net::Connector; use once_cell::sync::Lazy; use rusqlite::ffi::SQLITE_CONFIG_MALLOC; @@ -55,14 +55,20 @@ use tokio::runtime::Runtime; use tokio::sync::{mpsc, Notify, Semaphore}; use tokio::task::JoinSet; use tokio::time::Duration; +use tonic::transport::Channel; use url::Url; use utils::services::idle_shutdown::IdleShutdownKicker; use self::config::MetaStoreConfig; use self::connection::connection_manager::InnerWalManager; +use self::namespace::configurator::{ + BaseNamespaceConfig, NamespaceConfigurators, PrimaryConfigurator, PrimaryExtraConfig, + ReplicaConfigurator, SchemaConfigurator, +}; use self::namespace::NamespaceStore; use self::net::AddrIncoming; use self::replication::script_backup_manager::{CommandHandler, ScriptBackupManager}; +use self::schema::SchedulerHandle; pub mod auth; mod broadcaster; @@ -336,7 +342,8 @@ where config.heartbeat_url.as_deref().unwrap_or(""), config.heartbeat_period, ); - join_set.spawn({ + + self.spawn_until_shutdown_on(join_set, { let heartbeat_auth = config.heartbeat_auth.clone(); let heartbeat_period = config.heartbeat_period; let heartbeat_url = if let Some(url) = &config.heartbeat_url { @@ -423,60 +430,47 @@ where let extensions = self.db_config.validate_extensions()?; let user_auth_strategy = self.user_api_config.auth_strategy.clone(); - let service_shutdown = Arc::new(Notify::new()); - let db_kind = if self.rpc_client_config.is_some() { - DatabaseKind::Replica - } else { - DatabaseKind::Primary - }; - let scripted_backup = match self.db_config.snapshot_exec { Some(ref command) => { let (scripted_backup, script_backup_task) = ScriptBackupManager::new(&self.path, CommandHandler::new(command.to_string())) .await?; - join_set.spawn(script_backup_task.run()); + self.spawn_until_shutdown_on(&mut join_set, script_backup_task.run()); Some(scripted_backup) } None => None, }; - let (channel, uri) = match self.rpc_client_config { - Some(ref config) => { - let (channel, uri) = config.configure().await?; - (Some(channel), Some(uri)) - } - None => (None, None), + let db_kind = match self.rpc_client_config { + Some(_) => DatabaseKind::Replica, + _ => DatabaseKind::Primary, }; + let client_config = self.get_client_config().await?; let (scheduler_sender, scheduler_receiver) = mpsc::channel(128); - let (stats_sender, stats_receiver) = mpsc::channel(1024); - // chose the wal backend - let (make_wal_manager, registry_shutdown) = self.configure_wal_manager(&mut join_set)?; - - let ns_config = NamespaceConfig { - db_kind, + let base_config = BaseNamespaceConfig { base_path: self.path.clone(), - max_log_size: self.db_config.max_log_size, - max_log_duration: self.db_config.max_log_duration.map(Duration::from_secs_f32), - bottomless_replication: self.db_config.bottomless_replication.clone(), extensions, - stats_sender: stats_sender.clone(), + stats_sender, max_response_size: self.db_config.max_response_size, max_total_response_size: self.db_config.max_total_response_size, - checkpoint_interval: self.db_config.checkpoint_interval, - encryption_config: self.db_config.encryption_config.clone(), max_concurrent_connections: Arc::new(Semaphore::new(self.max_concurrent_connections)), - scripted_backup, max_concurrent_requests: self.db_config.max_concurrent_requests, - channel: channel.clone(), - uri: uri.clone(), - migration_scheduler: scheduler_sender.into(), - make_wal_manager, + encryption_config: self.db_config.encryption_config.clone(), }; + let configurators = self + .make_configurators( + base_config, + client_config.clone(), + &mut join_set, + scheduler_sender.into(), + scripted_backup, + ) + .await?; + let (metastore_conn_maker, meta_store_wal_manager) = metastore_connection_maker(self.meta_store_config.bottomless.clone(), &self.path) .await?; @@ -488,36 +482,19 @@ where meta_store_wal_manager, ) .await?; + let namespace_store: NamespaceStore = NamespaceStore::new( db_kind.is_replica(), self.db_config.snapshot_at_shutdown, self.max_active_namespaces, - ns_config, meta_store, + configurators, + db_kind, ) .await?; - let meta_conn = metastore_conn_maker()?; - let scheduler = Scheduler::new(namespace_store.clone(), meta_conn).await?; - - join_set.spawn(async move { - scheduler.run(scheduler_receiver).await; - Ok(()) - }); - self.spawn_monitoring_tasks(&mut join_set, stats_receiver)?; - // eagerly load the default namespace when namespaces are disabled - if self.disable_namespaces && db_kind.is_primary() { - namespace_store - .create( - NamespaceName::default(), - namespace::RestoreOption::Latest, - Default::default(), - ) - .await?; - } - // if namespaces are enabled, then bottomless must have set DB ID if !self.disable_namespaces { if let Some(bottomless) = &self.db_config.bottomless_replication { @@ -532,7 +509,7 @@ where let proxy_service = ProxyService::new(namespace_store.clone(), None, self.disable_namespaces); // Garbage collect proxy clients every 30 seconds - join_set.spawn({ + self.spawn_until_shutdown_on(&mut join_set, { let clients = proxy_service.clients(); async move { loop { @@ -541,21 +518,44 @@ where } } }); - join_set.spawn(run_rpc_server( - proxy_service, - config.acceptor, - config.tls_config, - idle_shutdown_kicker.clone(), - namespace_store.clone(), - self.disable_namespaces, - )); + + self.spawn_until_shutdown_on( + &mut join_set, + run_rpc_server( + proxy_service, + config.acceptor, + config.tls_config, + idle_shutdown_kicker.clone(), + namespace_store.clone(), + self.disable_namespaces, + ), + ); } let shutdown_timeout = self.shutdown_timeout.clone(); let shutdown = self.shutdown.clone(); + let service_shutdown = Arc::new(Notify::new()); // setup user-facing rpc services match db_kind { DatabaseKind::Primary => { + // The migration scheduler is only useful on the primary + let meta_conn = metastore_conn_maker()?; + let scheduler = Scheduler::new(namespace_store.clone(), meta_conn).await?; + self.spawn_until_shutdown_on(&mut join_set, async move { + scheduler.run(scheduler_receiver).await; + Ok(()) + }); + + if self.disable_namespaces { + namespace_store + .create( + NamespaceName::default(), + namespace::RestoreOption::Latest, + Default::default(), + ) + .await?; + } + let replication_svc = ReplicationLogService::new( namespace_store.clone(), idle_shutdown_kicker.clone(), @@ -571,7 +571,7 @@ where ); // Garbage collect proxy clients every 30 seconds - join_set.spawn({ + self.spawn_until_shutdown_on(&mut join_set, { let clients = proxy_svc.clients(); async move { loop { @@ -592,11 +592,11 @@ where .configure(&mut join_set); } DatabaseKind::Replica => { - let replication_svc = - ReplicationLogProxyService::new(channel.clone().unwrap(), uri.clone().unwrap()); + let (channel, uri) = client_config.clone().unwrap(); + let replication_svc = ReplicationLogProxyService::new(channel.clone(), uri.clone()); let proxy_svc = ReplicaProxyService::new( - channel.clone().unwrap(), - uri.clone().unwrap(), + channel, + uri, namespace_store.clone(), user_auth_strategy.clone(), self.disable_namespaces, @@ -620,7 +620,6 @@ where join_set.shutdown().await; service_shutdown.notify_waiters(); namespace_store.shutdown().await?; - registry_shutdown.await?; Ok::<_, crate::Error>(()) }; @@ -649,21 +648,15 @@ where Ok(()) } - fn setup_shutdown(&self) -> Option { - let shutdown_notify = self.shutdown.clone(); - self.idle_shutdown_timeout.map(|d| { - IdleShutdownKicker::new(d, self.initial_idle_shutdown_timeout, shutdown_notify) - }) - } - - fn configure_wal_manager( + async fn make_configurators( &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, join_set: &mut JoinSet>, - ) -> anyhow::Result<( - Arc InnerWalManager + Sync + Send + 'static>, - Pin> + Send + Sync + 'static>>, - )> { - let wal_path = self.path.join("wals"); + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result { + let wal_path = base_config.base_path.join("wals"); let enable_libsql_wal_test = { let is_primary = self.rpc_server_config.is_some(); let is_libsql_wal_test = std::env::var("LIBSQL_WAL_TEST").is_ok(); @@ -686,6 +679,52 @@ where } } + match self.use_custom_wal { + Some(CustomWAL::LibsqlWal) => self.libsql_wal_configurators( + base_config, + client_config, + join_set, + migration_scheduler_handle, + scripted_backup, + wal_path, + ), + #[cfg(feature = "durable-wal")] + Some(CustomWAL::DurableWal) => self.durable_wal_configurators( + base_config, + client_config, + migration_scheduler_handle, + scripted_backup, + ), + None => { + self.legacy_configurators( + base_config, + client_config, + migration_scheduler_handle, + scripted_backup, + ) + .await + } + } + } + + fn libsql_wal_configurators( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + join_set: &mut JoinSet>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + wal_path: PathBuf, + ) -> anyhow::Result { + tracing::info!("using libsql wal"); + let (sender, receiver) = tokio::sync::mpsc::channel(64); + let registry = Arc::new(WalRegistry::new(wal_path, NoStorage, sender)?); + let checkpointer = LibsqlCheckpointer::new(registry.clone(), receiver, 8); + self.spawn_until_shutdown_on(join_set, async move { + checkpointer.run().await; + Ok(()) + }); + let namespace_resolver = |path: &Path| { NamespaceName::from_string( path.parent() @@ -699,49 +738,282 @@ where .unwrap() .into() }; + let wal = LibsqlWalManager::new(registry.clone(), Arc::new(namespace_resolver)); - match self.use_custom_wal { - Some(CustomWAL::LibsqlWal) => { - let (sender, receiver) = tokio::sync::mpsc::channel(64); - let registry = Arc::new(WalRegistry::new(wal_path, NoStorage, sender)?); - let checkpointer = LibsqlCheckpointer::new(registry.clone(), receiver, 8); - join_set.spawn(async move { - checkpointer.run().await; - Ok(()) - }); + self.spawn_until_shutdown_with_teardown(join_set, pending(), async move { + registry.shutdown().await?; + Ok(()) + }); - let wal = LibsqlWalManager::new(registry.clone(), Arc::new(namespace_resolver)); - let shutdown_notify = self.shutdown.clone(); - let shutdown_fut = Box::pin(async move { - shutdown_notify.notified().await; - registry.shutdown().await?; - Ok(()) - }); + let make_wal_manager = Arc::new(move || EitherWAL::B(wal.clone())); + // let mut configurators = NamespaceConfigurators::empty(); + + // match client_config { + // Some(_) => todo!("configure replica"), + // // configure primary + // None => self.configure_primary_common( + // base_config, + // &mut configurators, + // make_wal_manager, + // migration_scheduler_handle, + // scripted_backup, + // ), + // } + + self.configurators_common( + base_config, + client_config, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + ) + } - tracing::info!("using libsql wal"); - Ok((Arc::new(move || EitherWAL::B(wal.clone())), shutdown_fut)) - } - #[cfg(feature = "durable-wal")] - Some(CustomWAL::DurableWal) => { - tracing::info!("using durable wal"); - let lock_manager = Arc::new(std::sync::Mutex::new(LockManager::new())); - let wal = DurableWalManager::new( - lock_manager, - namespace_resolver, - self.storage_server_address.clone(), - ); - Ok(( - Arc::new(move || EitherWAL::C(wal.clone())), - Box::pin(ready(Ok(()))), - )) + #[cfg(feature = "durable-wal")] + fn durable_wal_configurators( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result { + tracing::info!("using durable wal"); + let lock_manager = Arc::new(std::sync::Mutex::new(LockManager::new())); + let namespace_resolver = |path: &Path| { + NamespaceName::from_string( + path.parent() + .unwrap() + .file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(), + ) + .unwrap() + .into() + }; + let wal = DurableWalManager::new( + lock_manager, + namespace_resolver, + self.storage_server_address.clone(), + ); + let make_wal_manager = Arc::new(move || EitherWAL::C(wal.clone())); + self.configurators_common( + base_config, + client_config, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + ) + } + + fn spawn_until_shutdown_on(&self, join_set: &mut JoinSet>, fut: F) + where + F: Future> + Send + 'static, + { + self.spawn_until_shutdown_with_teardown(join_set, fut, ready(Ok(()))) + } + + /// run the passed future until shutdown is called, then call the passed teardown future + fn spawn_until_shutdown_with_teardown( + &self, + join_set: &mut JoinSet>, + fut: F, + teardown: T, + ) where + F: Future> + Send + 'static, + T: Future> + Send + 'static, + { + let shutdown = self.shutdown.clone(); + join_set.spawn(async move { + tokio::select! { + _ = shutdown.notified() => { + teardown.await + }, + ret = fut => ret } - None => { - tracing::info!("using sqlite3 wal"); - Ok(( - Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - Box::pin(ready(Ok(()))), - )) + }); + } + + async fn legacy_configurators( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result { + let make_wal_manager = Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())); + self.configurators_common( + base_config, + client_config, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + ) + } + + fn configurators_common( + &self, + base_config: BaseNamespaceConfig, + client_config: Option<(Channel, Uri)>, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) -> anyhow::Result { + let mut configurators = NamespaceConfigurators::empty(); + match client_config { + // replica mode + Some((channel, uri)) => { + let replica_configurator = + ReplicaConfigurator::new(base_config, channel, uri, make_wal_manager); + configurators.with_replica(replica_configurator); } + // primary mode + None => self.configure_primary_common( + base_config, + &mut configurators, + make_wal_manager, + migration_scheduler_handle, + scripted_backup, + ), + } + + Ok(configurators) + } + + fn configure_primary_common( + &self, + base_config: BaseNamespaceConfig, + configurators: &mut NamespaceConfigurators, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler_handle: SchedulerHandle, + scripted_backup: Option, + ) { + let primary_config = PrimaryExtraConfig { + max_log_size: self.db_config.max_log_size, + max_log_duration: self.db_config.max_log_duration.map(Duration::from_secs_f32), + bottomless_replication: self.db_config.bottomless_replication.clone(), + scripted_backup, + checkpoint_interval: self.db_config.checkpoint_interval, + }; + + let primary_configurator = PrimaryConfigurator::new( + base_config.clone(), + primary_config.clone(), + make_wal_manager.clone(), + ); + + let schema_configurator = SchemaConfigurator::new( + base_config.clone(), + primary_config, + make_wal_manager.clone(), + migration_scheduler_handle, + ); + + configurators.with_schema(schema_configurator); + configurators.with_primary(primary_configurator); + } + + fn setup_shutdown(&self) -> Option { + let shutdown_notify = self.shutdown.clone(); + self.idle_shutdown_timeout.map(|d| { + IdleShutdownKicker::new(d, self.initial_idle_shutdown_timeout, shutdown_notify) + }) + } + + // fn configure_wal_manager( + // &self, + // join_set: &mut JoinSet>, + // ) -> anyhow::Result<( + // Arc InnerWalManager + Sync + Send + 'static>, + // Pin> + Send + Sync + 'static>>, + // )> { + // let wal_path = self.path.join("wals"); + // let enable_libsql_wal_test = { + // let is_primary = self.rpc_server_config.is_some(); + // let is_libsql_wal_test = std::env::var("LIBSQL_WAL_TEST").is_ok(); + // is_primary && is_libsql_wal_test + // }; + // let use_libsql_wal = + // self.use_custom_wal == Some(CustomWAL::LibsqlWal) || enable_libsql_wal_test; + // if !use_libsql_wal { + // if wal_path.try_exists()? { + // anyhow::bail!("database was previously setup to use libsql-wal"); + // } + // } + // + // if self.use_custom_wal.is_some() { + // if self.db_config.bottomless_replication.is_some() { + // anyhow::bail!("bottomless not supported with custom WAL"); + // } + // if self.rpc_client_config.is_some() { + // anyhow::bail!("custom WAL not supported in replica mode"); + // } + // } + // + // let namespace_resolver = |path: &Path| { + // NamespaceName::from_string( + // path.parent() + // .unwrap() + // .file_name() + // .unwrap() + // .to_str() + // .unwrap() + // .to_string(), + // ) + // .unwrap() + // .into() + // }; + // + // match self.use_custom_wal { + // Some(CustomWAL::LibsqlWal) => { + // let (sender, receiver) = tokio::sync::mpsc::channel(64); + // let registry = Arc::new(WalRegistry::new(wal_path, NoStorage, sender)?); + // let checkpointer = LibsqlCheckpointer::new(registry.clone(), receiver, 8); + // join_set.spawn(async move { + // checkpointer.run().await; + // Ok(()) + // }); + // + // let wal = LibsqlWalManager::new(registry.clone(), Arc::new(namespace_resolver)); + // let shutdown_notify = self.shutdown.clone(); + // let shutdown_fut = Box::pin(async move { + // shutdown_notify.notified().await; + // registry.shutdown().await?; + // Ok(()) + // }); + // + // tracing::info!("using libsql wal"); + // Ok((Arc::new(move || EitherWAL::B(wal.clone())), shutdown_fut)) + // } + // #[cfg(feature = "durable-wal")] + // Some(CustomWAL::DurableWal) => { + // tracing::info!("using durable wal"); + // let lock_manager = Arc::new(std::sync::Mutex::new(LockManager::new())); + // let wal = DurableWalManager::new( + // lock_manager, + // namespace_resolver, + // self.storage_server_address.clone(), + // ); + // Ok(( + // Arc::new(move || EitherWAL::C(wal.clone())), + // Box::pin(ready(Ok(()))), + // )) + // } + // None => { + // tracing::info!("using sqlite3 wal"); + // Ok(( + // Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), + // Box::pin(ready(Ok(()))), + // )) + // } + // } + // } + + async fn get_client_config(&self) -> anyhow::Result> { + match self.rpc_client_config { + Some(ref config) => Ok(Some(config.configure().await?)), + None => Ok(None), } } } diff --git a/libsql-server/src/metrics.rs b/libsql-server/src/metrics.rs index a71b5ca979..bb9c049fa7 100644 --- a/libsql-server/src/metrics.rs +++ b/libsql-server/src/metrics.rs @@ -153,3 +153,62 @@ pub static LISTEN_EVENTS_DROPPED: Lazy = Lazy::new(|| { describe_counter!(NAME, "Number of listen events dropped"); register_counter!(NAME) }); +pub static QUERY_CANCELED: Lazy = Lazy::new(|| { + const NAME: &str = "libsql_server_query_canceled"; + describe_counter!(NAME, "Number of canceled queries"); + register_counter!(NAME) +}); + +pub static TOKIO_RUNTIME_BLOCKING_QUEUE_DEPTH: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_blocking_queue_depth"; + describe_gauge!(NAME, "tokio runtime blocking_queue_depth"); + register_gauge!(NAME) +}); + +pub static TOKIO_RUNTIME_INJECTION_QUEUE_DEPTH: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_injection_queue_depth"; + describe_gauge!(NAME, "tokio runtime injection_queue_depth"); + register_gauge!(NAME) +}); + +pub static TOKIO_RUNTIME_NUM_BLOCKING_THREADS: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_num_blocking_threads"; + describe_gauge!(NAME, "tokio runtime num_blocking_threads"); + register_gauge!(NAME) +}); + +pub static TOKIO_RUNTIME_NUM_IDLE_BLOCKING_THREADS: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_num_idle_blocking_threads"; + describe_gauge!(NAME, "tokio runtime num_idle_blocking_threads"); + register_gauge!(NAME) +}); + +pub static TOKIO_RUNTIME_NUM_WORKERS: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_num_workers"; + describe_gauge!(NAME, "tokio runtime num_workers"); + register_gauge!(NAME) +}); + +pub static TOKIO_RUNTIME_IO_DRIVER_FD_DEREGISTERED_COUNT: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_io_driver_fd_deregistered_count"; + describe_counter!(NAME, "tokio runtime io_driver_fd_deregistered_count"); + register_counter!(NAME) +}); + +pub static TOKIO_RUNTIME_IO_DRIVER_FD_REGISTERED_COUNT: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_io_driver_fd_registered_count"; + describe_counter!(NAME, "tokio runtime io_driver_fd_registered_count"); + register_counter!(NAME) +}); + +pub static TOKIO_RUNTIME_IO_DRIVER_READY_COUNT: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_io_driver_ready_count"; + describe_counter!(NAME, "tokio runtime io_driver_ready_count"); + register_counter!(NAME) +}); + +pub static TOKIO_RUNTIME_REMOTE_SCHEDULE_COUNT: Lazy = Lazy::new(|| { + const NAME: &str = "tokio_runtime_remote_schedule_count"; + describe_gauge!(NAME, "tokio runtime remote_schedule_count"); + register_counter!(NAME) +}); diff --git a/libsql-server/src/namespace/fork.rs b/libsql-server/src/namespace/configurator/fork.rs similarity index 75% rename from libsql-server/src/namespace/fork.rs rename to libsql-server/src/namespace/configurator/fork.rs index dfa053b43d..03f2ac03d8 100644 --- a/libsql-server/src/namespace/fork.rs +++ b/libsql-server/src/namespace/configurator/fork.rs @@ -12,17 +12,71 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio::time::Duration; use tokio_stream::StreamExt; -use crate::namespace::ResolveNamespacePathFn; +use crate::database::Database; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{Namespace, NamespaceBottomlessDbId}; use crate::replication::primary::frame_stream::FrameStream; use crate::replication::{LogReadError, ReplicationLogger}; use crate::{BLOCKING_RT, LIBSQL_PAGE_SIZE}; -use super::broadcasters::BroadcasterHandle; -use super::meta_store::MetaStoreHandle; -use super::{Namespace, NamespaceConfig, NamespaceName, NamespaceStore, RestoreOption}; +use super::helpers::make_bottomless_options; +use super::{NamespaceName, NamespaceStore, PrimaryExtraConfig, RestoreOption}; type Result = crate::Result; +pub(super) async fn fork( + from_ns: &Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + primary_config: &PrimaryExtraConfig, + base_path: Arc, +) -> crate::Result { + let from_config = from_config.get(); + let bottomless_db_id = NamespaceBottomlessDbId::from_config(&from_config); + let restore_to = if let Some(timestamp) = timestamp { + if let Some(ref options) = primary_config.bottomless_replication { + Some(PointInTimeRestore { + timestamp, + replicator_options: make_bottomless_options( + options, + bottomless_db_id.clone(), + from_ns.name().clone(), + ), + }) + } else { + return Err(crate::Error::Fork(ForkError::BackupServiceNotConfigured)); + } + } else { + None + }; + + let logger = match &from_ns.db { + Database::Primary(db) => db.wal_wrapper.wrapper().logger(), + Database::Schema(db) => db.wal_wrapper.wrapper().logger(), + _ => { + return Err(crate::Error::Fork(ForkError::Internal(anyhow::Error::msg( + "Invalid source database type for fork", + )))); + } + }; + + let fork_task = ForkTask { + base_path, + to_namespace: to_ns.clone(), + logger, + restore_to, + to_config, + store, + }; + + let ns = fork_task.fork().await?; + + Ok(ns) +} + #[derive(Debug, thiserror::Error)] pub enum ForkError { #[error("internal error: {0}")] @@ -54,16 +108,13 @@ async fn write_frame(frame: &FrameBorrowed, temp_file: &mut tokio::fs::File) -> Ok(()) } -pub struct ForkTask<'a> { +pub struct ForkTask { pub base_path: Arc, pub logger: Arc, pub to_namespace: NamespaceName, pub to_config: MetaStoreHandle, pub restore_to: Option, - pub ns_config: &'a NamespaceConfig, - pub resolve_attach: ResolveNamespacePathFn, pub store: NamespaceStore, - pub broadcaster: BroadcasterHandle, } pub struct PointInTimeRestore { @@ -71,7 +122,7 @@ pub struct PointInTimeRestore { pub replicator_options: bottomless::replicator::Options, } -impl<'a> ForkTask<'a> { +impl ForkTask { pub async fn fork(self) -> Result { let base_path = self.base_path.clone(); let dest_namespace = self.to_namespace.clone(); @@ -105,18 +156,10 @@ impl<'a> ForkTask<'a> { let dest_path = self.base_path.join("dbs").join(self.to_namespace.as_str()); tokio::fs::rename(temp_dir.path(), dest_path).await?; - Namespace::from_config( - self.ns_config, - self.to_config.clone(), - RestoreOption::Latest, - &self.to_namespace, - Box::new(|_op| {}), - self.resolve_attach.clone(), - self.store.clone(), - self.broadcaster, - ) - .await - .map_err(|e| ForkError::CreateNamespace(Box::new(e))) + self.store + .make_namespace(&self.to_namespace, self.to_config, RestoreOption::Latest) + .await + .map_err(|e| ForkError::CreateNamespace(Box::new(e))) } /// Restores the database state from a local log file. diff --git a/libsql-server/src/namespace/configurator/helpers.rs b/libsql-server/src/namespace/configurator/helpers.rs new file mode 100644 index 0000000000..355b1b1472 --- /dev/null +++ b/libsql-server/src/namespace/configurator/helpers.rs @@ -0,0 +1,460 @@ +use std::path::{Path, PathBuf}; +use std::sync::Weak; +use std::sync::{atomic::AtomicBool, Arc}; +use std::time::Duration; + +use anyhow::Context as _; +use bottomless::replicator::Options; +use bytes::Bytes; +use enclose::enclose; +use futures::Stream; +use libsql_sys::wal::Sqlite3WalManager; +use libsql_sys::EncryptionConfig; +use tokio::io::AsyncBufReadExt as _; +use tokio::sync::watch; +use tokio::task::JoinSet; +use tokio_util::io::StreamReader; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::libsql::{open_conn, MakeLibSqlConn}; +use crate::connection::{Connection as _, MakeConnection as _}; +use crate::database::{PrimaryConnection, PrimaryConnectionMaker}; +use crate::error::LoadDumpError; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::replication_wal::{make_replication_wal_wrapper, ReplicationWalWrapper}; +use crate::namespace::{ + NamespaceBottomlessDbId, NamespaceBottomlessDbIdInit, NamespaceName, ResolveNamespacePathFn, + RestoreOption, +}; +use crate::replication::{FrameNo, ReplicationLogger}; +use crate::stats::Stats; +use crate::{StatsSender, BLOCKING_RT, DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT}; + +use super::{BaseNamespaceConfig, PrimaryExtraConfig}; + +const WASM_TABLE_CREATE: &str = + "CREATE TABLE libsql_wasm_func_table (name text PRIMARY KEY, body text) WITHOUT ROWID;"; + +#[tracing::instrument(skip_all)] +pub(super) async fn make_primary_connection_maker( + primary_config: &PrimaryExtraConfig, + base_config: &BaseNamespaceConfig, + meta_store_handle: &MetaStoreHandle, + db_path: &Path, + name: &NamespaceName, + restore_option: RestoreOption, + block_writes: Arc, + join_set: &mut JoinSet>, + resolve_attach_path: ResolveNamespacePathFn, + broadcaster: BroadcasterHandle, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + encryption_config: Option, +) -> crate::Result<(PrimaryConnectionMaker, ReplicationWalWrapper, Arc)> { + let db_config = meta_store_handle.get(); + let bottomless_db_id = NamespaceBottomlessDbId::from_config(&db_config); + // FIXME: figure how to to it per-db + let mut is_dirty = { + let sentinel_path = db_path.join(".sentinel"); + if sentinel_path.try_exists()? { + true + } else { + tokio::fs::File::create(&sentinel_path).await?; + false + } + }; + + // FIXME: due to a bug in logger::checkpoint_db we call regular checkpointing code + // instead of our virtual WAL one. It's a bit tangled to fix right now, because + // we need WAL context for checkpointing, and WAL context needs the ReplicationLogger... + // So instead we checkpoint early, *before* bottomless gets initialized. That way + // we're sure bottomless won't try to back up any existing WAL frames and will instead + // treat the existing db file as the source of truth. + + let bottomless_replicator = match primary_config.bottomless_replication { + Some(ref options) => { + tracing::debug!("Checkpointing before initializing bottomless"); + crate::replication::primary::logger::checkpoint_db(&db_path.join("data"))?; + tracing::debug!("Checkpointed before initializing bottomless"); + let options = make_bottomless_options(options, bottomless_db_id, name.clone()); + let (replicator, did_recover) = + init_bottomless_replicator(db_path.join("data"), options, &restore_option).await?; + tracing::debug!("Completed init of bottomless replicator"); + is_dirty |= did_recover; + Some(replicator) + } + None => None, + }; + + tracing::debug!("Checking fresh db"); + let is_fresh_db = check_fresh_db(&db_path)?; + // switch frame-count checkpoint to time-based one + let auto_checkpoint = if primary_config.checkpoint_interval.is_some() { + 0 + } else { + DEFAULT_AUTO_CHECKPOINT + }; + + let logger = Arc::new(ReplicationLogger::open( + &db_path, + primary_config.max_log_size, + primary_config.max_log_duration, + is_dirty, + auto_checkpoint, + primary_config.scripted_backup.clone(), + name.clone(), + encryption_config.clone(), + )?); + + tracing::debug!("sending stats"); + + let stats = make_stats( + &db_path, + join_set, + meta_store_handle.clone(), + base_config.stats_sender.clone(), + name.clone(), + logger.new_frame_notifier.subscribe(), + base_config.encryption_config.clone(), + ) + .await?; + + tracing::debug!("Making replication wal wrapper"); + let wal_wrapper = make_replication_wal_wrapper(bottomless_replicator, logger.clone()); + + tracing::debug!("Opening libsql connection"); + + let connection_maker = MakeLibSqlConn::new( + db_path.to_path_buf(), + wal_wrapper.clone(), + stats.clone(), + broadcaster, + meta_store_handle.clone(), + base_config.extensions.clone(), + base_config.max_response_size, + base_config.max_total_response_size, + auto_checkpoint, + logger.new_frame_notifier.subscribe(), + encryption_config, + block_writes, + resolve_attach_path, + make_wal_manager.clone(), + ) + .await? + .throttled( + base_config.max_concurrent_connections.clone(), + Some(DB_CREATE_TIMEOUT), + base_config.max_total_response_size, + base_config.max_concurrent_requests, + ); + + tracing::debug!("Completed opening libsql connection"); + + // this must happen after we create the connection maker. The connection maker old on a + // connection to ensure that no other connection is closing while we try to open the dump. + // that would cause a SQLITE_LOCKED error. + match restore_option { + RestoreOption::Dump(_) if !is_fresh_db => { + Err(LoadDumpError::LoadDumpExistingDb)?; + } + RestoreOption::Dump(dump) => { + let conn = connection_maker.create().await?; + tracing::debug!("Loading dump"); + load_dump(dump, conn).await?; + tracing::debug!("Done loading dump"); + } + _ => { /* other cases were already handled when creating bottomless */ } + } + + join_set.spawn(run_periodic_compactions(logger.clone())); + + tracing::debug!("Done making primary connection"); + + Ok((connection_maker, wal_wrapper, stats)) +} + +pub(super) fn make_bottomless_options( + options: &Options, + namespace_db_id: NamespaceBottomlessDbId, + name: NamespaceName, +) -> Options { + let mut options = options.clone(); + let mut db_id = match namespace_db_id { + NamespaceBottomlessDbId::Namespace(id) => id, + // FIXME(marin): I don't like that, if bottomless is enabled, proper config must be passed. + NamespaceBottomlessDbId::NotProvided => options.db_id.unwrap_or_default(), + }; + + db_id = format!("ns-{db_id}:{name}"); + options.db_id = Some(db_id); + options +} + +async fn init_bottomless_replicator( + path: impl AsRef, + options: bottomless::replicator::Options, + restore_option: &RestoreOption, +) -> anyhow::Result<(bottomless::replicator::Replicator, bool)> { + tracing::debug!("Initializing bottomless replication"); + let path = path + .as_ref() + .to_str() + .ok_or_else(|| anyhow::anyhow!("Invalid db path"))? + .to_owned(); + let mut replicator = bottomless::replicator::Replicator::with_options(path, options).await?; + + let (generation, timestamp) = match restore_option { + RestoreOption::Latest | RestoreOption::Dump(_) => (None, None), + RestoreOption::Generation(generation) => (Some(*generation), None), + RestoreOption::PointInTime(timestamp) => (None, Some(*timestamp)), + }; + + let (action, did_recover) = replicator.restore(generation, timestamp).await?; + match action { + bottomless::replicator::RestoreAction::SnapshotMainDbFile => { + replicator.new_generation().await; + if let Some(_handle) = replicator.snapshot_main_db_file(true).await? { + tracing::trace!("got snapshot handle after restore with generation upgrade"); + } + // Restoration process only leaves the local WAL file if it was + // detected to be newer than its remote counterpart. + replicator.maybe_replicate_wal().await? + } + bottomless::replicator::RestoreAction::ReuseGeneration(gen) => { + replicator.set_generation(gen); + } + } + + Ok((replicator, did_recover)) +} + +async fn run_periodic_compactions(logger: Arc) -> anyhow::Result<()> { + // calling `ReplicationLogger::maybe_compact()` is cheap if the compaction does not actually + // take place, so we can afford to poll it very often for simplicity + let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(1000)); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + + loop { + interval.tick().await; + let handle = BLOCKING_RT.spawn_blocking(enclose! {(logger) move || { + logger.maybe_compact() + }}); + handle + .await + .expect("Compaction task crashed") + .context("Compaction failed")?; + } +} + +async fn load_dump(dump: S, conn: PrimaryConnection) -> crate::Result<(), LoadDumpError> +where + S: Stream> + Unpin, +{ + let mut reader = tokio::io::BufReader::new(StreamReader::new(dump)); + let mut curr = String::new(); + let mut line = String::new(); + let mut skipped_wasm_table = false; + let mut n_stmt = 0; + let mut line_id = 0; + + while let Ok(n) = reader.read_line(&mut curr).await { + line_id += 1; + if n == 0 { + break; + } + let trimmed = curr.trim(); + if trimmed.is_empty() || trimmed.starts_with("--") { + curr.clear(); + continue; + } + // FIXME: it's well known bug that comment ending with semicolon will be handled incorrectly by currend dump processing code + let statement_end = trimmed.ends_with(';'); + + // we want to concat original(non-trimmed) lines as trimming will join all them in one + // single-line statement which is incorrect if comments in the end are present + line.push_str(&curr); + curr.clear(); + + // This is a hack to ignore the libsql_wasm_func_table table because it is already created + // by the system. + if !skipped_wasm_table && line.trim() == WASM_TABLE_CREATE { + skipped_wasm_table = true; + line.clear(); + continue; + } + + if statement_end { + n_stmt += 1; + // dump must be performd within a txn + if n_stmt > 2 && conn.is_autocommit().await.unwrap() { + return Err(LoadDumpError::NoTxn); + } + + line = tokio::task::spawn_blocking({ + let conn = conn.clone(); + move || -> crate::Result { + conn.with_raw(|conn| conn.execute(&line, ())).map_err(|e| { + LoadDumpError::Internal(format!("line: {}, error: {}", line_id, e)) + })?; + Ok(line) + } + }) + .await??; + line.clear(); + } else { + line.push(' '); + } + } + tracing::debug!("loaded {} lines from dump", line_id); + + if !conn.is_autocommit().await.unwrap() { + tokio::task::spawn_blocking({ + let conn = conn.clone(); + move || -> crate::Result<(), LoadDumpError> { + conn.with_raw(|conn| conn.execute("rollback", ()))?; + Ok(()) + } + }) + .await??; + return Err(LoadDumpError::NoCommit); + } + + Ok(()) +} + +fn check_fresh_db(path: &Path) -> crate::Result { + let is_fresh = !path.join("wallog").try_exists()?; + Ok(is_fresh) +} + +pub(super) async fn make_stats( + db_path: &Path, + join_set: &mut JoinSet>, + meta_store_handle: MetaStoreHandle, + stats_sender: StatsSender, + name: NamespaceName, + mut current_frame_no: watch::Receiver>, + encryption_config: Option, +) -> anyhow::Result> { + tracing::debug!("creating stats type"); + let stats = Stats::new(name.clone(), db_path, join_set).await?; + + // the storage monitor is optional, so we ignore the error here. + tracing::debug!("stats created, sending stats"); + let _ = stats_sender + .send((name.clone(), meta_store_handle, Arc::downgrade(&stats))) + .await; + + join_set.spawn({ + let stats = stats.clone(); + // initialize the current_frame_no value + current_frame_no + .borrow_and_update() + .map(|fno| stats.set_current_frame_no(fno)); + async move { + while current_frame_no.changed().await.is_ok() { + current_frame_no + .borrow_and_update() + .map(|fno| stats.set_current_frame_no(fno)); + } + Ok(()) + } + }); + + join_set.spawn(run_storage_monitor( + db_path.into(), + Arc::downgrade(&stats), + encryption_config, + )); + + tracing::debug!("done sending stats, and creating bg tasks"); + + Ok(stats) +} + +// Periodically check the storage used by the database and save it in the Stats structure. +// TODO: Once we have a separate fiber that does WAL checkpoints, running this routine +// right after checkpointing is exactly where it should be done. +async fn run_storage_monitor( + db_path: PathBuf, + stats: Weak, + encryption_config: Option, +) -> anyhow::Result<()> { + // on initialization, the database file doesn't exist yet, so we wait a bit for it to be + // created + tokio::time::sleep(Duration::from_secs(1)).await; + + let duration = tokio::time::Duration::from_secs(60); + let db_path: Arc = db_path.into(); + loop { + let db_path = db_path.clone(); + let Some(stats) = stats.upgrade() else { + return Ok(()); + }; + + let encryption_config = encryption_config.clone(); + let _ = tokio::task::spawn_blocking(move || { + // because closing the last connection interferes with opening a new one, we lazily + // initialize a connection here, and keep it alive for the entirety of the program. If we + // fail to open it, we wait for `duration` and try again later. + match open_conn(&db_path, Sqlite3WalManager::new(), Some(rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY), encryption_config) { + Ok(mut conn) => { + if let Ok(tx) = conn.transaction() { + let page_count = tx.query_row("pragma page_count;", [], |row| { row.get::(0) }); + let freelist_count = tx.query_row("pragma freelist_count;", [], |row| { row.get::(0) }); + if let (Ok(page_count), Ok(freelist_count)) = (page_count, freelist_count) { + let storage_bytes_used = (page_count - freelist_count) * 4096; + stats.set_storage_bytes_used(storage_bytes_used); + } + } + }, + Err(e) => { + tracing::warn!("failed to open connection for storager monitor: {e}, trying again in {duration:?}"); + }, + } + }).await; + + tokio::time::sleep(duration).await; + } +} + +pub(super) async fn cleanup_primary( + base: &BaseNamespaceConfig, + primary_config: &PrimaryExtraConfig, + namespace: &NamespaceName, + db_config: &DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, +) -> crate::Result<()> { + let ns_path = base.base_path.join("dbs").join(namespace.as_str()); + if let Some(ref options) = primary_config.bottomless_replication { + let bottomless_db_id = match bottomless_db_id_init { + NamespaceBottomlessDbIdInit::Provided(db_id) => db_id, + NamespaceBottomlessDbIdInit::FetchFromConfig => { + NamespaceBottomlessDbId::from_config(db_config) + } + }; + let options = make_bottomless_options(options, bottomless_db_id, namespace.clone()); + let replicator = bottomless::replicator::Replicator::with_options( + ns_path.join("data").to_str().unwrap(), + options, + ) + .await?; + if prune_all { + let delete_all = replicator.delete_all(None).await?; + // perform hard deletion in the background + tokio::spawn(delete_all.commit()); + } else { + // for soft delete make sure that local db is fully backed up + replicator.savepoint().confirmed().await?; + } + } + + if ns_path.try_exists()? { + tracing::debug!("removing database directory: {}", ns_path.display()); + tokio::fs::remove_dir_all(ns_path).await?; + } + + Ok(()) +} diff --git a/libsql-server/src/namespace/configurator/libsql_wal_replica.rs b/libsql-server/src/namespace/configurator/libsql_wal_replica.rs new file mode 100644 index 0000000000..6ab6cc52ef --- /dev/null +++ b/libsql-server/src/namespace/configurator/libsql_wal_replica.rs @@ -0,0 +1,139 @@ +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + +use chrono::prelude::NaiveDateTime; +use hyper::Uri; +use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; +use libsql_wal::io::StdIO; +use libsql_wal::registry::WalRegistry; +use libsql_wal::storage::NoStorage; +use tokio::task::JoinSet; +use tonic::transport::Channel; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::write_proxy::MakeWriteProxyConn; +use crate::connection::MakeConnection; +use crate::database::{Database, ReplicaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_stats; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, + ResolveNamespacePathFn, RestoreOption, +}; +use crate::DEFAULT_AUTO_CHECKPOINT; + +use super::{BaseNamespaceConfig, ConfigureNamespace}; + +pub struct LibsqlWalReplicaConfigurator { + base: BaseNamespaceConfig, + registry: Arc>, + uri: Uri, + channel: Channel, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +} + +impl ConfigureNamespace for LibsqlWalReplicaConfigurator { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + todo!() + // Box::pin(async move { + // tracing::debug!("creating replica namespace"); + // let db_path = self.base.base_path.join("dbs").join(name.as_str()); + // let channel = self.channel.clone(); + // let uri = self.uri.clone(); + // + // let rpc_client = ReplicationLogClient::with_origin(channel.clone(), uri.clone()); + // // TODO! setup replication + // + // let mut join_set = JoinSet::new(); + // let namespace = name.clone(); + // + // let stats = make_stats( + // &db_path, + // &mut join_set, + // db_config.clone(), + // self.base.stats_sender.clone(), + // name.clone(), + // applied_frame_no_receiver.clone(), + // ) + // .await?; + // + // let connection_maker = MakeWriteProxyConn::new( + // db_path.clone(), + // self.base.extensions.clone(), + // channel.clone(), + // uri.clone(), + // stats.clone(), + // broadcaster, + // db_config.clone(), + // applied_frame_no_receiver, + // self.base.max_response_size, + // self.base.max_total_response_size, + // primary_current_replication_index, + // None, + // resolve_attach_path, + // self.make_wal_manager.clone(), + // ) + // .await? + // .throttled( + // self.base.max_concurrent_connections.clone(), + // Some(DB_CREATE_TIMEOUT), + // self.base.max_total_response_size, + // self.base.max_concurrent_requests, + // ); + // + // Ok(Namespace { + // tasks: join_set, + // db: Database::Replica(ReplicaDatabase { + // connection_maker: Arc::new(connection_maker), + // }), + // name: name.clone(), + // stats, + // db_config_store: db_config, + // path: db_path.into(), + // }) + // }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + _db_config: &DatabaseConfig, + _prune_all: bool, + _bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let ns_path = self.base.base_path.join("dbs").join(namespace.as_str()); + if ns_path.try_exists()? { + tracing::debug!("removing database directory: {}", ns_path.display()); + tokio::fs::remove_dir_all(ns_path).await?; + } + Ok(()) + }) + } + + fn fork<'a>( + &'a self, + _from_ns: &'a Namespace, + _from_config: MetaStoreHandle, + _to_ns: NamespaceName, + _to_config: MetaStoreHandle, + _timestamp: Option, + _store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + Box::pin(std::future::ready(Err(crate::Error::Fork( + super::fork::ForkError::ForkReplica, + )))) + } +} diff --git a/libsql-server/src/namespace/configurator/mod.rs b/libsql-server/src/namespace/configurator/mod.rs new file mode 100644 index 0000000000..b96d5a3824 --- /dev/null +++ b/libsql-server/src/namespace/configurator/mod.rs @@ -0,0 +1,140 @@ +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use chrono::NaiveDateTime; +use futures::Future; +use libsql_sys::EncryptionConfig; +use tokio::sync::Semaphore; + +use crate::connection::config::DatabaseConfig; +use crate::replication::script_backup_manager::ScriptBackupManager; +use crate::StatsSender; + +use super::broadcasters::BroadcasterHandle; +use super::meta_store::MetaStoreHandle; +use super::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, + ResolveNamespacePathFn, RestoreOption, +}; + +pub mod fork; +mod helpers; +// mod libsql_wal_replica; +mod primary; +mod replica; +mod schema; + +pub use primary::PrimaryConfigurator; +pub use replica::ReplicaConfigurator; +pub use schema::SchemaConfigurator; + +#[derive(Clone, Debug)] +pub struct BaseNamespaceConfig { + pub(crate) base_path: Arc, + pub(crate) extensions: Arc<[PathBuf]>, + pub(crate) stats_sender: StatsSender, + pub(crate) max_response_size: u64, + pub(crate) max_total_response_size: u64, + pub(crate) max_concurrent_connections: Arc, + pub(crate) max_concurrent_requests: u64, + pub(crate) encryption_config: Option, +} + +#[derive(Clone)] +pub struct PrimaryExtraConfig { + pub(crate) max_log_size: u64, + pub(crate) max_log_duration: Option, + pub(crate) bottomless_replication: Option, + pub(crate) scripted_backup: Option, + pub(crate) checkpoint_interval: Option, +} + +pub type DynConfigurator = dyn ConfigureNamespace + Send + Sync + 'static; + +pub(crate) struct NamespaceConfigurators { + replica_configurator: Option>, + primary_configurator: Option>, + schema_configurator: Option>, +} + +impl Default for NamespaceConfigurators { + fn default() -> Self { + Self::empty() + } +} + +impl NamespaceConfigurators { + pub fn empty() -> Self { + Self { + replica_configurator: None, + primary_configurator: None, + schema_configurator: None, + } + } + + pub fn with_primary( + &mut self, + c: impl ConfigureNamespace + Send + Sync + 'static, + ) -> &mut Self { + self.primary_configurator = Some(Box::new(c)); + self + } + + pub fn with_replica( + &mut self, + c: impl ConfigureNamespace + Send + Sync + 'static, + ) -> &mut Self { + self.replica_configurator = Some(Box::new(c)); + self + } + + pub fn with_schema(&mut self, c: impl ConfigureNamespace + Send + Sync + 'static) -> &mut Self { + self.schema_configurator = Some(Box::new(c)); + self + } + + pub fn configure_schema(&self) -> crate::Result<&DynConfigurator> { + self.schema_configurator.as_deref().ok_or_else(|| todo!()) + } + + pub fn configure_primary(&self) -> crate::Result<&DynConfigurator> { + self.primary_configurator.as_deref().ok_or_else(|| todo!()) + } + + pub fn configure_replica(&self) -> crate::Result<&DynConfigurator> { + self.replica_configurator.as_deref().ok_or_else(|| todo!()) + } +} + +pub trait ConfigureNamespace { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>>; + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>>; + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> Pin> + Send + 'a>>; +} diff --git a/libsql-server/src/namespace/configurator/primary.rs b/libsql-server/src/namespace/configurator/primary.rs new file mode 100644 index 0000000000..03cdd2fd7b --- /dev/null +++ b/libsql-server/src/namespace/configurator/primary.rs @@ -0,0 +1,202 @@ +use std::path::Path; +use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use futures::prelude::Future; +use libsql_sys::EncryptionConfig; +use tokio::task::JoinSet; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::MakeConnection; +use crate::database::{Database, PrimaryDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_primary_connection_maker; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceBottomlessDbIdInit, NamespaceName, NamespaceStore, ResetCb, + ResolveNamespacePathFn, RestoreOption, +}; +use crate::run_periodic_checkpoint; +use crate::schema::{has_pending_migration_task, setup_migration_table}; + +use super::helpers::cleanup_primary; +use super::{BaseNamespaceConfig, ConfigureNamespace, PrimaryExtraConfig}; + +pub struct PrimaryConfigurator { + base: BaseNamespaceConfig, + primary_config: PrimaryExtraConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +} + +impl PrimaryConfigurator { + pub fn new( + base: BaseNamespaceConfig, + primary_config: PrimaryExtraConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + ) -> Self { + Self { + base, + primary_config, + make_wal_manager, + } + } + + #[tracing::instrument(skip_all, fields(namespace))] + async fn try_new_primary( + &self, + namespace: NamespaceName, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + resolve_attach_path: ResolveNamespacePathFn, + db_path: Arc, + broadcaster: BroadcasterHandle, + encryption_config: Option, + ) -> crate::Result { + let mut join_set = JoinSet::new(); + + tokio::fs::create_dir_all(&db_path).await?; + + let block_writes = Arc::new(AtomicBool::new(false)); + let (connection_maker, wal_wrapper, stats) = make_primary_connection_maker( + &self.primary_config, + &self.base, + &meta_store_handle, + &db_path, + &namespace, + restore_option, + block_writes.clone(), + &mut join_set, + resolve_attach_path, + broadcaster, + self.make_wal_manager.clone(), + encryption_config, + ) + .await?; + let connection_maker = Arc::new(connection_maker); + + if meta_store_handle.get().shared_schema_name.is_some() { + let block_writes = block_writes.clone(); + let conn = connection_maker.create().await?; + tokio::task::spawn_blocking(move || { + conn.with_raw(|conn| -> crate::Result<()> { + setup_migration_table(conn)?; + if has_pending_migration_task(conn)? { + block_writes.store(true, Ordering::SeqCst); + } + Ok(()) + }) + }) + .await + .unwrap()?; + } + + if let Some(checkpoint_interval) = self.primary_config.checkpoint_interval { + join_set.spawn(run_periodic_checkpoint( + connection_maker.clone(), + checkpoint_interval, + namespace.clone(), + )); + } + + tracing::debug!("Done making new primary"); + + Ok(Namespace { + tasks: join_set, + db: Database::Primary(PrimaryDatabase { + wal_wrapper, + connection_maker, + block_writes, + }), + name: namespace, + stats, + db_config_store: meta_store_handle, + path: db_path.into(), + }) + } +} + +impl ConfigureNamespace for PrimaryConfigurator { + fn setup<'a>( + &'a self, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + _reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + _store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let db_path: Arc = self.base.base_path.join("dbs").join(name.as_str()).into(); + let fresh_namespace = !db_path.try_exists()?; + // FIXME: make that truly atomic. explore the idea of using temp directories, and it's implications + match self + .try_new_primary( + name.clone(), + meta_store_handle, + restore_option, + resolve_attach_path, + db_path.clone(), + broadcaster, + self.base.encryption_config.clone(), + ) + .await + { + Ok(this) => Ok(this), + Err(e) if fresh_namespace => { + tracing::error!( + "an error occured while deleting creating namespace, cleaning..." + ); + if let Err(e) = tokio::fs::remove_dir_all(&db_path).await { + tracing::error!("failed to remove dirty namespace directory: {e}") + } + Err(e) + } + Err(e) => Err(e), + } + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + cleanup_primary( + &self.base, + &self.primary_config, + namespace, + db_config, + prune_all, + bottomless_db_id_init, + ) + .await + }) + } + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + Box::pin(super::fork::fork( + from_ns, + from_config, + to_ns, + to_config, + timestamp, + store, + &self.primary_config, + self.base.base_path.clone(), + )) + } +} diff --git a/libsql-server/src/namespace/configurator/replica.rs b/libsql-server/src/namespace/configurator/replica.rs new file mode 100644 index 0000000000..7832d30ef8 --- /dev/null +++ b/libsql-server/src/namespace/configurator/replica.rs @@ -0,0 +1,245 @@ +use std::pin::Pin; +use std::sync::Arc; + +use futures::Future; +use hyper::Uri; +use libsql_replication::rpc::replication::log_offset::WalFlavor; +use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; +use tokio::task::JoinSet; +use tonic::transport::Channel; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::connection::write_proxy::MakeWriteProxyConn; +use crate::connection::MakeConnection; +use crate::database::{Database, ReplicaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::configurator::helpers::make_stats; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{Namespace, NamespaceBottomlessDbIdInit, RestoreOption}; +use crate::namespace::{NamespaceName, NamespaceStore, ResetCb, ResetOp, ResolveNamespacePathFn}; +use crate::{DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT}; + +use super::{BaseNamespaceConfig, ConfigureNamespace}; + +pub struct ReplicaConfigurator { + base: BaseNamespaceConfig, + channel: Channel, + uri: Uri, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, +} + +impl ReplicaConfigurator { + pub fn new( + base: BaseNamespaceConfig, + channel: Channel, + uri: Uri, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + ) -> Self { + Self { + base, + channel, + uri, + make_wal_manager, + } + } +} + +impl ConfigureNamespace for ReplicaConfigurator { + fn setup<'a>( + &'a self, + meta_store_handle: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + tracing::debug!("creating replica namespace"); + let db_path = self.base.base_path.join("dbs").join(name.as_str()); + let channel = self.channel.clone(); + let uri = self.uri.clone(); + + let rpc_client = ReplicationLogClient::with_origin(channel.clone(), uri.clone()); + let client = crate::replication::replicator_client::Client::new( + name.clone(), + rpc_client, + &db_path, + meta_store_handle.clone(), + store.clone(), + WalFlavor::Sqlite, + ) + .await?; + let applied_frame_no_receiver = client.current_frame_no_notifier.subscribe(); + let mut replicator = libsql_replication::replicator::Replicator::new_sqlite( + client, + db_path.join("data"), + DEFAULT_AUTO_CHECKPOINT, + None, + ) + .await?; + + tracing::debug!("try perform handshake"); + // force a handshake now, to retrieve the primary's current replication index + match replicator.try_perform_handshake().await { + Err(libsql_replication::replicator::Error::Meta( + libsql_replication::meta::Error::LogIncompatible, + )) => { + tracing::error!( + "trying to replicate incompatible logs, reseting replica and nuking db dir" + ); + std::fs::remove_dir_all(&db_path).unwrap(); + return self + .setup( + meta_store_handle, + restore_option, + name, + reset, + resolve_attach_path, + store, + broadcaster, + ) + .await; + } + Err(e) => Err(e)?, + Ok(_) => (), + } + + tracing::debug!("done performing handshake"); + + let primary_current_replicatio_index = + replicator.client_mut().primary_replication_index; + + let mut join_set = JoinSet::new(); + let namespace = name.clone(); + join_set.spawn(async move { + use libsql_replication::replicator::Error; + loop { + match replicator.run().await { + err @ Error::Fatal(_) => Err(err)?, + err @ Error::NamespaceDoesntExist => { + tracing::error!("namespace {namespace} doesn't exist, destroying..."); + (reset)(ResetOp::Destroy(namespace.clone())); + Err(err)?; + } + e @ Error::Injector(_) => { + tracing::error!("potential corruption detected while replicating, reseting replica: {e}"); + (reset)(ResetOp::Reset(namespace.clone())); + Err(e)?; + }, + Error::Meta(err) => { + use libsql_replication::meta::Error; + match err { + Error::LogIncompatible => { + tracing::error!("trying to replicate incompatible logs, reseting replica"); + (reset)(ResetOp::Reset(namespace.clone())); + Err(err)?; + } + Error::InvalidMetaFile + | Error::Io(_) + | Error::InvalidLogId + | Error::FailedToCommit(_) + | Error::InvalidReplicationPath + | Error::RequiresCleanDatabase => { + // We retry from last frame index? + tracing::warn!("non-fatal replication error, retrying from last commit index: {err}"); + }, + } + } + e @ (Error::Internal(_) + | Error::Client(_) + | Error::PrimaryHandshakeTimeout + | Error::NeedSnapshot) => { + tracing::warn!("non-fatal replication error, retrying from last commit index: {e}"); + }, + Error::NoHandshake => { + // not strictly necessary, but in case the handshake error goes uncaught, + // we reset the client state. + replicator.client_mut().reset_token(); + } + Error::SnapshotPending => unreachable!(), + } + } + }); + + let stats = make_stats( + &db_path, + &mut join_set, + meta_store_handle.clone(), + self.base.stats_sender.clone(), + name.clone(), + applied_frame_no_receiver.clone(), + self.base.encryption_config.clone(), + ) + .await?; + + let connection_maker = MakeWriteProxyConn::new( + db_path.clone(), + self.base.extensions.clone(), + channel.clone(), + uri.clone(), + stats.clone(), + broadcaster, + meta_store_handle.clone(), + applied_frame_no_receiver, + self.base.max_response_size, + self.base.max_total_response_size, + primary_current_replicatio_index, + None, + resolve_attach_path, + self.make_wal_manager.clone(), + ) + .await? + .throttled( + self.base.max_concurrent_connections.clone(), + Some(DB_CREATE_TIMEOUT), + self.base.max_total_response_size, + self.base.max_concurrent_requests, + ); + + Ok(Namespace { + tasks: join_set, + db: Database::Replica(ReplicaDatabase { + connection_maker: Arc::new(connection_maker), + }), + name: name.clone(), + stats, + db_config_store: meta_store_handle, + path: db_path.into(), + }) + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + _db_config: &DatabaseConfig, + _prune_all: bool, + _bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let ns_path = self.base.base_path.join("dbs").join(namespace.as_str()); + if ns_path.try_exists()? { + tracing::debug!("removing database directory: {}", ns_path.display()); + tokio::fs::remove_dir_all(ns_path).await?; + } + Ok(()) + }) + } + + fn fork<'a>( + &'a self, + _from_ns: &'a Namespace, + _from_config: MetaStoreHandle, + _to_ns: NamespaceName, + _to_config: MetaStoreHandle, + _timestamp: Option, + _store: NamespaceStore, + ) -> Pin> + Send + 'a>> { + Box::pin(std::future::ready(Err(crate::Error::Fork( + super::fork::ForkError::ForkReplica, + )))) + } +} diff --git a/libsql-server/src/namespace/configurator/schema.rs b/libsql-server/src/namespace/configurator/schema.rs new file mode 100644 index 0000000000..f95c8abf51 --- /dev/null +++ b/libsql-server/src/namespace/configurator/schema.rs @@ -0,0 +1,132 @@ +use std::sync::{atomic::AtomicBool, Arc}; + +use futures::prelude::Future; +use tokio::task::JoinSet; + +use crate::connection::config::DatabaseConfig; +use crate::connection::connection_manager::InnerWalManager; +use crate::database::{Database, SchemaDatabase}; +use crate::namespace::broadcasters::BroadcasterHandle; +use crate::namespace::meta_store::MetaStoreHandle; +use crate::namespace::{ + Namespace, NamespaceName, NamespaceStore, ResetCb, ResolveNamespacePathFn, RestoreOption, +}; +use crate::schema::SchedulerHandle; + +use super::helpers::{cleanup_primary, make_primary_connection_maker}; +use super::{BaseNamespaceConfig, ConfigureNamespace, PrimaryExtraConfig}; + +pub struct SchemaConfigurator { + base: BaseNamespaceConfig, + primary_config: PrimaryExtraConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler: SchedulerHandle, +} + +impl SchemaConfigurator { + pub fn new( + base: BaseNamespaceConfig, + primary_config: PrimaryExtraConfig, + make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, + migration_scheduler: SchedulerHandle, + ) -> Self { + Self { + base, + primary_config, + make_wal_manager, + migration_scheduler, + } + } +} + +impl ConfigureNamespace for SchemaConfigurator { + fn setup<'a>( + &'a self, + db_config: MetaStoreHandle, + restore_option: RestoreOption, + name: &'a NamespaceName, + _reset: ResetCb, + resolve_attach_path: ResolveNamespacePathFn, + _store: NamespaceStore, + broadcaster: BroadcasterHandle, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + let mut join_set = JoinSet::new(); + let db_path = self.base.base_path.join("dbs").join(name.as_str()); + + tokio::fs::create_dir_all(&db_path).await?; + + let (connection_maker, wal_manager, stats) = make_primary_connection_maker( + &self.primary_config, + &self.base, + &db_config, + &db_path, + &name, + restore_option, + Arc::new(AtomicBool::new(false)), // this is always false for schema + &mut join_set, + resolve_attach_path, + broadcaster, + self.make_wal_manager.clone(), + self.base.encryption_config.clone(), + ) + .await?; + + Ok(Namespace { + db: Database::Schema(SchemaDatabase::new( + self.migration_scheduler.clone(), + name.clone(), + connection_maker, + wal_manager, + db_config.clone(), + )), + name: name.clone(), + tasks: join_set, + stats, + db_config_store: db_config.clone(), + path: db_path.into(), + }) + }) + } + + fn cleanup<'a>( + &'a self, + namespace: &'a NamespaceName, + db_config: &'a DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: crate::namespace::NamespaceBottomlessDbIdInit, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + cleanup_primary( + &self.base, + &self.primary_config, + namespace, + db_config, + prune_all, + bottomless_db_id_init, + ) + .await + }) + } + + fn fork<'a>( + &'a self, + from_ns: &'a Namespace, + from_config: MetaStoreHandle, + to_ns: NamespaceName, + to_config: MetaStoreHandle, + timestamp: Option, + store: NamespaceStore, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(super::fork::fork( + from_ns, + from_config, + to_ns, + to_config, + timestamp, + store, + &self.primary_config, + self.base.base_path.clone(), + )) + } +} diff --git a/libsql-server/src/namespace/meta_store.rs b/libsql-server/src/namespace/meta_store.rs index 599dab9360..283eaf36eb 100644 --- a/libsql-server/src/namespace/meta_store.rs +++ b/libsql-server/src/namespace/meta_store.rs @@ -70,8 +70,8 @@ struct MetaStoreInner { // TODO(lucio): Use a concurrent hashmap so we don't block connection creation // when we are updating the config. The config si already synced via the watch // channel. - configs: Mutex>>, - conn: Mutex, + configs: tokio::sync::Mutex>>, + conn: tokio::sync::Mutex, wal_manager: MetaStoreWalManager, } @@ -313,7 +313,7 @@ fn process(msg: ChangeMsg, inner: Arc) { } else { Ok(()) }; - let mut configs = inner.configs.lock(); + let mut configs = inner.configs.blocking_lock(); if let Some(config_watch) = configs.get_mut(&namespace) { let new_version = config_watch.borrow().version.wrapping_add(1); @@ -330,7 +330,7 @@ fn process(msg: ChangeMsg, inner: Arc) { let _ = ret_chan.send(ret); } else { let ret = if flush { - let mut configs = inner.configs.lock(); + let mut configs = inner.configs.blocking_lock(); if let Some(config_watch) = configs.get_mut(&namespace) { let config = config_watch.subscribe().borrow().clone(); try_process(&inner, &namespace, &config.config) @@ -351,7 +351,7 @@ fn try_process( ) -> Result<()> { let config_encoded = metadata::DatabaseConfig::from(&*config).encode_to_vec(); - let mut conn = inner.conn.lock(); + let mut conn = inner.conn.blocking_lock(); if let Some(schema) = config.shared_schema_name.as_ref() { let tx = conn.transaction()?; if let Some(ref schema) = config.shared_schema_name { @@ -470,11 +470,11 @@ impl MetaStore { Ok(Self { changes_tx, inner }) } - pub fn handle(&self, namespace: NamespaceName) -> MetaStoreHandle { + pub async fn handle(&self, namespace: NamespaceName) -> MetaStoreHandle { tracing::debug!("getting meta store handle"); let change_tx = self.changes_tx.clone(); - let mut configs = self.inner.configs.lock(); + let mut configs = self.inner.configs.lock().await; let sender = configs.entry(namespace.clone()).or_insert_with(|| { // TODO(lucio): if no entry exists we need to ensure we send the update to // the bg channel. @@ -495,11 +495,18 @@ impl MetaStore { pub fn remove(&self, namespace: NamespaceName) -> Result>> { tracing::debug!("removing namespace `{}` from meta store", namespace); - let mut configs = self.inner.configs.lock(); + // "configs" lock can be used in both async and sync contexts while "conn" lock always used + // in blocking context + // + // so, we better to acquire "conn" lock first in order to prevent situation when "configs" + // lock is taken but "conn" lock is not free (so, we potentially will block async tasks for + // indefinite amount of time while "conn" lock will be acquired by other thread) + let mut conn = self.inner.conn.blocking_lock(); + + let mut configs = self.inner.configs.blocking_lock(); let r = if let Some(sender) = configs.get(&namespace) { tracing::debug!("removed namespace `{}` from meta store", namespace); let config = sender.borrow().clone(); - let mut conn = self.inner.conn.lock(); let tx = conn.transaction()?; if config.config.is_shared_schema { if crate::schema::db::schema_has_linked_dbs(&tx, &namespace)? { @@ -535,8 +542,8 @@ impl MetaStore { // TODO: we need to either make sure that the metastore is restored // before we start accepting connections or we need to contact bottomless // here to check if a namespace exists. Preferably the former. - pub fn exists(&self, namespace: &NamespaceName) -> bool { - self.inner.configs.lock().contains_key(namespace) + pub async fn exists(&self, namespace: &NamespaceName) -> bool { + self.inner.configs.lock().await.contains_key(namespace) } pub(crate) async fn shutdown(&self) -> crate::Result<()> { @@ -559,7 +566,7 @@ impl MetaStore { ) -> crate::Result { let inner = self.inner.clone(); let summary = tokio::task::spawn_blocking(move || { - let mut conn = inner.conn.lock(); + let mut conn = inner.conn.blocking_lock(); crate::schema::get_migrations_summary(&mut conn, schema) }) .await @@ -574,7 +581,7 @@ impl MetaStore { ) -> crate::Result> { let inner = self.inner.clone(); let details = tokio::task::spawn_blocking(move || { - let mut conn = inner.conn.lock(); + let mut conn = inner.conn.blocking_lock(); crate::schema::get_migration_details(&mut conn, schema, job_id) }) .await diff --git a/libsql-server/src/namespace/mod.rs b/libsql-server/src/namespace/mod.rs index 6e48e7f1d8..2a2e3eb211 100644 --- a/libsql-server/src/namespace/mod.rs +++ b/libsql-server/src/namespace/mod.rs @@ -1,62 +1,31 @@ -pub mod broadcasters; -mod fork; -pub mod meta_store; -mod name; -pub mod replication_wal; -mod schema_lock; -mod store; - -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Weak}; +use std::path::Path; +use std::sync::Arc; -use anyhow::{Context as _, Error}; -use bottomless::replicator::Options; -use broadcasters::BroadcasterHandle; +use anyhow::Context as _; use bytes::Bytes; use chrono::NaiveDateTime; -use enclose::enclose; use futures_core::{Future, Stream}; -use hyper::Uri; -use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; -use libsql_sys::wal::Sqlite3WalManager; -use libsql_sys::EncryptionConfig; -use tokio::io::AsyncBufReadExt; -use tokio::sync::{watch, Semaphore}; use tokio::task::JoinSet; -use tokio::time::Duration; -use tokio_util::io::StreamReader; -use tonic::transport::Channel; use uuid::Uuid; use crate::auth::parse_jwt_keys; use crate::connection::config::DatabaseConfig; -use crate::connection::connection_manager::InnerWalManager; -use crate::connection::libsql::{open_conn, MakeLibSqlConn}; -use crate::connection::write_proxy::MakeWriteProxyConn; -use crate::connection::Connection; -use crate::connection::MakeConnection; -use crate::database::{ - Database, DatabaseKind, PrimaryConnection, PrimaryConnectionMaker, PrimaryDatabase, - ReplicaDatabase, SchemaDatabase, -}; -use crate::error::LoadDumpError; -use crate::replication::script_backup_manager::ScriptBackupManager; -use crate::replication::{FrameNo, ReplicationLogger}; -use crate::schema::{has_pending_migration_task, setup_migration_table, SchedulerHandle}; +use crate::connection::Connection as _; +use crate::database::Database; use crate::stats::Stats; -use crate::{ - run_periodic_checkpoint, StatsSender, BLOCKING_RT, DB_CREATE_TIMEOUT, DEFAULT_AUTO_CHECKPOINT, -}; -pub use fork::ForkError; - -use self::fork::{ForkTask, PointInTimeRestore}; use self::meta_store::MetaStoreHandle; pub use self::name::NamespaceName; -use self::replication_wal::{make_replication_wal_wrapper, ReplicationWalWrapper}; pub use self::store::NamespaceStore; +pub mod broadcasters; +pub(crate) mod configurator; +pub mod meta_store; +mod name; +pub mod replication_wal; +mod schema_lock; +mod store; + pub type ResetCb = Box; pub type ResolveNamespacePathFn = Arc crate::Result> + Sync + Send + 'static>; @@ -100,103 +69,10 @@ pub struct Namespace { } impl Namespace { - async fn from_config( - ns_config: &NamespaceConfig, - db_config: MetaStoreHandle, - restore_option: RestoreOption, - name: &NamespaceName, - reset: ResetCb, - resolve_attach_path: ResolveNamespacePathFn, - store: NamespaceStore, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - match ns_config.db_kind { - DatabaseKind::Primary if db_config.get().is_shared_schema => { - Self::new_schema( - ns_config, - name.clone(), - db_config, - restore_option, - resolve_attach_path, - broadcaster, - ) - .await - } - DatabaseKind::Primary => { - Self::new_primary( - ns_config, - name.clone(), - db_config, - restore_option, - resolve_attach_path, - broadcaster, - ) - .await - } - DatabaseKind::Replica => { - Self::new_replica( - ns_config, - name.clone(), - db_config, - reset, - resolve_attach_path, - store, - broadcaster, - ) - .await - } - } - } - pub(crate) fn name(&self) -> &NamespaceName { &self.name } - /// completely remove resources associated with the namespace - pub(crate) async fn cleanup( - ns_config: &NamespaceConfig, - name: &NamespaceName, - db_config: &DatabaseConfig, - prune_all: bool, - bottomless_db_id_init: NamespaceBottomlessDbIdInit, - ) -> crate::Result<()> { - let ns_path = ns_config.base_path.join("dbs").join(name.as_str()); - match ns_config.db_kind { - DatabaseKind::Primary => { - if let Some(ref options) = ns_config.bottomless_replication { - let bottomless_db_id = match bottomless_db_id_init { - NamespaceBottomlessDbIdInit::Provided(db_id) => db_id, - NamespaceBottomlessDbIdInit::FetchFromConfig => { - NamespaceBottomlessDbId::from_config(&db_config) - } - }; - let options = make_bottomless_options(options, bottomless_db_id, name.clone()); - let replicator = bottomless::replicator::Replicator::with_options( - ns_path.join("data").to_str().unwrap(), - options, - ) - .await?; - if prune_all { - let delete_all = replicator.delete_all(None).await?; - // perform hard deletion in the background - tokio::spawn(delete_all.commit()); - } else { - // for soft delete make sure that local db is fully backed up - replicator.savepoint().confirmed().await?; - } - } - } - DatabaseKind::Replica => (), - } - - if ns_path.try_exists()? { - tracing::debug!("removing database directory: {}", ns_path.display()); - tokio::fs::remove_dir_all(ns_path).await?; - } - - Ok(()) - } - async fn destroy(mut self) -> anyhow::Result<()> { self.tasks.shutdown().await; self.db.destroy(); @@ -246,605 +122,11 @@ impl Namespace { pub fn config_changed(&self) -> impl Future { self.db_config_store.changed() } - - async fn new_primary( - config: &NamespaceConfig, - name: NamespaceName, - meta_store_handle: MetaStoreHandle, - restore_option: RestoreOption, - resolve_attach_path: ResolveNamespacePathFn, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let db_path: Arc = config.base_path.join("dbs").join(name.as_str()).into(); - let fresh_namespace = !db_path.try_exists()?; - // FIXME: make that truly atomic. explore the idea of using temp directories, and it's implications - match Self::try_new_primary( - config, - name.clone(), - meta_store_handle, - restore_option, - resolve_attach_path, - db_path.clone(), - broadcaster, - ) - .await - { - Ok(this) => Ok(this), - Err(e) if fresh_namespace => { - tracing::error!("an error occured while deleting creating namespace, cleaning..."); - if let Err(e) = tokio::fs::remove_dir_all(&db_path).await { - tracing::error!("failed to remove dirty namespace directory: {e}") - } - Err(e) - } - Err(e) => Err(e), - } - } - - #[tracing::instrument(skip_all)] - async fn make_primary_connection_maker( - ns_config: &NamespaceConfig, - meta_store_handle: &MetaStoreHandle, - db_path: &Path, - name: &NamespaceName, - restore_option: RestoreOption, - block_writes: Arc, - join_set: &mut JoinSet>, - resolve_attach_path: ResolveNamespacePathFn, - broadcaster: BroadcasterHandle, - ) -> crate::Result<(PrimaryConnectionMaker, ReplicationWalWrapper, Arc)> { - let db_config = meta_store_handle.get(); - let bottomless_db_id = NamespaceBottomlessDbId::from_config(&db_config); - // FIXME: figure how to to it per-db - let mut is_dirty = { - let sentinel_path = db_path.join(".sentinel"); - if sentinel_path.try_exists()? { - true - } else { - tokio::fs::File::create(&sentinel_path).await?; - false - } - }; - - // FIXME: due to a bug in logger::checkpoint_db we call regular checkpointing code - // instead of our virtual WAL one. It's a bit tangled to fix right now, because - // we need WAL context for checkpointing, and WAL context needs the ReplicationLogger... - // So instead we checkpoint early, *before* bottomless gets initialized. That way - // we're sure bottomless won't try to back up any existing WAL frames and will instead - // treat the existing db file as the source of truth. - - let bottomless_replicator = match ns_config.bottomless_replication { - Some(ref options) => { - tracing::debug!("Checkpointing before initializing bottomless"); - crate::replication::primary::logger::checkpoint_db(&db_path.join("data"))?; - tracing::debug!("Checkpointed before initializing bottomless"); - let options = make_bottomless_options(options, bottomless_db_id, name.clone()); - let (replicator, did_recover) = - init_bottomless_replicator(db_path.join("data"), options, &restore_option) - .await?; - tracing::debug!("Completed init of bottomless replicator"); - is_dirty |= did_recover; - Some(replicator) - } - None => None, - }; - - tracing::debug!("Checking fresh db"); - let is_fresh_db = check_fresh_db(&db_path)?; - // switch frame-count checkpoint to time-based one - let auto_checkpoint = if ns_config.checkpoint_interval.is_some() { - 0 - } else { - DEFAULT_AUTO_CHECKPOINT - }; - - let logger = Arc::new(ReplicationLogger::open( - &db_path, - ns_config.max_log_size, - ns_config.max_log_duration, - is_dirty, - auto_checkpoint, - ns_config.scripted_backup.clone(), - name.clone(), - ns_config.encryption_config.clone(), - )?); - - tracing::debug!("sending stats"); - - let stats = make_stats( - &db_path, - join_set, - meta_store_handle.clone(), - ns_config.stats_sender.clone(), - name.clone(), - logger.new_frame_notifier.subscribe(), - ns_config.encryption_config.clone(), - ) - .await?; - - tracing::debug!("Making replication wal wrapper"); - let wal_wrapper = make_replication_wal_wrapper(bottomless_replicator, logger.clone()); - - tracing::debug!("Opening libsql connection"); - - let connection_maker = MakeLibSqlConn::new( - db_path.to_path_buf(), - wal_wrapper.clone(), - stats.clone(), - broadcaster, - meta_store_handle.clone(), - ns_config.extensions.clone(), - ns_config.max_response_size, - ns_config.max_total_response_size, - auto_checkpoint, - logger.new_frame_notifier.subscribe(), - ns_config.encryption_config.clone(), - block_writes, - resolve_attach_path, - ns_config.make_wal_manager.clone(), - ) - .await? - .throttled( - ns_config.max_concurrent_connections.clone(), - Some(DB_CREATE_TIMEOUT), - ns_config.max_total_response_size, - ns_config.max_concurrent_requests, - ); - - tracing::debug!("Completed opening libsql connection"); - - // this must happen after we create the connection maker. The connection maker old on a - // connection to ensure that no other connection is closing while we try to open the dump. - // that would cause a SQLITE_LOCKED error. - match restore_option { - RestoreOption::Dump(_) if !is_fresh_db => { - Err(LoadDumpError::LoadDumpExistingDb)?; - } - RestoreOption::Dump(dump) => { - let conn = connection_maker.create().await?; - tracing::debug!("Loading dump"); - load_dump(dump, conn).await?; - tracing::debug!("Done loading dump"); - } - _ => { /* other cases were already handled when creating bottomless */ } - } - - join_set.spawn(run_periodic_compactions(logger.clone())); - - tracing::debug!("Done making primary connection"); - - Ok((connection_maker, wal_wrapper, stats)) - } - - #[tracing::instrument(skip_all, fields(namespace))] - async fn try_new_primary( - ns_config: &NamespaceConfig, - namespace: NamespaceName, - meta_store_handle: MetaStoreHandle, - restore_option: RestoreOption, - resolve_attach_path: ResolveNamespacePathFn, - db_path: Arc, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let mut join_set = JoinSet::new(); - - tokio::fs::create_dir_all(&db_path).await?; - - let block_writes = Arc::new(AtomicBool::new(false)); - let (connection_maker, wal_wrapper, stats) = Self::make_primary_connection_maker( - ns_config, - &meta_store_handle, - &db_path, - &namespace, - restore_option, - block_writes.clone(), - &mut join_set, - resolve_attach_path, - broadcaster, - ) - .await?; - let connection_maker = Arc::new(connection_maker); - - if meta_store_handle.get().shared_schema_name.is_some() { - let block_writes = block_writes.clone(); - let conn = connection_maker.create().await?; - tokio::task::spawn_blocking(move || { - conn.with_raw(|conn| -> crate::Result<()> { - setup_migration_table(conn)?; - if has_pending_migration_task(conn)? { - block_writes.store(true, Ordering::SeqCst); - } - Ok(()) - }) - }) - .await - .unwrap()?; - } - - if let Some(checkpoint_interval) = ns_config.checkpoint_interval { - join_set.spawn(run_periodic_checkpoint( - connection_maker.clone(), - checkpoint_interval, - namespace.clone(), - )); - } - - tracing::debug!("Done making new primary"); - - Ok(Self { - tasks: join_set, - db: Database::Primary(PrimaryDatabase { - wal_wrapper, - connection_maker, - block_writes, - }), - name: namespace, - stats, - db_config_store: meta_store_handle, - path: db_path.into(), - }) - } - - #[tracing::instrument(skip_all, fields(name))] - #[async_recursion::async_recursion] - async fn new_replica( - config: &NamespaceConfig, - name: NamespaceName, - meta_store_handle: MetaStoreHandle, - reset: ResetCb, - resolve_attach_path: ResolveNamespacePathFn, - store: NamespaceStore, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - tracing::debug!("creating replica namespace"); - let db_path = config.base_path.join("dbs").join(name.as_str()); - let channel = config.channel.clone().expect("bad replica config"); - let uri = config.uri.clone().expect("bad replica config"); - - let rpc_client = ReplicationLogClient::with_origin(channel.clone(), uri.clone()); - let client = crate::replication::replicator_client::Client::new( - name.clone(), - rpc_client, - &db_path, - meta_store_handle.clone(), - store.clone(), - ) - .await?; - let applied_frame_no_receiver = client.current_frame_no_notifier.subscribe(); - let mut replicator = libsql_replication::replicator::Replicator::new( - client, - db_path.join("data"), - DEFAULT_AUTO_CHECKPOINT, - config.encryption_config.clone(), - ) - .await?; - - tracing::debug!("try perform handshake"); - // force a handshake now, to retrieve the primary's current replication index - match replicator.try_perform_handshake().await { - Err(libsql_replication::replicator::Error::Meta( - libsql_replication::meta::Error::LogIncompatible, - )) => { - tracing::error!( - "trying to replicate incompatible logs, reseting replica and nuking db dir" - ); - std::fs::remove_dir_all(&db_path).unwrap(); - return Self::new_replica( - config, - name, - meta_store_handle, - reset, - resolve_attach_path, - store, - broadcaster, - ) - .await; - } - Err(e) => Err(e)?, - Ok(_) => (), - } - - tracing::debug!("done performing handshake"); - - let primary_current_replicatio_index = replicator.client_mut().primary_replication_index; - - let mut join_set = JoinSet::new(); - let namespace = name.clone(); - join_set.spawn(async move { - use libsql_replication::replicator::Error; - loop { - match replicator.run().await { - err @ Error::Fatal(_) => Err(err)?, - err @ Error::NamespaceDoesntExist => { - tracing::error!("namespace {namespace} doesn't exist, destroying..."); - (reset)(ResetOp::Destroy(namespace.clone())); - Err(err)?; - } - e @ Error::Injector(_) => { - tracing::error!("potential corruption detected while replicating, reseting replica: {e}"); - (reset)(ResetOp::Reset(namespace.clone())); - Err(e)?; - }, - Error::Meta(err) => { - use libsql_replication::meta::Error; - match err { - Error::LogIncompatible => { - tracing::error!("trying to replicate incompatible logs, reseting replica"); - (reset)(ResetOp::Reset(namespace.clone())); - Err(err)?; - } - Error::InvalidMetaFile - | Error::Io(_) - | Error::InvalidLogId - | Error::FailedToCommit(_) - | Error::InvalidReplicationPath - | Error::RequiresCleanDatabase => { - // We retry from last frame index? - tracing::warn!("non-fatal replication error, retrying from last commit index: {err}"); - }, - } - } - e @ (Error::Internal(_) - | Error::Client(_) - | Error::PrimaryHandshakeTimeout - | Error::NeedSnapshot) => { - tracing::warn!("non-fatal replication error, retrying from last commit index: {e}"); - }, - Error::NoHandshake => { - // not strictly necessary, but in case the handshake error goes uncaught, - // we reset the client state. - replicator.client_mut().reset_token(); - } - Error::SnapshotPending => unreachable!(), - } - } - }); - - let stats = make_stats( - &db_path, - &mut join_set, - meta_store_handle.clone(), - config.stats_sender.clone(), - name.clone(), - applied_frame_no_receiver.clone(), - config.encryption_config.clone(), - ) - .await?; - - let connection_maker = MakeWriteProxyConn::new( - db_path.clone(), - config.extensions.clone(), - channel.clone(), - uri.clone(), - stats.clone(), - broadcaster, - meta_store_handle.clone(), - applied_frame_no_receiver, - config.max_response_size, - config.max_total_response_size, - primary_current_replicatio_index, - config.encryption_config.clone(), - resolve_attach_path, - config.make_wal_manager.clone(), - ) - .await? - .throttled( - config.max_concurrent_connections.clone(), - Some(DB_CREATE_TIMEOUT), - config.max_total_response_size, - config.max_concurrent_requests, - ); - - Ok(Self { - tasks: join_set, - db: Database::Replica(ReplicaDatabase { - connection_maker: Arc::new(connection_maker), - }), - name, - stats, - db_config_store: meta_store_handle, - path: db_path.into(), - }) - } - - async fn fork( - ns_config: &NamespaceConfig, - from_ns: &Namespace, - from_config: MetaStoreHandle, - to_ns: NamespaceName, - to_config: MetaStoreHandle, - timestamp: Option, - resolve_attach: ResolveNamespacePathFn, - store: NamespaceStore, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let from_config = from_config.get(); - match ns_config.db_kind { - DatabaseKind::Primary => { - let bottomless_db_id = NamespaceBottomlessDbId::from_config(&from_config); - let restore_to = if let Some(timestamp) = timestamp { - if let Some(ref options) = ns_config.bottomless_replication { - Some(PointInTimeRestore { - timestamp, - replicator_options: make_bottomless_options( - options, - bottomless_db_id.clone(), - from_ns.name().clone(), - ), - }) - } else { - return Err(crate::Error::Fork(ForkError::BackupServiceNotConfigured)); - } - } else { - None - }; - - let logger = match &from_ns.db { - Database::Primary(db) => db.wal_wrapper.wrapper().logger(), - Database::Schema(db) => db.wal_wrapper.wrapper().logger(), - _ => { - return Err(crate::Error::Fork(ForkError::Internal(Error::msg( - "Invalid source database type for fork", - )))); - } - }; - - let fork_task = ForkTask { - base_path: ns_config.base_path.clone(), - to_namespace: to_ns.clone(), - logger, - restore_to, - to_config, - ns_config, - resolve_attach, - store, - broadcaster: broadcaster.handle(to_ns), - }; - - let ns = fork_task.fork().await?; - Ok(ns) - } - DatabaseKind::Replica => Err(ForkError::ForkReplica.into()), - } - } - - async fn new_schema( - ns_config: &NamespaceConfig, - name: NamespaceName, - meta_store_handle: MetaStoreHandle, - restore_option: RestoreOption, - resolve_attach_path: ResolveNamespacePathFn, - broadcaster: BroadcasterHandle, - ) -> crate::Result { - let mut join_set = JoinSet::new(); - let db_path = ns_config.base_path.join("dbs").join(name.as_str()); - - tokio::fs::create_dir_all(&db_path).await?; - - let (connection_maker, wal_manager, stats) = Self::make_primary_connection_maker( - ns_config, - &meta_store_handle, - &db_path, - &name, - restore_option, - Arc::new(AtomicBool::new(false)), // this is always false for schema - &mut join_set, - resolve_attach_path, - broadcaster, - ) - .await?; - - Ok(Namespace { - db: Database::Schema(SchemaDatabase::new( - ns_config.migration_scheduler.clone(), - name.clone(), - connection_maker, - wal_manager, - meta_store_handle.clone(), - )), - name, - tasks: join_set, - stats, - db_config_store: meta_store_handle, - path: db_path.into(), - }) - } -} - -pub struct NamespaceConfig { - /// Default database kind the store should be Creating - pub(crate) db_kind: DatabaseKind, - // Common config - pub(crate) base_path: Arc, - pub(crate) max_log_size: u64, - pub(crate) max_log_duration: Option, - pub(crate) extensions: Arc<[PathBuf]>, - pub(crate) stats_sender: StatsSender, - pub(crate) max_response_size: u64, - pub(crate) max_total_response_size: u64, - pub(crate) checkpoint_interval: Option, - pub(crate) max_concurrent_connections: Arc, - pub(crate) max_concurrent_requests: u64, - pub(crate) encryption_config: Option, - - // Replica specific config - /// grpc channel for replica - pub channel: Option, - /// grpc uri - pub uri: Option, - - // primary only config - pub(crate) bottomless_replication: Option, - pub(crate) scripted_backup: Option, - pub(crate) migration_scheduler: SchedulerHandle, - pub(crate) make_wal_manager: Arc InnerWalManager + Sync + Send + 'static>, } pub type DumpStream = Box> + Send + Sync + 'static + Unpin>; -fn make_bottomless_options( - options: &Options, - namespace_db_id: NamespaceBottomlessDbId, - name: NamespaceName, -) -> Options { - let mut options = options.clone(); - let mut db_id = match namespace_db_id { - NamespaceBottomlessDbId::Namespace(id) => id, - // FIXME(marin): I don't like that, if bottomless is enabled, proper config must be passed. - NamespaceBottomlessDbId::NotProvided => options.db_id.unwrap_or_default(), - }; - - db_id = format!("ns-{db_id}:{name}"); - options.db_id = Some(db_id); - options -} - -async fn make_stats( - db_path: &Path, - join_set: &mut JoinSet>, - meta_store_handle: MetaStoreHandle, - stats_sender: StatsSender, - name: NamespaceName, - mut current_frame_no: watch::Receiver>, - encryption_config: Option, -) -> anyhow::Result> { - tracing::debug!("creating stats type"); - let stats = Stats::new(name.clone(), db_path, join_set).await?; - - // the storage monitor is optional, so we ignore the error here. - tracing::debug!("stats created, sending stats"); - let _ = stats_sender - .send((name.clone(), meta_store_handle, Arc::downgrade(&stats))) - .await; - - join_set.spawn({ - let stats = stats.clone(); - // initialize the current_frame_no value - current_frame_no - .borrow_and_update() - .map(|fno| stats.set_current_frame_no(fno)); - async move { - while current_frame_no.changed().await.is_ok() { - current_frame_no - .borrow_and_update() - .map(|fno| stats.set_current_frame_no(fno)); - } - Ok(()) - } - }); - - join_set.spawn(run_storage_monitor( - db_path.into(), - Arc::downgrade(&stats), - encryption_config, - )); - - tracing::debug!("done sending stats, and creating bg tasks"); - - Ok(stats) -} - #[derive(Default)] pub enum RestoreOption { /// Restore database state from the most recent version found in a backup. @@ -858,189 +140,3 @@ pub enum RestoreOption { /// Granularity depends of how frequently WAL log pages are being snapshotted. PointInTime(NaiveDateTime), } - -const WASM_TABLE_CREATE: &str = - "CREATE TABLE libsql_wasm_func_table (name text PRIMARY KEY, body text) WITHOUT ROWID;"; - -async fn load_dump(dump: S, conn: PrimaryConnection) -> crate::Result<(), LoadDumpError> -where - S: Stream> + Unpin, -{ - let mut reader = tokio::io::BufReader::new(StreamReader::new(dump)); - let mut curr = String::new(); - let mut line = String::new(); - let mut skipped_wasm_table = false; - let mut n_stmt = 0; - let mut line_id = 0; - - while let Ok(n) = reader.read_line(&mut curr).await { - line_id += 1; - if n == 0 { - break; - } - let trimmed = curr.trim(); - if trimmed.is_empty() || trimmed.starts_with("--") { - curr.clear(); - continue; - } - // FIXME: it's well known bug that comment ending with semicolon will be handled incorrectly by currend dump processing code - let statement_end = trimmed.ends_with(';'); - - // we want to concat original(non-trimmed) lines as trimming will join all them in one - // single-line statement which is incorrect if comments in the end are present - line.push_str(&curr); - curr.clear(); - - // This is a hack to ignore the libsql_wasm_func_table table because it is already created - // by the system. - if !skipped_wasm_table && line.trim() == WASM_TABLE_CREATE { - skipped_wasm_table = true; - line.clear(); - continue; - } - - if statement_end { - n_stmt += 1; - // dump must be performd within a txn - if n_stmt > 2 && conn.is_autocommit().await.unwrap() { - return Err(LoadDumpError::NoTxn); - } - - line = tokio::task::spawn_blocking({ - let conn = conn.clone(); - move || -> crate::Result { - conn.with_raw(|conn| conn.execute(&line, ())).map_err(|e| { - LoadDumpError::Internal(format!("line: {}, error: {}", line_id, e)) - })?; - Ok(line) - } - }) - .await??; - line.clear(); - } else { - line.push(' '); - } - } - tracing::debug!("loaded {} lines from dump", line_id); - - if !conn.is_autocommit().await.unwrap() { - tokio::task::spawn_blocking({ - let conn = conn.clone(); - move || -> crate::Result<(), LoadDumpError> { - conn.with_raw(|conn| conn.execute("rollback", ()))?; - Ok(()) - } - }) - .await??; - return Err(LoadDumpError::NoCommit); - } - - Ok(()) -} - -pub async fn init_bottomless_replicator( - path: impl AsRef, - options: bottomless::replicator::Options, - restore_option: &RestoreOption, -) -> anyhow::Result<(bottomless::replicator::Replicator, bool)> { - tracing::debug!("Initializing bottomless replication"); - let path = path - .as_ref() - .to_str() - .ok_or_else(|| anyhow::anyhow!("Invalid db path"))? - .to_owned(); - let mut replicator = bottomless::replicator::Replicator::with_options(path, options).await?; - - let (generation, timestamp) = match restore_option { - RestoreOption::Latest | RestoreOption::Dump(_) => (None, None), - RestoreOption::Generation(generation) => (Some(*generation), None), - RestoreOption::PointInTime(timestamp) => (None, Some(*timestamp)), - }; - - let (action, did_recover) = replicator.restore(generation, timestamp).await?; - match action { - bottomless::replicator::RestoreAction::SnapshotMainDbFile => { - replicator.new_generation().await; - if let Some(_handle) = replicator.snapshot_main_db_file(true).await? { - tracing::trace!("got snapshot handle after restore with generation upgrade"); - } - // Restoration process only leaves the local WAL file if it was - // detected to be newer than its remote counterpart. - replicator.maybe_replicate_wal().await? - } - bottomless::replicator::RestoreAction::ReuseGeneration(gen) => { - replicator.set_generation(gen); - } - } - - Ok((replicator, did_recover)) -} - -async fn run_periodic_compactions(logger: Arc) -> anyhow::Result<()> { - // calling `ReplicationLogger::maybe_compact()` is cheap if the compaction does not actually - // take place, so we can afford to poll it very often for simplicity - let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(1000)); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); - - loop { - interval.tick().await; - let handle = BLOCKING_RT.spawn_blocking(enclose! {(logger) move || { - logger.maybe_compact() - }}); - handle - .await - .expect("Compaction task crashed") - .context("Compaction failed")?; - } -} - -fn check_fresh_db(path: &Path) -> crate::Result { - let is_fresh = !path.join("wallog").try_exists()?; - Ok(is_fresh) -} - -// Periodically check the storage used by the database and save it in the Stats structure. -// TODO: Once we have a separate fiber that does WAL checkpoints, running this routine -// right after checkpointing is exactly where it should be done. -async fn run_storage_monitor( - db_path: PathBuf, - stats: Weak, - encryption_config: Option, -) -> anyhow::Result<()> { - // on initialization, the database file doesn't exist yet, so we wait a bit for it to be - // created - tokio::time::sleep(Duration::from_secs(1)).await; - - let duration = tokio::time::Duration::from_secs(60); - let db_path: Arc = db_path.into(); - loop { - let db_path = db_path.clone(); - let Some(stats) = stats.upgrade() else { - return Ok(()); - }; - - let encryption_config = encryption_config.clone(); - let _ = tokio::task::spawn_blocking(move || { - // because closing the last connection interferes with opening a new one, we lazily - // initialize a connection here, and keep it alive for the entirety of the program. If we - // fail to open it, we wait for `duration` and try again later. - match open_conn(&db_path, Sqlite3WalManager::new(), Some(rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY), encryption_config) { - Ok(mut conn) => { - if let Ok(tx) = conn.transaction() { - let page_count = tx.query_row("pragma page_count;", [], |row| { row.get::(0) }); - let freelist_count = tx.query_row("pragma freelist_count;", [], |row| { row.get::(0) }); - if let (Ok(page_count), Ok(freelist_count)) = (page_count, freelist_count) { - let storage_bytes_used = (page_count - freelist_count) * 4096; - stats.set_storage_bytes_used(storage_bytes_used); - } - } - }, - Err(e) => { - tracing::warn!("failed to open connection for storager monitor: {e}, trying again in {duration:?}"); - }, - } - }).await; - - tokio::time::sleep(duration).await; - } -} diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index e0147fc2e8..f9f614fc77 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -13,15 +13,17 @@ use tokio_stream::wrappers::BroadcastStream; use crate::auth::Authenticated; use crate::broadcaster::BroadcastMsg; use crate::connection::config::DatabaseConfig; +use crate::database::DatabaseKind; use crate::error::Error; use crate::metrics::NAMESPACE_LOAD_LATENCY; use crate::namespace::{NamespaceBottomlessDbId, NamespaceBottomlessDbIdInit, NamespaceName}; use crate::stats::Stats; use super::broadcasters::{BroadcasterHandle, BroadcasterRegistry}; +use super::configurator::{DynConfigurator, NamespaceConfigurators}; use super::meta_store::{MetaStore, MetaStoreHandle}; use super::schema_lock::SchemaLocksRegistry; -use super::{Namespace, NamespaceConfig, ResetCb, ResetOp, ResolveNamespacePathFn, RestoreOption}; +use super::{Namespace, ResetCb, ResetOp, ResolveNamespacePathFn, RestoreOption}; type NamespaceEntry = Arc>>; @@ -44,18 +46,20 @@ pub struct NamespaceStoreInner { allow_lazy_creation: bool, has_shutdown: AtomicBool, snapshot_at_shutdown: bool, - pub config: NamespaceConfig, schema_locks: SchemaLocksRegistry, broadcasters: BroadcasterRegistry, + configurators: NamespaceConfigurators, + db_kind: DatabaseKind, } impl NamespaceStore { - pub async fn new( + pub(crate) async fn new( allow_lazy_creation: bool, snapshot_at_shutdown: bool, max_active_namespaces: usize, - config: NamespaceConfig, metadata: MetaStore, + configurators: NamespaceConfigurators, + db_kind: DatabaseKind, ) -> crate::Result { tracing::trace!("Max active namespaces: {max_active_namespaces}"); let store = Cache::::builder() @@ -87,15 +91,16 @@ impl NamespaceStore { allow_lazy_creation, has_shutdown: AtomicBool::new(false), snapshot_at_shutdown, - config, schema_locks: Default::default(), broadcasters: Default::default(), + configurators, + db_kind, }), }) } - pub fn exists(&self, namespace: &NamespaceName) -> bool { - self.inner.metadata.exists(namespace) + pub async fn exists(&self, namespace: &NamespaceName) -> bool { + self.inner.metadata.exists(namespace).await } pub async fn destroy(&self, namespace: NamespaceName, prune_all: bool) -> crate::Result<()> { @@ -127,14 +132,8 @@ impl NamespaceStore { } } - Namespace::cleanup( - &self.inner.config, - &namespace, - &db_config, - prune_all, - bottomless_db_id_init, - ) - .await?; + self.cleanup(&namespace, &db_config, prune_all, bottomless_db_id_init) + .await?; tracing::info!("destroyed namespace: {namespace}"); @@ -174,27 +173,18 @@ impl NamespaceStore { ns.destroy().await?; } - let handle = self.inner.metadata.handle(namespace.clone()); + let db_config = self.inner.metadata.handle(namespace.clone()).await; // destroy on-disk database - Namespace::cleanup( - &self.inner.config, + self.cleanup( &namespace, - &handle.get(), + &db_config.get(), false, NamespaceBottomlessDbIdInit::FetchFromConfig, ) .await?; - let ns = Namespace::from_config( - &self.inner.config, - handle, - restore_option, - &namespace, - self.make_reset_cb(), - self.resolve_attach_fn(), - self.clone(), - self.broadcaster(namespace.clone()), - ) - .await?; + let ns = self + .make_namespace(&namespace, db_config, restore_option) + .await?; lock.replace(ns); @@ -236,7 +226,7 @@ impl NamespaceStore { } // check that the source namespace exists - if !self.inner.metadata.exists(&from) { + if !self.inner.metadata.exists(&from).await { return Err(crate::error::Error::NamespaceDoesntExist(from.to_string())); } @@ -251,11 +241,11 @@ impl NamespaceStore { } // FIXME: we could potentially delete the namespace while trying to fork it - if !self.inner.metadata.exists(&from) { + if !self.inner.metadata.exists(&from).await { return Err(crate::Error::NamespaceDoesntExist(from.to_string())); } - let from_config = self.inner.metadata.handle(from.clone()); + let from_config = self.inner.metadata.handle(from.clone()).await; let from_entry = self .load_namespace(&from, from_config.clone(), RestoreOption::Latest) .await?; @@ -290,22 +280,21 @@ impl NamespaceStore { should_delete: true, }; - let handle = self.inner.metadata.handle(to.clone()); + let handle = self.inner.metadata.handle(to.clone()).await; handle .store_and_maybe_flush(Some(to_config.into()), false) .await?; - let to_ns = Namespace::fork( - &self.inner.config, - from_ns, - from_config, - to.clone(), - handle.clone(), - timestamp, - self.resolve_attach_fn(), - self.clone(), - self.broadcaster(to), - ) - .await?; + let to_ns = self + .get_configurator(&from_config.get()) + .fork( + from_ns, + from_config, + to.clone(), + handle.clone(), + timestamp, + self.clone(), + ) + .await?; to_lock.replace(to_ns); handle.flush().await?; @@ -339,7 +328,7 @@ impl NamespaceStore { Fun: FnOnce(&Namespace) -> R, { if namespace != NamespaceName::default() - && !self.inner.metadata.exists(&namespace) + && !self.inner.metadata.exists(&namespace).await && !self.inner.allow_lazy_creation { return Err(Error::NamespaceDoesntExist(namespace.to_string())); @@ -357,7 +346,7 @@ impl NamespaceStore { } }; - let handle = self.inner.metadata.handle(namespace.to_owned()); + let handle = self.inner.metadata.handle(namespace.to_owned()).await; f(self .load_namespace(&namespace, handle, RestoreOption::Latest) .await?) @@ -378,30 +367,39 @@ impl NamespaceStore { .clone() } + pub(crate) async fn make_namespace( + &self, + namespace: &NamespaceName, + config: MetaStoreHandle, + restore_option: RestoreOption, + ) -> crate::Result { + let ns = self + .get_configurator(&config.get()) + .setup( + config, + restore_option, + namespace, + self.make_reset_cb(), + self.resolve_attach_fn(), + self.clone(), + self.broadcaster(namespace.clone()), + ) + .await?; + + Ok(ns) + } + async fn load_namespace( &self, namespace: &NamespaceName, db_config: MetaStoreHandle, restore_option: RestoreOption, ) -> crate::Result { - let init = { - let namespace = namespace.clone(); - async move { - let ns = Namespace::from_config( - &self.inner.config, - db_config, - restore_option, - &namespace, - self.make_reset_cb(), - self.resolve_attach_fn(), - self.clone(), - self.broadcaster(namespace.clone()), - ) + let init = async { + let ns = self + .make_namespace(namespace, db_config, restore_option) .await?; - tracing::info!("loaded namespace: `{namespace}`"); - - Ok(Some(ns)) - } + Ok(Some(ns)) }; let before_load = Instant::now(); @@ -442,12 +440,12 @@ impl NamespaceStore { // FIXME: move the default namespace check out of this function. if self.inner.allow_lazy_creation || namespace == NamespaceName::default() { tracing::trace!("auto-creating the namespace"); - } else if self.inner.metadata.exists(&namespace) { + } else if self.inner.metadata.exists(&namespace).await { return Err(Error::NamespaceAlreadyExist(namespace.to_string())); } let db_config = Arc::new(db_config); - let handle = self.inner.metadata.handle(namespace.clone()); + let handle = self.inner.metadata.handle(namespace.clone()).await; tracing::debug!("storing db config"); handle.store(db_config).await?; tracing::debug!("completed storing db config, loading namespace"); @@ -516,4 +514,26 @@ impl NamespaceStore { pub(crate) fn schema_locks(&self) -> &SchemaLocksRegistry { &self.inner.schema_locks } + + fn get_configurator(&self, db_config: &DatabaseConfig) -> &DynConfigurator { + match self.inner.db_kind { + DatabaseKind::Primary if db_config.is_shared_schema => { + self.inner.configurators.configure_schema().unwrap() + } + DatabaseKind::Primary => self.inner.configurators.configure_primary().unwrap(), + DatabaseKind::Replica => self.inner.configurators.configure_replica().unwrap(), + } + } + + async fn cleanup( + &self, + namespace: &NamespaceName, + db_config: &DatabaseConfig, + prune_all: bool, + bottomless_db_id_init: NamespaceBottomlessDbIdInit, + ) -> crate::Result<()> { + self.get_configurator(db_config) + .cleanup(namespace, db_config, prune_all, bottomless_db_id_init) + .await + } } diff --git a/libsql-server/src/replication/replicator_client.rs b/libsql-server/src/replication/replicator_client.rs index 4d12ff7f83..753baac996 100644 --- a/libsql-server/src/replication/replicator_client.rs +++ b/libsql-server/src/replication/replicator_client.rs @@ -4,15 +4,17 @@ use std::pin::Pin; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::TryStreamExt; -use libsql_replication::frame::Frame; use libsql_replication::meta::WalIndexMeta; -use libsql_replication::replicator::{map_frame_err, Error, ReplicatorClient}; +use libsql_replication::replicator::{Error, ReplicatorClient}; +use libsql_replication::rpc::replication::log_offset::WalFlavor; use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient; use libsql_replication::rpc::replication::{ - verify_session_token, HelloRequest, LogOffset, NAMESPACE_METADATA_KEY, SESSION_TOKEN_KEY, + verify_session_token, Frame as RpcFrame, HelloRequest, LogOffset, NAMESPACE_METADATA_KEY, + SESSION_TOKEN_KEY, }; use tokio::sync::watch; -use tokio_stream::{Stream, StreamExt}; +use tokio_stream::Stream; + use tonic::metadata::{AsciiMetadataValue, BinaryMetadataValue}; use tonic::transport::Channel; use tonic::{Code, Request, Status}; @@ -35,6 +37,7 @@ pub struct Client { // the primary current replication index, as reported by the last handshake pub primary_replication_index: Option, store: NamespaceStore, + wal_flavor: WalFlavor, } impl Client { @@ -44,6 +47,7 @@ impl Client { path: &Path, meta_store_handle: MetaStoreHandle, store: NamespaceStore, + wal_flavor: WalFlavor, ) -> crate::Result { let (current_frame_no_notifier, _) = watch::channel(None); let meta = WalIndexMeta::open(path).await?; @@ -57,6 +61,7 @@ impl Client { meta_store_handle, primary_replication_index: None, store, + wal_flavor, }) } @@ -91,7 +96,7 @@ impl Client { #[async_trait::async_trait] impl ReplicatorClient for Client { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = Pin> + Send + 'static>>; #[tracing::instrument(skip(self))] async fn handshake(&mut self) -> Result<(), Error> { @@ -138,6 +143,7 @@ impl ReplicatorClient for Client { async fn next_frames(&mut self) -> Result { let offset = LogOffset { next_offset: self.next_frame_no(), + wal_flavor: Some(self.wal_flavor.into()), }; let req = self.make_request(offset); let stream = self @@ -165,7 +171,7 @@ impl ReplicatorClient for Client { None => REPLICATION_LATENCY_CACHE_MISS.increment(1), } }) - .map(map_frame_err); + .map_err(Into::into); Ok(Box::pin(stream)) } @@ -173,11 +179,12 @@ impl ReplicatorClient for Client { async fn snapshot(&mut self) -> Result { let offset = LogOffset { next_offset: self.next_frame_no(), + wal_flavor: Some(self.wal_flavor.into()), }; let req = self.make_request(offset); match self.client.snapshot(req).await { Ok(resp) => { - let stream = resp.into_inner().map(map_frame_err); + let stream = resp.into_inner().map_err(Into::into); Ok(Box::pin(stream)) } Err(e) if e.code() == Code::Unavailable => Err(Error::SnapshotPending), diff --git a/libsql-server/src/rpc/replication_log.rs b/libsql-server/src/rpc/replication_log.rs index c0b216739e..1ef306daf1 100644 --- a/libsql-server/src/rpc/replication_log.rs +++ b/libsql-server/src/rpc/replication_log.rs @@ -8,6 +8,7 @@ use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures_core::Future; pub use libsql_replication::rpc::replication as rpc; +use libsql_replication::rpc::replication::log_offset::WalFlavor; use libsql_replication::rpc::replication::replication_log_server::ReplicationLog; use libsql_replication::rpc::replication::{ Frame, Frames, HelloRequest, HelloResponse, LogOffset, NAMESPACE_DOESNT_EXIST, @@ -259,6 +260,9 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { + if let WalFlavor::Libsql = req.get_ref().wal_flavor() { + return Err(Status::invalid_argument("libsql wal not supported")); + } let namespace = super::extract_namespace(self.disable_namespaces, &req)?; self.authenticate(&req, namespace.clone()).await?; @@ -304,6 +308,9 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { + if let WalFlavor::Libsql = req.get_ref().wal_flavor() { + return Err(Status::invalid_argument("libsql wal not supported")); + } let namespace = super::extract_namespace(self.disable_namespaces, &req)?; self.authenticate(&req, namespace.clone()).await?; @@ -354,7 +361,6 @@ impl ReplicationLog for ReplicationLogService { guard.insert((replica_addr, namespace.clone())); } } - let (logger, config, version, _, _) = self.logger_from_namespace(namespace, &req, false).await?; @@ -376,7 +382,12 @@ impl ReplicationLog for ReplicationLogService { &self, req: tonic::Request, ) -> Result, Status> { + if let WalFlavor::Libsql = req.get_ref().wal_flavor() { + return Err(Status::invalid_argument("libsql wal not supported")); + } + let namespace = super::extract_namespace(self.disable_namespaces, &req)?; + self.authenticate(&req, namespace.clone()).await?; let (logger, _, _, stats, _) = self.logger_from_namespace(namespace, &req, true).await?; diff --git a/libsql-server/src/schema/db.rs b/libsql-server/src/schema/db.rs index 7efb2efaa8..f644b2420f 100644 --- a/libsql-server/src/schema/db.rs +++ b/libsql-server/src/schema/db.rs @@ -482,6 +482,7 @@ mod test { async fn register_schema(meta_store: &MetaStore, schema: &'static str) { meta_store .handle(schema.into()) + .await .store(DatabaseConfig { is_shared_schema: true, ..Default::default() @@ -497,6 +498,7 @@ mod test { ) -> crate::Result<()> { meta_store .handle(name.into()) + .await .store(DatabaseConfig { shared_schema_name: Some(schema.into()), ..Default::default() @@ -561,6 +563,7 @@ mod test { // necessary checks beforehand, and return a nice error message. assert!(meta_store .handle("ns1".into()) + .await .store(DatabaseConfig { shared_schema_name: Some("schema1".into()), ..Default::default() diff --git a/libsql-server/src/schema/scheduler.rs b/libsql-server/src/schema/scheduler.rs index 17fdfb3143..01a3d795d8 100644 --- a/libsql-server/src/schema/scheduler.rs +++ b/libsql-server/src/schema/scheduler.rs @@ -808,8 +808,12 @@ mod test { use crate::connection::config::DatabaseConfig; use crate::database::DatabaseKind; + use crate::namespace::configurator::{ + BaseNamespaceConfig, NamespaceConfigurators, PrimaryConfigurator, PrimaryExtraConfig, + SchemaConfigurator, + }; use crate::namespace::meta_store::{metastore_connection_maker, MetaStore}; - use crate::namespace::{NamespaceConfig, RestoreOption}; + use crate::namespace::RestoreOption; use crate::schema::SchedulerHandle; use super::super::migration::has_pending_migration_task; @@ -826,9 +830,10 @@ mod test { .unwrap(); let (sender, mut receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let mut scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); @@ -902,27 +907,42 @@ mod test { assert!(!block_write.load(std::sync::atomic::Ordering::Relaxed)); } - fn make_config(migration_scheduler: SchedulerHandle, path: &Path) -> NamespaceConfig { - NamespaceConfig { - db_kind: DatabaseKind::Primary, + fn make_config(migration_scheduler: SchedulerHandle, path: &Path) -> NamespaceConfigurators { + let mut configurators = NamespaceConfigurators::empty(); + let base_config = BaseNamespaceConfig { base_path: path.to_path_buf().into(), - max_log_size: 1000000000, - max_log_duration: None, extensions: Arc::new([]), stats_sender: tokio::sync::mpsc::channel(1).0, max_response_size: 100000000000000, max_total_response_size: 100000000000, - checkpoint_interval: None, max_concurrent_connections: Arc::new(Semaphore::new(10)), max_concurrent_requests: 10000, encryption_config: None, - channel: None, - uri: None, + }; + + let primary_config = PrimaryExtraConfig { + max_log_size: 1000000000, + max_log_duration: None, bottomless_replication: None, scripted_backup: None, + checkpoint_interval: None, + }; + + let make_wal_manager = Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())); + + configurators.with_schema(SchemaConfigurator::new( + base_config.clone(), + primary_config.clone(), + make_wal_manager.clone(), migration_scheduler, - make_wal_manager: Arc::new(|| EitherWAL::A(Sqlite3WalManager::default())), - } + )); + configurators.with_primary(PrimaryConfigurator::new( + base_config, + primary_config, + make_wal_manager.clone(), + )); + + configurators } #[tokio::test] @@ -936,9 +956,10 @@ mod test { .unwrap(); let (sender, mut receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let mut scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); @@ -1012,9 +1033,10 @@ mod test { .unwrap(); let (sender, _receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); store .with("ns".into(), |ns| { @@ -1039,9 +1061,10 @@ mod test { .unwrap(); let (sender, mut receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let mut scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); @@ -1112,9 +1135,10 @@ mod test { .unwrap(); let (sender, _receiver) = mpsc::channel(100); let config = make_config(sender.clone().into(), tmp.path()); - let store = NamespaceStore::new(false, false, 10, config, meta_store) - .await - .unwrap(); + let store = + NamespaceStore::new(false, false, 10, meta_store, config, DatabaseKind::Primary) + .await + .unwrap(); let scheduler = Scheduler::new(store.clone(), maker().unwrap()) .await .unwrap(); diff --git a/libsql-server/tests/embedded_replica/mod.rs b/libsql-server/tests/embedded_replica/mod.rs index 78a46ad996..2d5f8c0de0 100644 --- a/libsql-server/tests/embedded_replica/mod.rs +++ b/libsql-server/tests/embedded_replica/mod.rs @@ -179,6 +179,8 @@ fn execute_batch() { conn.execute("CREATE TABLE user (id INTEGER NOT NULL PRIMARY KEY)", ()) .await?; + assert_eq!(db.max_write_replication_index(), Some(1)); + let n = db.sync().await?.frame_no(); assert_eq!(n, Some(1)); @@ -231,6 +233,7 @@ fn stream() { conn.execute("CREATE TABLE user (id INTEGER NOT NULL PRIMARY KEY)", ()) .await?; + assert_eq!(db.max_write_replication_index(), Some(1)); let n = db.sync().await?.frame_no(); assert_eq!(n, Some(1)); @@ -244,8 +247,10 @@ fn stream() { ", ) .await?; + let replication_index = db.max_write_replication_index(); - db.sync().await.unwrap(); + let synced_replication_index = db.sync().await.unwrap().frame_no(); + assert_eq!(synced_replication_index, replication_index); let rows = conn.query("select * from user", ()).await.unwrap(); diff --git a/libsql-sqlite3/Makefile.in b/libsql-sqlite3/Makefile.in index 4520dda0d2..7316257fa4 100644 --- a/libsql-sqlite3/Makefile.in +++ b/libsql-sqlite3/Makefile.in @@ -195,7 +195,7 @@ LIBOBJS0 = alter.lo analyze.lo attach.lo auth.lo \ sqlite3session.lo select.lo sqlite3rbu.lo status.lo stmt.lo \ table.lo threads.lo tokenize.lo treeview.lo trigger.lo \ update.lo userauth.lo upsert.lo util.lo vacuum.lo \ - vector.lo vectorfloat32.lo vectorfloat64.lo \ + vector.lo vectorfloat32.lo vectorfloat64.lo vectorfloat1bit.lo vectorfloat8.lo \ vectorIndex.lo vectordiskann.lo vectorvtab.lo \ vdbe.lo vdbeapi.lo vdbeaux.lo vdbeblob.lo vdbemem.lo vdbesort.lo \ vdbetrace.lo vdbevtab.lo \ @@ -303,8 +303,10 @@ SRC = \ $(TOP)/src/vacuum.c \ $(TOP)/src/vector.c \ $(TOP)/src/vectorInt.h \ + $(TOP)/src/vectorfloat1bit.c \ $(TOP)/src/vectorfloat32.c \ $(TOP)/src/vectorfloat64.c \ + $(TOP)/src/vectorfloat8.c \ $(TOP)/src/vectorIndexInt.h \ $(TOP)/src/vectorIndex.c \ $(TOP)/src/vectordiskann.c \ @@ -1138,12 +1140,18 @@ vacuum.lo: $(TOP)/src/vacuum.c $(HDR) vector.lo: $(TOP)/src/vector.c $(HDR) $(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vector.c +vectorfloat1bit.lo: $(TOP)/src/vectorfloat1bit.c $(HDR) + $(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorfloat1bit.c + vectorfloat32.lo: $(TOP)/src/vectorfloat32.c $(HDR) $(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorfloat32.c vectorfloat64.lo: $(TOP)/src/vectorfloat64.c $(HDR) $(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorfloat64.c +vectorfloat8.lo: $(TOP)/src/vectorfloat8.c $(HDR) + $(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorfloat8.c + vectorIndex.lo: $(TOP)/src/vectorIndex.c $(HDR) $(LTCOMPILE) $(TEMP_STORE) -c $(TOP)/src/vectorIndex.c diff --git a/libsql-sqlite3/benchmark/workload.py b/libsql-sqlite3/benchmark/workload.py index 2d413531fa..728e375933 100644 --- a/libsql-sqlite3/benchmark/workload.py +++ b/libsql-sqlite3/benchmark/workload.py @@ -10,10 +10,10 @@ def recall_uniform(dim, n, q): print(f'CREATE TABLE queries ( emb FLOAT32({dim}) );') print(f'BEGIN TRANSACTION;') for i in range(n): - vector = f"[{','.join(map(str, np.random.uniform(size=dim)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=dim)))}]" print(f'INSERT INTO data VALUES ({i}, vector(\'{vector}\'));') for i in range(q): - vector = f"[{','.join(map(str, np.random.uniform(size=dim)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=dim)))}]" print(f'INSERT INTO queries VALUES (vector(\'{vector}\'));') print(f'COMMIT;') print('---insert everything') @@ -29,7 +29,7 @@ def recall_normal(dim, n, q): vector = f"[{','.join(map(str, np.random.uniform(size=64)))}]" print(f'INSERT INTO data VALUES ({i}, \'{vector}\');') for i in range(q): - vector = f"[{','.join(map(str, np.random.uniform(size=64)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=64)))}]" print(f'INSERT INTO queries VALUES (\'{vector}\');') print(f'COMMIT;') print('---insert everything') @@ -40,7 +40,7 @@ def no_vectors(n, q): print('PRAGMA journal_mode=WAL;') print(f'CREATE TABLE x ( id INTEGER PRIMARY KEY, value TEXT );') for i in range(n): - vector = f"[{','.join(map(str, np.random.uniform(size=64)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=64)))}]" print(f'INSERT INTO x VALUES ({i}, \'{vector}\');') print('---inserts') for i in range(q): @@ -54,11 +54,11 @@ def bruteforce(dim, n, q): print('PRAGMA journal_mode=WAL;') print(f'CREATE TABLE x ( id INTEGER PRIMARY KEY, embedding FLOAT32({dim}) );') for i in range(n): - vector = f"[{','.join(map(str, np.random.uniform(size=dim)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=dim)))}]" print(f'INSERT INTO x VALUES ({i}, vector(\'{vector}\'));') print('---inserts') for i in range(q): - vector = f"[{','.join(map(str, np.random.uniform(size=dim)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=dim)))}]" print(f'SELECT id FROM x ORDER BY vector_distance_cos(embedding, vector(\'{vector}\')) LIMIT 1;') print('---search') @@ -68,13 +68,13 @@ def diskann(dim, n, q): q = int(q) print('PRAGMA journal_mode=WAL;') print(f'CREATE TABLE x ( id INTEGER PRIMARY KEY, embedding FLOAT32({dim}) );') - print(f'CREATE INDEX x_idx ON x( libsql_vector_idx(embedding) );') + print(f"CREATE INDEX x_idx ON x( libsql_vector_idx(embedding) );") for i in range(n): - vector = f"[{','.join(map(str, np.random.uniform(size=dim)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=dim)))}]" print(f'INSERT INTO x VALUES ({i}, vector(\'{vector}\'));') print('---inserts') for i in range(q): - vector = f"[{','.join(map(str, np.random.uniform(size=dim)))}]" + vector = f"[{','.join(map(str, np.random.uniform(-1, 1, size=dim)))}]" print(f'SELECT id FROM vector_top_k(\'x_idx\', vector(\'{vector}\'), 1);') print('---search') diff --git a/libsql-sqlite3/ext/fts5/fts5_tokenize.c b/libsql-sqlite3/ext/fts5/fts5_tokenize.c index f12056170f..7e239b6ca5 100644 --- a/libsql-sqlite3/ext/fts5/fts5_tokenize.c +++ b/libsql-sqlite3/ext/fts5/fts5_tokenize.c @@ -1290,40 +1290,46 @@ static int fts5TriCreate( Fts5Tokenizer **ppOut ){ int rc = SQLITE_OK; - TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew)); - UNUSED_PARAM(pUnused); - if( pNew==0 ){ - rc = SQLITE_NOMEM; + TrigramTokenizer *pNew = 0; + + if( nArg%2 ){ + rc = SQLITE_ERROR; }else{ - int i; - pNew->bFold = 1; - pNew->iFoldParam = 0; - for(i=0; rc==SQLITE_OK && ibFold = 1; + pNew->iFoldParam = 0; + for(i=0; rc==SQLITE_OK && ibFold = (zArg[0]=='0'); + } + }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ + if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ + rc = SQLITE_ERROR; + }else{ + pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; + } }else{ - pNew->bFold = (zArg[0]=='0'); - } - }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ - if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){ rc = SQLITE_ERROR; - }else{ - pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0; } - }else{ - rc = SQLITE_ERROR; } - } - if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ - rc = SQLITE_ERROR; - } + if( pNew->iFoldParam!=0 && pNew->bFold==0 ){ + rc = SQLITE_ERROR; + } - if( rc!=SQLITE_OK ){ - fts5TriDelete((Fts5Tokenizer*)pNew); - pNew = 0; + if( rc!=SQLITE_OK ){ + fts5TriDelete((Fts5Tokenizer*)pNew); + pNew = 0; + } } } *ppOut = (Fts5Tokenizer*)pNew; diff --git a/libsql-sqlite3/src/build.c b/libsql-sqlite3/src/build.c index 4396f2fae7..d6faa6b071 100644 --- a/libsql-sqlite3/src/build.c +++ b/libsql-sqlite3/src/build.c @@ -833,7 +833,7 @@ static void SQLITE_NOINLINE deleteTable(sqlite3 *db, Table *pTable){ for(pIndex = pTable->pIndex; pIndex; pIndex=pNext){ pNext = pIndex->pNext; assert( pIndex->pSchema==pTable->pSchema - || (IsVirtual(pTable) && !IsAppDefIndex(pIndex)) ); + || (IsVirtual(pTable) && pIndex->idxType!=SQLITE_IDXTYPE_APPDEF) ); if( db->pnBytesFreed==0 && !IsVirtual(pTable) ){ char *zName = pIndex->zName; TESTONLY ( Index *pOld = ) sqlite3HashInsert( @@ -4345,13 +4345,12 @@ void sqlite3CreateIndex( goto exit_create_index; } if( vectorIdxRc >= 1 ){ - idxType = SQLITE_IDXTYPE_VECTOR; /* * SQLite can use B-Tree indices in some optimizations (like SELECT COUNT(*) can use any full B-Tree index instead of PK index) * But, SQLite pretty conservative about usage of unordered indices - that's what we need here */ pIndex->bUnordered = 1; - pIndex->idxType = idxType; + pIndex->idxIsVector = 1; } if( vectorIdxRc == 1 ){ skipRefill = 1; @@ -4399,7 +4398,7 @@ void sqlite3CreateIndex( for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){ int k; assert( IsUniqueIndex(pIdx) ); - assert( !IsAppDefIndex(pIdx) ); + assert( pIdx->idxType!=SQLITE_IDXTYPE_APPDEF ); assert( IsUniqueIndex(pIndex) ); if( pIdx->nKeyCol!=pIndex->nKeyCol ) continue; @@ -4680,7 +4679,7 @@ void sqlite3DropIndex(Parse *pParse, SrcList *pName, int ifExists){ pParse->checkSchema = 1; goto exit_drop_index; } - if( !IsAppDefIndex(pIndex) ){ + if( pIndex->idxType!=SQLITE_IDXTYPE_APPDEF ){ sqlite3ErrorMsg(pParse, "index associated with UNIQUE " "or PRIMARY KEY constraint cannot be dropped", 0); goto exit_drop_index; diff --git a/libsql-sqlite3/src/parse.y b/libsql-sqlite3/src/parse.y index 41e08ad6d6..f866ec5d2c 100644 --- a/libsql-sqlite3/src/parse.y +++ b/libsql-sqlite3/src/parse.y @@ -1451,9 +1451,6 @@ paren_exprlist(A) ::= LP exprlist(X) RP. {A = X;} cmd ::= createkw(S) uniqueflag(U) INDEX ifnotexists(NE) nm(X) dbnm(D) indextype(T) ON nm(Y) LP sortlist(Z) RP where_opt(W). { u8 idxType = SQLITE_IDXTYPE_APPDEF; - if( T.pUsing!=0 ){ - idxType = SQLITE_IDXTYPE_VECTOR; - } sqlite3CreateIndex(pParse, &X, &D, sqlite3SrcListAppend(pParse,0,&Y,0), Z, U, &S, W, SQLITE_SO_ASC, NE, idxType, T.pUsing); diff --git a/libsql-sqlite3/src/sqliteInt.h b/libsql-sqlite3/src/sqliteInt.h index e2fd32d3c4..0a9dd98d66 100644 --- a/libsql-sqlite3/src/sqliteInt.h +++ b/libsql-sqlite3/src/sqliteInt.h @@ -2799,7 +2799,8 @@ struct Index { u16 nKeyCol; /* Number of columns forming the key */ u16 nColumn; /* Number of columns stored in the index */ u8 onError; /* OE_Abort, OE_Ignore, OE_Replace, or OE_None */ - unsigned idxType:3; /* 0:Normal 1:UNIQUE, 2:PRIMARY KEY, 3:IPK, 4:VECTOR INDEX */ + unsigned idxType:2; /* 0:Normal 1:UNIQUE, 2:PRIMARY KEY, 3:IPK */ + unsigned idxIsVector:1; /* 0:Normal 1:VECTOR INDEX */ unsigned bUnordered:1; /* Use this index for == or IN queries only */ unsigned uniqNotNull:1; /* True if UNIQUE and NOT NULL for all columns */ unsigned isResized:1; /* True if resizeIndexObject() has been called */ @@ -2831,7 +2832,6 @@ struct Index { #define SQLITE_IDXTYPE_UNIQUE 1 /* Implements a UNIQUE constraint */ #define SQLITE_IDXTYPE_PRIMARYKEY 2 /* Is the PRIMARY KEY for the table */ #define SQLITE_IDXTYPE_IPK 3 /* INTEGER PRIMARY KEY index */ -#define SQLITE_IDXTYPE_VECTOR 4 /* libSQL vector index */ /* Return true if index X is a PRIMARY KEY index */ #define IsPrimaryKeyIndex(X) ((X)->idxType==SQLITE_IDXTYPE_PRIMARYKEY) @@ -2840,10 +2840,7 @@ struct Index { #define IsUniqueIndex(X) ((X)->onError!=OE_None) /* Return true if index X is a vector index */ -#define IsVectorIndex(X) ((X)->idxType==SQLITE_IDXTYPE_VECTOR) - -/* Return true if index X is an user defined index (APPDEF or VECTOR) */ -#define IsAppDefIndex(X) ((X)->idxType==SQLITE_IDXTYPE_APPDEF||(X)->idxType==SQLITE_IDXTYPE_VECTOR) +#define IsVectorIndex(X) ((X)->idxIsVector==1) /* The Index.aiColumn[] values are normally positive integer. But ** there are some negative values that have special meaning: diff --git a/libsql-sqlite3/src/vacuum.c b/libsql-sqlite3/src/vacuum.c index c0ae4bc1e1..f8e848aca6 100644 --- a/libsql-sqlite3/src/vacuum.c +++ b/libsql-sqlite3/src/vacuum.c @@ -17,6 +17,10 @@ #include "sqliteInt.h" #include "vdbeInt.h" +#ifndef SQLITE_OMIT_VECTOR +#include "vectorIndexInt.h" +#endif + #if !defined(SQLITE_OMIT_VACUUM) && !defined(SQLITE_OMIT_ATTACH) /* @@ -294,6 +298,27 @@ SQLITE_NOINLINE int sqlite3RunVacuum( if( rc!=SQLITE_OK ) goto end_of_vacuum; db->init.iDb = 0; +#ifndef SQLITE_OMIT_VECTOR + // shadow tables for vector index will be populated automatically during CREATE INDEX command + // so we must skip them at this step + if( sqlite3FindTable(db, VECTOR_INDEX_GLOBAL_META_TABLE, zDbMain) != NULL ){ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0 AND name NOT IN (SELECT name||'_shadow' FROM " VECTOR_INDEX_GLOBAL_META_TABLE ")", + zDbMain + ); + }else{ + rc = execSqlF(db, pzErrMsg, + "SELECT'INSERT INTO vacuum_db.'||quote(name)" + "||' SELECT*FROM\"%w\".'||quote(name)" + "FROM vacuum_db.sqlite_schema " + "WHERE type='table'AND coalesce(rootpage,1)>0", + zDbMain + ); + } +#else /* Loop through the tables in the main database. For each, do ** an "INSERT INTO vacuum_db.xxx SELECT * FROM main.xxx;" to copy ** the contents to the temporary database. @@ -305,6 +330,7 @@ SQLITE_NOINLINE int sqlite3RunVacuum( "WHERE type='table'AND coalesce(rootpage,1)>0", zDbMain ); +#endif assert( (db->mDbFlags & DBFLAG_Vacuum)!=0 ); db->mDbFlags &= ~DBFLAG_Vacuum; if( rc!=SQLITE_OK ) goto end_of_vacuum; diff --git a/libsql-sqlite3/src/vector.c b/libsql-sqlite3/src/vector.c index d32819cd00..8ec485c472 100644 --- a/libsql-sqlite3/src/vector.c +++ b/libsql-sqlite3/src/vector.c @@ -41,6 +41,10 @@ size_t vectorDataSize(VectorType type, VectorDims dims){ return dims * sizeof(float); case VECTOR_TYPE_FLOAT64: return dims * sizeof(double); + case VECTOR_TYPE_FLOAT1BIT: + return (dims + 7) / 8; + case VECTOR_TYPE_FLOAT8: + return ALIGN(dims, sizeof(float)) + sizeof(float) /* alpha */ + sizeof(float) /* shift */; default: assert(0); } @@ -72,10 +76,11 @@ Vector *vectorAlloc(VectorType type, VectorDims dims){ ** Note that the vector object points to the blob so if ** you free the blob, the vector becomes invalid. **/ -void vectorInitStatic(Vector *pVector, VectorType type, const unsigned char *pBlob, size_t nBlobSize){ - pVector->type = type; +void vectorInitStatic(Vector *pVector, VectorType type, VectorDims dims, void *pBlob){ pVector->flags = VECTOR_FLAGS_STATIC; - vectorInitFromBlob(pVector, pBlob, nBlobSize); + pVector->type = type; + pVector->dims = dims; + pVector->data = pBlob; } /* @@ -111,6 +116,10 @@ float vectorDistanceCos(const Vector *pVector1, const Vector *pVector2){ return vectorF32DistanceCos(pVector1, pVector2); case VECTOR_TYPE_FLOAT64: return vectorF64DistanceCos(pVector1, pVector2); + case VECTOR_TYPE_FLOAT1BIT: + return vector1BitDistanceHamming(pVector1, pVector2); + case VECTOR_TYPE_FLOAT8: + return vectorF8DistanceCos(pVector1, pVector2); default: assert(0); } @@ -124,6 +133,8 @@ float vectorDistanceL2(const Vector *pVector1, const Vector *pVector2){ return vectorF32DistanceL2(pVector1, pVector2); case VECTOR_TYPE_FLOAT64: return vectorF64DistanceL2(pVector1, pVector2); + case VECTOR_TYPE_FLOAT8: + return vectorF8DistanceL2(pVector1, pVector2); default: assert(0); } @@ -247,16 +258,97 @@ static int vectorParseSqliteText( return -1; } -int vectorParseSqliteBlob( +static int vectorParseMeta(const unsigned char *pBlob, size_t nBlobSize, int *pType, int *pDims, size_t *pDataSize, char **pzErrMsg){ + int nTrailingBits; + int nTrailingBytes; + + if( nBlobSize % 2 == 0 ){ + *pType = VECTOR_TYPE_FLOAT32; + *pDims = nBlobSize / sizeof(float); + *pDataSize = nBlobSize; + return SQLITE_OK; + } + *pType = pBlob[nBlobSize - 1]; + nBlobSize--; + + if( *pType == VECTOR_TYPE_FLOAT32 ){ + if( nBlobSize % 4 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float32 vector blob length must be divisible by 4 (excluding optional 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + *pDims = nBlobSize / sizeof(float); + *pDataSize = nBlobSize; + }else if( *pType == VECTOR_TYPE_FLOAT64 ){ + if( nBlobSize % 8 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float64 vector blob length must be divisible by 8 (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + *pDims = nBlobSize / sizeof(double); + *pDataSize = nBlobSize; + }else if( *pType == VECTOR_TYPE_FLOAT1BIT ){ + if( nBlobSize == 0 || nBlobSize % 2 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float1bit vector blob length must be divisible by 2 and not be empty (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + nTrailingBits = pBlob[nBlobSize - 1]; + *pDims = nBlobSize * 8 - nTrailingBits; + *pDataSize = (*pDims + 7) / 8; + }else if( *pType == VECTOR_TYPE_FLOAT8 ){ + if( nBlobSize < 2 || nBlobSize % 2 != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector: float8 vector blob length must be divisible by 2 and has at least 2 bytes (excluding 'type'-byte): length=%d", nBlobSize); + return SQLITE_ERROR; + } + nTrailingBytes = pBlob[nBlobSize - 1]; + *pDims = (nBlobSize - 2) - sizeof(float) - sizeof(float) - nTrailingBytes; + *pDataSize = nBlobSize - 2; + }else{ + *pzErrMsg = sqlite3_mprintf("vector: unexpected binary type: %d", *pType); + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +int vectorParseSqliteBlobWithType( sqlite3_value *arg, Vector *pVector, char **pzErrMsg ){ + const unsigned char *pBlob; + size_t nBlobSize, nDataSize; + int type, dims; + + assert( sqlite3_value_type(arg) == SQLITE_BLOB ); + + pBlob = sqlite3_value_blob(arg); + nBlobSize = sqlite3_value_bytes(arg); + if( vectorParseMeta(pBlob, nBlobSize, &type, &dims, &nDataSize, pzErrMsg) != SQLITE_OK ){ + return SQLITE_ERROR; + } + + if( nDataSize != vectorDataSize(pVector->type, pVector->dims) ){ + *pzErrMsg = sqlite3_mprintf( + "vector: unexpected data part size: type=%d, dims=%d, %u != %u", + pVector->type, + pVector->dims, + nDataSize, + vectorDataSize(pVector->type, pVector->dims) + ); + return SQLITE_ERROR; + } + switch (pVector->type) { case VECTOR_TYPE_FLOAT32: - return vectorF32ParseSqliteBlob(arg, pVector, pzErrMsg); + vectorF32DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; case VECTOR_TYPE_FLOAT64: - return vectorF64ParseSqliteBlob(arg, pVector, pzErrMsg); + vectorF64DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitDeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; + case VECTOR_TYPE_FLOAT8: + vectorF8DeserializeFromBlob(pVector, pBlob, nDataSize); + return 0; default: assert(0); } @@ -265,32 +357,21 @@ int vectorParseSqliteBlob( int detectBlobVectorParameters(sqlite3_value *arg, int *pType, int *pDims, char **pzErrMsg) { const u8 *pBlob; - int nBlobSize; + size_t nBlobSize, nDataSize; assert( sqlite3_value_type(arg) == SQLITE_BLOB ); pBlob = sqlite3_value_blob(arg); nBlobSize = sqlite3_value_bytes(arg); - if( nBlobSize % 2 != 0 ){ - // we have trailing byte with explicit type definition - *pType = pBlob[nBlobSize - 1]; - } else { - // else, fallback to FLOAT32 - *pType = VECTOR_TYPE_FLOAT32; - } - if( *pType == VECTOR_TYPE_FLOAT32 ){ - *pDims = nBlobSize / sizeof(float); - } else if( *pType == VECTOR_TYPE_FLOAT64 ){ - *pDims = nBlobSize / sizeof(double); - } else{ - *pzErrMsg = sqlite3_mprintf("vector: unexpected binary type: got %d, expected %d or %d", *pType, VECTOR_TYPE_FLOAT32, VECTOR_TYPE_FLOAT64); - return -1; + + if( vectorParseMeta(pBlob, nBlobSize, pType, pDims, &nDataSize, pzErrMsg) != SQLITE_OK ){ + return SQLITE_ERROR; } if( *pDims > MAX_VECTOR_SZ ){ *pzErrMsg = sqlite3_mprintf("vector: max size exceeded: %d > %d", *pDims, MAX_VECTOR_SZ); - return -1; + return SQLITE_ERROR; } - return 0; + return SQLITE_OK; } int detectTextVectorParameters(sqlite3_value *arg, int typeHint, int *pType, int *pDims, char **pzErrMsg) { @@ -339,14 +420,14 @@ int detectVectorParameters(sqlite3_value *arg, int typeHint, int *pType, int *pD } } -int vectorParse( +int vectorParseWithType( sqlite3_value *arg, Vector *pVector, char **pzErrMsg ){ switch( sqlite3_value_type(arg) ){ case SQLITE_BLOB: - return vectorParseSqliteBlob(arg, pVector, pzErrMsg); + return vectorParseSqliteBlobWithType(arg, pVector, pzErrMsg); case SQLITE_TEXT: return vectorParseSqliteText(arg, pVector, pzErrMsg); default: @@ -363,6 +444,12 @@ void vectorDump(const Vector *pVector){ case VECTOR_TYPE_FLOAT64: vectorF64Dump(pVector); break; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitDump(pVector); + break; + case VECTOR_TYPE_FLOAT8: + vectorF8Dump(pVector); + break; default: assert(0); } @@ -384,56 +471,326 @@ void vectorMarshalToText( } } -void vectorSerialize( +static int vectorMetaSize(VectorType type, VectorDims dims){ + int nDataSize; + if( type == VECTOR_TYPE_FLOAT32 ){ + return 0; + }else if( type == VECTOR_TYPE_FLOAT64 ){ + return 1; + }else if( type == VECTOR_TYPE_FLOAT1BIT ){ + nDataSize = vectorDataSize(type, dims); + // optional padding byte + "trailing-bits" byte + "vector-type" byte + return (nDataSize % 2 == 0 ? 1 : 0) + 1 + 1; + }else if( type == VECTOR_TYPE_FLOAT8 ){ + nDataSize = vectorDataSize(type, dims); + assert( nDataSize % 2 == 0 ); + /* padding byte + "trailing-bytes" byte + "vector-type" byte */ + return 1 + 1 + 1; + }else{ + assert( 0 ); + } +} + +static void vectorSerializeMeta(const Vector *pVector, size_t nDataSize, unsigned char *pBlob, size_t nBlobSize){ + if( pVector->type == VECTOR_TYPE_FLOAT32 ){ + // no meta for f32 type as this is "default" vector type + }else if( pVector->type == VECTOR_TYPE_FLOAT64 ){ + assert( nDataSize % 2 == 0 ); + assert( nBlobSize == nDataSize + 1 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT64; + }else if( pVector->type == VECTOR_TYPE_FLOAT1BIT ){ + assert( nBlobSize % 2 == 1 ); + assert( nBlobSize >= 3 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT1BIT; + pBlob[nBlobSize - 2] = 8 * (nBlobSize - 1) - pVector->dims; + if( vectorMetaSize(pVector->type, pVector->dims) == 3 ){ + pBlob[nBlobSize - 3] = 0; + } + }else if( pVector->type == VECTOR_TYPE_FLOAT8 ){ + assert( nBlobSize % 2 == 1 ); + assert( nDataSize % 2 == 0 ); + assert( nBlobSize == nDataSize + 3 ); + pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT8; + pBlob[nBlobSize - 2] = ALIGN(pVector->dims, sizeof(float)) - pVector->dims; + }else{ + assert( 0 ); + } +} + +void vectorSerializeWithMeta( sqlite3_context *context, const Vector *pVector ){ + unsigned char *pBlob; + size_t nBlobSize, nDataSize, nMetaSize; + + assert( pVector->dims <= MAX_VECTOR_SZ ); + + nDataSize = vectorDataSize(pVector->type, pVector->dims); + nMetaSize = vectorMetaSize(pVector->type, pVector->dims); + nBlobSize = nDataSize + nMetaSize; + if( nBlobSize == 0 ){ + sqlite3_result_zeroblob(context, 0); + return; + } + + pBlob = sqlite3_malloc64(nBlobSize); + if( pBlob == NULL ){ + sqlite3_result_error_nomem(context); + return; + } + + vectorSerializeToBlob(pVector, pBlob, nDataSize); + vectorSerializeMeta(pVector, nDataSize, pBlob, nBlobSize); + sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); +} + +void vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t nBlobSize){ switch (pVector->type) { case VECTOR_TYPE_FLOAT32: - vectorF32Serialize(context, pVector); + vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); break; case VECTOR_TYPE_FLOAT64: - vectorF64Serialize(context, pVector); + vectorF64SerializeToBlob(pVector, pBlob, nBlobSize); + break; + case VECTOR_TYPE_FLOAT1BIT: + vector1BitSerializeToBlob(pVector, pBlob, nBlobSize); + break; + case VECTOR_TYPE_FLOAT8: + vectorF8SerializeToBlob(pVector, pBlob, nBlobSize); break; default: assert(0); } } -size_t vectorSerializeToBlob(const Vector *pVector, unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - return vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); - case VECTOR_TYPE_FLOAT64: - return vectorF64SerializeToBlob(pVector, pBlob, nBlobSize); - default: - assert(0); +void vectorInitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ + pVector->data = (void*)pBlob; +} + +static void vectorConvertFromF32(const Vector *pFrom, Vector *pTo){ + int i; + float *src; + + u8 *dst1Bit; + double *dstF64; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT32 ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF64[i] = src[i]; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( src[i] > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); } - return 0; } -size_t vectorDeserializeFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - return vectorF32DeserializeFromBlob(pVector, pBlob, nBlobSize); - case VECTOR_TYPE_FLOAT64: - return vectorF64DeserializeFromBlob(pVector, pBlob, nBlobSize); - default: - assert(0); +static void vectorConvertFromF64(const Vector *pFrom, Vector *pTo){ + int i; + double *src; + + u8 *dst1Bit; + float *dstF32; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT64 ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF32[i] = src[i]; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( src[i] > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); } - return 0; } -void vectorInitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - switch (pVector->type) { - case VECTOR_TYPE_FLOAT32: - vectorF32InitFromBlob(pVector, pBlob, nBlobSize); - break; - case VECTOR_TYPE_FLOAT64: - vectorF64InitFromBlob(pVector, pBlob, nBlobSize); - break; - default: - assert(0); +static void vectorConvertFrom1Bit(const Vector *pFrom, Vector *pTo){ + int i; + u8 *src; + + float *dstF32; + double *dstF64; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT1BIT ); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + if( ((src[i / 8] >> (i & 7)) & 1) == 1 ){ + dstF32[i] = +1; + }else{ + dstF32[i] = -1; + } + } + }else if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + if( ((src[i / 8] >> (i & 7)) & 1) == 1 ){ + dstF64[i] = +1; + }else{ + dstF64[i] = -1; + } + } + }else{ + assert( 0 ); + } +} + +static void vectorConvertFromF8(const Vector *pFrom, Vector *pTo){ + int i; + u8 *src; + float alpha, shift; + + float *dstF32; + double *dstF64; + u8 *dst1Bit; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pFrom->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(pFrom->data, pFrom->dims, &alpha, &shift); + + src = pFrom->data; + if( pTo->type == VECTOR_TYPE_FLOAT32 ){ + dstF32 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF32[i] = alpha * src[i] + shift; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT64 ){ + dstF64 = pTo->data; + for(i = 0; i < pFrom->dims; i++){ + dstF64[i] = alpha * src[i] + shift; + } + }else if( pTo->type == VECTOR_TYPE_FLOAT1BIT ){ + dst1Bit = pTo->data; + for(i = 0; i < pFrom->dims; i += 8){ + dst1Bit[i / 8] = 0; + } + for(i = 0; i < pFrom->dims; i++){ + if( (alpha * src[i] + shift) > 0 ){ + dst1Bit[i / 8] |= (1 << (i & 7)); + } + } + }else{ + assert( 0 ); + } +} + +static inline int clip(float f, int minF, int maxF){ + if( f < minF ){ + return minF; + }else if( f > maxF ){ + return maxF; + } + return (int)(f + 0.5); +} + +#define MINMAX(i, value, minValue, maxValue) {if(i == 0){ minValue = (value); maxValue = (value);} else { minValue = MIN(minValue, (value)); maxValue = MAX(maxValue, (value)); }} + +static void vectorConvertToF8(const Vector *pFrom, Vector *pTo){ + int i; + u8 *dst; + float alpha, shift; + float minF = 0, maxF = 0; + + float *srcF32; + double *srcF64; + u8 *src1Bit; + + assert( pFrom->dims == pTo->dims ); + assert( pFrom->type != pTo->type ); + assert( pTo->type == VECTOR_TYPE_FLOAT8 ); + + dst = pTo->data; + if( pFrom->type == VECTOR_TYPE_FLOAT32 ){ + srcF32 = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, srcF32[i], minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip((srcF32[i] - shift) / alpha, 0, 255); + } + }else if( pFrom->type == VECTOR_TYPE_FLOAT64 ){ + srcF64 = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, srcF64[i], minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip((srcF64[i] - shift) / alpha, 0, 255); + } + }else if( pFrom->type == VECTOR_TYPE_FLOAT1BIT ){ + src1Bit = pFrom->data; + for(i = 0; i < pFrom->dims; i++){ + MINMAX(i, ((src1Bit[i / 8] >> (i & 7)) & 1) ? +1 : -1, minF, maxF); + } + shift = minF; + alpha = (maxF - minF) / 255; + for(i = 0; i < pFrom->dims; i++){ + dst[i] = clip(((((src1Bit[i / 8] >> (i & 7)) & 1) ? +1 : -1) - shift) / alpha, 0, 255); + } + }else{ + assert( 0 ); + } + vectorF8SetParameters(pTo->data, pTo->dims, alpha, shift); +} + + +void vectorConvert(const Vector *pFrom, Vector *pTo){ + assert( pFrom->dims == pTo->dims ); + + if( pFrom->type == pTo->type ){ + memcpy(pTo->data, pFrom->data, vectorDataSize(pFrom->type, pFrom->dims)); + return; + } + + if( pTo->type == VECTOR_TYPE_FLOAT8 ){ + vectorConvertToF8(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT32 ){ + vectorConvertFromF32(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT64 ){ + vectorConvertFromF64(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT1BIT ){ + vectorConvertFrom1Bit(pFrom, pTo); + }else if( pFrom->type == VECTOR_TYPE_FLOAT8 ){ + vectorConvertFromF8(pFrom, pTo); + }else{ + assert( 0 ); } } @@ -448,31 +805,49 @@ static void vectorFuncHintedType( sqlite3_context *context, int argc, sqlite3_value **argv, - int typeHint + int targetType ){ char *pzErrMsg = NULL; - Vector *pVector; - int type, dims; + Vector *pVector = NULL, *pTarget = NULL; + int type, dims, typeHint = VECTOR_TYPE_FLOAT32; if( argc < 1 ){ - return; + goto out; + } + // simplification in order to support only parsing from text to f32 and f64 vectors + if( targetType == VECTOR_TYPE_FLOAT64 ){ + typeHint = targetType; } if( detectVectorParameters(argv[0], typeHint, &type, &dims, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - return; + goto out; } pVector = vectorContextAlloc(context, type, dims); - if( pVector==NULL ){ - return; + if( pVector == NULL ){ + goto out; } - if( vectorParse(argv[0], pVector, &pzErrMsg) != 0 ){ + if( vectorParseWithType(argv[0], pVector, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - goto out_free_vec; + goto out; + } + if( type == targetType ){ + vectorSerializeWithMeta(context, pVector); + }else{ + pTarget = vectorContextAlloc(context, targetType, dims); + if( pTarget == NULL ){ + goto out; + } + vectorConvert(pVector, pTarget); + vectorSerializeWithMeta(context, pTarget); + } +out: + if( pVector != NULL ){ + vectorFree(pVector); + } + if( pTarget != NULL ){ + vectorFree(pTarget); } - vectorSerialize(context, pVector); -out_free_vec: - vectorFree(pVector); } static void vector32Func( @@ -490,6 +865,22 @@ static void vector64Func( vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT64); } +static void vector8Func( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT8); +} + +static void vector1BitFunc( + sqlite3_context *context, + int argc, + sqlite3_value **argv +){ + vectorFuncHintedType(context, argc, argv, VECTOR_TYPE_FLOAT1BIT); +} + /* ** Implementation of vector_extract(X) function. */ @@ -499,39 +890,51 @@ static void vectorExtractFunc( sqlite3_value **argv ){ char *pzErrMsg = NULL; - Vector *pVector; + Vector *pVector = NULL, *pTarget = NULL; unsigned i; int type, dims; if( argc < 1 ){ - return; + goto out; } if( detectVectorParameters(argv[0], 0, &type, &dims, &pzErrMsg) != 0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - return; + goto out; } pVector = vectorContextAlloc(context, type, dims); - if( pVector==NULL ){ - return; + if( pVector == NULL ){ + goto out; } - if( vectorParse(argv[0], pVector, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[0], pVector, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); - goto out_free; + goto out; + } + if( pVector->type == VECTOR_TYPE_FLOAT32 || pVector->type == VECTOR_TYPE_FLOAT64 ){ + vectorMarshalToText(context, pVector); + }else{ + pTarget = vectorContextAlloc(context, VECTOR_TYPE_FLOAT32, dims); + if( pTarget == NULL ){ + goto out; + } + vectorConvert(pVector, pTarget); + vectorMarshalToText(context, pTarget); + } +out: + if( pVector != NULL ){ + vectorFree(pVector); + } + if( pTarget != NULL ){ + vectorFree(pTarget); } - vectorMarshalToText(context, pVector); -out_free: - vectorFree(pVector); } -/* -** Implementation of vector_distance_cos(X, Y) function. -*/ -static void vectorDistanceCosFunc( +static void vectorDistanceFunc( sqlite3_context *context, int argc, - sqlite3_value **argv + sqlite3_value **argv, + float (*vectorDistance)(const Vector *pVector1, const Vector *pVector2) ){ char *pzErrMsg = NULL; Vector *pVector1 = NULL, *pVector2 = NULL; @@ -551,13 +954,19 @@ static void vectorDistanceCosFunc( goto out_free; } if( type1 != type2 ){ - pzErrMsg = sqlite3_mprintf("vector_distance_cos: vectors must have the same type: %d != %d", type1, type2); + pzErrMsg = sqlite3_mprintf("vector_distance: vectors must have the same type: %d != %d", type1, type2); sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } if( dims1 != dims2 ){ - pzErrMsg = sqlite3_mprintf("vector_distance_cos: vectors must have the same length: %d != %d", dims1, dims2); + pzErrMsg = sqlite3_mprintf("vector_distance: vectors must have the same length: %d != %d", dims1, dims2); + sqlite3_result_error(context, pzErrMsg, -1); + sqlite3_free(pzErrMsg); + goto out_free; + } + if( vectorDistance == vectorDistanceL2 && type1 == VECTOR_TYPE_FLOAT1BIT ){ + pzErrMsg = sqlite3_mprintf("vector_distance: l2 distance is not supported for float1bit vectors", dims1, dims2); sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; @@ -570,17 +979,17 @@ static void vectorDistanceCosFunc( if( pVector2==NULL ){ goto out_free; } - if( vectorParse(argv[0], pVector1, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[0], pVector1, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } - if( vectorParse(argv[1], pVector2, &pzErrMsg)<0 ){ + if( vectorParseWithType(argv[1], pVector2, &pzErrMsg)<0 ){ sqlite3_result_error(context, pzErrMsg, -1); sqlite3_free(pzErrMsg); goto out_free; } - sqlite3_result_double(context, vectorDistanceCos(pVector1, pVector2)); + sqlite3_result_double(context, vectorDistance(pVector1, pVector2)); out_free: if( pVector2 ){ vectorFree(pVector2); @@ -590,6 +999,20 @@ static void vectorDistanceCosFunc( } } +/* +** Implementation of vector_distance_cos(X, Y) function. +*/ +static void vectorDistanceCosFunc(sqlite3_context *context, int argc, sqlite3_value **argv){ + vectorDistanceFunc(context, argc, argv, vectorDistanceCos); +} + +/* +** Implementation of vector_distance_l2(X, Y) function. +*/ +static void vectorDistanceL2Func(sqlite3_context *context, int argc, sqlite3_value **argv){ + vectorDistanceFunc(context, argc, argv, vectorDistanceL2); +} + /* * Marker function which is used in index creation syntax: CREATE INDEX idx ON t(libsql_vector_idx(emb)); */ @@ -606,8 +1029,11 @@ void sqlite3RegisterVectorFunctions(void){ FUNCTION(vector, 1, 0, 0, vector32Func), FUNCTION(vector32, 1, 0, 0, vector32Func), FUNCTION(vector64, 1, 0, 0, vector64Func), + FUNCTION(vector1bit, 1, 0, 0, vector1BitFunc), + FUNCTION(vector8, 1, 0, 0, vector8Func), FUNCTION(vector_extract, 1, 0, 0, vectorExtractFunc), FUNCTION(vector_distance_cos, 2, 0, 0, vectorDistanceCosFunc), + FUNCTION(vector_distance_l2, 2, 0, 0, vectorDistanceL2Func), FUNCTION(libsql_vector_idx, -1, 0, 0, libsqlVectorIdx), }; diff --git a/libsql-sqlite3/src/vectorIndex.c b/libsql-sqlite3/src/vectorIndex.c index f627c98e00..96f5b450c4 100644 --- a/libsql-sqlite3/src/vectorIndex.c +++ b/libsql-sqlite3/src/vectorIndex.c @@ -49,11 +49,6 @@ ** VectorIdxParams utilities ****************************************************************************/ -// VACUUM creates tables and indices first and only then populate data -// we need to ignore inserts from 'INSERT INTO vacuum.t SELECT * FROM t' statements because -// all shadow tables will be populated by VACUUM process during regular process of table copy -#define IsVacuum(db) ((db->mDbFlags&DBFLAG_Vacuum)!=0) - void vectorIdxParamsInit(VectorIdxParams *pParams, u8 *pBinBuf, int nBinSize) { assert( nBinSize <= VECTOR_INDEX_PARAMS_BUF_SIZE ); @@ -266,7 +261,7 @@ int vectorInRowAlloc(sqlite3 *db, const UnpackedRecord *pRecord, VectorInRow *pV vectorInitFromBlob(pVectorInRow->pVector, sqlite3_value_blob(pVectorValue), sqlite3_value_bytes(pVectorValue)); } else if( sqlite3_value_type(pVectorValue) == SQLITE_TEXT ){ // users can put strings (e.g. '[1,2,3]') in the table and we should process them correctly - if( vectorParse(pVectorValue, pVectorInRow->pVector, pzErrMsg) != 0 ){ + if( vectorParseWithType(pVectorValue, pVectorInRow->pVector, pzErrMsg) != 0 ){ rc = SQLITE_ERROR; goto out; } @@ -378,14 +373,18 @@ void vectorOutRowsFree(sqlite3 *db, VectorOutRows *pRows) { */ struct VectorColumnType { const char *zName; - int nBits; + int type; }; static struct VectorColumnType VECTOR_COLUMN_TYPES[] = { - { "FLOAT32", 32 }, - { "FLOAT64", 64 }, - { "F32_BLOB", 32 }, - { "F64_BLOB", 64 } + { "FLOAT32", VECTOR_TYPE_FLOAT32 }, + { "F32_BLOB", VECTOR_TYPE_FLOAT32 }, + { "FLOAT64", VECTOR_TYPE_FLOAT64 }, + { "F64_BLOB", VECTOR_TYPE_FLOAT64 }, + { "FLOAT1BIT", VECTOR_TYPE_FLOAT1BIT }, + { "F1BIT_BLOB", VECTOR_TYPE_FLOAT1BIT }, + { "FLOAT8", VECTOR_TYPE_FLOAT8 }, + { "F8_BLOB", VECTOR_TYPE_FLOAT8 }, }; /* @@ -401,13 +400,16 @@ struct VectorParamName { }; static struct VectorParamName VECTOR_PARAM_NAMES[] = { - { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, - { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, - { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, - { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, - { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, - { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, + { "type", VECTOR_INDEX_TYPE_PARAM_ID, 0, "diskann", VECTOR_INDEX_TYPE_DISKANN }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "cosine", VECTOR_METRIC_TYPE_COS }, + { "metric", VECTOR_METRIC_TYPE_PARAM_ID, 0, "l2", VECTOR_METRIC_TYPE_L2 }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float1bit", VECTOR_TYPE_FLOAT1BIT }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float8", VECTOR_TYPE_FLOAT8 }, + { "compress_neighbors", VECTOR_COMPRESS_NEIGHBORS_PARAM_ID, 0, "float32", VECTOR_TYPE_FLOAT32 }, + { "alpha", VECTOR_PRUNING_ALPHA_PARAM_ID, 2, 0, 0 }, + { "search_l", VECTOR_SEARCH_L_PARAM_ID, 1, 0, 0 }, + { "insert_l", VECTOR_INSERT_L_PARAM_ID, 1, 0, 0 }, + { "max_neighbors", VECTOR_MAX_NEIGHBORS_PARAM_ID, 1, 0, 0 }, }; static int parseVectorIdxParam(const char *zParam, VectorIdxParams *pParams, const char **pErrMsg) { @@ -573,14 +575,7 @@ int vectorIdxParseColumnType(const char *zType, int *pType, int *pDims, const ch } *pDims = dimensions; - if( VECTOR_COLUMN_TYPES[i].nBits == 32 ) { - *pType = VECTOR_TYPE_FLOAT32; - } else if( VECTOR_COLUMN_TYPES[i].nBits == 64 ) { - *pType = VECTOR_TYPE_FLOAT64; - } else { - *pErrMsg = "unsupported vector type"; - return -1; - } + *pType = VECTOR_COLUMN_TYPES[i].type; return 0; } *pErrMsg = "unexpected vector column type"; @@ -772,10 +767,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { // this is done to prevent unrecoverable situations where index were dropped but index parameters deletion failed and second attempt will fail on first step int rcIdx, rcParams; - if( IsVacuum(db) ){ - return SQLITE_OK; - } - assert( zDbSName != NULL ); rcIdx = diskAnnDropIndex(db, zDbSName, zIdxName); @@ -786,10 +777,6 @@ int vectorIndexDrop(sqlite3 *db, const char *zDbSName, const char *zIdxName) { int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { assert( zDbSName != NULL ); - if( IsVacuum(db) ){ - return SQLITE_OK; - } - return diskAnnClearIndex(db, zDbSName, zIdxName); } @@ -799,7 +786,7 @@ int vectorIndexClear(sqlite3 *db, const char *zDbSName, const char *zIdxName) { * this made intentionally in order to natively support upload of SQLite dumps * * dump populates tables first and create indices after - * so we must omit them because shadow tables already filled + * so we must omit index refill setp because shadow tables already filled * * 1. in case of any error :-1 returned (and pParse errMsg is populated with some error message) * 2. if vector index must not be created : 0 returned @@ -815,11 +802,7 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co int i, rc = SQLITE_OK; int dims, type; int hasLibsqlVectorIdxFn = 0, hasCollation = 0; - const char *pzErrMsg; - - if( IsVacuum(pParse->db) ){ - return CREATE_IGNORE; - } + const char *pzErrMsg = NULL; assert( zDbSName != NULL ); @@ -879,11 +862,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: must contain exactly one column wrapped into the " VECTOR_INDEX_MARKER_FUNCTION " function"); return CREATE_FAIL; } - // we are able to support this but I doubt this works for now - more polishing required to make this work - if( pIdx->pPartIdxWhere != NULL ) { - sqlite3ErrorMsg(pParse, "vector index: where condition is forbidden"); - return CREATE_FAIL; - } pArgsList = pIdx->aColExpr->a[0].pExpr->x.pList; pListItem = pArgsList->a; @@ -908,7 +886,6 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: %s: %s", pzErrMsg, zEmbeddingColumnTypeName); return CREATE_FAIL; } - // schema is locked while db is initializing and we need to just proceed here if( db->init.busy == 1 ){ return CREATE_OK; @@ -931,9 +908,13 @@ int vectorIndexCreate(Parse *pParse, const Index *pIdx, const char *zDbSName, co sqlite3ErrorMsg(pParse, "vector index: unsupported for tables without ROWID and composite primary key"); return CREATE_FAIL; } - rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams); + rc = diskAnnCreateIndex(db, zDbSName, pIdx->zName, &idxKey, &idxParams, &pzErrMsg); if( rc != SQLITE_OK ){ - sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann"); + if( pzErrMsg != NULL ){ + sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann: %s", pzErrMsg); + }else{ + sqlite3ErrorMsg(pParse, "vector index: unable to initialize diskann"); + } return CREATE_FAIL; } rc = insertIndexParameters(db, zDbSName, pIdx->zName, &idxParams); @@ -973,7 +954,6 @@ int vectorIndexSearch( VectorIdxParams idxParams; vectorIdxParamsInit(&idxParams, NULL, 0); - assert( !IsVacuum(db) ); assert( zDbSName != NULL ); if( argc != 3 ){ @@ -985,17 +965,14 @@ int vectorIndexSearch( rc = SQLITE_ERROR; goto out; } - if( type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(search): only f32 vectors are supported"); - rc = SQLITE_ERROR; - goto out; - } + assert( type == VECTOR_TYPE_FLOAT32 || type == VECTOR_TYPE_FLOAT64 || type == VECTOR_TYPE_FLOAT1BIT ); + pVector = vectorAlloc(type, dims); if( pVector == NULL ){ rc = SQLITE_NOMEM_BKPT; goto out; } - if( vectorParse(argv[1], pVector, pzErrMsg) != 0 ){ + if( vectorParseWithType(argv[1], pVector, pzErrMsg) != 0 ){ rc = SQLITE_ERROR; goto out; } @@ -1058,10 +1035,6 @@ int vectorIndexInsert( int rc; VectorInRow vectorInRow; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - rc = vectorInRowAlloc(pCur->db, pRecord, &vectorInRow, pzErrMsg); if( rc != SQLITE_OK ){ return rc; @@ -1081,10 +1054,6 @@ int vectorIndexDelete( ){ VectorInRow payload; - if( IsVacuum(pCur->db) ){ - return SQLITE_OK; - } - payload.pVector = NULL; payload.nKeys = r->nField - 1; payload.pKeyValues = r->aMem + 1; diff --git a/libsql-sqlite3/src/vectorIndexInt.h b/libsql-sqlite3/src/vectorIndexInt.h index 8f73091bb1..e65df4d515 100644 --- a/libsql-sqlite3/src/vectorIndexInt.h +++ b/libsql-sqlite3/src/vectorIndexInt.h @@ -73,10 +73,10 @@ int nodeEdgesMetadataOffset(const DiskAnnIndex *pIndex); void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Vector *pVector); void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector); u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot); -void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector); +void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, float *distance, Vector *pVector); int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u64 nRowid); void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPruned); -void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector); +void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, float distance, Vector *pVector); void nodeBinDeleteEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iDelete); void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot); @@ -100,43 +100,47 @@ typedef u8 MetricType; */ /* format version which can help to upgrade vector on-disk format without breaking older version of the db */ -#define VECTOR_FORMAT_PARAM_ID 1 +#define VECTOR_FORMAT_PARAM_ID 1 /* - * 1 - initial version + * 1 - v1 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u64 unused ] [u64 edge rowid] ] ... + * 2 - v2 version; node block format: [node meta] [node vector] [edge vectors] ... [ [u32 unused] [f32 distance] [u64 edge rowid] ] ... */ -#define VECTOR_FORMAT_DEFAULT 1 +#define VECTOR_FORMAT_V1 1 +#define VECTOR_FORMAT_DEFAULT 2 /* type of the vector index */ -#define VECTOR_INDEX_TYPE_PARAM_ID 2 -#define VECTOR_INDEX_TYPE_DISKANN 1 +#define VECTOR_INDEX_TYPE_PARAM_ID 2 +#define VECTOR_INDEX_TYPE_DISKANN 1 /* type of the underlying vector for the vector index */ -#define VECTOR_TYPE_PARAM_ID 3 +#define VECTOR_TYPE_PARAM_ID 3 /* dimension of the underlying vector for the vector index */ -#define VECTOR_DIM_PARAM_ID 4 +#define VECTOR_DIM_PARAM_ID 4 /* metric type used for comparing two vectors */ -#define VECTOR_METRIC_TYPE_PARAM_ID 5 -#define VECTOR_METRIC_TYPE_COS 1 -#define VECTOR_METRIC_TYPE_L2 2 +#define VECTOR_METRIC_TYPE_PARAM_ID 5 +#define VECTOR_METRIC_TYPE_COS 1 +#define VECTOR_METRIC_TYPE_L2 2 /* block size */ -#define VECTOR_BLOCK_SIZE_PARAM_ID 6 -#define VECTOR_BLOCK_SIZE_DEFAULT 128 +#define VECTOR_BLOCK_SIZE_PARAM_ID 6 +#define VECTOR_BLOCK_SIZE_DEFAULT 128 -#define VECTOR_PRUNING_ALPHA_PARAM_ID 7 -#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2 +#define VECTOR_PRUNING_ALPHA_PARAM_ID 7 +#define VECTOR_PRUNING_ALPHA_DEFAULT 1.2 -#define VECTOR_INSERT_L_PARAM_ID 8 -#define VECTOR_INSERT_L_DEFAULT 70 +#define VECTOR_INSERT_L_PARAM_ID 8 +#define VECTOR_INSERT_L_DEFAULT 70 -#define VECTOR_SEARCH_L_PARAM_ID 9 -#define VECTOR_SEARCH_L_DEFAULT 200 +#define VECTOR_SEARCH_L_PARAM_ID 9 +#define VECTOR_SEARCH_L_DEFAULT 200 -#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 +#define VECTOR_MAX_NEIGHBORS_PARAM_ID 10 + +#define VECTOR_COMPRESS_NEIGHBORS_PARAM_ID 11 /* total amount of vector index parameters */ -#define VECTOR_PARAM_IDS_COUNT 9 +#define VECTOR_PARAM_IDS_COUNT 11 /* * Vector index parameters are stored in simple binary format (1 byte tag + 8 byte u64 integer / f64 float) @@ -218,7 +222,7 @@ int vectorOutRowsPut(VectorOutRows *, int, int, const u64 *, sqlite3_value *); void vectorOutRowsGet(sqlite3_context *, const VectorOutRows *, int, int); void vectorOutRowsFree(sqlite3 *, VectorOutRows *); -int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *); +int diskAnnCreateIndex(sqlite3 *, const char *, const char *, const VectorIdxKey *, VectorIdxParams *, const char **); int diskAnnClearIndex(sqlite3 *, const char *, const char *); int diskAnnDropIndex(sqlite3 *, const char *, const char *); int diskAnnOpenIndex(sqlite3 *, const char *, const char *, const VectorIdxParams *, DiskAnnIndex **); diff --git a/libsql-sqlite3/src/vectorInt.h b/libsql-sqlite3/src/vectorInt.h index 8c9138b94f..1c72857326 100644 --- a/libsql-sqlite3/src/vectorInt.h +++ b/libsql-sqlite3/src/vectorInt.h @@ -19,14 +19,45 @@ typedef u32 VectorDims; */ #define MAX_VECTOR_SZ 65536 +/* + * on-disk binary format for vector of different types: + * 1. float32 + * [data[0] as f32] [data[1] as f32] ... [data[dims - 1] as f32] [1 as u8]? + * - last 'type'-byte is optional for float32 vectors + * + * 2. float64 + * [data[0] as f64] [data[1] as f64] ... [data[dims - 1] as f64] [2 as u8] + * - last 'type'-byte is mandatory for float64 vectors + * + * 3. float1bit + * [data[0] as u8] [data[1] as u8] ... [data[(dims + 7) / 8] as u8] [_ as u8; padding]? [trailing_bits as u8] [3 as u8] + * - every data byte (except for the last) represents exactly 8 components of the vector + * - last data byte represents [1..8] components of the vector + * - optional padding byte ensures that "trailing_bits" byte will be written at the odd blob position (0-based) + * - "trailing_bits" byte specify amount of trailing *bits* in the blob without last 'type'-byte which must be omitted + * (so, vector dimensions are equal to 8 * (blob_size - 1) - trailing_bits) + * - last 'type'-byte is mandatory for float1bit vectors + * + * 4. float8 + * [data[0] as u8] [data[1] as u8] ... [data[dims - 1] as u8] [_ as u8; alignment_padding]* [alpha as f32] [shift as f32] [padding as u8] [trailing_bytes as u8] [4 as u8] + * - every data byte represents single quantized vector component + * - "alignment_padding" has size from 0 to 3 bytes in order to pad content to multiple of 4 = sizeof(float) + * - "trailing_bytes" byte specify amount of bytes in the "alignment_padding" + * - last 'type'-byte is mandatory for float8 vectors +*/ + /* * Enumerate of supported vector types (0 omitted intentionally as we can use zero as "undefined" value) */ -#define VECTOR_TYPE_FLOAT32 1 -#define VECTOR_TYPE_FLOAT64 2 +#define VECTOR_TYPE_FLOAT32 1 +#define VECTOR_TYPE_FLOAT64 2 +#define VECTOR_TYPE_FLOAT1BIT 3 +#define VECTOR_TYPE_FLOAT8 4 #define VECTOR_FLAGS_STATIC 1 +#define ALIGN(n, size) (((n + size - 1) / size) * size) + /* * Object which represents a vector * data points to the memory which must be interpreted according to the vector type @@ -41,15 +72,20 @@ struct Vector { size_t vectorDataSize(VectorType, VectorDims); Vector *vectorAlloc(VectorType, VectorDims); void vectorFree(Vector *v); -int vectorParse(sqlite3_value *, Vector *, char **); +int vectorParseWithType(sqlite3_value *, Vector *, char **); void vectorInit(Vector *, VectorType, VectorDims, void *); /* * Dumps vector on the console (used only for debugging) */ -void vectorDump (const Vector *v); -void vectorF32Dump(const Vector *v); -void vectorF64Dump(const Vector *v); +void vectorDump (const Vector *v); +void vectorF8Dump (const Vector *v); +void vectorF32Dump (const Vector *v); +void vectorF64Dump (const Vector *v); +void vector1BitDump(const Vector *v); + +void vectorF8GetParameters(const u8 *, int, float *, float *); +void vectorF8SetParameters(u8 *, int, float, float); /* * Converts vector to the text representation and write the result to the sqlite3_context @@ -61,28 +97,30 @@ void vectorF64MarshalToText(sqlite3_context *, const Vector *); /* * Serializes vector to the blob in little-endian format according to the IEEE-754 standard */ -size_t vectorSerializeToBlob (const Vector *, unsigned char *, size_t); -size_t vectorF32SerializeToBlob(const Vector *, unsigned char *, size_t); -size_t vectorF64SerializeToBlob(const Vector *, unsigned char *, size_t); - -/* - * Deserializes vector from the blob in little-endian format according to the IEEE-754 standard -*/ -size_t vectorDeserializeFromBlob (Vector *, const unsigned char *, size_t); -size_t vectorF32DeserializeFromBlob(Vector *, const unsigned char *, size_t); -size_t vectorF64DeserializeFromBlob(Vector *, const unsigned char *, size_t); +void vectorSerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF8SerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF32SerializeToBlob (const Vector *, unsigned char *, size_t); +void vectorF64SerializeToBlob (const Vector *, unsigned char *, size_t); +void vector1BitSerializeToBlob(const Vector *, unsigned char *, size_t); /* * Calculates cosine distance between two vectors (vector must have same type and same dimensions) */ float vectorDistanceCos (const Vector *, const Vector *); +float vectorF8DistanceCos (const Vector *, const Vector *); float vectorF32DistanceCos (const Vector *, const Vector *); double vectorF64DistanceCos(const Vector *, const Vector *); +/* + * Calculates hamming distance between two 1-bit vectors (vector must have same dimensions) +*/ +int vector1BitDistanceHamming(const Vector *, const Vector *); + /* * Calculates L2 distance between two vectors (vector must have same type and same dimensions) */ float vectorDistanceL2 (const Vector *, const Vector *); +float vectorF8DistanceL2 (const Vector *, const Vector *); float vectorF32DistanceL2 (const Vector *, const Vector *); double vectorF64DistanceL2(const Vector *, const Vector *); @@ -91,25 +129,44 @@ double vectorF64DistanceL2(const Vector *, const Vector *); * LibSQL can append one trailing byte in the end of final blob. This byte will be later used to determine type of the blob * By default, blob with even length will be treated as a f32 blob */ -void vectorSerialize (sqlite3_context *, const Vector *); -void vectorF32Serialize(sqlite3_context *, const Vector *); -void vectorF64Serialize(sqlite3_context *, const Vector *); +void vectorSerializeWithMeta(sqlite3_context *, const Vector *); /* * Parses Vector content from the blob; vector type and dimensions must be filled already */ -int vectorParseSqliteBlob (sqlite3_value *, Vector *, char **); -int vectorF32ParseSqliteBlob(sqlite3_value *, Vector *, char **); -int vectorF64ParseSqliteBlob(sqlite3_value *, Vector *, char **); +int vectorParseSqliteBlobWithType(sqlite3_value *, Vector *, char **); -void vectorInitStatic(Vector *, VectorType, const unsigned char *, size_t); +void vectorF8DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vectorF32DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vectorF64DeserializeFromBlob (Vector *, const unsigned char *, size_t); +void vector1BitDeserializeFromBlob(Vector *, const unsigned char *, size_t); + +void vectorInitStatic(Vector *, VectorType, VectorDims, void *); void vectorInitFromBlob(Vector *, const unsigned char *, size_t); -void vectorF32InitFromBlob(Vector *, const unsigned char *, size_t); -void vectorF64InitFromBlob(Vector *, const unsigned char *, size_t); + +void vectorConvert(const Vector *, Vector *); /* Detect type and dimension of vector provided with first parameter of sqlite3_value * type */ int detectVectorParameters(sqlite3_value *, int, int *, int *, char **); +static inline unsigned serializeF32(unsigned char *pBuf, float value){ + u32 *p = (u32 *)&value; + pBuf[0] = *p & 0xFF; + pBuf[1] = (*p >> 8) & 0xFF; + pBuf[2] = (*p >> 16) & 0xFF; + pBuf[3] = (*p >> 24) & 0xFF; + return sizeof(float); +} + +static inline float deserializeF32(const unsigned char *pBuf){ + u32 value = 0; + value |= (u32)pBuf[0]; + value |= (u32)pBuf[1] << 8; + value |= (u32)pBuf[2] << 16; + value |= (u32)pBuf[3] << 24; + return *(float *)&value; +} + #ifdef __cplusplus } /* end of the 'extern "C"' block */ #endif diff --git a/libsql-sqlite3/src/vectordiskann.c b/libsql-sqlite3/src/vectordiskann.c index 95d473b630..2585883492 100644 --- a/libsql-sqlite3/src/vectordiskann.c +++ b/libsql-sqlite3/src/vectordiskann.c @@ -53,7 +53,7 @@ #include "sqliteInt.h" #include "vectorIndexInt.h" -#define SQLITE_VECTOR_TRACE +// #define SQLITE_VECTOR_TRACE #if defined(SQLITE_DEBUG) && defined(SQLITE_VECTOR_TRACE) #define DiskAnnTrace(X) sqlite3DebugPrintf X; #else @@ -79,9 +79,19 @@ #define VECTOR_NODE_METADATA_SIZE (sizeof(u64) + sizeof(u16)) #define VECTOR_EDGE_METADATA_SIZE (sizeof(u64) + sizeof(u64)) +typedef struct VectorPair VectorPair; typedef struct DiskAnnSearchCtx DiskAnnSearchCtx; typedef struct DiskAnnNode DiskAnnNode; +// VectorPair represents single vector where pNode is an exact representation and pEdge - compressed representation +// (pEdge pointer always equals to pNode if pNodeType == pEdgeType) +struct VectorPair { + int nodeType; + int edgeType; + Vector *pNode; + Vector *pEdge; +}; + // DiskAnnNode represents single node in the DiskAnn graph struct DiskAnnNode { u64 nRowid; /* node id */ @@ -97,14 +107,18 @@ struct DiskAnnNode { * so caller which puts nodes in the context can forget about resource managmenet (context will take care of this) */ struct DiskAnnSearchCtx { - const Vector *pQuery; /* initial query vector; user query for SELECT and row vector for INSERT */ - DiskAnnNode **aCandidates; /* array of candidates ordered by distance to the query (ascending) */ - double *aDistances; /* array of distances to the query vector */ - unsigned int nCandidates; /* current size of aCandidates/aDistances arrays */ - unsigned int maxCandidates; /* max size of aCandidates/aDistances arrays */ - DiskAnnNode *visitedList; /* list of all visited candidates (so, candidates from aCandidates array either got replaced or moved to the visited list) */ - unsigned int nUnvisited; /* amount of unvisited candidates in the aCadidates array */ - int blobMode; /* DISKANN_BLOB_READONLY if we wont modify node blobs; DISKANN_BLOB_WRITABLE - otherwise */ + VectorPair query; /* initial query vector; user query for SELECT and row vector for INSERT */ + DiskAnnNode **aCandidates; /* array of unvisited candidates ordered by distance (possibly approximate) to the query (ascending) */ + float *aDistances; /* array of distances (possible approximate) to the query vector */ + unsigned int nCandidates; /* current size of aCandidates/aDistances arrays */ + unsigned int maxCandidates; /* max size of aCandidates/aDistances arrays */ + DiskAnnNode **aTopCandidates; /* top candidates with exact distance calculated */ + float *aTopDistances; /* top candidates exact distances */ + int nTopCandidates; /* current size of aTopCandidates/aTopDistances arrays */ + int maxTopCandidates; /* max size of aTopCandidates/aTopDistances arrays */ + DiskAnnNode *visitedList; /* list of all visited candidates (so, candidates from aCandidates array either got replaced or moved to the visited list) */ + unsigned int nUnvisited; /* amount of unvisited candidates in the aCadidates array */ + int blobMode; /* DISKANN_BLOB_READONLY if we wont modify node blobs; DISKANN_BLOB_WRITABLE - otherwise */ }; /************************************************************************** @@ -115,6 +129,10 @@ static inline u16 readLE16(const unsigned char *p){ return (u16)p[0] | (u16)p[1] << 8; } +static inline u32 readLE32(const unsigned char *p){ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16 | (u32)p[3] << 24; +} + static inline u64 readLE64(const unsigned char *p){ return (u64)p[0] | (u64)p[1] << 8 @@ -131,6 +149,13 @@ static inline void writeLE16(unsigned char *p, u16 v){ p[1] = v >> 8; } +static inline void writeLE32(unsigned char *p, u32 v){ + p[0] = v; + p[1] = v >> 8; + p[2] = v >> 16; + p[3] = v >> 24; +} + static inline void writeLE64(unsigned char *p, u64 v){ p[0] = v; p[1] = v >> 8; @@ -310,7 +335,7 @@ void nodeBinInit(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, u64 nRowid, Ve void nodeBinVector(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, Vector *pVector) { assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize <= pBlobSpot->nBufferSize ); - vectorInitStatic(pVector, pIndex->nNodeVectorType, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE, pIndex->nNodeVectorSize); + vectorInitStatic(pVector, pIndex->nNodeVectorType, pIndex->nVectorDims, pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE); } u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { @@ -319,20 +344,25 @@ u16 nodeBinEdges(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { return readLE16(pBlobSpot->pBuffer + sizeof(u64)); } -void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, Vector *pVector) { +void nodeBinEdge(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, int iEdge, u64 *pRowid, float *pDistance, Vector *pVector) { + u32 distance; int offset = nodeEdgesMetadataOffset(pIndex); if( pRowid != NULL ){ assert( offset + (iEdge + 1) * VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize ); *pRowid = readLE64(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u64)); } + if( pIndex->nFormatVersion != VECTOR_FORMAT_V1 && pDistance != NULL ){ + distance = readLE32(pBlobSpot->pBuffer + offset + iEdge * VECTOR_EDGE_METADATA_SIZE + sizeof(u32)); + *pDistance = *((float*)&distance); + } if( pVector != NULL ){ assert( VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize < offset ); vectorInitStatic( pVector, pIndex->nEdgeVectorType, - pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nNodeVectorSize, - pIndex->nEdgeVectorSize + pIndex->nVectorDims, + pBlobSpot->pBuffer + VECTOR_NODE_METADATA_SIZE + pIndex->nNodeVectorSize + iEdge * pIndex->nEdgeVectorSize ); } } @@ -342,7 +372,7 @@ int nodeBinEdgeFindIdx(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot, u6 // todo: if edges will be sorted by identifiers we can use binary search here (although speed up will be visible only on pretty loaded nodes: >128 edges) for(i = 0; i < nEdges; i++){ u64 edgeId; - nodeBinEdge(pIndex, pBlobSpot, i, &edgeId, NULL); + nodeBinEdge(pIndex, pBlobSpot, i, &edgeId, NULL, NULL); if( edgeId == nRowid ){ return i; } @@ -357,7 +387,7 @@ void nodeBinPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int nPru } // replace edge at position iReplace or add new one if iReplace == nEdges -void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, Vector *pVector) { +void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iReplace, u64 nRowid, float distance, Vector *pVector) { int nMaxEdges = nodeEdgesMaxCount(pIndex); int nEdges = nodeBinEdges(pIndex, pBlobSpot); int edgeVectorOffset, edgeMetaOffset, itemsToMove; @@ -376,6 +406,7 @@ void nodeBinReplaceEdge(const DiskAnnIndex *pIndex, BlobSpot *pBlobSpot, int iRe assert( edgeMetaOffset + VECTOR_EDGE_METADATA_SIZE <= pBlobSpot->nBufferSize ); vectorSerializeToBlob(pVector, pBlobSpot->pBuffer + edgeVectorOffset, pIndex->nEdgeVectorSize); + writeLE32(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u32), *((u32*)&distance)); writeLE64(pBlobSpot->pBuffer + edgeMetaOffset + sizeof(u64), nRowid); writeLE16(pBlobSpot->pBuffer + sizeof(u64), nEdges); @@ -410,6 +441,7 @@ void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { #if defined(SQLITE_DEBUG) && defined(SQLITE_VECTOR_TRACE) int nEdges, nMaxEdges, i; u64 nRowid; + float distance = 0; Vector vector; nEdges = nodeBinEdges(pIndex, pBlobSpot); @@ -420,8 +452,8 @@ void nodeBinDebug(const DiskAnnIndex *pIndex, const BlobSpot *pBlobSpot) { DiskAnnTrace((" nEdges=%d, nMaxEdges=%d, vector=", nEdges, nMaxEdges)); vectorDump(&vector); for(i = 0; i < nEdges; i++){ - nodeBinEdge(pIndex, pBlobSpot, i, &nRowid, &vector); - DiskAnnTrace((" to=%lld, vector=", nRowid, nRowid)); + nodeBinEdge(pIndex, pBlobSpot, i, &nRowid, &distance, &vector); + DiskAnnTrace((" to=%lld, distance=%f, vector=", nRowid, distance)); vectorDump(&vector); } #endif @@ -436,12 +468,14 @@ int diskAnnCreateIndex( const char *zDbSName, const char *zIdxName, const VectorIdxKey *pKey, - VectorIdxParams *pParams + VectorIdxParams *pParams, + const char **pzErrMsg ){ int rc; - int type, dims; + int type, dims, metric, neighbours; u64 maxNeighborsParam, blockSizeBytes; char *zSql; + const char *zRowidColumnName; char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...) char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...) if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){ @@ -463,24 +497,36 @@ int diskAnnCreateIndex( } assert( 0 < dims && dims <= MAX_VECTOR_SZ ); + metric = vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID); + if( metric == 0 ){ + metric = VECTOR_METRIC_TYPE_COS; + if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, metric) != 0 ){ + return SQLITE_ERROR; + } + } + neighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID); + if( neighbours == VECTOR_TYPE_FLOAT1BIT && metric != VECTOR_METRIC_TYPE_COS ){ + *pzErrMsg = "1-bit compression available only for cosine metric"; + return SQLITE_ERROR; + } + if( neighbours == 0 ){ + neighbours = type; + } + maxNeighborsParam = vectorIdxParamsGetU64(pParams, VECTOR_MAX_NEIGHBORS_PARAM_ID); if( maxNeighborsParam == 0 ){ // 3 D**(1/2) gives good recall values (90%+) // we also want to keep disk overhead at moderate level - 50x of the disk size increase is the current upper bound - maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(type, dims)) + 1); + maxNeighborsParam = MIN(3 * ((int)(sqrt(dims)) + 1), (50 * nodeOverhead(vectorDataSize(type, dims))) / nodeEdgeOverhead(vectorDataSize(neighbours, dims)) + 1); } - blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(type, dims)); + blockSizeBytes = nodeOverhead(vectorDataSize(type, dims)) + maxNeighborsParam * (u64)nodeEdgeOverhead(vectorDataSize(neighbours, dims)); if( blockSizeBytes > DISKANN_MAX_BLOCK_SZ ){ return SQLITE_ERROR; } if( vectorIdxParamsPutU64(pParams, VECTOR_BLOCK_SIZE_PARAM_ID, MAX(256, blockSizeBytes)) != 0 ){ return SQLITE_ERROR; } - if( vectorIdxParamsGetU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID) == 0 ){ - if( vectorIdxParamsPutU64(pParams, VECTOR_METRIC_TYPE_PARAM_ID, VECTOR_METRIC_TYPE_COS) != 0 ){ - return SQLITE_ERROR; - } - } + if( vectorIdxParamsGetF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID) == 0 ){ if( vectorIdxParamsPutF64(pParams, VECTOR_PRUNING_ALPHA_PARAM_ID, VECTOR_PRUNING_ALPHA_DEFAULT) != 0 ){ return SQLITE_ERROR; @@ -509,6 +555,7 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "index_key"; }else{ zSql = sqlite3MPrintf( db, @@ -518,9 +565,31 @@ int diskAnnCreateIndex( columnSqlDefs, columnSqlNames ); + zRowidColumnName = "rowid"; } rc = sqlite3_exec(db, zSql, 0, 0, 0); sqlite3DbFree(db, zSql); + if( rc != SQLITE_OK ){ + return rc; + } + /* + * vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB) + * in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell + * + * as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them + * so, in order to remove this overhead for random row selection - we creating an index with just single column used + * in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small + */ + zSql = sqlite3MPrintf( + db, + "CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)", + zDbSName, + zIdxName, + zIdxName, + zRowidColumnName + ); + rc = sqlite3_exec(db, zSql, 0, 0, 0); + sqlite3DbFree(db, zSql); return rc; } @@ -550,8 +619,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid) zSql = sqlite3MPrintf( pIndex->db, - "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)", - pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow + "SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)", + pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow ); if( zSql == NULL ){ rc = SQLITE_NOMEM_BKPT; @@ -795,6 +864,83 @@ static int diskAnnDeleteShadowRow(const DiskAnnIndex *pIndex, i64 nRowid){ return rc; } +/************************************************************************** +** Generic utilities +**************************************************************************/ + +int initVectorPair(int nodeType, int edgeType, int dims, VectorPair *pPair){ + pPair->nodeType = nodeType; + pPair->edgeType = edgeType; + pPair->pNode = NULL; + pPair->pEdge = NULL; + if( pPair->nodeType == pPair->edgeType ){ + return 0; + } + pPair->pEdge = vectorAlloc(edgeType, dims); + if( pPair->pEdge == NULL ){ + return SQLITE_NOMEM_BKPT; + } + return 0; +} + +void loadVectorPair(VectorPair *pPair, const Vector *pVector){ + pPair->pNode = (Vector*)pVector; + if( pPair->edgeType != pPair->nodeType ){ + vectorConvert(pPair->pNode, pPair->pEdge); + }else{ + pPair->pEdge = pPair->pNode; + } +} + +void deinitVectorPair(VectorPair *pPair) { + if( pPair->pEdge != NULL && pPair->pNode != pPair->pEdge ){ + vectorFree(pPair->pEdge); + } +} + +int distanceBufferInsertIdx(const float *aDistances, int nSize, int nMaxSize, float distance){ + int i; +#ifdef SQLITE_DEBUG + for(i = 0; i < nSize - 1; i++){ + assert(aDistances[i] <= aDistances[i + 1]); + } +#endif + for(i = 0; i < nSize; i++){ + if( distance < aDistances[i] ){ + return i; + } + } + return nSize < nMaxSize ? nSize : -1; +} + +void bufferInsert(u8 *aBuffer, int nSize, int nMaxSize, int iInsert, int nItemSize, const u8 *pItem, u8 *pLast) { + int itemsToMove; + + assert( nMaxSize > 0 && nItemSize > 0 ); + assert( nSize <= nMaxSize ); + assert( 0 <= iInsert && iInsert <= nSize && iInsert < nMaxSize ); + + if( nSize == nMaxSize ){ + if( pLast != NULL ){ + memcpy(pLast, aBuffer + (nSize - 1) * nItemSize, nItemSize); + } + nSize--; + } + itemsToMove = nSize - iInsert; + memmove(aBuffer + (iInsert + 1) * nItemSize, aBuffer + iInsert * nItemSize, itemsToMove * nItemSize); + memcpy(aBuffer + iInsert * nItemSize, pItem, nItemSize); +} + +void bufferDelete(u8 *aBuffer, int nSize, int iDelete, int nItemSize) { + int itemsToMove; + + assert( nItemSize > 0 ); + assert( 0 <= iDelete && iDelete < nSize ); + + itemsToMove = nSize - iDelete - 1; + memmove(aBuffer + iDelete * nItemSize, aBuffer + (iDelete + 1) * nItemSize, itemsToMove * nItemSize); +} + /************************************************************************** ** DiskANN internals **************************************************************************/ @@ -831,26 +977,40 @@ static void diskAnnNodeFree(DiskAnnNode *pNode){ sqlite3_free(pNode); } -static int diskAnnSearchCtxInit(DiskAnnSearchCtx *pCtx, const Vector* pQuery, unsigned int maxCandidates, int blobMode){ - pCtx->pQuery = pQuery; +static int diskAnnSearchCtxInit(const DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, const Vector* pQuery, int maxCandidates, int topCandidates, int blobMode){ + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &pCtx->query) != 0 ){ + return SQLITE_NOMEM_BKPT; + } + loadVectorPair(&pCtx->query, pQuery); + pCtx->aDistances = sqlite3_malloc(maxCandidates * sizeof(double)); pCtx->aCandidates = sqlite3_malloc(maxCandidates * sizeof(DiskAnnNode*)); pCtx->nCandidates = 0; pCtx->maxCandidates = maxCandidates; + pCtx->aTopDistances = sqlite3_malloc(topCandidates * sizeof(double)); + pCtx->aTopCandidates = sqlite3_malloc(topCandidates * sizeof(DiskAnnNode*)); + pCtx->nTopCandidates = 0; + pCtx->maxTopCandidates = topCandidates; pCtx->visitedList = NULL; pCtx->nUnvisited = 0; pCtx->blobMode = blobMode; - if( pCtx->aDistances == NULL || pCtx->aCandidates == NULL ){ - goto out_oom; + + if( pCtx->aDistances != NULL && pCtx->aCandidates != NULL && pCtx->aTopDistances != NULL && pCtx->aTopCandidates != NULL ){ + return SQLITE_OK; } - return SQLITE_OK; -out_oom: if( pCtx->aDistances != NULL ){ sqlite3_free(pCtx->aDistances); } if( pCtx->aCandidates != NULL ){ sqlite3_free(pCtx->aCandidates); } + if( pCtx->aTopDistances != NULL ){ + sqlite3_free(pCtx->aTopDistances); + } + if( pCtx->aTopCandidates != NULL ){ + sqlite3_free(pCtx->aTopCandidates); + } + deinitVectorPair(&pCtx->query); return SQLITE_NOMEM_BKPT; } @@ -874,6 +1034,9 @@ static void diskAnnSearchCtxDeinit(DiskAnnSearchCtx *pCtx){ } sqlite3_free(pCtx->aCandidates); sqlite3_free(pCtx->aDistances); + sqlite3_free(pCtx->aTopCandidates); + sqlite3_free(pCtx->aTopDistances); + deinitVectorPair(&pCtx->query); } // check if we visited this node earlier @@ -915,7 +1078,9 @@ static int diskAnnSearchCtxShouldAddCandidate(const DiskAnnIndex *pIndex, const } // mark node as visited and put it in the head of visitedList -static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNode){ +static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNode, float distance){ + int iInsert; + assert( pCtx->nUnvisited > 0 ); assert( pNode->visited == 0 ); @@ -924,56 +1089,51 @@ static void diskAnnSearchCtxMarkVisited(DiskAnnSearchCtx *pCtx, DiskAnnNode *pNo pNode->pNext = pCtx->visitedList; pCtx->visitedList = pNode; + + iInsert = distanceBufferInsertIdx(pCtx->aTopDistances, pCtx->nTopCandidates, pCtx->maxTopCandidates, distance); + if( iInsert < 0 ){ + return; + } + bufferInsert((u8*)pCtx->aTopCandidates, pCtx->nTopCandidates, pCtx->maxTopCandidates, iInsert, sizeof(DiskAnnNode*), (u8*)&pNode, NULL); + bufferInsert((u8*)pCtx->aTopDistances, pCtx->nTopCandidates, pCtx->maxTopCandidates, iInsert, sizeof(float), (u8*)&distance, NULL); + pCtx->nTopCandidates = MIN(pCtx->nTopCandidates + 1, pCtx->maxTopCandidates); } static int diskAnnSearchCtxHasUnvisited(const DiskAnnSearchCtx *pCtx){ return pCtx->nUnvisited > 0; } -static DiskAnnNode* diskAnnSearchCtxGetCandidate(DiskAnnSearchCtx *pCtx, int i){ +static void diskAnnSearchCtxGetCandidate(DiskAnnSearchCtx *pCtx, int i, DiskAnnNode **ppNode, float *pDistance){ assert( 0 <= i && i < pCtx->nCandidates ); - return pCtx->aCandidates[i]; + *ppNode = pCtx->aCandidates[i]; + *pDistance = pCtx->aDistances[i]; } static void diskAnnSearchCtxDeleteCandidate(DiskAnnSearchCtx *pCtx, int iDelete){ int i; - assert( 0 <= iDelete && iDelete < pCtx->nCandidates ); assert( pCtx->nUnvisited > 0 ); assert( !pCtx->aCandidates[iDelete]->visited ); assert( pCtx->aCandidates[iDelete]->pBlobSpot == NULL ); diskAnnNodeFree(pCtx->aCandidates[iDelete]); + bufferDelete((u8*)pCtx->aCandidates, pCtx->nCandidates, iDelete, sizeof(DiskAnnNode*)); + bufferDelete((u8*)pCtx->aDistances, pCtx->nCandidates, iDelete, sizeof(float)); - for(i = iDelete + 1; i < pCtx->nCandidates; i++){ - pCtx->aCandidates[i - 1] = pCtx->aCandidates[i]; - pCtx->aDistances[i - 1] = pCtx->aDistances[i]; - } pCtx->nCandidates--; pCtx->nUnvisited--; } -static void diskAnnSearchCtxInsertCandidate(DiskAnnSearchCtx *pCtx, int iInsert, DiskAnnNode* pCandidate, float candidateDist){ - int i; - assert( 0 <= iInsert && iInsert <= pCtx->nCandidates && iInsert < pCtx->maxCandidates ); - if( pCtx->nCandidates < pCtx->maxCandidates ){ - pCtx->nCandidates++; - } else { - DiskAnnNode *pLast = pCtx->aCandidates[pCtx->nCandidates - 1]; - if( !pLast->visited ){ - // since pLast is not visited it should have uninitialized pBlobSpot - so it's safe to completely free the node - assert( pLast->pBlobSpot == NULL ); - pCtx->nUnvisited--; - diskAnnNodeFree(pLast); - } - } - // Shift the candidates to the right to make space for the new one. - for(i = pCtx->nCandidates - 1; i > iInsert; i--){ - pCtx->aCandidates[i] = pCtx->aCandidates[i - 1]; - pCtx->aDistances[i] = pCtx->aDistances[i - 1]; +static void diskAnnSearchCtxInsertCandidate(DiskAnnSearchCtx *pCtx, int iInsert, DiskAnnNode* pCandidate, float distance){ + DiskAnnNode *pLast = NULL; + bufferInsert((u8*)pCtx->aCandidates, pCtx->nCandidates, pCtx->maxCandidates, iInsert, sizeof(DiskAnnNode*), (u8*)&pCandidate, (u8*)&pLast); + bufferInsert((u8*)pCtx->aDistances, pCtx->nCandidates, pCtx->maxCandidates, iInsert, sizeof(float), (u8*)&distance, NULL); + pCtx->nCandidates = MIN(pCtx->nCandidates + 1, pCtx->maxCandidates); + if( pLast != NULL && !pLast->visited ){ + // since pLast is not visited it should have uninitialized pBlobSpot - so it's safe to completely free the node + assert( pLast->pBlobSpot == NULL ); + pCtx->nUnvisited--; + diskAnnNodeFree(pLast); } - // Insert the new candidate. - pCtx->aCandidates[iInsert] = pCandidate; - pCtx->aDistances[iInsert] = candidateDist; pCtx->nUnvisited++; } @@ -1003,7 +1163,14 @@ static int diskAnnSearchCtxFindClosestCandidateIdx(const DiskAnnSearchCtx *pCtx) // return position for new edge(C) which will replace previous edge on that position or -1 if we should ignore it // we also check that no current edge(B) will "prune" new vertex: i.e. dist(B, C) >= (means worse than) alpha * dist(node, C) for all current edges // if any edge(B) will "prune" new edge(C) we will ignore it (return -1) -static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, u64 newRowid, const Vector *pNewVector) { +static int diskAnnReplaceEdgeIdx( + const DiskAnnIndex *pIndex, + BlobSpot *pNodeBlob, + u64 newRowid, + VectorPair *pNewVector, + VectorPair *pPlaceholder, + float *pNodeToNew +) { int i, nEdges, nMaxEdges, iReplace = -1; Vector nodeVector, edgeVector; float nodeToNew, nodeToReplace; @@ -1011,20 +1178,27 @@ static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob nEdges = nodeBinEdges(pIndex, pNodeBlob); nMaxEdges = nodeEdgesMaxCount(pIndex); nodeBinVector(pIndex, pNodeBlob, &nodeVector); - nodeToNew = diskAnnVectorDistance(pIndex, &nodeVector, pNewVector); + loadVectorPair(pPlaceholder, &nodeVector); + + // we need to evaluate potentially approximate distance here in order to correctly compare it with edge distances + nodeToNew = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, pNewVector->pEdge); + *pNodeToNew = nodeToNew; for(i = nEdges - 1; i >= 0; i--){ u64 edgeRowid; float edgeToNew, nodeToEdge; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &nodeToEdge, &edgeVector); if( edgeRowid == newRowid ){ // deletes can leave "zombie" edges in the graph and we must override them and not store duplicate edges in the node return i; } - edgeToNew = diskAnnVectorDistance(pIndex, &edgeVector, pNewVector); - nodeToEdge = diskAnnVectorDistance(pIndex, &nodeVector, &edgeVector); + if( pIndex->nFormatVersion == VECTOR_FORMAT_V1 ){ + nodeToEdge = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, &edgeVector); + } + + edgeToNew = diskAnnVectorDistance(pIndex, &edgeVector, pNewVector->pEdge); if( nodeToNew > pIndex->pruningAlpha * edgeToNew ){ return -1; } @@ -1042,12 +1216,14 @@ static int diskAnnReplaceEdgeIdx(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob // prune edges after we inserted new edge at position iInserted // we only need to check for edges which will be pruned by new vertex // no need to check for other pairs as we checked them on previous insertions -static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, int iInserted) { +static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, int iInserted, VectorPair *pPlaceholder) { int i, s, nEdges; - Vector nodeVector, hintVector; + Vector nodeVector, hintEdgeVector; u64 hintRowid; nodeBinVector(pIndex, pNodeBlob, &nodeVector); + loadVectorPair(pPlaceholder, &nodeVector); + nEdges = nodeBinEdges(pIndex, pNodeBlob); assert( 0 <= iInserted && iInserted < nEdges ); @@ -1057,7 +1233,7 @@ static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, i nodeBinDebug(pIndex, pNodeBlob); #endif - nodeBinEdge(pIndex, pNodeBlob, iInserted, &hintRowid, &hintVector); + nodeBinEdge(pIndex, pNodeBlob, iInserted, &hintRowid, NULL, &hintEdgeVector); // remove edges which is no longer interesting due to the addition of iInserted i = 0; @@ -1065,14 +1241,17 @@ static void diskAnnPruneEdges(const DiskAnnIndex *pIndex, BlobSpot *pNodeBlob, i Vector edgeVector; float nodeToEdge, hintToEdge; u64 edgeRowid; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, &nodeToEdge, &edgeVector); if( hintRowid == edgeRowid ){ i++; continue; } - nodeToEdge = diskAnnVectorDistance(pIndex, &nodeVector, &edgeVector); - hintToEdge = diskAnnVectorDistance(pIndex, &hintVector, &edgeVector); + if( pIndex->nFormatVersion == VECTOR_FORMAT_V1 ){ + nodeToEdge = diskAnnVectorDistance(pIndex, pPlaceholder->pEdge, &edgeVector); + } + + hintToEdge = diskAnnVectorDistance(pIndex, &hintEdgeVector, &edgeVector); if( nodeToEdge > pIndex->pruningAlpha * hintToEdge ){ nodeBinDeleteEdge(pIndex, pNodeBlob, i); nEdges--; @@ -1121,7 +1300,7 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u } nodeBinVector(pIndex, start->pBlobSpot, &startVector); - startDistance = diskAnnVectorDistance(pIndex, pCtx->pQuery, &startVector); + startDistance = diskAnnVectorDistance(pIndex, pCtx->query.pNode, &startVector); if( pCtx->blobMode == DISKANN_BLOB_READONLY ){ assert( start->pBlobSpot != NULL ); @@ -1138,8 +1317,9 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u Vector vCandidate; DiskAnnNode *pCandidate; BlobSpot *pCandidateBlob; + float distance; int iCandidate = diskAnnSearchCtxFindClosestCandidateIdx(pCtx); - pCandidate = diskAnnSearchCtxGetCandidate(pCtx, iCandidate); + diskAnnSearchCtxGetCandidate(pCtx, iCandidate, &pCandidate, &distance); rc = SQLITE_OK; if( pReusableBlobSpot != NULL ){ @@ -1167,25 +1347,30 @@ static int diskAnnSearchInternal(DiskAnnIndex *pIndex, DiskAnnSearchCtx *pCtx, u goto out; } - diskAnnSearchCtxMarkVisited(pCtx, pCandidate); - nVisited += 1; DiskAnnTrace(("visiting candidate(%d): id=%lld\n", nVisited, pCandidate->nRowid)); nodeBinVector(pIndex, pCandidateBlob, &vCandidate); nEdges = nodeBinEdges(pIndex, pCandidateBlob); + // if pNodeQuery != pEdgeQuery then distance from aDistances is approximate and we must recalculate it + if( pCtx->query.pNode != pCtx->query.pEdge ){ + distance = diskAnnVectorDistance(pIndex, &vCandidate, pCtx->query.pNode); + } + + diskAnnSearchCtxMarkVisited(pCtx, pCandidate, distance); + for(i = 0; i < nEdges; i++){ u64 edgeRowid; Vector edgeVector; float edgeDistance; int iInsert; DiskAnnNode *pNewCandidate; - nodeBinEdge(pIndex, pCandidateBlob, i, &edgeRowid, &edgeVector); + nodeBinEdge(pIndex, pCandidateBlob, i, &edgeRowid, NULL, &edgeVector); if( diskAnnSearchCtxIsVisited(pCtx, edgeRowid) || diskAnnSearchCtxHasCandidate(pCtx, edgeRowid) ){ continue; } - edgeDistance = diskAnnVectorDistance(pIndex, pCtx->pQuery, &edgeVector); + edgeDistance = diskAnnVectorDistance(pIndex, pCtx->query.pEdge, &edgeVector); iInsert = diskAnnSearchCtxShouldAddCandidate(pIndex, pCtx, edgeDistance); if( iInsert < 0 ){ continue; @@ -1243,12 +1428,12 @@ int diskAnnSearch( *pzErrMsg = sqlite3_mprintf("vector index(search): k must be a non-negative integer"); return SQLITE_ERROR; } - if( pIndex->nVectorDims != pVector->dims ){ + if( pVector->dims != pIndex->nVectorDims ){ *pzErrMsg = sqlite3_mprintf("vector index(search): dimensions are different: %d != %d", pVector->dims, pIndex->nVectorDims); return SQLITE_ERROR; } - if( pVector->type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(search): only f32 vectors are supported"); + if( pVector->type != pIndex->nNodeVectorType ){ + *pzErrMsg = sqlite3_mprintf("vector index(search): vector type differs from column type: %d != %d", pVector->type, pIndex->nNodeVectorType); return SQLITE_ERROR; } @@ -1262,7 +1447,7 @@ int diskAnnSearch( *pzErrMsg = sqlite3_mprintf("vector index(search): failed to select start node for search"); return rc; } - rc = diskAnnSearchCtxInit(&ctx, pVector, pIndex->searchL, DISKANN_BLOB_READONLY); + rc = diskAnnSearchCtxInit(pIndex, &ctx, pVector, pIndex->searchL, k, DISKANN_BLOB_READONLY); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to initialize search context"); goto out; @@ -1271,7 +1456,7 @@ int diskAnnSearch( if( rc != SQLITE_OK ){ goto out; } - nOutRows = MIN(k, ctx.nCandidates); + nOutRows = MIN(k, ctx.nTopCandidates); rc = vectorOutRowsAlloc(pIndex->db, pRows, nOutRows, pKey->nKeyColumns, vectorIdxKeyRowidLike(pKey)); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to allocate output rows"); @@ -1279,9 +1464,9 @@ int diskAnnSearch( } for(i = 0; i < nOutRows; i++){ if( pRows->aIntValues != NULL ){ - rc = vectorOutRowsPut(pRows, i, 0, &ctx.aCandidates[i]->nRowid, NULL); + rc = vectorOutRowsPut(pRows, i, 0, &ctx.aTopCandidates[i]->nRowid, NULL); }else{ - rc = diskAnnGetShadowRowKeys(pIndex, ctx.aCandidates[i]->nRowid, pKey, pRows, i); + rc = diskAnnGetShadowRowKeys(pIndex, ctx.aTopCandidates[i]->nRowid, pKey, pRows, i); } if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(search): failed to put result in the output row"); @@ -1305,24 +1490,39 @@ int diskAnnInsert( BlobSpot *pBlobSpot = NULL; DiskAnnNode *pVisited; DiskAnnSearchCtx ctx; + VectorPair vInsert, vCandidate; + vInsert.pNode = NULL; vInsert.pEdge = NULL; + vCandidate.pNode = NULL; vCandidate.pEdge = NULL; if( pVectorInRow->pVector->dims != pIndex->nVectorDims ){ *pzErrMsg = sqlite3_mprintf("vector index(insert): dimensions are different: %d != %d", pVectorInRow->pVector->dims, pIndex->nVectorDims); return SQLITE_ERROR; } - if( pVectorInRow->pVector->type != VECTOR_TYPE_FLOAT32 ){ - *pzErrMsg = sqlite3_mprintf("vector index(insert): only f32 vectors are supported"); + if( pVectorInRow->pVector->type != pIndex->nNodeVectorType ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): vector type differs from column type: %d != %d", pVectorInRow->pVector->type, pIndex->nNodeVectorType); return SQLITE_ERROR; } DiskAnnTrace(("diskAnnInset started\n")); - rc = diskAnnSearchCtxInit(&ctx, pVectorInRow->pVector, pIndex->insertL, DISKANN_BLOB_WRITABLE); + rc = diskAnnSearchCtxInit(pIndex, &ctx, pVectorInRow->pVector, pIndex->insertL, 1, DISKANN_BLOB_WRITABLE); if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(insert): failed to initialize search context"); return rc; } + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &vInsert) != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): unable to allocate mem for node VectorPair"); + rc = SQLITE_NOMEM_BKPT; + goto out; + } + + if( initVectorPair(pIndex->nNodeVectorType, pIndex->nEdgeVectorType, pIndex->nVectorDims, &vCandidate) != 0 ){ + *pzErrMsg = sqlite3_mprintf("vector index(insert): unable to allocate mem for candidate VectorPair"); + rc = SQLITE_NOMEM_BKPT; + goto out; + } + // note: we must select random row before we will insert new row in the shadow table rc = diskAnnSelectRandomShadowRow(pIndex, &nStartRowid); if( rc == SQLITE_DONE ){ @@ -1360,28 +1560,33 @@ int diskAnnInsert( } // first pass - add all visited nodes as a potential neighbours of new node for(pVisited = ctx.visitedList; pVisited != NULL; pVisited = pVisited->pNext){ - Vector vector; + Vector nodeVector; int iReplace; + float nodeToNew; + + nodeBinVector(pIndex, pVisited->pBlobSpot, &nodeVector); + loadVectorPair(&vCandidate, &nodeVector); - nodeBinVector(pIndex, pVisited->pBlobSpot, &vector); - iReplace = diskAnnReplaceEdgeIdx(pIndex, pBlobSpot, pVisited->nRowid, &vector); + iReplace = diskAnnReplaceEdgeIdx(pIndex, pBlobSpot, pVisited->nRowid, &vCandidate, &vInsert, &nodeToNew); if( iReplace == -1 ){ continue; } - nodeBinReplaceEdge(pIndex, pBlobSpot, iReplace, pVisited->nRowid, &vector); - diskAnnPruneEdges(pIndex, pBlobSpot, iReplace); + nodeBinReplaceEdge(pIndex, pBlobSpot, iReplace, pVisited->nRowid, nodeToNew, vCandidate.pEdge); + diskAnnPruneEdges(pIndex, pBlobSpot, iReplace, &vInsert); } // second pass - add new node as a potential neighbour of all visited nodes + loadVectorPair(&vInsert, pVectorInRow->pVector); for(pVisited = ctx.visitedList; pVisited != NULL; pVisited = pVisited->pNext){ int iReplace; + float nodeToNew; - iReplace = diskAnnReplaceEdgeIdx(pIndex, pVisited->pBlobSpot, nNewRowid, pVectorInRow->pVector); + iReplace = diskAnnReplaceEdgeIdx(pIndex, pVisited->pBlobSpot, nNewRowid, &vInsert, &vCandidate, &nodeToNew); if( iReplace == -1 ){ continue; } - nodeBinReplaceEdge(pIndex, pVisited->pBlobSpot, iReplace, nNewRowid, pVectorInRow->pVector); - diskAnnPruneEdges(pIndex, pVisited->pBlobSpot, iReplace); + nodeBinReplaceEdge(pIndex, pVisited->pBlobSpot, iReplace, nNewRowid, nodeToNew, vInsert.pEdge); + diskAnnPruneEdges(pIndex, pVisited->pBlobSpot, iReplace, &vCandidate); rc = blobSpotFlush(pIndex, pVisited->pBlobSpot); if( rc != SQLITE_OK ){ @@ -1392,6 +1597,8 @@ int diskAnnInsert( rc = SQLITE_OK; out: + deinitVectorPair(&vInsert); + deinitVectorPair(&vCandidate); if( rc == SQLITE_OK ){ rc = blobSpotFlush(pIndex, pBlobSpot); if( rc != SQLITE_OK ){ @@ -1426,7 +1633,12 @@ int diskAnnDelete( DiskAnnTrace(("diskAnnDelete started: rowid=%lld\n", nodeRowid)); rc = blobSpotCreate(pIndex, &pNodeBlob, nodeRowid, pIndex->nBlockSize, DISKANN_BLOB_WRITABLE); - if( rc != SQLITE_OK ){ + if( rc == DISKANN_ROW_NOT_FOUND ){ + // as we omit rows with NULL values during insert, it can be the case that there is nothing to delete in the index, while row exists in the base table + // so, we must simply silently stop delete process as there is nothing to delete from index + rc = SQLITE_OK; + goto out; + }else if( rc != SQLITE_OK ){ *pzErrMsg = sqlite3_mprintf("vector index(delete): failed to create blob for node row"); goto out; } @@ -1443,7 +1655,7 @@ int diskAnnDelete( nNeighbours = nodeBinEdges(pIndex, pNodeBlob); for(i = 0; i < nNeighbours; i++){ u64 edgeRowid; - nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, NULL); + nodeBinEdge(pIndex, pNodeBlob, i, &edgeRowid, NULL, NULL); rc = blobSpotReload(pIndex, pEdgeBlob, edgeRowid, pIndex->nBlockSize); if( rc == DISKANN_ROW_NOT_FOUND ){ continue; @@ -1490,6 +1702,7 @@ int diskAnnOpenIndex( ){ DiskAnnIndex *pIndex; u64 nBlockSize; + int compressNeighbours; pIndex = sqlite3DbMallocRaw(db, sizeof(DiskAnnIndex)); if( pIndex == NULL ){ return SQLITE_NOMEM; @@ -1536,11 +1749,18 @@ int diskAnnOpenIndex( pIndex->searchL = VECTOR_SEARCH_L_DEFAULT; } pIndex->nNodeVectorSize = vectorDataSize(pIndex->nNodeVectorType, pIndex->nVectorDims); - // will change in future when we will support compression of edges vectors - pIndex->nEdgeVectorType = pIndex->nNodeVectorType; - pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize; + + compressNeighbours = vectorIdxParamsGetU64(pParams, VECTOR_COMPRESS_NEIGHBORS_PARAM_ID); + if( compressNeighbours == 0 ){ + pIndex->nEdgeVectorType = pIndex->nNodeVectorType; + pIndex->nEdgeVectorSize = pIndex->nNodeVectorSize; + }else{ + pIndex->nEdgeVectorType = compressNeighbours; + pIndex->nEdgeVectorSize = vectorDataSize(compressNeighbours, pIndex->nVectorDims); + } *ppIndex = pIndex; + DiskAnnTrace(("opened index %s: max edges %d\n", zIdxName, nodeEdgesMaxCount(pIndex))); return SQLITE_OK; } diff --git a/libsql-sqlite3/src/vectorfloat1bit.c b/libsql-sqlite3/src/vectorfloat1bit.c new file mode 100644 index 0000000000..a9c4d45f26 --- /dev/null +++ b/libsql-sqlite3/src/vectorfloat1bit.c @@ -0,0 +1,140 @@ +/* +** 2024-07-04 +** +** Copyright 2024 the libSQL authors +** +** Permission is hereby granted, free of charge, to any person obtaining a copy of +** this software and associated documentation files (the "Software"), to deal in +** the Software without restriction, including without limitation the rights to +** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +** the Software, and to permit persons to whom the Software is furnished to do so, +** subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +****************************************************************************** +** +** 1-bit vector format utilities. +*/ +#ifndef SQLITE_OMIT_VECTOR +#include "sqliteInt.h" + +#include "vectorInt.h" + +#include + +/************************************************************************** +** Utility routines for debugging +**************************************************************************/ + +void vector1BitDump(const Vector *pVec){ + u8 *elems = pVec->data; + unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT1BIT ); + + printf("f1bit: ["); + for(i = 0; i < pVec->dims; i++){ + printf("%s%d", i == 0 ? "" : ", ", ((elems[i / 8] >> (i & 7)) & 1) ? +1 : -1); + } + printf("]\n"); +} + +/************************************************************************** +** Utility routines for vector serialization and deserialization +**************************************************************************/ + +void vector1BitSerializeToBlob( + const Vector *pVector, + unsigned char *pBlob, + size_t nBlobSize +){ + u8 *elems = pVector->data; + u8 *pPtr = pBlob; + unsigned i; + + assert( pVector->type == VECTOR_TYPE_FLOAT1BIT ); + assert( pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + for(i = 0; i < (pVector->dims + 7) / 8; i++){ + pPtr[i] = elems[i]; + } +} + +// [sum(map(int, bin(i)[2:])) for i in range(256)] +static int BitsCount[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; + +static inline int sqlite3PopCount32(u32 a){ +#if GCC_VERSION>=5004000 && !defined(__INTEL_COMPILER) + return __builtin_popcount(a); +#else + return BitsCount[a >> 24] + BitsCount[(a >> 16) & 0xff] + BitsCount[(a >> 8) & 0xff] + BitsCount[a & 0xff]; +#endif +} + +int vector1BitDistanceHamming(const Vector *v1, const Vector *v2){ + int diff = 0; + u8 *e1U8 = v1->data; + u32 *e1U32 = v1->data; + u8 *e2U8 = v2->data; + u32 *e2U32 = v2->data; + int i, len8, len32, offset8; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT1BIT ); + assert( v2->type == VECTOR_TYPE_FLOAT1BIT ); + + len8 = (v1->dims + 7) / 8; + len32 = v1->dims / 32; + offset8 = len32 * 4; + + for(i = 0; i < len32; i++){ + diff += sqlite3PopCount32(e1U32[i] ^ e2U32[i]); + } + for(i = offset8; i < len8; i++){ + diff += sqlite3PopCount32(e1U8[i] ^ e2U8[i]); + } + return diff; +} + +void vector1BitDeserializeFromBlob( + Vector *pVector, + const unsigned char *pBlob, + size_t nBlobSize +){ + u8 *elems = pVector->data; + + assert( pVector->type == VECTOR_TYPE_FLOAT1BIT ); + assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy(elems, pBlob, (pVector->dims + 7) / 8); +} + +#endif /* !defined(SQLITE_OMIT_VECTOR) */ diff --git a/libsql-sqlite3/src/vectorfloat32.c b/libsql-sqlite3/src/vectorfloat32.c index 8aeae2eb23..56d022ae9c 100644 --- a/libsql-sqlite3/src/vectorfloat32.c +++ b/libsql-sqlite3/src/vectorfloat32.c @@ -41,10 +41,11 @@ void vectorF32Dump(const Vector *pVec){ assert( pVec->type == VECTOR_TYPE_FLOAT32 ); + printf("f32: ["); for(i = 0; i < pVec->dims; i++){ - printf("%f ", elems[i]); + printf("%s%f", i == 0 ? "" : ", ", elems[i]); } - printf("\n"); + printf("]\n"); } /************************************************************************** @@ -56,25 +57,7 @@ static inline unsigned formatF32(float value, char *pBuf, int nBufSize){ return strlen(pBuf); } -static inline unsigned serializeF32(unsigned char *pBuf, float value){ - u32 *p = (u32 *)&value; - pBuf[0] = *p & 0xFF; - pBuf[1] = (*p >> 8) & 0xFF; - pBuf[2] = (*p >> 16) & 0xFF; - pBuf[3] = (*p >> 24) & 0xFF; - return sizeof(float); -} - -static inline float deserializeF32(const unsigned char *pBuf){ - u32 value = 0; - value |= (u32)pBuf[0]; - value |= (u32)pBuf[1] << 8; - value |= (u32)pBuf[2] << 16; - value |= (u32)pBuf[3] << 24; - return *(float *)&value; -} - -size_t vectorF32SerializeToBlob( +void vectorF32SerializeToBlob( const Vector *pVector, unsigned char *pBlob, size_t nBlobSize @@ -86,60 +69,11 @@ size_t vectorF32SerializeToBlob( assert( pVector->type == VECTOR_TYPE_FLOAT32 ); assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize >= pVector->dims * sizeof(float) ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ pPtr += serializeF32(pPtr, elems[i]); } - return sizeof(float) * pVector->dims; -} - -size_t vectorF32DeserializeFromBlob( - Vector *pVector, - const unsigned char *pBlob, - size_t nBlobSize -){ - float *elems = pVector->data; - unsigned i; - pVector->type = VECTOR_TYPE_FLOAT32; - pVector->dims = nBlobSize / sizeof(float); - - assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize % 2 == 0 || pBlob[nBlobSize - 1] == VECTOR_TYPE_FLOAT32 ); - - for(i = 0; i < pVector->dims; i++){ - elems[i] = deserializeF32(pBlob); - pBlob += sizeof(float); - } - return vectorDataSize(pVector->type, pVector->dims); -} - -void vectorF32Serialize( - sqlite3_context *context, - const Vector *pVector -){ - float *elems = pVector->data; - unsigned char *pBlob; - size_t nBlobSize; - - assert( pVector->type == VECTOR_TYPE_FLOAT32 ); - assert( pVector->dims <= MAX_VECTOR_SZ ); - - nBlobSize = vectorDataSize(pVector->type, pVector->dims); - - if( nBlobSize == 0 ){ - sqlite3_result_zeroblob(context, 0); - return; - } - - pBlob = sqlite3_malloc64(nBlobSize); - if( pBlob == NULL ){ - sqlite3_result_error_nomem(context); - return; - } - - vectorF32SerializeToBlob(pVector, pBlob, nBlobSize); - sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); } #define SINGLE_FLOAT_CHAR_LIMIT 32 @@ -215,37 +149,22 @@ float vectorF32DistanceL2(const Vector *v1, const Vector *v2){ return sqrt(sum); } -void vectorF32InitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - pVector->dims = nBlobSize / sizeof(float); - pVector->data = (void*)pBlob; -} - -int vectorF32ParseSqliteBlob( - sqlite3_value *arg, +void vectorF32DeserializeFromBlob( Vector *pVector, - char **pzErr + const unsigned char *pBlob, + size_t nBlobSize ){ - const unsigned char *pBlob; float *elems = pVector->data; unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT32 ); assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); - assert( sqlite3_value_type(arg) == SQLITE_BLOB ); - - pBlob = sqlite3_value_blob(arg); - if( sqlite3_value_bytes(arg) < sizeof(float) * pVector->dims ){ - *pzErr = sqlite3_mprintf("invalid f32 vector: not enough bytes for all dimensions"); - goto error; - } + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ elems[i] = deserializeF32(pBlob); pBlob += sizeof(float); } - return 0; -error: - return -1; } #endif /* !defined(SQLITE_OMIT_VECTOR) */ diff --git a/libsql-sqlite3/src/vectorfloat64.c b/libsql-sqlite3/src/vectorfloat64.c index ced2be1843..dca6bda773 100644 --- a/libsql-sqlite3/src/vectorfloat64.c +++ b/libsql-sqlite3/src/vectorfloat64.c @@ -38,10 +38,14 @@ void vectorF64Dump(const Vector *pVec){ double *elems = pVec->data; unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT64 ); + + printf("f64: ["); for(i = 0; i < pVec->dims; i++){ - printf("%lf ", elems[i]); + printf("%s%lf", i == 0 ? "" : ", ", elems[i]); } - printf("\n"); + printf("]\n"); } /************************************************************************** @@ -79,7 +83,7 @@ static inline double deserializeF64(const unsigned char *pBuf){ return *(double *)&value; } -size_t vectorF64SerializeToBlob( +void vectorF64SerializeToBlob( const Vector *pVector, unsigned char *pBlob, size_t nBlobSize @@ -90,63 +94,11 @@ size_t vectorF64SerializeToBlob( assert( pVector->type == VECTOR_TYPE_FLOAT64 ); assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize >= pVector->dims * sizeof(double) ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for (i = 0; i < pVector->dims; i++) { pPtr += serializeF64(pPtr, elems[i]); } - return sizeof(double) * pVector->dims; -} - -size_t vectorF64DeserializeFromBlob( - Vector *pVector, - const unsigned char *pBlob, - size_t nBlobSize -){ - double *elems = pVector->data; - unsigned i; - pVector->type = VECTOR_TYPE_FLOAT64; - pVector->dims = nBlobSize / sizeof(double); - - assert( pVector->dims <= MAX_VECTOR_SZ ); - assert( nBlobSize % 2 == 1 && pBlob[nBlobSize - 1] == VECTOR_TYPE_FLOAT64 ); - - for(i = 0; i < pVector->dims; i++){ - elems[i] = deserializeF64(pBlob); - pBlob += sizeof(double); - } - return vectorDataSize(pVector->type, pVector->dims); -} - -void vectorF64Serialize( - sqlite3_context *context, - const Vector *pVector -){ - double *elems = pVector->data; - unsigned char *pBlob; - size_t nBlobSize; - - assert( pVector->type == VECTOR_TYPE_FLOAT64 ); - assert( pVector->dims <= MAX_VECTOR_SZ ); - - // allocate one extra trailing byte with vector blob type metadata - nBlobSize = vectorDataSize(pVector->type, pVector->dims) + 1; - - if( nBlobSize == 0 ){ - sqlite3_result_zeroblob(context, 0); - return; - } - - pBlob = sqlite3_malloc64(nBlobSize); - if( pBlob == NULL ){ - sqlite3_result_error_nomem(context); - return; - } - - vectorF64SerializeToBlob(pVector, pBlob, nBlobSize - 1); - pBlob[nBlobSize - 1] = VECTOR_TYPE_FLOAT64; - - sqlite3_result_blob(context, (char*)pBlob, nBlobSize, sqlite3_free); } #define SINGLE_DOUBLE_CHAR_LIMIT 32 @@ -222,37 +174,22 @@ double vectorF64DistanceL2(const Vector *v1, const Vector *v2){ return sqrt(sum); } -void vectorF64InitFromBlob(Vector *pVector, const unsigned char *pBlob, size_t nBlobSize){ - pVector->dims = nBlobSize / sizeof(double); - pVector->data = (void*)pBlob; -} - -int vectorF64ParseSqliteBlob( - sqlite3_value *arg, +void vectorF64DeserializeFromBlob( Vector *pVector, - char **pzErr + const unsigned char *pBlob, + size_t nBlobSize ){ - const unsigned char *pBlob; double *elems = pVector->data; unsigned i; assert( pVector->type == VECTOR_TYPE_FLOAT64 ); assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); - assert( sqlite3_value_type(arg) == SQLITE_BLOB ); - - pBlob = sqlite3_value_blob(arg); - if( sqlite3_value_bytes(arg) < sizeof(double) * pVector->dims ){ - *pzErr = sqlite3_mprintf("invalid f64 vector: not enough bytes for all dimensions"); - goto error; - } + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); for(i = 0; i < pVector->dims; i++){ elems[i] = deserializeF64(pBlob); pBlob += sizeof(double); } - return 0; -error: - return -1; } #endif /* !defined(SQLITE_OMIT_VECTOR) */ diff --git a/libsql-sqlite3/src/vectorfloat8.c b/libsql-sqlite3/src/vectorfloat8.c new file mode 100644 index 0000000000..dd02d839b4 --- /dev/null +++ b/libsql-sqlite3/src/vectorfloat8.c @@ -0,0 +1,162 @@ +/* +** 2024-07-04 +** +** Copyright 2024 the libSQL authors +** +** Permission is hereby granted, free of charge, to any person obtaining a copy of +** this software and associated documentation files (the "Software"), to deal in +** the Software without restriction, including without limitation the rights to +** use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +** the Software, and to permit persons to whom the Software is furnished to do so, +** subject to the following conditions: +** +** The above copyright notice and this permission notice shall be included in all +** copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +** FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +** COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +** IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +****************************************************************************** +** +** 8-bit (INT8) floating point vector format utilities. +** +** The idea is to replace vector [f_0, f_1, ... f_k] with quantized uint8 values [q_0, q_1, ..., q_k] in such a way that +** f_i = alpha * q_i + shift, when alpha and shift determined from all f_i values like that: +** alpha = (max(f) - min(f)) / 255, shift = min(f) +** +** This differs from uint8 quantization in neural-network as it usually take form of f_i = alpha * (q_i - z) conversion instead +** But, neural-network uint8 quantization is less generic and works better for distributions centered around zero (symmetric or not) +** In our implementation we want to handle more generic cases - so profits from neural-network-style quantization are not clear +*/ +#ifndef SQLITE_OMIT_VECTOR +#include "sqliteInt.h" + +#include "vectorInt.h" + +#include + +/************************************************************************** +** Utility routines for vector serialization and deserialization +**************************************************************************/ + +void vectorF8GetParameters(const u8 *pData, int dims, float *pAlpha, float *pShift){ + pData = pData + ALIGN(dims, sizeof(float)); + *pAlpha = deserializeF32(pData); + *pShift = deserializeF32(pData + sizeof(*pAlpha)); +} + +void vectorF8SetParameters(u8 *pData, int dims, float alpha, float shift){ + pData = pData + ALIGN(dims, sizeof(float)); + serializeF32(pData, alpha); + serializeF32(pData + sizeof(alpha), shift); +} + +void vectorF8Dump(const Vector *pVec){ + u8 *elems = pVec->data; + float alpha, shift; + unsigned i; + + assert( pVec->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(pVec->data, pVec->dims, &alpha, &shift); + + printf("f8: ["); + for(i = 0; i < pVec->dims; i++){ + printf("%s%f", i == 0 ? "" : ", ", (float)elems[i] * alpha + shift); + } + printf("]\n"); +} + +void vectorF8SerializeToBlob( + const Vector *pVector, + unsigned char *pBlob, + size_t nBlobSize +){ + float alpha, shift; + + assert( pVector->type == VECTOR_TYPE_FLOAT8 ); + assert( pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy(pBlob, pVector->data, pVector->dims); + + vectorF8GetParameters(pVector->data, pVector->dims, &alpha, &shift); + vectorF8SetParameters(pBlob, pVector->dims, alpha, shift); +} + +float vectorF8DistanceCos(const Vector *v1, const Vector *v2){ + int i; + float alpha1, shift1, alpha2, shift2; + u32 sum1 = 0, sum2 = 0, sumsq1 = 0, sumsq2 = 0, doti = 0; + float dot = 0, norm1 = 0, norm2 = 0; + u8 *data1 = v1->data, *data2 = v2->data; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT8 ); + assert( v2->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(v1->data, v1->dims, &alpha1, &shift1); + vectorF8GetParameters(v2->data, v2->dims, &alpha2, &shift2); + + /* + * (Ax + S)^2 = A^2 x^2 + 2AS x + S^2 -> we need to maintain 'sumsq' and 'sum' + * (A1x + S1) * (A2y + S2) = A1A2 xy + A1 S2 x + A2 S1 y + S1 S2 -> we need to maintain 'dot' and 'sum' again + */ + + for(i = 0; i < v1->dims; i++){ + sum1 += data1[i]; + sum2 += data2[i]; + sumsq1 += data1[i]*data1[i]; + sumsq2 += data2[i]*data2[i]; + doti += data1[i]*data2[i]; + } + + dot = alpha1 * alpha2 * (float)doti + alpha1 * shift2 * (float)sum1 + alpha2 * shift1 * (float)sum2 + shift1 * shift2 * v1->dims; + norm1 = alpha1 * alpha1 * (float)sumsq1 + 2 * alpha1 * shift1 * (float)sum1 + shift1 * shift1 * v1->dims; + norm2 = alpha2 * alpha2 * (float)sumsq2 + 2 * alpha2 * shift2 * (float)sum2 + shift2 * shift2 * v1->dims; + + return 1.0 - (dot / sqrt(norm1 * norm2)); +} + +float vectorF8DistanceL2(const Vector *v1, const Vector *v2){ + int i; + float alpha1, shift1, alpha2, shift2; + float sum = 0; + u8 *data1 = v1->data, *data2 = v2->data; + + assert( v1->dims == v2->dims ); + assert( v1->type == VECTOR_TYPE_FLOAT8 ); + assert( v2->type == VECTOR_TYPE_FLOAT8 ); + + vectorF8GetParameters(v1->data, v1->dims, &alpha1, &shift1); + vectorF8GetParameters(v2->data, v2->dims, &alpha2, &shift2); + + for(i = 0; i < v1->dims; i++){ + float d = (alpha1 * data1[i] + shift1) - (alpha2 * data2[i] + shift2); + sum += d*d; + } + return sqrt(sum); +} + +void vectorF8DeserializeFromBlob( + Vector *pVector, + const unsigned char *pBlob, + size_t nBlobSize +){ + float alpha, shift; + + assert( pVector->type == VECTOR_TYPE_FLOAT8 ); + assert( 0 <= pVector->dims && pVector->dims <= MAX_VECTOR_SZ ); + assert( nBlobSize >= vectorDataSize(pVector->type, pVector->dims) ); + + memcpy((u8*)pVector->data, (u8*)pBlob, ALIGN(pVector->dims, sizeof(float))); + + vectorF8GetParameters(pBlob, pVector->dims, &alpha, &shift); + vectorF8SetParameters(pVector->data, pVector->dims, alpha, shift); +} + +#endif /* !defined(SQLITE_OMIT_VECTOR) */ diff --git a/libsql-sqlite3/test/libsql_vector.test b/libsql-sqlite3/test/libsql_vector.test index cf91a7fa18..31d1edc843 100644 --- a/libsql-sqlite3/test/libsql_vector.test +++ b/libsql-sqlite3/test/libsql_vector.test @@ -50,6 +50,21 @@ do_execsql_test vector-1-func-valid { SELECT vector_distance_cos('[1,1]', '[-1,-1]'); SELECT vector_distance_cos('[1,1]', '[-1,1]'); SELECT vector_distance_cos('[1,2]', '[2,1]'); + SELECT vector_distance_cos(vector1bit('[10,-10]'), vector1bit('[-5,4]')); + SELECT vector_distance_cos(vector1bit('[10,-10]'), vector1bit('[20,4]')); + SELECT vector_distance_cos(vector1bit('[10,-10]'), vector1bit('[20,-2]')); + + SELECT vector_distance_cos(vector8('[10,-10]'), vector8('[10,-10]')); + SELECT vector_distance_cos(vector32('[10,-10]'), vector32('[10,-10]')); + + SELECT vector_distance_cos(vector8('[-21,-31,0,2,2.1,2.2,105]'), vector8('[-20,-30,0,1,1.1,1.2,100]')); + SELECT vector_distance_cos(vector32('[-21,-31,0,2,2.1,2.2,105]'), vector32('[-20,-30,0,1,1.1,1.2,100]')); + + SELECT vector_distance_cos(vector8('[-20,-30,0,1,1.1,1.2,100]'), vector8('[-20,-30,0,1,1.1,1.2,10000]')); + SELECT vector_distance_cos(vector32('[-20,-30,0,1,1.1,1.2,100]'), vector32('[-20,-30,0,1,1.1,1.2,10000]')); + + SELECT vector_distance_l2(vector('[1,2,2,3,4,1,5]'), vector('[2,3,1,-1,2,4,5]')); + SELECT vector_distance_l2(vector8('[1,2,2,3,4,1,5]'), vector8('[2,3,1,-1,2,4,5]')); } { {[]} {[]} @@ -65,6 +80,65 @@ do_execsql_test vector-1-func-valid { {2.0} {1.0} {0.200000002980232} + {2.0} + {1.0} + {0.0} + + {-6.10352568486405e-09} {0.0} + {0.000111237335659098} {0.000117244853754528} + {0.0576796568930149} {0.0582110174000263} + + {5.65685415267944} {5.65413522720337} +} + +do_execsql_test vector-1-conversion { + SELECT hex(vector32('[]')); + SELECT hex(vector64(vector32('[]'))); + + SELECT vector_extract(vector32(vector1bit('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector32(vector1bit('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + SELECT vector_extract(vector32(vector32('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector32(vector32('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + SELECT vector_extract(vector32(vector64('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector32(vector64('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + + SELECT vector_extract(vector64(vector1bit('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector64(vector1bit('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + SELECT vector_extract(vector64(vector32('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector64(vector32('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + SELECT vector_extract(vector64(vector64('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector64(vector64('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + + SELECT vector_extract(vector1bit(vector1bit('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector1bit(vector1bit('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + SELECT vector_extract(vector1bit(vector32('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector1bit(vector32('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + SELECT vector_extract(vector1bit(vector64('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))), hex(vector1bit(vector64('[-0.000001,1e-100,1e100,-1e10,1e-10,0,1.5]'))); + + SELECT vector_extract(vector8(vector1bit('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector8(vector1bit('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); + SELECT vector_extract(vector8(vector32('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector8(vector32('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); + SELECT vector_extract(vector8(vector64('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector8(vector64('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); + SELECT vector_extract(vector8(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector8(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); + + SELECT vector_extract(vector1bit(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector1bit(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); + SELECT vector_extract(vector32(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector32(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); + SELECT vector_extract(vector64(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))), hex(vector64(vector8('[-20,-35.44,1,1.5,2,3,10,100,105,110]'))); +} { + {} + 02 + + {[-1,-1,1,-1,1,-1,1]} 000080BF000080BF0000803F000080BF0000803F000080BF0000803F + {[-1e-06,0,Inf,-1e+10,1e-10,0,1.5]} BD3786B5000000000000807FF90215D0FFE6DB2E000000000000C03F + {[-1e-06,0,Inf,-1e+10,1e-10,0,1.5]} BD3786B5000000000000807FF90215D0FFE6DB2E000000000000C03F + + {[-1,-1,1,-1,1,-1,1]} 000000000000F0BF000000000000F0BF000000000000F03F000000000000F0BF000000000000F03F000000000000F0BF000000000000F03F02 + {[-1e-06,0,Inf,-1e+10,1e-10,0,1.5]} 000000A0F7C6B0BE0000000000000000000000000000F07F000000205FA002C2000000E0DF7CDB3D0000000000000000000000000000F83F02 + {[-1e-06,1e-100,1e+100,-1e+10,1e-10,0,1.5]} 8DEDB5A0F7C6B0BE30058EE42EFF2B2B7DC39425AD49B254000000205FA002C2BBBDD7D9DF7CDB3D0000000000000000000000000000F83F02 + + {[-1,-1,1,-1,1,-1,1]} 540903 + {[-1,-1,1,-1,1,-1,1]} 540903 + {[-1,1,1,-1,1,-1,1]} 560903 + + {[-1,-1,1,1,1,1,1,1,1,1]} 0000FFFFFFFFFFFFFFFF00008180003C000080BF000204 + {[-20.0405,-35.44,1.06259,1.63295,2.2033,2.77365,10.1882,99.7337,104.867,110]} 1B004041424350EDF6FF0000A702123F8FC20DC2000204 + {[-20.0405,-35.44,1.06259,1.63295,2.2033,2.77365,10.1882,99.7337,104.867,110]} 1B004041424350EDF6FF0000A702123F8FC20DC2000204 + {[-20.0405,-35.44,1.06259,1.63295,2.2033,2.77365,10.1882,99.7337,104.867,110]} 1B004041424350EDF6FF0000A702123F8FC20DC2000204 + + {[-1,-1,1,1,1,1,1,1,1,1]} FC03001603 + {[-20.0405,-35.44,1.06259,1.63295,2.2033,2.77365,10.1882,99.7337,104.867,110]} E152A0C18FC20DC20003883F6004D13FD0020D408083314008032341A277C742D0BBD1420000DC42 + {[-20.0405,-35.44,1.06259,1.63295,2.2033,2.77365,10.1882,99.7337,104.867,110]} 000000205C0A34C0000000E051B841C0000000006000F13F000000008C20FA3F000000005AA001400000000070300640000000006160244000000040F4EE5840000000007A375A400000000000805B4002 } proc error_messages {sql} { @@ -89,6 +163,7 @@ do_test vector-1-func-errors { lappend ret [error_messages {SELECT vector(x'0000000000')}] lappend ret [error_messages {SELECT vector_distance_cos('[1,2,3]', '[1,2]')}] lappend ret [error_messages {SELECT vector_distance_cos(vector32('[1,2,3]'), vector64('[1,2,3]'))}] + lappend ret [error_messages {SELECT vector_distance_l2(vector1bit('[1,2,2,3,4,1,5]'), vector1bit('[2,3,1,-1,2,4,5]'))}] } [list {*}{ {vector: unexpected value type: got FLOAT, expected TEXT or BLOB} {vector: unexpected value type: got INTEGER, expected TEXT or BLOB} @@ -99,7 +174,8 @@ do_test vector-1-func-errors { {vector: invalid float at position 0: '[1'} {vector: invalid float at position 2: '1.1.1'} {vector: must end with ']'} - {vector: unexpected binary type: got 0, expected 1 or 2} - {vector_distance_cos: vectors must have the same length: 3 != 2} - {vector_distance_cos: vectors must have the same type: 1 != 2} + {vector: unexpected binary type: 0} + {vector_distance: vectors must have the same length: 3 != 2} + {vector_distance: vectors must have the same type: 1 != 2} + {vector_distance: l2 distance is not supported for float1bit vectors} }] diff --git a/libsql-sqlite3/test/libsql_vector_index.test b/libsql-sqlite3/test/libsql_vector_index.test index 19d31ba19c..a453af6a90 100644 --- a/libsql-sqlite3/test/libsql_vector_index.test +++ b/libsql-sqlite3/test/libsql_vector_index.test @@ -30,12 +30,17 @@ set testprefix vector sqlite3_db_config_lookaside db 0 0 0 -do_execsql_test vector-integrity { - CREATE TABLE t_integrity( v FLOAT32(3) ); - CREATE INDEX t_integrity_idx ON t_integrity( libsql_vector_idx(v) ); - INSERT INTO t_integrity VALUES (vector('[1,2,3]')); +do_execsql_test vector-pragmas { + CREATE TABLE t_pragmas( v FLOAT32(3) ); + CREATE INDEX t_pragmas_idx ON t_pragmas( libsql_vector_idx(v) ); + INSERT INTO t_pragmas VALUES (vector('[1,2,3]')); PRAGMA integrity_check; -} {{row 1 missing from index t_integrity_idx} {wrong # of entries in index t_integrity_idx}} + PRAGMA index_list='t_pragmas'; +} { + {row 1 missing from index t_pragmas_idx} + {wrong # of entries in index t_pragmas_idx} + 0 t_pragmas_idx 0 c 0 +} do_execsql_test vector-typename { CREATE TABLE t_type_spaces( v FLOAT32 ( 3 ) ); @@ -126,13 +131,23 @@ do_execsql_test vector-empty { do_execsql_test vector-null { - CREATE TABLE t_null( v FLOAT32(3)); + CREATE TABLE t_null( v FLOAT32(2)); CREATE INDEX t_null_idx ON t_null( libsql_vector_idx(v) ); - INSERT INTO t_null VALUES(vector('[1,2,3]')); + INSERT INTO t_null VALUES(vector('[1,-1]')); INSERT INTO t_null VALUES(NULL); - INSERT INTO t_null VALUES(vector('[2,3,4]')); - SELECT * FROM vector_top_k('t_null_idx', '[1,2,3]', 2); -} {1 3} + INSERT INTO t_null VALUES(vector('[-2,1]')); + SELECT * FROM vector_top_k('t_null_idx', '[1,1]', 2); + UPDATE t_null SET v = vector('[1,1]') WHERE rowid = 2; + SELECT rowid FROM vector_top_k('t_null_idx', vector('[1,1]'), 3); + UPDATE t_null SET v = NULL WHERE rowid = 3; + SELECT rowid FROM vector_top_k('t_null_idx', vector('[1,1]'), 3); + UPDATE t_null SET v = NULL; + SELECT rowid FROM vector_top_k('t_null_idx', vector('[1,1]'), 3); +} { + 1 3 + 2 1 3 + 2 1 +} do_execsql_test vector-sql { CREATE TABLE t_sql( v FLOAT32(3)); @@ -140,7 +155,7 @@ do_execsql_test vector-sql { INSERT INTO t_sql VALUES(vector('[1,2,3]')), (vector('[2,3,4]')); SELECT sql FROM sqlite_master WHERE name LIKE '%t_sql%'; SELECT name FROM libsql_vector_meta_shadow WHERE name = 't_sql_idx'; -} {{CREATE TABLE t_sql( v FLOAT32(3))} {CREATE TABLE t_sql_idx_shadow (index_key INTEGER , data BLOB, PRIMARY KEY (index_key))} {CREATE INDEX t_sql_idx ON t_sql( libsql_vector_idx(v) )} {t_sql_idx}} +} {{CREATE TABLE t_sql( v FLOAT32(3))} {CREATE TABLE t_sql_idx_shadow (index_key INTEGER , data BLOB, PRIMARY KEY (index_key))} {CREATE INDEX t_sql_idx_shadow_idx ON t_sql_idx_shadow (index_key)} {CREATE INDEX t_sql_idx ON t_sql( libsql_vector_idx(v) )} {t_sql_idx}} do_execsql_test vector-drop-index { CREATE TABLE t_index_drop( v FLOAT32(3)); @@ -236,12 +251,17 @@ do_execsql_test vector-attach { do_execsql_test vector-vacuum { CREATE TABLE t_vacuum ( emb FLOAT32(2) ); - INSERT INTO t_vacuum VALUES (vector('[1,2]')), (vector('[3,4]')); + INSERT INTO t_vacuum VALUES (vector('[1,2]')), (vector('[3,4]')), (vector('[5,6]')); CREATE INDEX t_vacuum_idx ON t_vacuum(libsql_vector_idx(emb)); VACUUM; SELECT COUNT(*) FROM t_vacuum; SELECT COUNT(*) FROM t_vacuum_idx_shadow; -} {2 2} + DELETE FROM t_vacuum WHERE rowid = 2; + VACUUM; + SELECT * FROM vector_top_k('t_vacuum_idx', vector('[1,2]'), 3); + SELECT * FROM vector_top_k('t_vacuum_idx', vector('[5,6]'), 3); + SELECT * FROM vector_top_k('t_vacuum_idx', vector('[3,4]'), 3); +} {3 3 1 2 2 1 2 1} do_execsql_test vector-many-columns { CREATE TABLE t_many ( i INTEGER PRIMARY KEY, e1 FLOAT32(2), e2 FLOAT32(2) ); @@ -263,13 +283,105 @@ do_execsql_test vector-transaction { SELECT * FROM vector_top_k('t_transaction_idx', vector('[1,2]'), 2); } {3 4 1 2} +do_execsql_test vector-1bit { + CREATE TABLE t_1bit( v FLOAT32(3) ); + CREATE INDEX t_1bit_idx ON t_1bit( libsql_vector_idx(v, 'compress_neighbors=float1bit') ); + INSERT INTO t_1bit VALUES (vector('[-1,-1,1]')); + INSERT INTO t_1bit VALUES (vector('[-1,1,-1.5]')); + INSERT INTO t_1bit VALUES (vector('[1,-1,-1]')); + INSERT INTO t_1bit VALUES (vector('[-1,-1,-1]')); + SELECT rowid FROM vector_top_k('t_1bit_idx', vector('[1,-1,-1]'), 4); +} {3 4 2 1} + do_execsql_test vector-all-params { CREATE TABLE t_all_params ( emb FLOAT32(2) ); - CREATE INDEX t_all_params_idx ON t_all_params(libsql_vector_idx(emb, 'type=diskann', 'metric=cos', 'alpha=1.2', 'search_l=200', 'insert_l=70', 'max_neighbors=6')); + CREATE INDEX t_all_params_idx ON t_all_params(libsql_vector_idx(emb, 'type=diskann', 'metric=cos', 'alpha=1.2', 'search_l=200', 'insert_l=70', 'max_neighbors=6', 'compress_neighbors=float1bit')); INSERT INTO t_all_params VALUES (vector('[1,2]')), (vector('[3,4]')); SELECT * FROM vector_top_k('t_all_params_idx', vector('[1,2]'), 2); } {1 2} +do_execsql_test vector-f64-index { + CREATE TABLE t_f64 ( emb FLOAT64(2) ); + CREATE INDEX t_f64_idx ON t_f64(libsql_vector_idx(emb)); + INSERT INTO t_f64 VALUES (vector64('[1,2]')), (vector64('[3,4]')); + SELECT * FROM vector_top_k('t_f64_idx', vector64('[1,2]'), 2); +} {1 2} + +do_execsql_test vector-partial { + CREATE TABLE t_partial( name TEXT, type INT, v FLOAT32(3)); + INSERT INTO t_partial VALUES ( 'a', 0, vector('[1,2,3]') ); + INSERT INTO t_partial VALUES ( 'b', 1, vector('[3,4,5]') ); + INSERT INTO t_partial VALUES ( 'c', 2, vector('[4,5,6]') ); + INSERT INTO t_partial VALUES ( 'd', 0, vector('[5,6,7]') ); + INSERT INTO t_partial VALUES ( 'e', 1, vector('[6,7,8]') ); + INSERT INTO t_partial VALUES ( 'f', 2, vector('[7,8,9]') ); + CREATE INDEX t_partial_idx_0 ON t_partial( libsql_vector_idx(v) ) WHERE type = 0; + CREATE INDEX t_partial_idx_1 ON t_partial( libsql_vector_idx(v) ) WHERE type = 1; + CREATE INDEX t_partial_idx_not_0 ON t_partial( libsql_vector_idx(v) ) WHERE type != 0; + SELECT id FROM vector_top_k('t_partial_idx_0', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_1', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_not_0', vector('[1,2,3]'), 10); + INSERT INTO t_partial VALUES ( 'g', 0, vector('[8,9,10]') ); + INSERT INTO t_partial VALUES ( 'h', 1, vector('[9,10,11]') ); + INSERT INTO t_partial VALUES ( 'i', 2, vector('[10,11,12]') ); + SELECT id FROM vector_top_k('t_partial_idx_0', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_1', vector('[1,2,3]'), 10); + SELECT id FROM vector_top_k('t_partial_idx_not_0', vector('[1,2,3]'), 10); +} { + 1 4 + 2 5 + 2 3 5 6 + + 1 4 7 + 2 5 8 + 2 3 5 6 8 9 +} + +do_execsql_test vector-1bit-index { + CREATE TABLE t_1bit_table( v FLOAT1BIT(4) ); + INSERT INTO t_1bit_table VALUES ( vector1bit('[1,-1,1,-1]') ); + CREATE INDEX t_1bit_table_idx ON t_1bit_table( libsql_vector_idx(v) ); + INSERT INTO t_1bit_table VALUES ( vector1bit('[-1,1,1,-1]') ); + INSERT INTO t_1bit_table VALUES ( vector1bit('[1,-1,-1,1]') ); + SELECT * FROM vector_top_k('t_1bit_table_idx', vector1bit('[10,-10,-20,20]'), 4); +} {3 1 2} + +do_execsql_test vector-f64-compress-f32 { + CREATE TABLE t_f64_f32( v FLOAT64(4) ); + CREATE INDEX t_f64_f32_idx ON t_f64_f32( libsql_vector_idx(v, 'compress_neighbors=float32') ); + INSERT INTO t_f64_f32 VALUES ( vector64('[1,-1,1,-1]') ); + INSERT INTO t_f64_f32 VALUES ( vector64('[-1,1,1,-1]') ); + INSERT INTO t_f64_f32 VALUES ( vector64('[1,-1,-1,1]') ); + SELECT * FROM vector_top_k('t_f64_f32_idx', vector64('[10,-10,-20,20]'), 4); +} {3 1 2} + +do_execsql_test vector-f32-compress-f8 { + CREATE TABLE t_f32_f8( v FLOAT32(4) ); + CREATE INDEX t_f32_f8_idx ON t_f32_f8( libsql_vector_idx(v, 'compress_neighbors=float8') ); + INSERT INTO t_f32_f8 VALUES ( vector('[1,-1,1,-1]') ); + INSERT INTO t_f32_f8 VALUES ( vector('[-1,1,1,-1]') ); + INSERT INTO t_f32_f8 VALUES ( vector('[1,-1,-1,1]') ); + SELECT * FROM vector_top_k('t_f32_f8_idx', vector('[10,-10,-20,20]'), 4); +} {3 1 2} + +do_execsql_test vector-f8 { + CREATE TABLE t_f8( v FLOAT8(4) ); + CREATE INDEX t_f8_idx ON t_f8( libsql_vector_idx(v) ); + INSERT INTO t_f8 VALUES ( vector8('[1,-1,1,-1]') ); + INSERT INTO t_f8 VALUES ( vector8('[-1,1,1,-1]') ); + INSERT INTO t_f8 VALUES ( vector8('[1,-1,-1,1]') ); + SELECT * FROM vector_top_k('t_f8_idx', vector8('[10,-10,-20,20]'), 4); +} {3 1 2} + +do_execsql_test vector-f8-compress-1bit { + CREATE TABLE t_f8_1bit( v FLOAT8(4) ); + CREATE INDEX t_f8_1bit_idx ON t_f8_1bit( libsql_vector_idx(v, 'compress_neighbors=float1bit') ); + INSERT INTO t_f8_1bit VALUES ( vector8('[1,-1,1,-1]') ); + INSERT INTO t_f8_1bit VALUES ( vector8('[-1,1,1,-1]') ); + INSERT INTO t_f8_1bit VALUES ( vector8('[1,-1,-1,1]') ); + SELECT * FROM vector_top_k('t_f8_1bit_idx', vector8('[10,-10,-20,20]'), 4); +} {3 1 2} + proc error_messages {sql} { set ret "" catch { @@ -280,6 +392,8 @@ proc error_messages {sql} { set ret [sqlite3_errmsg db] } +reset_db + do_test vector-errors { set ret [list] lappend ret [error_messages {CREATE INDEX t_no_idx ON t_no( libsql_vector_idx(v) )}] @@ -304,8 +418,6 @@ do_test vector-errors { sqlite3_exec db { CREATE TABLE t_mixed_t( v FLOAT32(3)); } sqlite3_exec db { INSERT INTO t_mixed_t VALUES('[1]'); } lappend ret [error_messages {CREATE INDEX t_mixed_t_idx ON t_mixed_t( libsql_vector_idx(v) )}] - sqlite3_exec db { CREATE TABLE t_partial( name TEXT, type INT, v FLOAT32(3)); } - lappend ret [error_messages {CREATE INDEX t_partial_idx ON t_partial( libsql_vector_idx(v) ) WHERE type = 0}] } [list {*}{ {no such table: main.t_no} {no such column: v} @@ -320,8 +432,7 @@ do_test vector-errors { {vector index: unsupported for tables without ROWID and composite primary key} {vector index(insert): dimensions are different: 1 != 4} {vector index(insert): dimensions are different: 5 != 4} - {vector index(insert): only f32 vectors are supported} + {vector index(insert): vector type differs from column type: 2 != 1} {vector index(search): dimensions are different: 2 != 4} {vector index(insert): dimensions are different: 1 != 3} - {vector index: where condition is forbidden} }] diff --git a/libsql-sqlite3/tool/mksqlite3c.tcl b/libsql-sqlite3/tool/mksqlite3c.tcl index 31e6d84a57..560992a60b 100644 --- a/libsql-sqlite3/tool/mksqlite3c.tcl +++ b/libsql-sqlite3/tool/mksqlite3c.tcl @@ -469,8 +469,10 @@ set flist { json.c vector.c vectordiskann.c + vectorfloat1bit.c vectorfloat32.c vectorfloat64.c + vectorfloat8.c vectorIndex.c vectorvtab.c rtree.c diff --git a/libsql-sqlite3/tool/showwal b/libsql-sqlite3/tool/showwal new file mode 100755 index 0000000000..0687a41595 Binary files /dev/null and b/libsql-sqlite3/tool/showwal differ diff --git a/libsql-sys/Cargo.toml b/libsql-sys/Cargo.toml index 26dd091ea9..8351012d9f 100644 --- a/libsql-sys/Cargo.toml +++ b/libsql-sys/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql-sys" -version = "0.6.0" +version = "0.7.0" edition = "2021" license = "MIT" description = "Native bindings to libSQL" @@ -12,7 +12,7 @@ categories = ["external-ffi-bindings"] [dependencies] bytes = "1.5.0" -libsql-ffi = { version = "0.3", path = "../libsql-ffi/" } +libsql-ffi = { version = "0.4", path = "../libsql-ffi/" } once_cell = "1.18.0" rusqlite = { workspace = true, features = ["trace"], optional = true } tracing = "0.1.37" diff --git a/libsql-wal/Cargo.toml b/libsql-wal/Cargo.toml index 9624596c28..f24f2e4c59 100644 --- a/libsql-wal/Cargo.toml +++ b/libsql-wal/Cargo.toml @@ -9,6 +9,7 @@ publish = false [dependencies] arc-swap = "1.7.1" async-stream = "0.3.5" +async-lock = "3.4.0" bitflags = "2.5.0" bytes = "1.6.0" chrono = "0.4.38" @@ -29,7 +30,7 @@ tokio-stream = "0.1.15" tracing = "0.1.40" uuid = { version = "1.8.0", features = ["v4"] } walkdir = "2.5.0" -zerocopy = { version = "0.7.32", features = ["derive", "alloc"] } +zerocopy = { workspace = true } aws-config = { version = "1", optional = true, features = ["behavior-version-latest"] } aws-sdk-s3 = { version = "1", optional = true } diff --git a/libsql-wal/src/lib.rs b/libsql-wal/src/lib.rs index df104eda49..1c0dc63566 100644 --- a/libsql-wal/src/lib.rs +++ b/libsql-wal/src/lib.rs @@ -15,6 +15,22 @@ const LIBSQL_MAGIC: u64 = u64::from_be_bytes(*b"LIBSQL\0\0"); const LIBSQL_PAGE_SIZE: u16 = 4096; const LIBSQL_WAL_VERSION: u16 = 1; +use zerocopy::byteorder::big_endian::{U16 as bu16, U64 as bu64}; +/// LibsqlFooter is located at the end of the libsql file. I contains libsql specific metadata, +/// while remaining fully compatible with sqlite (which just ignores that footer) +/// +/// The fields are in big endian to remain coherent with sqlite +#[derive(Copy, Clone, Debug, zerocopy::FromBytes, zerocopy::FromZeroes, zerocopy::AsBytes)] +#[repr(C)] +pub struct LibsqlFooter { + pub magic: bu64, + pub version: bu16, + /// Replication index checkpointed into this file. + /// only valid if there are no outstanding segments to checkpoint, since a checkpoint could be + /// partial. + pub replication_index: bu64, +} + #[cfg(any(debug_assertions, test))] pub mod test { use std::fs::OpenOptions; diff --git a/libsql-wal/src/replication/injector.rs b/libsql-wal/src/replication/injector.rs index 66710bbb22..a922330102 100644 --- a/libsql-wal/src/replication/injector.rs +++ b/libsql-wal/src/replication/injector.rs @@ -6,23 +6,23 @@ use crate::error::Result; use crate::io::Io; use crate::segment::Frame; use crate::shared_wal::SharedWal; -use crate::transaction::TxGuard; +use crate::transaction::TxGuardOwned; /// The injector takes frames and injects them in the wal. -pub struct Injector<'a, IO: Io> { +pub struct Injector { // The wal to which we are injecting wal: Arc>, buffer: Vec>, /// capacity of the frame buffer capacity: usize, - tx: TxGuard<'a, IO::File>, + tx: TxGuardOwned, max_tx_frame_no: u64, } -impl<'a, IO: Io> Injector<'a, IO> { +impl Injector { pub fn new( wal: Arc>, - tx: TxGuard<'a, IO::File>, + tx: TxGuardOwned, buffer_capacity: usize, ) -> Result { Ok(Self { @@ -34,7 +34,7 @@ impl<'a, IO: Io> Injector<'a, IO> { }) } - pub async fn insert_frame(&mut self, frame: Box) -> Result<()> { + pub async fn insert_frame(&mut self, frame: Box) -> Result> { let size_after = frame.size_after(); self.max_tx_frame_no = self.max_tx_frame_no.max(frame.header().frame_no()); self.buffer.push(frame); @@ -43,10 +43,10 @@ impl<'a, IO: Io> Injector<'a, IO> { self.flush(size_after).await?; } - Ok(()) + Ok(size_after.map(|_| self.max_tx_frame_no)) } - async fn flush(&mut self, size_after: Option) -> Result<()> { + pub async fn flush(&mut self, size_after: Option) -> Result<()> { let buffer = std::mem::take(&mut self.buffer); let current = self.wal.current.load(); let commit_data = size_after.map(|size| (size, self.max_tx_frame_no)); @@ -60,6 +60,11 @@ impl<'a, IO: Io> Injector<'a, IO> { Ok(()) } + + pub fn rollback(&mut self) { + self.buffer.clear(); + self.tx.reset(0); + } } #[cfg(test)] @@ -89,7 +94,10 @@ mod test { let mut tx = crate::transaction::Transaction::Read(replica_shared.begin_read(42)); replica_shared.upgrade(&mut tx).unwrap(); - let guard = tx.as_write_mut().unwrap().lock(); + let guard = tx + .into_write() + .unwrap_or_else(|_| panic!()) + .into_lock_owned(); let mut injector = Injector::new(replica_shared.clone(), guard, 10).unwrap(); primary_conn.execute("create table test (x)", ()).unwrap(); diff --git a/libsql-wal/src/segment/current.rs b/libsql-wal/src/segment/current.rs index d8d720a145..bda6d5742a 100644 --- a/libsql-wal/src/segment/current.rs +++ b/libsql-wal/src/segment/current.rs @@ -22,7 +22,7 @@ use crate::io::file::FileExt; use crate::io::Inspect; use crate::segment::{checked_frame_offset, SegmentFlags}; use crate::segment::{frame_offset, page_offset, sealed::SealedSegment}; -use crate::transaction::{Transaction, TxGuard}; +use crate::transaction::{Transaction, TxGuard, TxGuardOwned}; use crate::{LIBSQL_MAGIC, LIBSQL_PAGE_SIZE, LIBSQL_WAL_VERSION}; use super::list::SegmentList; @@ -125,7 +125,7 @@ impl CurrentSegment { frames: Vec>, // (size_after, last_frame_no) commit_data: Option<(u32, u64)>, - tx: &mut TxGuard<'_, F>, + tx: &mut TxGuardOwned, ) -> Result>> where F: FileExt, diff --git a/libsql-wal/src/segment/list.rs b/libsql-wal/src/segment/list.rs index 25dfa3a32a..f1e3252161 100644 --- a/libsql-wal/src/segment/list.rs +++ b/libsql-wal/src/segment/list.rs @@ -15,6 +15,7 @@ use crate::error::Result; use crate::io::buf::{ZeroCopyBoxIoBuf, ZeroCopyBuf}; use crate::io::FileExt; use crate::segment::Frame; +use crate::{LibsqlFooter, LIBSQL_MAGIC, LIBSQL_PAGE_SIZE, LIBSQL_WAL_VERSION}; use super::Segment; @@ -157,6 +158,21 @@ where buf = read_buf.into_inner(); } + // update the footer at the end of the db file. + let footer = LibsqlFooter { + magic: LIBSQL_MAGIC.into(), + version: LIBSQL_WAL_VERSION.into(), + replication_index: last_replication_index.into(), + }; + + let footer_offset = size_after as usize * LIBSQL_PAGE_SIZE as usize; + let (_, ret) = db_file + .write_all_at_async(ZeroCopyBuf::new_init(footer), footer_offset as u64) + .await; + ret?; + + // todo: truncate if necessary + //// todo: make async db_file.sync_all()?; @@ -185,7 +201,7 @@ where Ok(Some(last_replication_index)) } - /// returnsstream pages from the sealed segment list, and what's the lowest replication index + /// returns a stream of pages from the sealed segment list, and what's the lowest replication index /// that was covered. If the returned index is less than start frame_no, the missing frames /// must be read somewhere else. pub async fn stream_pages_from<'a>( diff --git a/libsql-wal/src/shared_wal.rs b/libsql-wal/src/shared_wal.rs index 09a2747c5a..461ad13e03 100644 --- a/libsql-wal/src/shared_wal.rs +++ b/libsql-wal/src/shared_wal.rs @@ -20,7 +20,7 @@ use libsql_sys::name::NamespaceName; #[derive(Default)] pub struct WalLock { - pub(crate) tx_id: Arc>>, + pub(crate) tx_id: Arc>>, /// When a writer is popped from the write queue, its write transaction may not be reading from the most recent /// snapshot. In this case, we return `SQLITE_BUSY_SNAPHSOT` to the caller. If no reads were performed /// with that transaction before upgrading, then the caller will call us back immediately after re-acquiring @@ -108,7 +108,7 @@ impl SharedWal { Some(id) if id == read_tx.conn_id => { tracing::trace!("taking reserved slot"); reserved.take(); - let lock = self.wal_lock.tx_id.lock(); + let lock = self.wal_lock.tx_id.lock_blocking(); let write_tx = self.acquire_write(read_tx, lock, reserved)?; *tx = Transaction::Write(write_tx); return Ok(()); @@ -117,7 +117,7 @@ impl SharedWal { } } - let lock = self.wal_lock.tx_id.lock(); + let lock = self.wal_lock.tx_id.lock_blocking(); match *lock { None if self.wal_lock.waiters.is_empty() => { let write_tx = @@ -144,7 +144,7 @@ impl SharedWal { fn acquire_write( &self, read_tx: &ReadTransaction, - mut tx_id_lock: MutexGuard>, + mut tx_id_lock: async_lock::MutexGuard>, mut reserved: MutexGuard>, ) -> Result> { // we read two fields in the header. There is no risk that a transaction commit in diff --git a/libsql-wal/src/transaction.rs b/libsql-wal/src/transaction.rs index 723cffeae1..f2cdd5be70 100644 --- a/libsql-wal/src/transaction.rs +++ b/libsql-wal/src/transaction.rs @@ -4,7 +4,6 @@ use std::sync::Arc; use std::time::Instant; use libsql_sys::name::NamespaceName; -use parking_lot::{ArcMutexGuard, RawMutex}; use tokio::sync::mpsc; use crate::checkpointer::CheckpointMessage; @@ -31,6 +30,14 @@ impl Transaction { } } + pub fn into_write(self) -> Result, Self> { + if let Self::Write(v) = self { + Ok(v) + } else { + Err(self) + } + } + pub fn max_frame_no(&self) -> u64 { match self { Transaction::Write(w) => w.next_frame_no - 1, @@ -147,8 +154,27 @@ pub struct WriteTransaction { pub recompute_checksum: Option, } +pub struct TxGuardOwned { + _lock: async_lock::MutexGuardArc>, + inner: WriteTransaction, +} + +impl Deref for TxGuardOwned { + type Target = WriteTransaction; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for TxGuardOwned { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + pub struct TxGuard<'a, F> { - _lock: ArcMutexGuard>, + _lock: async_lock::MutexGuardArc>, inner: &'a mut WriteTransaction, } @@ -189,7 +215,7 @@ impl WriteTransaction { todo!("txn has already been commited"); } - let g = self.wal_lock.tx_id.lock_arc(); + let g = self.wal_lock.tx_id.lock_arc_blocking(); match *g { // we still hold the lock, we can proceed Some(id) if self.id == id => TxGuard { @@ -202,6 +228,25 @@ impl WriteTransaction { } } + pub fn into_lock_owned(self) -> TxGuardOwned { + if self.is_commited { + tracing::error!("transaction already commited"); + todo!("txn has already been commited"); + } + + let g = self.wal_lock.tx_id.lock_arc_blocking(); + match *g { + // we still hold the lock, we can proceed + Some(id) if self.id == id => TxGuardOwned { + _lock: g, + inner: self, + }, + // Somebody took the lock from us + Some(_) => todo!("lock stolen"), + None => todo!("not a transaction"), + } + } + pub fn reset(&mut self, savepoint_id: usize) { if savepoint_id >= self.savepoints.len() { unreachable!("savepoint doesn't exist"); @@ -231,7 +276,7 @@ impl WriteTransaction { let Self { wal_lock, read_tx, .. } = self; - let mut lock = wal_lock.tx_id.lock(); + let mut lock = wal_lock.tx_id.lock_blocking(); match *lock { Some(lock_id) if lock_id == read_tx.id => { lock.take(); diff --git a/libsql/Cargo.toml b/libsql/Cargo.toml index fa89cc68ad..aa78a1bf0c 100644 --- a/libsql/Cargo.toml +++ b/libsql/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql" -version = "0.5.0-alpha.2" +version = "0.5.1" edition = "2021" description = "libSQL library: the main gateway for interacting with the database" repository = "https://github.com/tursodatabase/libsql" @@ -11,7 +11,7 @@ tracing = { version = "0.1.37", default-features = false } thiserror = "1.0.40" futures = { version = "0.3.28", optional = true } -libsql-sys = { version = "0.6", path = "../libsql-sys", optional = true } +libsql-sys = { version = "0.7", path = "../libsql-sys", optional = true } libsql-hrana = { version = "0.2", path = "../libsql-hrana", optional = true } tokio = { version = "1.29.1", features = ["sync"], optional = true } tokio-util = { version = "0.7", features = ["io-util", "codec"], optional = true } @@ -20,7 +20,7 @@ hyper = { workspace = true, features = ["client", "stream"], optional = true } hyper-rustls = { version = "0.25", features = ["webpki-roots"], optional = true } base64 = { version = "0.21", optional = true } serde = { version = "1", features = ["derive"], optional = true } -serde_json = { version = "1", optional = true } +serde_json = { version = "1", features = ["float_roundtrip"], optional = true } async-trait = "0.1" bitflags = { version = "2.4.0", optional = true } tower = { workspace = true, features = ["util"], optional = true } @@ -37,10 +37,10 @@ tower-http = { version = "0.4.4", features = ["trace", "set-header", "util"], op http = { version = "0.2", optional = true } zerocopy = { version = "0.7.28", optional = true } -sqlite3-parser = { package = "libsql-sqlite3-parser", path = "../vendored/sqlite3-parser", version = "0.12", optional = true } +sqlite3-parser = { package = "libsql-sqlite3-parser", path = "../vendored/sqlite3-parser", version = "0.13", optional = true } fallible-iterator = { version = "0.3", optional = true } -libsql_replication = { version = "0.4", path = "../libsql-replication", optional = true } +libsql_replication = { version = "0.5", path = "../libsql-replication", optional = true } async-stream = { version = "0.3.5", optional = true } [dev-dependencies] diff --git a/libsql/src/database.rs b/libsql/src/database.rs index e87def367d..d14cf2e42c 100644 --- a/libsql/src/database.rs +++ b/libsql/src/database.rs @@ -7,9 +7,10 @@ pub use builder::Builder; #[cfg(feature = "core")] pub use libsql_sys::{Cipher, EncryptionConfig}; -use std::fmt; - use crate::{Connection, Result}; +use std::fmt; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; cfg_core! { bitflags::bitflags! { @@ -76,6 +77,9 @@ impl fmt::Debug for DbType { /// not do much work until the [`Database::connect`] fn is called. pub struct Database { db_type: DbType, + /// The maximum replication index returned from a write performed using any connection created using this Database object. + #[allow(dead_code)] + max_write_replication_index: Arc, } cfg_core! { @@ -87,6 +91,7 @@ cfg_core! { Ok(Database { db_type: DbType::Memory { db }, + max_write_replication_index: Default::default(), }) } @@ -105,6 +110,7 @@ cfg_core! { flags, encryption_config: None, }, + max_write_replication_index: Default::default(), }) } } @@ -130,6 +136,7 @@ cfg_replication! { Ok(Database { db_type: DbType::Sync { db, encryption_config }, + max_write_replication_index: Default::default(), }) } @@ -191,6 +198,7 @@ cfg_replication! { Ok(Database { db_type: DbType::Sync { db, encryption_config }, + max_write_replication_index: Default::default(), }) } @@ -317,6 +325,7 @@ cfg_replication! { Ok(Database { db_type: DbType::Sync { db, encryption_config }, + max_write_replication_index: Default::default(), }) } @@ -331,6 +340,16 @@ cfg_replication! { } } + /// Sync database from remote until it gets to a given replication_index or further, + /// and returns the committed frame_no after syncing, if applicable. + pub async fn sync_until(&self, replication_index: FrameNo) -> Result { + if let DbType::Sync { db, encryption_config: _ } = &self.db_type { + db.sync_until(replication_index).await + } else { + Err(Error::SyncNotSupported(format!("{:?}", self.db_type))) + } + } + /// Apply a set of frames to the database and returns the committed frame_no after syncing, if /// applicable. pub async fn sync_frames(&self, frames: crate::replication::Frames) -> Result> { @@ -372,12 +391,25 @@ cfg_replication! { DbType::Sync { db, .. } => { let path = db.path().to_string(); Ok(Database { - db_type: DbType::File { path, flags: OpenFlags::default(), encryption_config: None} + db_type: DbType::File { path, flags: OpenFlags::default(), encryption_config: None}, + max_write_replication_index: Default::default(), }) } t => Err(Error::FreezeNotSupported(format!("{:?}", t))) } } + + /// Get the maximum replication index returned from a write performed using any connection created using this Database object. + pub fn max_write_replication_index(&self) -> Option { + let index = self + .max_write_replication_index + .load(std::sync::atomic::Ordering::SeqCst); + if index == 0 { + None + } else { + Some(index) + } + } } } @@ -445,6 +477,7 @@ cfg_remote! { connector: crate::util::ConnectorService::new(svc), version, }, + max_write_replication_index: Default::default(), }) } } @@ -552,7 +585,11 @@ impl Database { let local = LibsqlConnection { conn }; let writer = local.conn.new_connection_writer(); - let remote = crate::replication::RemoteConnection::new(local, writer); + let remote = crate::replication::RemoteConnection::new( + local, + writer, + self.max_write_replication_index.clone(), + ); let conn = std::sync::Arc::new(remote); Ok(Connection { conn }) diff --git a/libsql/src/database/builder.rs b/libsql/src/database/builder.rs index 8749b6452b..35cd93f899 100644 --- a/libsql/src/database/builder.rs +++ b/libsql/src/database/builder.rs @@ -135,6 +135,7 @@ cfg_core! { let db = crate::local::Database::open(":memory:", crate::OpenFlags::default())?; Database { db_type: DbType::Memory { db } , + max_write_replication_index: Default::default(), } } else { let path = self @@ -150,6 +151,7 @@ cfg_core! { flags: self.inner.flags, encryption_config: self.inner.encryption_config, }, + max_write_replication_index: Default::default(), } }; @@ -291,6 +293,7 @@ cfg_replication! { Ok(Database { db_type: DbType::Sync { db, encryption_config }, + max_write_replication_index: Default::default(), }) } } @@ -360,6 +363,7 @@ cfg_replication! { Ok(Database { db_type: DbType::Sync { db, encryption_config }, + max_write_replication_index: Default::default(), }) } } @@ -414,6 +418,7 @@ cfg_remote! { connector, version, }, + max_write_replication_index: Default::default(), }) } } diff --git a/libsql/src/de.rs b/libsql/src/de.rs index 63ee71f598..44f231c134 100644 --- a/libsql/src/de.rs +++ b/libsql/src/de.rs @@ -68,7 +68,7 @@ impl<'de> Deserializer<'de> for RowDeserializer<'de> { visitor.visit_map(RowMapAccess { row: self.row, - idx: 0..self.row.inner.column_count(), + idx: 0..(self.row.inner.column_count() as usize), value: None, }) } diff --git a/libsql/src/hrana/mod.rs b/libsql/src/hrana/mod.rs index 9befe549de..4a6fd0c63a 100644 --- a/libsql/src/hrana/mod.rs +++ b/libsql/src/hrana/mod.rs @@ -24,7 +24,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use super::rows::{RowInner, RowsInner}; +use super::rows::{ColumnsInner, RowInner, RowsInner}; pub(crate) type Result = std::result::Result; @@ -261,7 +261,12 @@ where async fn next(&mut self) -> crate::Result> { self.next().await } +} +impl ColumnsInner for HranaRows +where + S: Stream> + Send + Sync + Unpin, +{ fn column_count(&self) -> i32 { self.column_count() } @@ -303,13 +308,6 @@ impl RowInner for Row { Ok(into_value2(v)) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.cols - .get(idx as usize) - .and_then(|c| c.name.as_ref()) - .map(|s| s.as_str()) - } - fn column_str(&self, idx: i32) -> crate::Result<&str> { if let Some(value) = self.inner.get(idx as usize) { if let proto::Value::Text { value } = value { @@ -321,6 +319,15 @@ impl RowInner for Row { Err(crate::Error::ColumnNotFound(idx)) } } +} + +impl ColumnsInner for Row { + fn column_name(&self, idx: i32) -> Option<&str> { + self.cols + .get(idx as usize) + .and_then(|c| c.name.as_ref()) + .map(|s| s.as_str()) + } fn column_type(&self, idx: i32) -> crate::Result { if let Some(value) = self.inner.get(idx as usize) { @@ -337,8 +344,8 @@ impl RowInner for Row { } } - fn column_count(&self) -> usize { - self.cols.len() + fn column_count(&self) -> i32 { + self.cols.len() as i32 } } @@ -417,7 +424,9 @@ impl RowsInner for StmtResultRows { inner: Box::new(row), })) } +} +impl ColumnsInner for StmtResultRows { fn column_count(&self) -> i32 { self.cols.len() as i32 } diff --git a/libsql/src/local/database.rs b/libsql/src/local/database.rs index 2892d809cc..3453a777c9 100644 --- a/libsql/src/local/database.rs +++ b/libsql/src/local/database.rs @@ -277,6 +277,29 @@ impl Database { Ok(self.sync_oneshot().await?) } + #[cfg(feature = "replication")] + /// Sync with primary at least to a given replication index + pub async fn sync_until(&self, replication_index: FrameNo) -> Result { + if let Some(ctx) = &self.replication_ctx { + let mut frame_no: Option = ctx.replicator.committed_frame_no().await; + let mut frames_synced: usize = 0; + while frame_no.unwrap_or(0) < replication_index { + let res = ctx.replicator.sync_oneshot().await?; + frame_no = res.frame_no(); + frames_synced += res.frames_synced(); + } + Ok(crate::replication::Replicated { + frame_no, + frames_synced, + }) + } else { + Err(crate::errors::Error::Misuse( + "No replicator available. Use Database::with_replicator() to enable replication" + .to_string(), + )) + } + } + #[cfg(feature = "replication")] pub async fn sync_frames(&self, frames: Frames) -> Result> { if let Some(ref ctx) = self.replication_ctx { diff --git a/libsql/src/local/impls.rs b/libsql/src/local/impls.rs index 8a9a5f440e..2338317a34 100644 --- a/libsql/src/local/impls.rs +++ b/libsql/src/local/impls.rs @@ -5,7 +5,7 @@ use crate::connection::BatchRows; use crate::{ connection::Conn, params::Params, - rows::{RowInner, RowsInner}, + rows::{ColumnsInner, RowInner, RowsInner}, statement::Stmt, transaction::Tx, Column, Connection, Result, Row, Rows, Statement, Transaction, TransactionBehavior, Value, @@ -159,7 +159,9 @@ impl RowsInner for LibsqlRows { Ok(row) } +} +impl ColumnsInner for LibsqlRows { fn column_count(&self) -> i32 { self.0.column_count() } @@ -180,20 +182,22 @@ impl RowInner for LibsqlRow { self.0.get_value(idx) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.0.column_name(idx) - } - fn column_str(&self, idx: i32) -> Result<&str> { self.0.get::<&str>(idx) } +} + +impl ColumnsInner for LibsqlRow { + fn column_name(&self, idx: i32) -> Option<&str> { + self.0.column_name(idx) + } fn column_type(&self, idx: i32) -> Result { self.0.column_type(idx).map(ValueType::from) } - fn column_count(&self) -> usize { - self.0.stmt.column_count() + fn column_count(&self) -> i32 { + self.0.stmt.column_count() as i32 } } diff --git a/libsql/src/local/rows.rs b/libsql/src/local/rows.rs index 7eb52d461b..4d4e622c75 100644 --- a/libsql/src/local/rows.rs +++ b/libsql/src/local/rows.rs @@ -1,6 +1,6 @@ use crate::local::{Connection, Statement}; use crate::params::Params; -use crate::rows::{RowInner, RowsInner}; +use crate::rows::{ColumnsInner, RowInner, RowsInner}; use crate::{errors, Error, Result}; use crate::{Value, ValueRef}; use libsql_sys::ValueType; @@ -213,7 +213,9 @@ impl RowsInner for BatchedRows { Ok(None) } } +} +impl ColumnsInner for BatchedRows { fn column_count(&self) -> i32 { self.cols.len() as i32 } @@ -244,10 +246,6 @@ impl RowInner for BatchedRow { .ok_or(Error::InvalidColumnIndex) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.cols.get(idx as usize).map(|c| c.0.as_str()) - } - fn column_str(&self, idx: i32) -> Result<&str> { self.row .get(idx as usize) @@ -258,9 +256,15 @@ impl RowInner for BatchedRow { .ok_or(Error::InvalidColumnType) }) } +} + +impl ColumnsInner for BatchedRow { + fn column_name(&self, idx: i32) -> Option<&str> { + self.cols.get(idx as usize).map(|c| c.0.as_str()) + } - fn column_count(&self) -> usize { - self.cols.len() + fn column_count(&self) -> i32 { + self.cols.len() as i32 } fn column_type(&self, idx: i32) -> Result { diff --git a/libsql/src/local/statement.rs b/libsql/src/local/statement.rs index 70116a152e..c28a66f18f 100644 --- a/libsql/src/local/statement.rs +++ b/libsql/src/local/statement.rs @@ -250,15 +250,15 @@ impl Statement { /// sure that current statement has already been stepped once before /// calling this method. pub fn column_names(&self) -> Vec<&str> { - let n = self.column_count(); - let mut cols = Vec::with_capacity(n); - for i in 0..n { - let s = self.column_name(i); - if let Some(s) = s { - cols.push(s); - } - } - cols + let n = self.column_count(); + let mut cols = Vec::with_capacity(n); + for i in 0..n { + let s = self.column_name(i); + if let Some(s) = s { + cols.push(s); + } + } + cols } /// Return the number of columns in the result set returned by the prepared @@ -314,12 +314,11 @@ impl Statement { /// the specified `name`. pub fn column_index(&self, name: &str) -> Result { let bytes = name.as_bytes(); - let n = self.column_count() as i32; + let n = self.column_count(); for i in 0..n { // Note: `column_name` is only fallible if `i` is out of bounds, // which we've already checked. let col_name = self - .inner .column_name(i) .ok_or_else(|| Error::InvalidColumnName(name.to_string()))?; if bytes.eq_ignore_ascii_case(col_name.as_bytes()) { diff --git a/libsql/src/replication/connection.rs b/libsql/src/replication/connection.rs index c82f523559..593bd634a1 100644 --- a/libsql/src/replication/connection.rs +++ b/libsql/src/replication/connection.rs @@ -2,7 +2,7 @@ use std::str::FromStr; use std::sync::Arc; - +use std::sync::atomic::AtomicU64; use libsql_replication::rpc::proxy::{ describe_result, query_result::RowResult, Cond, DescribeResult, ExecuteResults, NotCond, OkCond, Positional, Query, ResultRows, State as RemoteState, Step, @@ -11,7 +11,7 @@ use parking_lot::Mutex; use crate::parser; use crate::parser::StmtKind; -use crate::rows::{RowInner, RowsInner}; +use crate::rows::{ColumnsInner, RowInner, RowsInner}; use crate::statement::Stmt; use crate::transaction::Tx; use crate::{ @@ -28,6 +28,7 @@ pub struct RemoteConnection { pub(self) local: LibsqlConnection, writer: Option, inner: Arc>, + max_write_replication_index: Arc, } #[derive(Default, Debug)] @@ -166,12 +167,25 @@ impl From for State { } impl RemoteConnection { - pub(crate) fn new(local: LibsqlConnection, writer: Option) -> Self { + pub(crate) fn new(local: LibsqlConnection, writer: Option, max_write_replication_index: Arc) -> Self { let state = Arc::new(Mutex::new(Inner::default())); Self { local, writer, inner: state, + max_write_replication_index, + } + } + + fn update_max_write_replication_index(&self, index: Option) { + if let Some(index) = index { + let mut current = self.max_write_replication_index.load(std::sync::atomic::Ordering::SeqCst); + while index > current { + match self.max_write_replication_index.compare_exchange(current, index, std::sync::atomic::Ordering::SeqCst, std::sync::atomic::Ordering::SeqCst) { + Ok(_) => break, + Err(new_current) => current = new_current, + } + } } } @@ -201,6 +215,8 @@ impl RemoteConnection { .into(); } + self.update_max_write_replication_index(res.current_frame_no); + if let Some(replicator) = writer.replicator() { replicator.sync_oneshot().await?; } @@ -226,6 +242,8 @@ impl RemoteConnection { .into(); } + self.update_max_write_replication_index(res.current_frame_no); + if let Some(replicator) = writer.replicator() { replicator.sync_oneshot().await?; } @@ -780,7 +798,9 @@ impl RowsInner for RemoteRows { let row = RemoteRow(values, self.0.column_descriptions.clone()); Ok(Some(row).map(Box::new).map(|inner| Row { inner })) } +} +impl ColumnsInner for RemoteRows { fn column_count(&self) -> i32 { self.0.column_descriptions.len() as i32 } @@ -813,10 +833,6 @@ impl RowInner for RemoteRow { .ok_or(Error::InvalidColumnIndex) } - fn column_name(&self, idx: i32) -> Option<&str> { - self.1.get(idx as usize).map(|s| s.name.as_str()) - } - fn column_str(&self, idx: i32) -> Result<&str> { let value = self.0.get(idx as usize).ok_or(Error::InvalidColumnIndex)?; @@ -825,6 +841,12 @@ impl RowInner for RemoteRow { _ => Err(Error::InvalidColumnType), } } +} + +impl ColumnsInner for RemoteRow { + fn column_name(&self, idx: i32) -> Option<&str> { + self.1.get(idx as usize).map(|s| s.name.as_str()) + } fn column_type(&self, idx: i32) -> Result { let col = self.1.get(idx as usize).unwrap(); @@ -835,8 +857,8 @@ impl RowInner for RemoteRow { .ok_or(Error::InvalidColumnType) } - fn column_count(&self) -> usize { - self.1.len() + fn column_count(&self) -> i32 { + self.1.len() as i32 } } diff --git a/libsql/src/replication/local_client.rs b/libsql/src/replication/local_client.rs index 2d7b940c92..d3c713f530 100644 --- a/libsql/src/replication/local_client.rs +++ b/libsql/src/replication/local_client.rs @@ -3,6 +3,7 @@ use std::pin::Pin; use futures::{StreamExt, TryStreamExt}; use libsql_replication::{ + rpc::replication::Frame as RpcFrame, frame::{Frame, FrameNo}, meta::WalIndexMeta, replicator::{Error, ReplicatorClient}, @@ -35,7 +36,7 @@ impl LocalClient { #[async_trait::async_trait] impl ReplicatorClient for LocalClient { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { @@ -46,7 +47,7 @@ impl ReplicatorClient for LocalClient { async fn next_frames(&mut self) -> Result { match self.frames.take() { Some(Frames::Vec(f)) => { - let iter = f.into_iter().map(Ok); + let iter = f.into_iter().map(|f| RpcFrame { data: f.bytes(), timestamp: None }).map(Ok); Ok(Box::pin(tokio_stream::iter(iter))) } Some(f @ Frames::Snapshot(_)) => { @@ -70,7 +71,8 @@ impl ReplicatorClient for LocalClient { if s.as_mut().peek().await.is_none() { next.header_mut().size_after = size_after.into(); } - yield Frame::from(next); + let frame = Frame::from(next); + yield RpcFrame { data: frame.bytes(), timestamp: None }; } }; @@ -95,8 +97,9 @@ impl ReplicatorClient for LocalClient { #[cfg(test)] mod test { - use libsql_replication::snapshot::SnapshotFile; + use libsql_replication::{frame::FrameHeader, snapshot::SnapshotFile}; use tempfile::tempdir; + use zerocopy::FromBytes; use super::*; @@ -111,7 +114,8 @@ mod test { let mut s = client.snapshot().await.unwrap(); assert!(matches!(s.next().await, Some(Ok(_)))); let last = s.next().await.unwrap().unwrap(); - assert_eq!(last.header().size_after.get(), 2); + let header: FrameHeader = FrameHeader::read_from_prefix(&last.data[..]).unwrap(); + assert_eq!(header.size_after.get(), 2); assert!(s.next().await.is_none()); } } diff --git a/libsql/src/replication/mod.rs b/libsql/src/replication/mod.rs index 69cc0b5db2..116839a54f 100644 --- a/libsql/src/replication/mod.rs +++ b/libsql/src/replication/mod.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use std::time::Duration; pub use libsql_replication::frame::{Frame, FrameNo}; +use libsql_replication::injector::SqliteInjector; use libsql_replication::replicator::{Either, Replicator}; pub use libsql_replication::snapshot::SnapshotFile; @@ -35,8 +36,8 @@ pub(crate) mod remote_client; #[derive(Debug)] pub struct Replicated { - frame_no: Option, - frames_synced: usize, + pub(crate) frame_no: Option, + pub(crate) frames_synced: usize, } impl Replicated { @@ -129,7 +130,7 @@ impl Writer { #[derive(Clone)] pub(crate) struct EmbeddedReplicator { - replicator: Arc>>>, + replicator: Arc, SqliteInjector>>>, bg_abort: Option>, last_frames_synced: Arc, } @@ -149,7 +150,7 @@ impl EmbeddedReplicator { perodic_sync: Option, ) -> Result { let replicator = Arc::new(Mutex::new( - Replicator::new( + Replicator::new_sqlite( Either::Left(client), db_path, auto_checkpoint, @@ -193,7 +194,7 @@ impl EmbeddedReplicator { encryption_config: Option, ) -> Result { let replicator = Arc::new(Mutex::new( - Replicator::new( + Replicator::new_sqlite( Either::Right(client), db_path, auto_checkpoint, diff --git a/libsql/src/replication/remote_client.rs b/libsql/src/replication/remote_client.rs index dbab056938..864392ddb5 100644 --- a/libsql/src/replication/remote_client.rs +++ b/libsql/src/replication/remote_client.rs @@ -4,12 +4,12 @@ use std::pin::Pin; use std::time::{Duration, Instant}; use bytes::Bytes; -use futures::StreamExt as _; -use libsql_replication::frame::{Frame, FrameHeader, FrameNo}; +use futures::{StreamExt as _, TryStreamExt}; +use libsql_replication::frame::{FrameHeader, FrameNo}; use libsql_replication::meta::WalIndexMeta; -use libsql_replication::replicator::{map_frame_err, Error, ReplicatorClient}; +use libsql_replication::replicator::{Error, ReplicatorClient}; use libsql_replication::rpc::replication::{ - verify_session_token, Frames, HelloRequest, HelloResponse, LogOffset, SESSION_TOKEN_KEY, + Frame as RpcFrame, verify_session_token, Frames, HelloRequest, HelloResponse, LogOffset, SESSION_TOKEN_KEY, }; use tokio_stream::Stream; use tonic::metadata::AsciiMetadataValue; @@ -119,6 +119,7 @@ impl RemoteClient { let hello_req = self.make_request(HelloRequest::new()); let log_offset_req = self.make_request(LogOffset { next_offset: self.next_offset(), + wal_flavor: None, }); let mut client_clone = self.remote.clone(); let hello_fut = time(async { @@ -135,7 +136,7 @@ impl RemoteClient { (hello_fut.await, None) }; self.prefetched_batch_log_entries = if let Ok(true) = hello.0 { - tracing::warn!( + tracing::debug!( "Frames prefetching failed because of new session token returned by handshake" ); None @@ -160,7 +161,7 @@ impl RemoteClient { let frames_iter = frames .into_iter() - .map(|f| Frame::try_from(&*f.data).map_err(|e| Error::Client(e.into()))); + .map(Ok); let stream = tokio_stream::iter(frames_iter); @@ -178,6 +179,7 @@ impl RemoteClient { None => { let req = self.make_request(LogOffset { next_offset: self.next_offset(), + wal_flavor: None, }); time(self.remote.replication.batch_log_entries(req)).await } @@ -189,6 +191,7 @@ impl RemoteClient { async fn do_snapshot(&mut self) -> Result<::FrameStream, Error> { let req = self.make_request(LogOffset { next_offset: self.next_offset(), + wal_flavor: None, }); let mut frames = self .remote @@ -196,7 +199,7 @@ impl RemoteClient { .snapshot(req) .await? .into_inner() - .map(map_frame_err) + .map_err(|e| e.into()) .peekable(); { @@ -204,7 +207,8 @@ impl RemoteClient { // the first frame is the one with the highest frame_no in the snapshot if let Some(Ok(f)) = frames.peek().await { - self.last_received = Some(f.header().frame_no.get()); + let header: FrameHeader = FrameHeader::read_from_prefix(&f.data[..]).unwrap(); + self.last_received = Some(header.frame_no.get()); } } @@ -239,7 +243,7 @@ fn maybe_log( #[async_trait::async_trait] impl ReplicatorClient for RemoteClient { - type FrameStream = Pin> + Send + 'static>>; + type FrameStream = Pin> + Send + 'static>>; /// Perform handshake with remote async fn handshake(&mut self) -> Result<(), Error> { diff --git a/libsql/src/rows.rs b/libsql/src/rows.rs index b97aeac203..a10d82b827 100644 --- a/libsql/src/rows.rs +++ b/libsql/src/rows.rs @@ -38,14 +38,8 @@ impl Column<'_> { } #[async_trait::async_trait] -pub(crate) trait RowsInner { +pub(crate) trait RowsInner: ColumnsInner { async fn next(&mut self) -> Result>; - - fn column_count(&self) -> i32; - - fn column_name(&self, idx: i32) -> Option<&str>; - - fn column_type(&self, idx: i32) -> Result; } /// A set of rows returned from a connection. @@ -131,7 +125,7 @@ impl Row { } /// Get the count of columns in this set of rows. - pub fn column_count(&self) -> usize { + pub fn column_count(&self) -> i32 { self.inner.column_count() } @@ -284,12 +278,15 @@ where } impl Sealed for Option {} -pub(crate) trait RowInner: fmt::Debug { - fn column_value(&self, idx: i32) -> Result; - fn column_str(&self, idx: i32) -> Result<&str>; +pub(crate) trait ColumnsInner { fn column_name(&self, idx: i32) -> Option<&str>; fn column_type(&self, idx: i32) -> Result; - fn column_count(&self) -> usize; + fn column_count(&self) -> i32; +} + +pub(crate) trait RowInner: ColumnsInner + fmt::Debug { + fn column_value(&self, idx: i32) -> Result; + fn column_str(&self, idx: i32) -> Result<&str>; } mod sealed { diff --git a/libsql/tests/integration_tests.rs b/libsql/tests/integration_tests.rs index 0f8e575949..cdb0a985c3 100644 --- a/libsql/tests/integration_tests.rs +++ b/libsql/tests/integration_tests.rs @@ -596,6 +596,26 @@ async fn debug_print_row() { ); } +#[tokio::test] +async fn fts5_invalid_tokenizer() { + let db = Database::open(":memory:").unwrap(); + let conn = db.connect().unwrap(); + assert!(conn + .execute( + "CREATE VIRTUAL TABLE t USING fts5(s, tokenize='trigram case_sensitive ')", + (), + ) + .await + .is_err()); + assert!(conn + .execute( + "CREATE VIRTUAL TABLE t USING fts5(s, tokenize='trigram remove_diacritics ')", + (), + ) + .await + .is_err()); +} + #[cfg(feature = "serde")] #[tokio::test] async fn deserialize_row() { diff --git a/vendored/rusqlite/Cargo.toml b/vendored/rusqlite/Cargo.toml index 2d332f3279..d9fbcc525e 100644 --- a/vendored/rusqlite/Cargo.toml +++ b/vendored/rusqlite/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "libsql-rusqlite" # Note: Update version in README.md when you change this. -version = "0.31.0" +version = "0.32.0" authors = ["The rusqlite developers"] edition = "2018" description = "Ergonomic wrapper for SQLite (libsql fork)" @@ -109,7 +109,7 @@ fallible-iterator = "0.2" fallible-streaming-iterator = "0.1" uuid = { version = "1.0", optional = true } smallvec = "1.6.1" -libsql-ffi = { version = "0.3", path = "../../libsql-ffi" } +libsql-ffi = { version = "0.4", path = "../../libsql-ffi" } [dev-dependencies] doc-comment = "0.3" diff --git a/vendored/sqlite3-parser/Cargo.toml b/vendored/sqlite3-parser/Cargo.toml index 5ed9e31f4d..0381ac1d99 100644 --- a/vendored/sqlite3-parser/Cargo.toml +++ b/vendored/sqlite3-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libsql-sqlite3-parser" -version = "0.12.0" +version = "0.13.0" edition = "2021" authors = ["gwenn"] description = "SQL parser (as understood by SQLite) (libsql fork)"