From 38fcf2b1da7e1c39b4269970f851b6acef05573e Mon Sep 17 00:00:00 2001 From: Alexius Wadell Date: Wed, 5 Feb 2025 13:29:11 -0500 Subject: [PATCH 1/5] Update tokenizers and pyO3 for py3.13 --- Cargo.toml | 18 ++++++++----- pyproject.toml | 14 +++++----- src/lib.rs | 7 +++-- src/tokenizer.rs | 66 ++++++++++++++++++++++++++++-------------------- 4 files changed, 61 insertions(+), 44 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9db83d7..bcdbd57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,29 +1,35 @@ [package] name = "smirk" -version = "0.1.1" +version = "0.2.0" edition = "2021" +license = "Apache-2.0" +description = "A chemically complete tokenizer for OpenSMILES" +authors = [ + "Alexius Wadell ", + "Anoushka Bhutani", +] +readme = "README.md" +repository = "https://github.com/BattModels/smirk" [lib] name = "smirk" - -# "cdylib" is necessary to produce a shared library for Python to import from. crate-type = ["cdylib"] [dependencies] clap = "4.5.1" const_format = "0.2.32" derive_builder = "0.20.0" -dict_derive = "0.5.0" +dict_derive = "0.6.0" either = "1.13.0" macro_rules_attribute = "0.2.0" once_cell = "1.19.0" paste = "1.0.14" -pyo3 = { version = "^0.20", features = ["extension-module"] } +pyo3 = { version = "^0.23", features = ["extension-module"] } regex = "1.10.3" serde = "1.0.197" serde_json = "1.0.114" serde_with = "3.8.0" -tokenizers = { version = "^0.19"} +tokenizers = { version = "^0.21"} [dev-dependencies] tempfile = "3.10.1" diff --git a/pyproject.toml b/pyproject.toml index 6f49ca5..ff028ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,24 @@ [project] name = "smirk" -requires-python = ">=3.7" +requires-python = ">=3.9" license = {file = "LICENSE"} +dynamic = ["version"] classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "transformers ~= 4.40", + "transformers >=4.40,<5", ] -[project.optional-dependencies] -test = [ +[dependency-groups] +dev = [ "pytest~=8.3", "parameterized==0.9.0", - "torch", - "numpy", + "ruff~=0.9.4", + "torch~=2.0", + "numpy~=2.0", ] [build-system] diff --git a/src/lib.rs b/src/lib.rs index 9af0e34..d849c86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,13 +4,12 @@ mod tokenizer; mod wrapper; use pyo3::prelude::*; -use tokenizer::SmirkTokenizer; /// A Python module implemented in Rust. #[pymodule] -fn smirk(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - Ok(()) +mod smirk { + #[pymodule_export] + use crate::tokenizer::SmirkTokenizer; } #[cfg(test)] diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3462f0e..d39f3ee 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -3,10 +3,9 @@ use std::collections::{HashMap, HashSet}; use crate::gpe::{GpeTrainer, GPE}; use crate::pre_tokenizers::{split_structure, SmirkPreTokenizer}; use crate::wrapper::{ModelWrapper, PreTokenizerWrapper, TrainerWrapper}; -use dict_derive::{FromPyObject, IntoPyObject}; use pyo3::exceptions::PyValueError; -use pyo3::types::{PyDict, PyList, PyString}; -use pyo3::{pyclass, pymethods, PyResult, Python}; +use pyo3::prelude::*; +use pyo3::types::{PyAnyMethods, PyDict, PyString}; use regex::Regex; use tokenizers::decoders::fuse::Fuse; @@ -84,8 +83,8 @@ impl SmirkTokenizer { SmirkTokenizer::new(tokenizer) } - fn pretokenize(&self, smile: &PyString) -> PyResult> { - let mut pretokenized = PreTokenizedString::from(smile.to_str().unwrap()); + fn pretokenize(&self, smile: String) -> PyResult> { + let mut pretokenized = PreTokenizedString::from(smile); let _ = self .tokenizer .get_pre_tokenizer() @@ -100,8 +99,8 @@ impl SmirkTokenizer { } #[pyo3(signature = (smile, add_special_tokens = true))] - fn encode(&self, smile: &PyString, add_special_tokens: bool) -> PyResult { - let input = EncodeInput::from(smile.to_str().unwrap()); + fn encode(&self, smile: String, add_special_tokens: bool) -> PyResult { + let input = EncodeInput::from(smile); let encoding = self .tokenizer .encode_char_offsets(input, add_special_tokens) @@ -118,12 +117,12 @@ impl SmirkTokenizer { fn encode_batch( &self, py: Python<'_>, - examples: Vec<&PyString>, + examples: Vec>, add_special_tokens: bool, ) -> PyResult> { let inputs: Vec = examples .into_iter() - .map(|x| EncodeInput::from(x.to_str().unwrap())) + .map(|x| EncodeInput::from(x.to_string())) .collect(); // Release the GIL while tokenizing batch let out = py.allow_threads(|| { @@ -179,7 +178,7 @@ impl SmirkTokenizer { .special_tokens(special_tokens) .build() .unwrap(); - self.tokenizer.with_post_processor(tp); + self.tokenizer.with_post_processor(Some(tp)); Ok(()) } @@ -250,11 +249,11 @@ impl SmirkTokenizer { } #[pyo3(signature = (**kwargs))] - fn with_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { + fn with_padding(&mut self, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> { let mut params = PaddingParams::default(); if let Some(kwargs) = kwargs { - for (key, value) in kwargs { - let key: &str = key.extract().unwrap(); + for (key, value) in kwargs.iter() { + let key: &str = key.extract()?; match key { "direction" => { let value: &str = value.extract().unwrap(); @@ -286,7 +285,7 @@ impl SmirkTokenizer { } #[pyo3(signature = (**kwargs))] - fn with_truncation(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { + fn with_truncation(&mut self, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> { let mut params = TruncationParams::default(); if let Some(kwargs) = kwargs { for (key, value) in kwargs { @@ -330,23 +329,30 @@ impl SmirkTokenizer { Ok(()) } - fn add_tokens(&mut self, tokens: &PyList) -> PyResult { - let tokens = tokens + fn add_tokens(&mut self, tokens: Vec>) -> PyResult { + let tokens: Vec = tokens .into_iter() - .map(|token| AddedToken { - content: token.getattr("content").unwrap().to_string(), - lstrip: token.getattr("lstrip").unwrap().extract().unwrap(), - rstrip: token.getattr("rstrip").unwrap().extract().unwrap(), - normalized: token.getattr("normalized").unwrap().extract().unwrap(), - single_word: token.getattr("single_word").unwrap().extract().unwrap(), - special: token.getattr("special").unwrap().extract().unwrap(), + .map(|kwargs| { + Ok(AddedToken { + content: kwargs.getattr("content")?.extract::()?, + single_word: kwargs.getattr("single_word")?.extract::()?, + lstrip: kwargs.getattr("lstrip")?.extract::()?, + rstrip: kwargs.getattr("rstrip")?.extract::()?, + normalized: kwargs.getattr("normalized")?.extract::()?, + special: kwargs.getattr("special")?.extract::()?, + }) }) - .collect::>(); + .collect::, PyErr>>()?; Ok(self.tokenizer.add_tokens(&tokens)) } #[pyo3(signature = (files, **kwargs))] - fn train(&self, py: Python, files: Vec, kwargs: Option<&PyDict>) -> PyResult { + fn train( + &self, + py: Python, + files: Vec, + kwargs: Option>, + ) -> PyResult { // Construct Trainable Tokenizer let model: ModelWrapper = match self.tokenizer.get_model() { ModelWrapper::ModelWrapper(mw) => match mw { @@ -427,13 +433,13 @@ impl SmirkTokenizer { } } -#[derive(FromPyObject, IntoPyObject, Debug)] +#[derive(IntoPyObject, IntoPyObjectRef)] pub struct Encoding { pub input_ids: Vec, pub token_type_ids: Vec, pub attention_mask: Vec, pub special_tokens_mask: Vec, - pub offsets: Vec<(usize, usize)>, + pub offsets: Vec<(u64, u64)>, } impl From for Encoding { @@ -443,7 +449,11 @@ impl From for Encoding { token_type_ids: encoding.get_type_ids().to_vec(), attention_mask: encoding.get_attention_mask().to_vec(), special_tokens_mask: encoding.get_special_tokens_mask().to_vec(), - offsets: encoding.get_offsets().to_vec(), + offsets: encoding + .get_offsets() + .into_iter() + .map(|&(start, end)| (start as u64, end as u64)) + .collect(), } } } From 01b49acb0ea2ccd39e763dc9e41193c577be2b6d Mon Sep 17 00:00:00 2001 From: Alexius Wadell Date: Wed, 5 Feb 2025 13:32:45 -0500 Subject: [PATCH 2/5] add dependabot --- dependabot.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 dependabot.yaml diff --git a/dependabot.yaml b/dependabot.yaml new file mode 100644 index 0000000..8244ed7 --- /dev/null +++ b/dependabot.yaml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: cargo + directory: / + schedule: + interval: monthly + versioning-strategy: increase + - package-ecosystem: pip + directory: / + schedule: + interval: monthly From a59c98024ae6857ea9c9eaa33dc7ce990dca27a8 Mon Sep 17 00:00:00 2001 From: Alexius Wadell Date: Wed, 12 Feb 2025 20:19:03 -0500 Subject: [PATCH 3/5] added a changelog --- CHANGELOG.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..155d17a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,48 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Started a changelog ([#2](https://github.com/BattModels/smirk/pull/2)) + +### Changed + +- Bumped PyO3, tokenizers and dict_derive dependencies ([#2](https://github.com/BattModels/smirk/pull/2)) + +### Breaking + +- Increased minimum python version to 3.9 ([#2](https://github.com/BattModels/smirk/pull/2)) + +### Fixed + +- Mark version as dynamic in pyproject ([#2](https://github.com/BattModels/smirk/pull/2)) + +### Removed + +## [0.1.1] - 2024-12-09 + +Preprint v2 posted: [arXiv:2409.15370v2](https://arxiv.org/abs/2409.15370v2) + +## Added + +- Added support for post-processing templates to `SmirkTokenizerFast` ([#1](https://github.com/BattModels/smirk/pull/1)) +- Registered smirk with transformer's AutoTokenizer ([#1](https://github.com/BattModels/smirk/pull/1)) +- Added `vocab`, `convert_ids_to_tokens` and `convert_tokens_to_ids` methods ([#1](https://github.com/BattModels/smirk/pull/1)) +- Added support for truncating and padding during tokenization ([#1](https://github.com/BattModels/smirk/pull/1)) + +## Fixed + +- Fixed CI to install test dependencies ([#1](https://github.com/BattModels/smirk/pull/1)) + +## [0.1.0] - 2024-09-11 + +Preprint posted: [arXiv:2409.15370v1](https://arxiv.org/abs/2409.15370v1) + +### Added + +- Initial tagged version of smirk From 70c2eb468771afd5d2c3f433b3ebee9d9fb5f77a Mon Sep 17 00:00:00 2001 From: Alexius Wadell Date: Tue, 4 Mar 2025 09:41:43 -0500 Subject: [PATCH 4/5] use uv for ci / pre-commit workflows --- .github/workflows/CI.yaml | 23 ++++++++--------------- .github/workflows/pre-commit.yml | 15 +++++++++------ CHANGELOG.md | 1 + pyproject.toml | 1 + 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 63c396c..42e2f40 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -1,12 +1,9 @@ name: CI - on: push: pull_request: - env: CARGO_TERM_COLOR: always - jobs: check: name: Check @@ -15,7 +12,6 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - run: cargo check - test: name: Test Suite runs-on: ubuntu-latest @@ -24,23 +20,20 @@ jobs: - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - run: cargo test --benches --verbose --all - test_python: name: Test Python runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v5 with: - python-version: '3.10' - cache: 'pip' + python-version: "3.10" + enable-cache: true + cache-dependency-glob: pyproject.toml - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 + - name: install project + run: uv sync --all-extras --dev - name: pytest - shell: bash - run: | - set -e - python -m venv .venv - source .venv/bin/activate - pip install '.[test]' - pytest + run: uv run pytest diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 2b11178..89ff056 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -1,14 +1,17 @@ name: pre-commit - on: pull_request: push: - branches: [main] - + branches: + - main jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 - - uses: pre-commit/action@v3.0.1 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + cache-dependency-glob: pyproject.toml + - run: uv run --only-group dev pre-commit run --all diff --git a/CHANGELOG.md b/CHANGELOG.md index 155d17a..ff8dec8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ### Changed - Bumped PyO3, tokenizers and dict_derive dependencies ([#2](https://github.com/BattModels/smirk/pull/2)) +- Switched to uv for CI/pre-commit workflows ([#2](https://github.com/BattModels/smirk/pull/2)) ### Breaking diff --git a/pyproject.toml b/pyproject.toml index ff028ce..c41a663 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dev = [ "ruff~=0.9.4", "torch~=2.0", "numpy~=2.0", + "pre-commit~=4.1.0", ] [build-system] From 0d3a304aeea47eb793bb355dc8b916be70a73da6 Mon Sep 17 00:00:00 2001 From: Alexius Wadell Date: Tue, 4 Mar 2025 09:52:46 -0500 Subject: [PATCH 5/5] Dropping CD workflow Premature, costly and often broken --- .github/workflows/CD.yaml | 182 -------------------------------------- 1 file changed, 182 deletions(-) delete mode 100644 .github/workflows/CD.yaml diff --git a/.github/workflows/CD.yaml b/.github/workflows/CD.yaml deleted file mode 100644 index 3ec9a68..0000000 --- a/.github/workflows/CD.yaml +++ /dev/null @@ -1,182 +0,0 @@ -# This file is autogenerated by maturin v1.7.1 -# To update, run -# -# maturin generate-ci github --pytest -# -name: Build and Test Artifacts - -on: - push: - branches: - - main - tags: - - '*' - pull_request: - workflow_dispatch: - -permissions: - contents: read - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - linux: - runs-on: ${{ matrix.platform.runner }} - strategy: - matrix: - platform: - - runner: ubuntu-latest - target: x86_64 - - runner: ubuntu-latest - target: x86 - - runner: ubuntu-latest - target: aarch64 - steps: - - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 - - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --release --out dist -i python3.11 -i python3.10 - sccache: 'true' - manylinux: auto - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-linux-${{ matrix.platform.target }} - path: dist - - name: pytest - if: ${{ startsWith(matrix.platform.target, 'x86_64') }} - shell: bash - run: | - set -e - python3 -m venv .venv - source .venv/bin/activate - pip install smirk[test] --find-links dist --force-reinstall - pytest - - name: pytest - if: ${{ !startsWith(matrix.platform.target, 'x86') && matrix.platform.target != 'ppc64' }} - uses: uraimo/run-on-arch-action@v2 - with: - arch: ${{ matrix.platform.target }} - distro: ubuntu22.04 - githubToken: ${{ github.token }} - install: | - set -e - apt-get update - apt-get install -y --no-install-recommends python3 python3-pip python3-venv - run: | - python3 -m venv .venv - source .venv/bin/activate - pip install smirk[test] --find-links dist --force-reinstall - pytest - - windows: - runs-on: ${{ matrix.platform.runner }} - strategy: - matrix: - platform: - - runner: windows-latest - target: x64 - steps: - - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 - - uses: actions/setup-python@v5 - with: - python-version: '3.11' - cache: 'pip' - architecture: ${{ matrix.platform.target }} - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --release --out dist -i python3.11 -i python3.10 - sccache: 'true' - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-windows-${{ matrix.platform.target }} - path: dist - - name: pytest - if: ${{ !startsWith(matrix.platform.target, 'aarch64') }} - shell: bash - run: | - set -e - python3 -m venv .venv - source .venv/Scripts/activate - pip install smirk[test] --find-links dist --force-reinstall - pytest - - macos: - runs-on: ${{ matrix.platform.runner }} - strategy: - matrix: - platform: - - runner: macos-12 - target: x86_64 - - runner: macos-14 - target: aarch64 - steps: - - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@stable - - uses: Swatinem/rust-cache@v2 - - uses: actions/setup-python@v5 - with: - python-version: '3.11' - cache: 'pip' - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --release --out dist -i python3.11 -i python3.10 - sccache: 'true' - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-macos-${{ matrix.platform.target }} - path: dist - - name: install - run: | - set -e - python3 -m venv .venv - source .venv/bin/activate - pip install smirk[test] --find-links dist --force-reinstall - pytest - - sdist: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Build sdist - uses: PyO3/maturin-action@v1 - with: - command: sdist - args: --out dist - - name: Upload sdist - uses: actions/upload-artifact@v4 - with: - name: wheels-sdist - path: dist - - release: - name: Release - runs-on: ubuntu-latest - if: "startsWith(github.ref, 'refs/tags/')" - needs: [linux, windows, macos, sdist] - steps: - - uses: actions/download-artifact@v4 - - name: Publish to PyPI - uses: PyO3/maturin-action@v1 - env: - MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} - with: - command: upload - args: --non-interactive --skip-existing wheels-*/*