diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index dbe73bd..48b6bec 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -14,11 +14,8 @@ jobs: fail-fast: false matrix: # We test only the minimum and the maximum supported versions of python - python-version: ["3.8", "3.11"] + python-version: ["3.9", "3.12"] pandas-version: ["1.4", "2.1"] - exclude: - - python-version: "3.8" - pandas-version: "2.1" steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index b9c6e77..ec08757 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ description of the framework and the attack algorithms can be found in the paper ## Setup and installation -`Anonymeter` requires Python 3.8.x, 3.9.x or 3.10.x installed. The simplest way to install `Anonymeter` is from `PyPi`. Simply run +`Anonymeter` supports Python from 3.9 to 3.12. The simplest way to install `Anonymeter` is from `PyPi`. Simply run ``` pip install anonymeter diff --git a/pyproject.toml b/pyproject.toml index 9818a40..c0f8537 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ ] description = "Measure singling out, linkability, and inference risk for synthetic data." readme = "README.md" -requires-python = "<3.12, >3.7" # limited by Numba support +requires-python = "<3.13, >=3.9" license = {file = "LICENSE.md"} classifiers = [ "Programming Language :: Python :: 3", @@ -23,10 +23,10 @@ classifiers = [ dependencies = [ "scikit-learn~=1.2", - "numpy >=1.22, <1.27", # limited by Numba support + "numpy >=1.22, <1.27", # capped to supprt pandas ~1.4 "pandas>=1.4", "joblib~=1.2", - "numba~=0.58", + "numba~=0.59", "polars>=1.8", ] diff --git a/src/anonymeter/evaluators/inference_evaluator.py b/src/anonymeter/evaluators/inference_evaluator.py index 92f1534..94c87dd 100644 --- a/src/anonymeter/evaluators/inference_evaluator.py +++ b/src/anonymeter/evaluators/inference_evaluator.py @@ -3,7 +3,7 @@ # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. """Privacy evaluator that measures the inference risk.""" -from typing import List, Optional +from typing import Optional import numpy as np import numpy.typing as npt @@ -17,7 +17,7 @@ def _run_attack( target: pd.DataFrame, syn: pd.DataFrame, n_attacks: int, - aux_cols: List[str], + aux_cols: list[str], secret: str, n_jobs: int, naive: bool, @@ -149,7 +149,7 @@ def __init__( self, ori: pd.DataFrame, syn: pd.DataFrame, - aux_cols: List[str], + aux_cols: list[str], secret: str, regression: Optional[bool] = None, n_attacks: int = 500, diff --git a/src/anonymeter/evaluators/linkability_evaluator.py b/src/anonymeter/evaluators/linkability_evaluator.py index 0eb05fb..0e0655c 100644 --- a/src/anonymeter/evaluators/linkability_evaluator.py +++ b/src/anonymeter/evaluators/linkability_evaluator.py @@ -3,7 +3,7 @@ # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. """Privacy evaluator that measures the linkability risk.""" import logging -from typing import Dict, List, Optional, Set, Tuple, cast +from typing import Optional, cast import numpy as np import numpy.typing as npt @@ -37,7 +37,7 @@ def __init__(self, idx_0: npt.NDArray, idx_1: npt.NDArray): self._idx_0 = idx_0 self._idx_1 = idx_1 - def find_links(self, n_neighbors: int) -> Dict[int, Set[int]]: + def find_links(self, n_neighbors: int) -> dict[int, set[int]]: """Return synthetic records that link originals in the split datasets. Parameters @@ -86,9 +86,9 @@ def count_links(self, n_neighbors: int) -> int: return _count_links(links) -def _count_links(links: Dict[int, Set[int]]) -> int: +def _count_links(links: dict[int, set[int]]) -> int: """Count links.""" - linkable: Set[int] = set() + linkable: set[int] = set() for ori_idx in links: linkable = linkable | {ori_idx} @@ -127,7 +127,7 @@ def _linkability_attack( ori: pd.DataFrame, syn: pd.DataFrame, n_attacks: int, - aux_cols: Tuple[List[str], List[str]], + aux_cols: tuple[list[str], list[str]], n_neighbors: int, n_jobs: int, ) -> LinkabilityIndexes: @@ -185,7 +185,7 @@ def __init__( self, ori: pd.DataFrame, syn: pd.DataFrame, - aux_cols: Tuple[List[str], List[str]], + aux_cols: tuple[list[str], list[str]], n_attacks: Optional[int] = 500, n_neighbors: int = 1, control: Optional[pd.DataFrame] = None, diff --git a/src/anonymeter/evaluators/singling_out_evaluator.py b/src/anonymeter/evaluators/singling_out_evaluator.py index 17a1132..d4a30c4 100644 --- a/src/anonymeter/evaluators/singling_out_evaluator.py +++ b/src/anonymeter/evaluators/singling_out_evaluator.py @@ -5,8 +5,9 @@ import logging import operator +from collections.abc import Sequence from functools import reduce -from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union, cast +from typing import Any, Callable, Optional, Union, cast import numpy as np import numpy.typing as npt @@ -26,7 +27,7 @@ def _escape_quotes(string: str) -> str: def _query_from_record( record: dict, dtypes: dict, # map col -> pl.DataType - columns: List[str], + columns: list[str], medians: dict, # map col -> median value rng: np.random.Generator, ) -> pl.Expr: @@ -92,9 +93,9 @@ def _random_operator( def _random_query( - unique_values: Dict[str, List[Any]], - cols: List[str], - column_types: Dict[str, str], + unique_values: dict[str, list[Any]], + cols: list[str], + column_types: dict[str, str], rng: np.random.Generator, ) -> pl.Expr: exprs = [] @@ -140,7 +141,7 @@ def _random_queries( n_queries: int, n_cols: int, rng: np.random.Generator, -) -> List[pl.Expr]: +) -> list[pl.Expr]: unique_values = {col: df[col].unique().to_list() for col in df.columns} column_types = { col: _convert_polars_dtype(df[col].dtype) @@ -212,8 +213,8 @@ def singling_out_probability_integral( def _measure_queries_success( - df: pl.DataFrame, queries: List[pl.Expr], n_repeat: int, n_meas: int -) -> Tuple[npt.NDArray, npt.NDArray]: + df: pl.DataFrame, queries: list[pl.Expr], n_repeat: int, n_meas: int +) -> tuple[npt.NDArray, npt.NDArray]: sizes, successes = [], [] min_rows = min(1000, len(df)) @@ -247,7 +248,7 @@ def _fit_model(sizes: npt.NDArray, successes: npt.NDArray) -> Callable: return lambda x: _model(x, *popt) -def fit_correction_term(df: pl.DataFrame, queries: List[pl.Expr]) -> Callable: +def fit_correction_term(df: pl.DataFrame, queries: list[pl.Expr]) -> Callable: """Fit correction for different size of the control dataset. Parameters @@ -280,11 +281,11 @@ class UniqueSinglingOutQueries: """ def __init__(self, max_size: Optional[int] = None): - self._set: Set[str] = set() - self._list: List[pl.Expr] = [] + self._set: set[str] = set() + self._list: list[pl.Expr] = [] self._max_size: Optional[int] = max_size - def check_and_extend(self, queries: List[pl.Expr], df: pl.DataFrame): + def check_and_extend(self, queries: list[pl.Expr], df: pl.DataFrame): """Add singling-out queries to the collection. Only queries that are not already in this collection can be added. @@ -317,14 +318,14 @@ def __len__(self): return len(self._list) @property - def queries(self) -> List[pl.Expr]: + def queries(self) -> list[pl.Expr]: """Queries that are present in the collection.""" return self._list def univariate_singling_out_queries( df: pl.DataFrame, n_queries: int, rng: np.random.Generator -) -> List[pl.Expr]: +) -> list[pl.Expr]: """Generate singling out queries from rare attributes. Parameters @@ -388,7 +389,7 @@ def multivariate_singling_out_queries( max_attempts: Optional[int], rng: np.random.Generator, batch_size: int = 1000, -) -> List[pl.Expr]: +) -> list[pl.Expr]: """Generates singling out queries from a combination of attributes. Parameters @@ -478,8 +479,8 @@ def multivariate_singling_out_queries( def _evaluate_queries( - df: pl.DataFrame, queries: List[pl.Expr] -) -> Tuple[int, ...]: + df: pl.DataFrame, queries: list[pl.Expr] +) -> tuple[int, ...]: if len(queries) == 0: return () @@ -494,8 +495,8 @@ def _evaluate_queries( def _evaluate_queries_and_return_successful( - df: pl.DataFrame, queries: List[pl.Expr] -) -> List[pl.Expr]: + df: pl.DataFrame, queries: list[pl.Expr] +) -> list[pl.Expr]: counts = _evaluate_queries(df=df, queries=queries) counts_np = np.array(counts, dtype=float) @@ -517,7 +518,7 @@ def _generate_singling_out_queries( n_cols: int, max_attempts: Optional[int], rng: np.random.Generator, -) -> List[pl.Expr]: +) -> list[pl.Expr]: if mode == "univariate": queries = univariate_singling_out_queries( df=df, n_queries=n_attacks, rng=rng @@ -615,12 +616,12 @@ def __init__( control = pl.DataFrame(control) self._control = control.unique(maintain_order=True) self._max_attempts = max_attempts - self._queries: List[pl.Expr] = [] - self._random_queries: List[pl.Expr] = [] + self._queries: list[pl.Expr] = [] + self._random_queries: list[pl.Expr] = [] self._evaluated = False self._rng = np.random.default_rng() if seed is None else np.random.default_rng(seed) - def queries(self, baseline: bool = False) -> List[pl.Expr]: + def queries(self, baseline: bool = False) -> list[pl.Expr]: """Successful singling out queries. Parameters diff --git a/src/anonymeter/neighbors/mixed_types_kneighbors.py b/src/anonymeter/neighbors/mixed_types_kneighbors.py index 944491a..36c94b8 100644 --- a/src/anonymeter/neighbors/mixed_types_kneighbors.py +++ b/src/anonymeter/neighbors/mixed_types_kneighbors.py @@ -4,7 +4,7 @@ """Nearest neighbor search for mixed type data.""" import logging from math import fabs, isnan -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import numpy as np import numpy.typing as npt @@ -76,7 +76,7 @@ def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> flo @jit(nopython=True, nogil=True) def _nearest_neighbors( queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int -) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]: +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]: r"""For every element of ``queries``, find its nearest neighbors in ``candidates``. Parameters @@ -147,7 +147,7 @@ def __init__(self, n_neighbors: int = 5, n_jobs: int = -2): self._n_neighbors = n_neighbors self._n_jobs = n_jobs - def fit(self, candidates: pd.DataFrame, ctypes: Optional[Dict[str, List[str]]] = None): + def fit(self, candidates: pd.DataFrame, ctypes: Optional[dict[str, list[str]]] = None): """Prepare for nearest neighbor search. Parameters @@ -167,7 +167,7 @@ def fit(self, candidates: pd.DataFrame, ctypes: Optional[Dict[str, List[str]]] = def kneighbors( self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False - ) -> Union[Tuple[npt.NDArray, npt.NDArray], npt.NDArray]: + ) -> Union[tuple[npt.NDArray, npt.NDArray], npt.NDArray]: """Find the nearest neighbors for a set of query points. Note diff --git a/src/anonymeter/preprocessing/transformations.py b/src/anonymeter/preprocessing/transformations.py index 08d99c6..0ce0d15 100644 --- a/src/anonymeter/preprocessing/transformations.py +++ b/src/anonymeter/preprocessing/transformations.py @@ -3,7 +3,6 @@ # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. """Data pre-processing and transformations for the privacy evaluators.""" import logging -from typing import List, Tuple import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -14,7 +13,7 @@ def _encode_categorical( df1: pd.DataFrame, df2: pd.DataFrame, -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Encode dataframes with categorical values keeping label consistend.""" encoded = pd.concat((df1, df2), keys=["df1", "df2"]) @@ -24,7 +23,7 @@ def _encode_categorical( return encoded.loc["df1"], encoded.loc["df2"] -def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: +def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """Scale dataframes with *only* numerical values.""" df1_min, df1_max = df1.min(), df1.max() df2_min, df2_max = df2.min(), df2.max() @@ -50,8 +49,8 @@ def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame def mixed_types_transform( - df1: pd.DataFrame, df2: pd.DataFrame, num_cols: List[str], cat_cols: List[str] -) -> Tuple[pd.DataFrame, pd.DataFrame]: + df1: pd.DataFrame, df2: pd.DataFrame, num_cols: list[str], cat_cols: list[str] +) -> tuple[pd.DataFrame, pd.DataFrame]: """Combination of an encoder and a scaler to treat mixed type data. Numerical columns are scaled by dividing them by their range across both diff --git a/src/anonymeter/preprocessing/type_detection.py b/src/anonymeter/preprocessing/type_detection.py index bb8f193..ec35754 100644 --- a/src/anonymeter/preprocessing/type_detection.py +++ b/src/anonymeter/preprocessing/type_detection.py @@ -1,12 +1,11 @@ # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. # Copyright (c) 2022 Anonos IP LLC. # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. -from typing import Dict, List import pandas as pd -def detect_col_types(df: pd.DataFrame) -> Dict[str, List[str]]: +def detect_col_types(df: pd.DataFrame) -> dict[str, list[str]]: """Identify numerical and non-numerical columns in the dataframe. Parameters @@ -21,8 +20,8 @@ def detect_col_types(df: pd.DataFrame) -> Dict[str, List[str]]: Values are lists of column names. """ - num_cols: List[str] = list(df.select_dtypes("number").columns.values) - cat_cols: List[str] = [cn for cn in df.columns.values if cn not in num_cols] + num_cols: list[str] = list(df.select_dtypes("number").columns.values) + cat_cols: list[str] = [cn for cn in df.columns.values if cn not in num_cols] return {"num": sorted(num_cols), "cat": sorted(cat_cols)} diff --git a/src/anonymeter/stats/confidence.py b/src/anonymeter/stats/confidence.py index 852414a..b95049c 100644 --- a/src/anonymeter/stats/confidence.py +++ b/src/anonymeter/stats/confidence.py @@ -5,7 +5,7 @@ import warnings from math import sqrt -from typing import NamedTuple, Optional, Tuple +from typing import NamedTuple, Optional from scipy.stats import norm @@ -23,7 +23,7 @@ class PrivacyRisk(NamedTuple): """ value: float - ci: Tuple[float, float] + ci: tuple[float, float] class SuccessRate(NamedTuple): diff --git a/tests/test_inference_evaluator.py b/tests/test_inference_evaluator.py index ba146ca..c8cf9ab 100644 --- a/tests/test_inference_evaluator.py +++ b/tests/test_inference_evaluator.py @@ -1,7 +1,7 @@ # This file is part of Anonymeter and is released under BSD 3-Clause Clear License. # Copyright (c) 2022 Anonos IP LLC. # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. -from typing import Iterable +from collections.abc import Iterable import numpy as np import pandas as pd