Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@ jobs:
fail-fast: false
matrix:
# We test only the minimum and the maximum supported versions of python
python-version: ["3.8", "3.11"]
python-version: ["3.9", "3.12"]
pandas-version: ["1.4", "2.1"]
exclude:
- python-version: "3.8"
pandas-version: "2.1"

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ description of the framework and the attack algorithms can be found in the paper

## Setup and installation

`Anonymeter` requires Python 3.8.x, 3.9.x or 3.10.x installed. The simplest way to install `Anonymeter` is from `PyPi`. Simply run
`Anonymeter` supports Python from 3.9 to 3.12. The simplest way to install `Anonymeter` is from `PyPi`. Simply run

```
pip install anonymeter
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ authors = [
]
description = "Measure singling out, linkability, and inference risk for synthetic data."
readme = "README.md"
requires-python = "<3.12, >3.7" # limited by Numba support
requires-python = "<3.13, >=3.9"
license = {file = "LICENSE.md"}
classifiers = [
"Programming Language :: Python :: 3",
Expand All @@ -23,10 +23,10 @@ classifiers = [

dependencies = [
"scikit-learn~=1.2",
"numpy >=1.22, <1.27", # limited by Numba support
"numpy >=1.22, <1.27", # capped to supprt pandas ~1.4
"pandas>=1.4",
"joblib~=1.2",
"numba~=0.58",
"numba~=0.59",
"polars>=1.8",
]

Expand Down
6 changes: 3 additions & 3 deletions src/anonymeter/evaluators/inference_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
"""Privacy evaluator that measures the inference risk."""

from typing import List, Optional
from typing import Optional

import numpy as np
import numpy.typing as npt
Expand All @@ -17,7 +17,7 @@ def _run_attack(
target: pd.DataFrame,
syn: pd.DataFrame,
n_attacks: int,
aux_cols: List[str],
aux_cols: list[str],
secret: str,
n_jobs: int,
naive: bool,
Expand Down Expand Up @@ -149,7 +149,7 @@ def __init__(
self,
ori: pd.DataFrame,
syn: pd.DataFrame,
aux_cols: List[str],
aux_cols: list[str],
secret: str,
regression: Optional[bool] = None,
n_attacks: int = 500,
Expand Down
12 changes: 6 additions & 6 deletions src/anonymeter/evaluators/linkability_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
"""Privacy evaluator that measures the linkability risk."""
import logging
from typing import Dict, List, Optional, Set, Tuple, cast
from typing import Optional, cast

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -37,7 +37,7 @@ def __init__(self, idx_0: npt.NDArray, idx_1: npt.NDArray):
self._idx_0 = idx_0
self._idx_1 = idx_1

def find_links(self, n_neighbors: int) -> Dict[int, Set[int]]:
def find_links(self, n_neighbors: int) -> dict[int, set[int]]:
"""Return synthetic records that link originals in the split datasets.

Parameters
Expand Down Expand Up @@ -86,9 +86,9 @@ def count_links(self, n_neighbors: int) -> int:
return _count_links(links)


def _count_links(links: Dict[int, Set[int]]) -> int:
def _count_links(links: dict[int, set[int]]) -> int:
"""Count links."""
linkable: Set[int] = set()
linkable: set[int] = set()

for ori_idx in links:
linkable = linkable | {ori_idx}
Expand Down Expand Up @@ -127,7 +127,7 @@ def _linkability_attack(
ori: pd.DataFrame,
syn: pd.DataFrame,
n_attacks: int,
aux_cols: Tuple[List[str], List[str]],
aux_cols: tuple[list[str], list[str]],
n_neighbors: int,
n_jobs: int,
) -> LinkabilityIndexes:
Expand Down Expand Up @@ -185,7 +185,7 @@ def __init__(
self,
ori: pd.DataFrame,
syn: pd.DataFrame,
aux_cols: Tuple[List[str], List[str]],
aux_cols: tuple[list[str], list[str]],
n_attacks: Optional[int] = 500,
n_neighbors: int = 1,
control: Optional[pd.DataFrame] = None,
Expand Down
47 changes: 24 additions & 23 deletions src/anonymeter/evaluators/singling_out_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

import logging
import operator
from collections.abc import Sequence
from functools import reduce
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union, cast
from typing import Any, Callable, Optional, Union, cast

import numpy as np
import numpy.typing as npt
Expand All @@ -26,7 +27,7 @@ def _escape_quotes(string: str) -> str:
def _query_from_record(
record: dict,
dtypes: dict, # map col -> pl.DataType
columns: List[str],
columns: list[str],
medians: dict, # map col -> median value
rng: np.random.Generator,
) -> pl.Expr:
Expand Down Expand Up @@ -92,9 +93,9 @@ def _random_operator(


def _random_query(
unique_values: Dict[str, List[Any]],
cols: List[str],
column_types: Dict[str, str],
unique_values: dict[str, list[Any]],
cols: list[str],
column_types: dict[str, str],
rng: np.random.Generator,
) -> pl.Expr:
exprs = []
Expand Down Expand Up @@ -140,7 +141,7 @@ def _random_queries(
n_queries: int,
n_cols: int,
rng: np.random.Generator,
) -> List[pl.Expr]:
) -> list[pl.Expr]:
unique_values = {col: df[col].unique().to_list() for col in df.columns}
column_types = {
col: _convert_polars_dtype(df[col].dtype)
Expand Down Expand Up @@ -212,8 +213,8 @@ def singling_out_probability_integral(


def _measure_queries_success(
df: pl.DataFrame, queries: List[pl.Expr], n_repeat: int, n_meas: int
) -> Tuple[npt.NDArray, npt.NDArray]:
df: pl.DataFrame, queries: list[pl.Expr], n_repeat: int, n_meas: int
) -> tuple[npt.NDArray, npt.NDArray]:
sizes, successes = [], []
min_rows = min(1000, len(df))

Expand Down Expand Up @@ -247,7 +248,7 @@ def _fit_model(sizes: npt.NDArray, successes: npt.NDArray) -> Callable:
return lambda x: _model(x, *popt)


def fit_correction_term(df: pl.DataFrame, queries: List[pl.Expr]) -> Callable:
def fit_correction_term(df: pl.DataFrame, queries: list[pl.Expr]) -> Callable:
"""Fit correction for different size of the control dataset.

Parameters
Expand Down Expand Up @@ -280,11 +281,11 @@ class UniqueSinglingOutQueries:
"""

def __init__(self, max_size: Optional[int] = None):
self._set: Set[str] = set()
self._list: List[pl.Expr] = []
self._set: set[str] = set()
self._list: list[pl.Expr] = []
self._max_size: Optional[int] = max_size

def check_and_extend(self, queries: List[pl.Expr], df: pl.DataFrame):
def check_and_extend(self, queries: list[pl.Expr], df: pl.DataFrame):
"""Add singling-out queries to the collection.

Only queries that are not already in this collection can be added.
Expand Down Expand Up @@ -317,14 +318,14 @@ def __len__(self):
return len(self._list)

@property
def queries(self) -> List[pl.Expr]:
def queries(self) -> list[pl.Expr]:
"""Queries that are present in the collection."""
return self._list


def univariate_singling_out_queries(
df: pl.DataFrame, n_queries: int, rng: np.random.Generator
) -> List[pl.Expr]:
) -> list[pl.Expr]:
"""Generate singling out queries from rare attributes.

Parameters
Expand Down Expand Up @@ -388,7 +389,7 @@ def multivariate_singling_out_queries(
max_attempts: Optional[int],
rng: np.random.Generator,
batch_size: int = 1000,
) -> List[pl.Expr]:
) -> list[pl.Expr]:
"""Generates singling out queries from a combination of attributes.

Parameters
Expand Down Expand Up @@ -478,8 +479,8 @@ def multivariate_singling_out_queries(


def _evaluate_queries(
df: pl.DataFrame, queries: List[pl.Expr]
) -> Tuple[int, ...]:
df: pl.DataFrame, queries: list[pl.Expr]
) -> tuple[int, ...]:
if len(queries) == 0:
return ()

Expand All @@ -494,8 +495,8 @@ def _evaluate_queries(


def _evaluate_queries_and_return_successful(
df: pl.DataFrame, queries: List[pl.Expr]
) -> List[pl.Expr]:
df: pl.DataFrame, queries: list[pl.Expr]
) -> list[pl.Expr]:
counts = _evaluate_queries(df=df, queries=queries)

counts_np = np.array(counts, dtype=float)
Expand All @@ -517,7 +518,7 @@ def _generate_singling_out_queries(
n_cols: int,
max_attempts: Optional[int],
rng: np.random.Generator,
) -> List[pl.Expr]:
) -> list[pl.Expr]:
if mode == "univariate":
queries = univariate_singling_out_queries(
df=df, n_queries=n_attacks, rng=rng
Expand Down Expand Up @@ -615,12 +616,12 @@ def __init__(
control = pl.DataFrame(control)
self._control = control.unique(maintain_order=True)
self._max_attempts = max_attempts
self._queries: List[pl.Expr] = []
self._random_queries: List[pl.Expr] = []
self._queries: list[pl.Expr] = []
self._random_queries: list[pl.Expr] = []
self._evaluated = False
self._rng = np.random.default_rng() if seed is None else np.random.default_rng(seed)

def queries(self, baseline: bool = False) -> List[pl.Expr]:
def queries(self, baseline: bool = False) -> list[pl.Expr]:
"""Successful singling out queries.

Parameters
Expand Down
8 changes: 4 additions & 4 deletions src/anonymeter/neighbors/mixed_types_kneighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Nearest neighbor search for mixed type data."""
import logging
from math import fabs, isnan
from typing import Dict, List, Optional, Tuple, Union
from typing import Optional, Union

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -76,7 +76,7 @@ def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> flo
@jit(nopython=True, nogil=True)
def _nearest_neighbors(
queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]:
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]:
r"""For every element of ``queries``, find its nearest neighbors in ``candidates``.

Parameters
Expand Down Expand Up @@ -147,7 +147,7 @@ def __init__(self, n_neighbors: int = 5, n_jobs: int = -2):
self._n_neighbors = n_neighbors
self._n_jobs = n_jobs

def fit(self, candidates: pd.DataFrame, ctypes: Optional[Dict[str, List[str]]] = None):
def fit(self, candidates: pd.DataFrame, ctypes: Optional[dict[str, list[str]]] = None):
"""Prepare for nearest neighbor search.

Parameters
Expand All @@ -167,7 +167,7 @@ def fit(self, candidates: pd.DataFrame, ctypes: Optional[Dict[str, List[str]]] =

def kneighbors(
self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
) -> Union[Tuple[npt.NDArray, npt.NDArray], npt.NDArray]:
) -> Union[tuple[npt.NDArray, npt.NDArray], npt.NDArray]:
"""Find the nearest neighbors for a set of query points.

Note
Expand Down
9 changes: 4 additions & 5 deletions src/anonymeter/preprocessing/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
"""Data pre-processing and transformations for the privacy evaluators."""
import logging
from typing import List, Tuple

import pandas as pd
from sklearn.preprocessing import LabelEncoder
Expand All @@ -14,7 +13,7 @@
def _encode_categorical(
df1: pd.DataFrame,
df2: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Encode dataframes with categorical values keeping label consistend."""
encoded = pd.concat((df1, df2), keys=["df1", "df2"])

Expand All @@ -24,7 +23,7 @@ def _encode_categorical(
return encoded.loc["df1"], encoded.loc["df2"]


def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Scale dataframes with *only* numerical values."""
df1_min, df1_max = df1.min(), df1.max()
df2_min, df2_max = df2.min(), df2.max()
Expand All @@ -50,8 +49,8 @@ def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame


def mixed_types_transform(
df1: pd.DataFrame, df2: pd.DataFrame, num_cols: List[str], cat_cols: List[str]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
df1: pd.DataFrame, df2: pd.DataFrame, num_cols: list[str], cat_cols: list[str]
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Combination of an encoder and a scaler to treat mixed type data.

Numerical columns are scaled by dividing them by their range across both
Expand Down
7 changes: 3 additions & 4 deletions src/anonymeter/preprocessing/type_detection.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
# Copyright (c) 2022 Anonos IP LLC.
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
from typing import Dict, List

import pandas as pd


def detect_col_types(df: pd.DataFrame) -> Dict[str, List[str]]:
def detect_col_types(df: pd.DataFrame) -> dict[str, list[str]]:
"""Identify numerical and non-numerical columns in the dataframe.

Parameters
Expand All @@ -21,8 +20,8 @@ def detect_col_types(df: pd.DataFrame) -> Dict[str, List[str]]:
Values are lists of column names.

"""
num_cols: List[str] = list(df.select_dtypes("number").columns.values)
cat_cols: List[str] = [cn for cn in df.columns.values if cn not in num_cols]
num_cols: list[str] = list(df.select_dtypes("number").columns.values)
cat_cols: list[str] = [cn for cn in df.columns.values if cn not in num_cols]

return {"num": sorted(num_cols), "cat": sorted(cat_cols)}

Expand Down
4 changes: 2 additions & 2 deletions src/anonymeter/stats/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import warnings
from math import sqrt
from typing import NamedTuple, Optional, Tuple
from typing import NamedTuple, Optional

from scipy.stats import norm

Expand All @@ -23,7 +23,7 @@ class PrivacyRisk(NamedTuple):
"""

value: float
ci: Tuple[float, float]
ci: tuple[float, float]


class SuccessRate(NamedTuple):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_inference_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
# Copyright (c) 2022 Anonos IP LLC.
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
from typing import Iterable
from collections.abc import Iterable

import numpy as np
import pandas as pd
Expand Down