Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ dmypy.json
# Pyre type checker
.pyre/

# vscode
# IDE
.vscode/
.idea/

**/*DS_Store*
**/*DS_Store*
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,3 @@ This `bibtex` entry can be used to refer to the paper:
### License

Licensed under Clear BSD License, see `LICENSE.md` to see the full license text. Patent-pending code (application US-20230401336-A1).

79 changes: 45 additions & 34 deletions src/anonymeter/evaluators/inference_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,27 @@
# Copyright (c) 2022 Anonos IP LLC.
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
"""Privacy evaluator that measures the inference risk."""

from typing import Optional

import numpy as np
import numpy.typing as npt
import pandas as pd

from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors
from anonymeter.evaluators.inference_predictor import InferencePredictor
from anonymeter.neighbors.mixed_types_kneighbors import KNNInferencePredictor
from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk


def _run_attack(
target: pd.DataFrame,
syn: pd.DataFrame,
n_attacks: int,
aux_cols: list[str],
secret: str,
n_jobs: int,
naive: bool,
regression: Optional[bool],
target: pd.DataFrame,
syn: pd.DataFrame,
n_attacks: int,
aux_cols: list[str],
secret: str,
n_jobs: int,
naive: bool,
regression: Optional[bool],
inference_model: Optional[InferencePredictor],
) -> int:
if regression is None:
regression = pd.api.types.is_numeric_dtype(target[secret])
Expand All @@ -30,21 +31,17 @@ def _run_attack(

if naive:
guesses = syn.sample(n_attacks)[secret]

else:
nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=syn[aux_cols])

guesses_idx = nn.kneighbors(queries=targets[aux_cols])
if isinstance(guesses_idx, tuple):
raise RuntimeError("guesses_idx cannot be a tuple")

guesses = syn.iloc[guesses_idx.flatten()][secret]
# Instantiate the default KNN model if no other model is passed through `inference_model`.
if inference_model is None:
inference_model = KNNInferencePredictor(data=syn, columns=aux_cols, target_col=secret, n_jobs=n_jobs)
guesses = inference_model.predict(targets)

return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum()


def evaluate_inference_guesses(
guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05
guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05
) -> npt.NDArray:
"""Evaluate the success of an inference attack.

Expand Down Expand Up @@ -142,23 +139,33 @@ class InferenceEvaluator:
the variable.
n_attacks : int, default is 500
Number of attack attempts.
In case the whole dataset size should be used, set this to np.inf.
inference_model: InferencePredictor
An ml model fitted on `syn` as training data, and `secret` as target, that supports ::predict(x).
If not None, it will be used over the MixedTypeKNeighbors in the attack.

"""

def __init__(
self,
ori: pd.DataFrame,
syn: pd.DataFrame,
aux_cols: list[str],
secret: str,
regression: Optional[bool] = None,
n_attacks: int = 500,
control: Optional[pd.DataFrame] = None,
self,
ori: pd.DataFrame,
syn: pd.DataFrame,
aux_cols: list[str],
secret: str,
regression: Optional[bool] = None,
n_attacks: int = 500,
control: Optional[pd.DataFrame] = None,
inference_model: Optional[InferencePredictor] = None
):
self._ori = ori
self._syn = syn
self._control = control
self._n_attacks = n_attacks
self._inference_model = inference_model

self._n_attacks_ori = min(n_attacks, self._ori.shape[0])
self._n_attacks_baseline = min(self._syn.shape[0], self._n_attacks_ori)
self._n_attacks_control = -1 if self._control is None else min(n_attacks, self._control.shape[0])

# check if secret is a string column
if not isinstance(secret, str):
Expand All @@ -173,16 +180,17 @@ def __init__(
self._aux_cols = aux_cols
self._evaluated = False

def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int:
def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int, n_attacks: int) -> int:
return _run_attack(
target=target,
syn=self._syn,
n_attacks=self._n_attacks,
n_attacks=n_attacks,
aux_cols=self._aux_cols,
secret=self._secret,
n_jobs=n_jobs,
naive=naive,
regression=self._regression,
inference_model=self._inference_model,
)

def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
Expand All @@ -199,11 +207,14 @@ def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator":
The evaluated ``InferenceEvaluator`` object.

"""
self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs)
self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs)
self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs,
n_attacks=self._n_attacks_baseline)
self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs,
n_attacks=self._n_attacks_ori)
self._n_control = (
None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs)
)
None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs,
n_attacks=self._n_attacks_control)
)

self._evaluated = True
return self
Expand All @@ -226,7 +237,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults:
raise RuntimeError("The inference evaluator wasn't evaluated yet. Please, run `evaluate()` first.")

return EvaluationResults(
n_attacks=self._n_attacks,
n_attacks=(self._n_attacks_ori, self._n_attacks_baseline, self._n_attacks_control),
n_success=self._n_success,
n_baseline=self._n_baseline,
n_control=self._n_control,
Expand Down
31 changes: 31 additions & 0 deletions src/anonymeter/evaluators/inference_predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
# Copyright (c) 2022 Anonos IP LLC.
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
"""A protocol for a custom inference predictor."""
from typing import Protocol

import pandas as pd


class InferencePredictor(Protocol):
"""Interface for custom inference models.

It is used as `inference_model` in the InferenceEvaluator in inference_evaluator.py.

For an example usage refer to the SklearnInferencePredictor in sklearn_inference_predictor.py.
"""
def predict(self, x: pd.DataFrame) -> pd.Series:
"""Predict the targets for input `x`.

Parameters
----------
x : pd.DataFrame
The input data to predict.

Returns
-------
pd.Series
The predictions as pd.Series.

"""
...
44 changes: 44 additions & 0 deletions src/anonymeter/evaluators/sklearn_inference_predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# This file is part of Anonymeter and is released under BSD 3-Clause Clear License.
# Copyright (c) 2022 Anonos IP LLC.
# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details.
"""A wrapper class around a sklearn model implementing the InferencePredictor."""
import pandas as pd
from sklearn.base import BaseEstimator, is_classifier, is_regressor

from anonymeter.evaluators.inference_predictor import InferencePredictor


class SklearnInferencePredictor(InferencePredictor):
"""Wrapper class to use sklearn methods in the inference evaluator.

Parameters
----------
model : sklearn.base.BaseEstimator
A classifier or regressor which implements ::predict().
The model needs to be fitted, it must contain its own preprocessing pipeline,
and it needs to respect the index of the input data.

"""
def __init__(self, model: BaseEstimator):
if not (is_classifier(estimator=model) or is_regressor(estimator=model)):
raise ValueError("Model must be classifier or regressor %s", model)
if not hasattr(model, "predict"):
raise ValueError("Model must have a predict method, %s", model)
self._model = model

def predict(self, x: pd.DataFrame) -> pd.Series:
"""Predict the targets for input `x`.

Parameters
----------
x : pd.DataFrame
The input data to predict.

Returns
-------
pd.Series
The predictions as pd.Series.

"""
prediction = self._model.predict(x)
return pd.Series(prediction, index=x.index)
49 changes: 46 additions & 3 deletions src/anonymeter/neighbors/mixed_types_kneighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from joblib import Parallel, delayed
from numba import jit

from anonymeter.evaluators.inference_predictor import InferencePredictor
from anonymeter.preprocessing.transformations import mixed_types_transform
from anonymeter.preprocessing.type_detection import detect_consistent_col_types

Expand Down Expand Up @@ -75,7 +76,7 @@ def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> flo

@jit(nopython=True, nogil=True)
def _nearest_neighbors(
queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]:
r"""For every element of ``queries``, find its nearest neighbors in ``candidates``.

Expand Down Expand Up @@ -166,7 +167,7 @@ def fit(self, candidates: pd.DataFrame, ctypes: Optional[dict[str, list[str]]] =
return self

def kneighbors(
self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False
) -> Union[tuple[npt.NDArray, npt.NDArray], npt.NDArray]:
"""Find the nearest neighbors for a set of query points.

Expand Down Expand Up @@ -220,7 +221,7 @@ def kneighbors(
with Parallel(n_jobs=self._n_jobs, backend="threading") as executor:
res = executor(
delayed(_nearest_neighbors)(
queries=queries[ii : ii + 1],
queries=queries[ii: ii + 1],
candidates=candidates,
cat_cols_index=len(self._ctypes["num"]),
n_neighbors=n_neighbors,
Expand All @@ -235,3 +236,45 @@ def kneighbors(
return distances, indexes

return indexes


class KNNInferencePredictor(InferencePredictor):
"""Wrapper class to use MixedTypeKNeighbors in the inference evaluator.

Parameters
----------
data : pd.DataFrame
The train data to fit the model on (usually the synthetic data).
columns : list[str]
The auxiliary columns of `data`, used as input to the model.
target_col : str
The target column of `data`.
n_jobs : int, default is -2
Number of jobs to use. It follows joblib convention, so that ``n_jobs = -1``
means all available cores

"""

def __init__(self, data: pd.DataFrame, columns: list[str], target_col: str, n_jobs: int):
self._nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=data[columns])
self._target_series = data[target_col]
self._columns = columns

def predict(self, x: pd.DataFrame) -> pd.Series:
"""Predict the targets for input `x`.

Parameters
----------
x : pd.DataFrame
The input data to predict.

Returns
-------
pd.Series
The predictions as pd.Series.

"""
guesses_idx = self._nn.kneighbors(queries=x[self._columns])
if isinstance(guesses_idx, tuple):
raise RuntimeError("guesses_idx cannot be a tuple")
return self._target_series.iloc[guesses_idx.flatten()]
26 changes: 19 additions & 7 deletions src/anonymeter/stats/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import warnings
from math import sqrt
from typing import NamedTuple, Optional
from typing import NamedTuple, Optional, Union

from scipy.stats import norm

Expand Down Expand Up @@ -174,8 +174,12 @@ class EvaluationResults:

Parameters
----------
n_attacks : int
n_attacks : Union[int, tuple[int, int, int]]
Total number of attacks performed.
It can be a single number (int) which will apply to all three: main (ori), baseline, and control attack,
or a tuple (n_attacks_ori, n_attacks_baseline, n_attacks_control) - (int, int, int) which will contain
different numbers of attacks in case the user wants to perform different number of attacks for each
main (ori), baseline and control target dataset.
n_success : int
Number of successful attacks.
n_baseline : int
Expand All @@ -194,23 +198,31 @@ class EvaluationResults:

def __init__(
self,
n_attacks: int,
n_attacks: Union[int, tuple[int, int, int]],
n_success: int,
n_baseline: int,
n_control: Optional[int] = None,
confidence_level: float = 0.95,
):
self.attack_rate = success_rate(n_total=n_attacks, n_success=n_success, confidence_level=confidence_level)
if isinstance(n_attacks, int):
self.n_attacks_ori = n_attacks
self.n_attacks_baseline = n_attacks
self.n_attacks_control = n_attacks
elif isinstance(n_attacks, tuple):
self.n_attacks_ori, self.n_attacks_baseline, self.n_attacks_control = n_attacks
else:
raise ValueError(f"n_attacks must be an integer or a tuple of three integers, got {n_attacks}")

self.attack_rate = success_rate(n_total=self.n_attacks_ori, n_success=n_success, confidence_level=confidence_level)

self.baseline_rate = success_rate(n_total=n_attacks, n_success=n_baseline, confidence_level=confidence_level)
self.baseline_rate = success_rate(n_total=self.n_attacks_baseline, n_success=n_baseline, confidence_level=confidence_level)

self.control_rate = (
None
if n_control is None
else success_rate(n_total=n_attacks, n_success=n_control, confidence_level=confidence_level)
else success_rate(n_total=self.n_attacks_control, n_success=n_control, confidence_level=confidence_level)
)

self.n_attacks = n_attacks
self.n_success = n_success
self.n_baseline = n_baseline
self.n_control = n_control
Expand Down
Loading