diff --git a/.gitignore b/.gitignore index 4dba1c9..75ec26b 100644 --- a/.gitignore +++ b/.gitignore @@ -128,7 +128,8 @@ dmypy.json # Pyre type checker .pyre/ -# vscode +# IDE .vscode/ +.idea/ -**/*DS_Store* \ No newline at end of file +**/*DS_Store* diff --git a/README.md b/README.md index ec08757..9bbfd40 100644 --- a/README.md +++ b/README.md @@ -159,4 +159,3 @@ This `bibtex` entry can be used to refer to the paper: ### License Licensed under Clear BSD License, see `LICENSE.md` to see the full license text. Patent-pending code (application US-20230401336-A1). - diff --git a/src/anonymeter/evaluators/inference_evaluator.py b/src/anonymeter/evaluators/inference_evaluator.py index 94c87dd..a9ad545 100644 --- a/src/anonymeter/evaluators/inference_evaluator.py +++ b/src/anonymeter/evaluators/inference_evaluator.py @@ -2,26 +2,27 @@ # Copyright (c) 2022 Anonos IP LLC. # See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. """Privacy evaluator that measures the inference risk.""" - from typing import Optional import numpy as np import numpy.typing as npt import pandas as pd -from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors +from anonymeter.evaluators.inference_predictor import InferencePredictor +from anonymeter.neighbors.mixed_types_kneighbors import KNNInferencePredictor from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk def _run_attack( - target: pd.DataFrame, - syn: pd.DataFrame, - n_attacks: int, - aux_cols: list[str], - secret: str, - n_jobs: int, - naive: bool, - regression: Optional[bool], + target: pd.DataFrame, + syn: pd.DataFrame, + n_attacks: int, + aux_cols: list[str], + secret: str, + n_jobs: int, + naive: bool, + regression: Optional[bool], + inference_model: Optional[InferencePredictor], ) -> int: if regression is None: regression = pd.api.types.is_numeric_dtype(target[secret]) @@ -30,21 +31,17 @@ def _run_attack( if naive: guesses = syn.sample(n_attacks)[secret] - else: - nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=syn[aux_cols]) - - guesses_idx = nn.kneighbors(queries=targets[aux_cols]) - if isinstance(guesses_idx, tuple): - raise RuntimeError("guesses_idx cannot be a tuple") - - guesses = syn.iloc[guesses_idx.flatten()][secret] + # Instantiate the default KNN model if no other model is passed through `inference_model`. + if inference_model is None: + inference_model = KNNInferencePredictor(data=syn, columns=aux_cols, target_col=secret, n_jobs=n_jobs) + guesses = inference_model.predict(targets) return evaluate_inference_guesses(guesses=guesses, secrets=targets[secret], regression=regression).sum() def evaluate_inference_guesses( - guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05 + guesses: pd.Series, secrets: pd.Series, regression: bool, tolerance: float = 0.05 ) -> npt.NDArray: """Evaluate the success of an inference attack. @@ -142,23 +139,33 @@ class InferenceEvaluator: the variable. n_attacks : int, default is 500 Number of attack attempts. + In case the whole dataset size should be used, set this to np.inf. + inference_model: InferencePredictor + An ml model fitted on `syn` as training data, and `secret` as target, that supports ::predict(x). + If not None, it will be used over the MixedTypeKNeighbors in the attack. """ def __init__( - self, - ori: pd.DataFrame, - syn: pd.DataFrame, - aux_cols: list[str], - secret: str, - regression: Optional[bool] = None, - n_attacks: int = 500, - control: Optional[pd.DataFrame] = None, + self, + ori: pd.DataFrame, + syn: pd.DataFrame, + aux_cols: list[str], + secret: str, + regression: Optional[bool] = None, + n_attacks: int = 500, + control: Optional[pd.DataFrame] = None, + inference_model: Optional[InferencePredictor] = None ): self._ori = ori self._syn = syn self._control = control self._n_attacks = n_attacks + self._inference_model = inference_model + + self._n_attacks_ori = min(n_attacks, self._ori.shape[0]) + self._n_attacks_baseline = min(self._syn.shape[0], self._n_attacks_ori) + self._n_attacks_control = -1 if self._control is None else min(n_attacks, self._control.shape[0]) # check if secret is a string column if not isinstance(secret, str): @@ -173,16 +180,17 @@ def __init__( self._aux_cols = aux_cols self._evaluated = False - def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int: + def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int, n_attacks: int) -> int: return _run_attack( target=target, syn=self._syn, - n_attacks=self._n_attacks, + n_attacks=n_attacks, aux_cols=self._aux_cols, secret=self._secret, n_jobs=n_jobs, naive=naive, regression=self._regression, + inference_model=self._inference_model, ) def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator": @@ -199,11 +207,14 @@ def evaluate(self, n_jobs: int = -2) -> "InferenceEvaluator": The evaluated ``InferenceEvaluator`` object. """ - self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs) - self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs) + self._n_baseline = self._attack(target=self._ori, naive=True, n_jobs=n_jobs, + n_attacks=self._n_attacks_baseline) + self._n_success = self._attack(target=self._ori, naive=False, n_jobs=n_jobs, + n_attacks=self._n_attacks_ori) self._n_control = ( - None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs) - ) + None if self._control is None else self._attack(target=self._control, naive=False, n_jobs=n_jobs, + n_attacks=self._n_attacks_control) + ) self._evaluated = True return self @@ -226,7 +237,7 @@ def results(self, confidence_level: float = 0.95) -> EvaluationResults: raise RuntimeError("The inference evaluator wasn't evaluated yet. Please, run `evaluate()` first.") return EvaluationResults( - n_attacks=self._n_attacks, + n_attacks=(self._n_attacks_ori, self._n_attacks_baseline, self._n_attacks_control), n_success=self._n_success, n_baseline=self._n_baseline, n_control=self._n_control, diff --git a/src/anonymeter/evaluators/inference_predictor.py b/src/anonymeter/evaluators/inference_predictor.py new file mode 100644 index 0000000..b8b96d0 --- /dev/null +++ b/src/anonymeter/evaluators/inference_predictor.py @@ -0,0 +1,31 @@ +# This file is part of Anonymeter and is released under BSD 3-Clause Clear License. +# Copyright (c) 2022 Anonos IP LLC. +# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. +"""A protocol for a custom inference predictor.""" +from typing import Protocol + +import pandas as pd + + +class InferencePredictor(Protocol): + """Interface for custom inference models. + + It is used as `inference_model` in the InferenceEvaluator in inference_evaluator.py. + + For an example usage refer to the SklearnInferencePredictor in sklearn_inference_predictor.py. + """ + def predict(self, x: pd.DataFrame) -> pd.Series: + """Predict the targets for input `x`. + + Parameters + ---------- + x : pd.DataFrame + The input data to predict. + + Returns + ------- + pd.Series + The predictions as pd.Series. + + """ + ... diff --git a/src/anonymeter/evaluators/sklearn_inference_predictor.py b/src/anonymeter/evaluators/sklearn_inference_predictor.py new file mode 100644 index 0000000..4418aa2 --- /dev/null +++ b/src/anonymeter/evaluators/sklearn_inference_predictor.py @@ -0,0 +1,44 @@ +# This file is part of Anonymeter and is released under BSD 3-Clause Clear License. +# Copyright (c) 2022 Anonos IP LLC. +# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. +"""A wrapper class around a sklearn model implementing the InferencePredictor.""" +import pandas as pd +from sklearn.base import BaseEstimator, is_classifier, is_regressor + +from anonymeter.evaluators.inference_predictor import InferencePredictor + + +class SklearnInferencePredictor(InferencePredictor): + """Wrapper class to use sklearn methods in the inference evaluator. + + Parameters + ---------- + model : sklearn.base.BaseEstimator + A classifier or regressor which implements ::predict(). + The model needs to be fitted, it must contain its own preprocessing pipeline, + and it needs to respect the index of the input data. + + """ + def __init__(self, model: BaseEstimator): + if not (is_classifier(estimator=model) or is_regressor(estimator=model)): + raise ValueError("Model must be classifier or regressor %s", model) + if not hasattr(model, "predict"): + raise ValueError("Model must have a predict method, %s", model) + self._model = model + + def predict(self, x: pd.DataFrame) -> pd.Series: + """Predict the targets for input `x`. + + Parameters + ---------- + x : pd.DataFrame + The input data to predict. + + Returns + ------- + pd.Series + The predictions as pd.Series. + + """ + prediction = self._model.predict(x) + return pd.Series(prediction, index=x.index) diff --git a/src/anonymeter/neighbors/mixed_types_kneighbors.py b/src/anonymeter/neighbors/mixed_types_kneighbors.py index 36c94b8..318628a 100644 --- a/src/anonymeter/neighbors/mixed_types_kneighbors.py +++ b/src/anonymeter/neighbors/mixed_types_kneighbors.py @@ -12,6 +12,7 @@ from joblib import Parallel, delayed from numba import jit +from anonymeter.evaluators.inference_predictor import InferencePredictor from anonymeter.preprocessing.transformations import mixed_types_transform from anonymeter.preprocessing.type_detection import detect_consistent_col_types @@ -75,7 +76,7 @@ def gower_distance(r0: npt.NDArray, r1: npt.NDArray, cat_cols_index: int) -> flo @jit(nopython=True, nogil=True) def _nearest_neighbors( - queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int + queries: npt.NDArray, candidates: npt.NDArray, cat_cols_index: int, n_neighbors: int ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.float64]]: r"""For every element of ``queries``, find its nearest neighbors in ``candidates``. @@ -166,7 +167,7 @@ def fit(self, candidates: pd.DataFrame, ctypes: Optional[dict[str, list[str]]] = return self def kneighbors( - self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False + self, queries: pd.DataFrame, n_neighbors: Optional[int] = None, return_distance: bool = False ) -> Union[tuple[npt.NDArray, npt.NDArray], npt.NDArray]: """Find the nearest neighbors for a set of query points. @@ -220,7 +221,7 @@ def kneighbors( with Parallel(n_jobs=self._n_jobs, backend="threading") as executor: res = executor( delayed(_nearest_neighbors)( - queries=queries[ii : ii + 1], + queries=queries[ii: ii + 1], candidates=candidates, cat_cols_index=len(self._ctypes["num"]), n_neighbors=n_neighbors, @@ -235,3 +236,45 @@ def kneighbors( return distances, indexes return indexes + + +class KNNInferencePredictor(InferencePredictor): + """Wrapper class to use MixedTypeKNeighbors in the inference evaluator. + + Parameters + ---------- + data : pd.DataFrame + The train data to fit the model on (usually the synthetic data). + columns : list[str] + The auxiliary columns of `data`, used as input to the model. + target_col : str + The target column of `data`. + n_jobs : int, default is -2 + Number of jobs to use. It follows joblib convention, so that ``n_jobs = -1`` + means all available cores + + """ + + def __init__(self, data: pd.DataFrame, columns: list[str], target_col: str, n_jobs: int): + self._nn = MixedTypeKNeighbors(n_jobs=n_jobs, n_neighbors=1).fit(candidates=data[columns]) + self._target_series = data[target_col] + self._columns = columns + + def predict(self, x: pd.DataFrame) -> pd.Series: + """Predict the targets for input `x`. + + Parameters + ---------- + x : pd.DataFrame + The input data to predict. + + Returns + ------- + pd.Series + The predictions as pd.Series. + + """ + guesses_idx = self._nn.kneighbors(queries=x[self._columns]) + if isinstance(guesses_idx, tuple): + raise RuntimeError("guesses_idx cannot be a tuple") + return self._target_series.iloc[guesses_idx.flatten()] diff --git a/src/anonymeter/stats/confidence.py b/src/anonymeter/stats/confidence.py index b95049c..8ddae4b 100644 --- a/src/anonymeter/stats/confidence.py +++ b/src/anonymeter/stats/confidence.py @@ -5,7 +5,7 @@ import warnings from math import sqrt -from typing import NamedTuple, Optional +from typing import NamedTuple, Optional, Union from scipy.stats import norm @@ -174,8 +174,12 @@ class EvaluationResults: Parameters ---------- - n_attacks : int + n_attacks : Union[int, tuple[int, int, int]] Total number of attacks performed. + It can be a single number (int) which will apply to all three: main (ori), baseline, and control attack, + or a tuple (n_attacks_ori, n_attacks_baseline, n_attacks_control) - (int, int, int) which will contain + different numbers of attacks in case the user wants to perform different number of attacks for each + main (ori), baseline and control target dataset. n_success : int Number of successful attacks. n_baseline : int @@ -194,23 +198,31 @@ class EvaluationResults: def __init__( self, - n_attacks: int, + n_attacks: Union[int, tuple[int, int, int]], n_success: int, n_baseline: int, n_control: Optional[int] = None, confidence_level: float = 0.95, ): - self.attack_rate = success_rate(n_total=n_attacks, n_success=n_success, confidence_level=confidence_level) + if isinstance(n_attacks, int): + self.n_attacks_ori = n_attacks + self.n_attacks_baseline = n_attacks + self.n_attacks_control = n_attacks + elif isinstance(n_attacks, tuple): + self.n_attacks_ori, self.n_attacks_baseline, self.n_attacks_control = n_attacks + else: + raise ValueError(f"n_attacks must be an integer or a tuple of three integers, got {n_attacks}") + + self.attack_rate = success_rate(n_total=self.n_attacks_ori, n_success=n_success, confidence_level=confidence_level) - self.baseline_rate = success_rate(n_total=n_attacks, n_success=n_baseline, confidence_level=confidence_level) + self.baseline_rate = success_rate(n_total=self.n_attacks_baseline, n_success=n_baseline, confidence_level=confidence_level) self.control_rate = ( None if n_control is None - else success_rate(n_total=n_attacks, n_success=n_control, confidence_level=confidence_level) + else success_rate(n_total=self.n_attacks_control, n_success=n_control, confidence_level=confidence_level) ) - self.n_attacks = n_attacks self.n_success = n_success self.n_baseline = n_baseline self.n_control = n_control diff --git a/tests/test_inference_evaluator.py b/tests/test_inference_evaluator.py index c8cf9ab..eeaa934 100644 --- a/tests/test_inference_evaluator.py +++ b/tests/test_inference_evaluator.py @@ -72,9 +72,9 @@ def test_evaluate_inference_guesses_regression_tolerance(guesses, secrets, toler ], ) def test_inference_evaluator_rates( - ori: Iterable, - syn: Iterable, - expected: float, + ori: Iterable, + syn: Iterable, + expected: float, ): # created a dataframe from ori and name columns c0 and c1 ori = pd.DataFrame(ori, columns=pd.Index(["c0", "c1"])) diff --git a/tests/test_sklearn_inference_model.py b/tests/test_sklearn_inference_model.py new file mode 100644 index 0000000..ac45d2a --- /dev/null +++ b/tests/test_sklearn_inference_model.py @@ -0,0 +1,56 @@ +# This file is part of Anonymeter and is released under BSD 3-Clause Clear License. +# Copyright (c) 2022 Anonos IP LLC. +# See https://github.com/statice/anonymeter/blob/main/LICENSE.md for details. + +import numpy as np +import pytest +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.tree import DecisionTreeRegressor + +from anonymeter.evaluators.inference_evaluator import InferenceEvaluator +from anonymeter.evaluators.sklearn_inference_predictor import SklearnInferencePredictor + +from tests.fixtures import get_adult + + +@pytest.mark.parametrize( + "aux_cols", + [ + ["type_employer", "capital_loss", "hr_per_week", "age"], + ["education_num", "type_employer", "capital_loss"], + ["age", "type_employer", "race"], + ], +) +@pytest.mark.parametrize("secret", ["capital_gain", "capital_loss"]) +def test_inference_evaluator_custom_model_regressor(aux_cols, secret): + ori = get_adult("ori", n_samples=10) + + # Inference model prep + categorical_cols = ori[aux_cols].select_dtypes(include=["object"]).columns + numeric_cols = ori[aux_cols].select_dtypes(include=["number"]).columns + + preprocess = ColumnTransformer( + transformers=[ + ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols), + ("num", "passthrough", numeric_cols) + ] + ) + tree = DecisionTreeRegressor(random_state=42) + + model = Pipeline(steps=[ + ("preprocess", preprocess), + ("tree", tree) + ]) + model.fit(ori[aux_cols], ori[secret]) + inference_model = SklearnInferencePredictor(model) + + # Evaluator + evaluator = InferenceEvaluator(ori=ori, syn=ori, control=ori, aux_cols=aux_cols, secret=secret, n_attacks=10, + inference_model=inference_model, regression=True) + evaluator.evaluate(n_jobs=1) + results = evaluator.results(confidence_level=0) + + np.testing.assert_equal(results.attack_rate, (1, 0)) + np.testing.assert_equal(results.control_rate, (1, 0))