From a500325fcc444be8ade00b2dc9a8ebe5d3ef6c7f Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Fri, 26 Apr 2024 09:46:40 -0700 Subject: [PATCH 01/11] added data folder to exclude from git --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d032ff6..ccef881 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data/ .env/ .idea/ .DS_Store From cb25d41b0c3fe13bd796c8f2908d605df9d4eb9a Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Thu, 9 May 2024 09:15:53 -0700 Subject: [PATCH 02/11] removed redundant presidio yml --- ai_sanitizer_app/presidio_config.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ai_sanitizer_app/presidio_config.yml diff --git a/ai_sanitizer_app/presidio_config.yml b/ai_sanitizer_app/presidio_config.yml new file mode 100644 index 0000000..e69de29 From 46c0f97e5c783583beb9991a2c41c8fb9682714c Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Thu, 9 May 2024 09:16:38 -0700 Subject: [PATCH 03/11] removed redundant presidio yml --- ai_sanitizer_app/presidio_config.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 ai_sanitizer_app/presidio_config.yml diff --git a/ai_sanitizer_app/presidio_config.yml b/ai_sanitizer_app/presidio_config.yml deleted file mode 100644 index e69de29..0000000 From ab603d7df3b4100ffd2d4982ff83749abf161b3c Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Fri, 10 May 2024 09:03:04 -0700 Subject: [PATCH 04/11] added presidio library --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index f831cf4..3def881 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,8 @@ matplotlib seaborn tqdm pytest +presidio-anonymizer +presidio_analyzer + + + From 53afef37989e4a3fd12a03c777c917fc44980fe0 Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Fri, 10 May 2024 09:05:41 -0700 Subject: [PATCH 05/11] changed config placeholders to match presidio, removed person name from regex based --- ai_sanitizer_app/config.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/ai_sanitizer_app/config.py b/ai_sanitizer_app/config.py index add422c..1c0c44b 100644 --- a/ai_sanitizer_app/config.py +++ b/ai_sanitizer_app/config.py @@ -1,43 +1,38 @@ SENSITIVE_DATA_CONFIGS = { "EMAIL": { "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", - "placeholder": "[EMAIL_ADDRESS]" + "placeholder": "" }, "CREDIT_CARD": { "pattern": r"\b(?:\d{4}[-\s]?){3}\d{4}\b", - "placeholder": "[CREDIT_CARD]" + "placeholder": "" }, "US_SSN": { "pattern": r"\b(?!\d{9}$)\d{3}-?\d{2}-?\d{4}\b", - "placeholder": "[US_SSN]" + "placeholder": "" }, "US_BANK_ACCOUNT": { "pattern": r"\b\d{9}\b", - "placeholder": "[US_BANK_ACCOUNT]" + "placeholder": "" }, "PHONE_NUMBER": { "pattern": r"\b\d{3}-?\d{3}-?\d{4}\b", - "placeholder": "[PHONE_NUMBER]" + "placeholder": "" }, "IP_ADDRESS": { "pattern": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", - "placeholder": "[IP_ADDRESS]" + "placeholder": "" }, "UUID": { "pattern": r"\b[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}\b", - "placeholder": "[UUID]" + "placeholder": "" }, "US_DRIVING_LICENSE": { "pattern": r"\b[A-Z]{1,2}\d{4,8}\b", - "placeholder": "[US_DRIVING_LICENSE]" + "placeholder": "" }, "IBAN_CODE": { "pattern": r"\b[A-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{14}\b", - "placeholder": "[IBAN_CODE]" + "placeholder": "" }, - "PERSON_NAME": { - "pattern": r"\b[A-Z][a-z]*(?:-[A-Z][a-z]*)? " - r"(?:[A-Z]\.? )?[A-Z][a-z]*(?:-[A-Z][a-z]*)?(?:,? (Jr\.|Sr\.|III|IV|Ph\.D\.))?", - "placeholder": "[PERSON_NAME]" - } } \ No newline at end of file From 80e4f1fc74b02256bc59b67a3ea10c876d571dc4 Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Fri, 10 May 2024 11:37:45 -0700 Subject: [PATCH 06/11] added entity tag for presidio --- ai_sanitizer_app/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ai_sanitizer_app/config.py b/ai_sanitizer_app/config.py index 1c0c44b..39b25a2 100644 --- a/ai_sanitizer_app/config.py +++ b/ai_sanitizer_app/config.py @@ -35,4 +35,8 @@ "pattern": r"\b[A-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{14}\b", "placeholder": "" }, + "OTHER": { + "pattern": r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b", + "placeholder": "" + } } \ No newline at end of file From ae737346f7158d227cfe10b37239e9bdb0d79bac Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Fri, 10 May 2024 12:03:16 -0700 Subject: [PATCH 07/11] implemented regex-first then presidio logic --- ai_sanitizer_app/sensitive_data_sanitizer.py | 21 +++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ai_sanitizer_app/sensitive_data_sanitizer.py b/ai_sanitizer_app/sensitive_data_sanitizer.py index 52c6ef4..a7ae803 100644 --- a/ai_sanitizer_app/sensitive_data_sanitizer.py +++ b/ai_sanitizer_app/sensitive_data_sanitizer.py @@ -1,17 +1,32 @@ import re from config import SENSITIVE_DATA_CONFIGS +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine class SensitiveDataSanitizer: def __init__(self) -> None: self.sensitive_data = SENSITIVE_DATA_CONFIGS + self.analyzer = AnalyzerEngine() + self.anonymizer = AnonymizerEngine() def sanitize_input(self, input_content: str) -> str: sanitized_content = input_content for entity, details in self.sensitive_data.items(): - regex_pattern = details["pattern"] - placeholder = details["placeholder"] - sanitized_content = re.sub(regex_pattern, placeholder, sanitized_content, flags=re.IGNORECASE) + if entity == "OTHER": + analysis_results = self.analyzer.analyze( + text=sanitized_content, + language="en", + ) + anonymized_results = self.anonymizer.anonymize( + text=sanitized_content, + analyzer_results=analysis_results + ) + sanitized_content = anonymized_results.text + else: + regex_pattern = details["pattern"] + placeholder = details["placeholder"] + sanitized_content = re.sub(regex_pattern, placeholder, sanitized_content, flags=re.IGNORECASE) return sanitized_content From 80ae9dbdaa570ca9e79fc9fceaaf29ea460ecd22 Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Fri, 10 May 2024 12:03:41 -0700 Subject: [PATCH 08/11] removed person name test from regex logic --- tests/test_sensitive_data_sanitizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_sensitive_data_sanitizer.py b/tests/test_sensitive_data_sanitizer.py index 11061e5..b317bd8 100644 --- a/tests/test_sensitive_data_sanitizer.py +++ b/tests/test_sensitive_data_sanitizer.py @@ -12,7 +12,6 @@ "UUID": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11", "US_DRIVING_LICENSE": "CA1234567", "US_BANK_ACCOUNT": "123456789", - "PERSON_NAME": "John A. Doe Jr.", } From 31d8ffb8726211a8740ea72a19638ce0a41def85 Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Mon, 13 May 2024 20:25:43 -0700 Subject: [PATCH 09/11] edited import path so that tested in jupyter --- ai_sanitizer_app/sensitive_data_sanitizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai_sanitizer_app/sensitive_data_sanitizer.py b/ai_sanitizer_app/sensitive_data_sanitizer.py index a7ae803..2353d7c 100644 --- a/ai_sanitizer_app/sensitive_data_sanitizer.py +++ b/ai_sanitizer_app/sensitive_data_sanitizer.py @@ -1,5 +1,5 @@ import re -from config import SENSITIVE_DATA_CONFIGS +from ai_sanitizer_app.config import SENSITIVE_DATA_CONFIGS from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine From 52181c45de3066383724989636d2f50ca875e125 Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Mon, 13 May 2024 20:27:12 -0700 Subject: [PATCH 10/11] renamed folder and module --- {ai_sanitizer_app => sensitive_info_disclosure}/__init__.py | 0 {ai_sanitizer_app => sensitive_info_disclosure}/config.py | 0 .../sensitive_data_sanitizer.py | 0 tests/test_sensitive_data_sanitizer.py | 4 ++-- 4 files changed, 2 insertions(+), 2 deletions(-) rename {ai_sanitizer_app => sensitive_info_disclosure}/__init__.py (100%) rename {ai_sanitizer_app => sensitive_info_disclosure}/config.py (100%) rename {ai_sanitizer_app => sensitive_info_disclosure}/sensitive_data_sanitizer.py (100%) diff --git a/ai_sanitizer_app/__init__.py b/sensitive_info_disclosure/__init__.py similarity index 100% rename from ai_sanitizer_app/__init__.py rename to sensitive_info_disclosure/__init__.py diff --git a/ai_sanitizer_app/config.py b/sensitive_info_disclosure/config.py similarity index 100% rename from ai_sanitizer_app/config.py rename to sensitive_info_disclosure/config.py diff --git a/ai_sanitizer_app/sensitive_data_sanitizer.py b/sensitive_info_disclosure/sensitive_data_sanitizer.py similarity index 100% rename from ai_sanitizer_app/sensitive_data_sanitizer.py rename to sensitive_info_disclosure/sensitive_data_sanitizer.py diff --git a/tests/test_sensitive_data_sanitizer.py b/tests/test_sensitive_data_sanitizer.py index b317bd8..ca6f874 100644 --- a/tests/test_sensitive_data_sanitizer.py +++ b/tests/test_sensitive_data_sanitizer.py @@ -1,6 +1,6 @@ import pytest -from ai_sanitizer_app.sensitive_data_sanitizer import SensitiveDataSanitizer -from ai_sanitizer_app.config import SENSITIVE_DATA_CONFIGS +from sensitive_info_disclosure.sensitive_data_sanitizer import SensitiveDataSanitizer +from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS test_cases = { "CREDIT_CARD": "4012-8888-8888-8881", From 8561bab1e8de5197440c8639c91f8ad3f0a65d75 Mon Sep 17 00:00:00 2001 From: Akerke Balgabekova Date: Mon, 13 May 2024 20:27:53 -0700 Subject: [PATCH 11/11] renamed module for webapp --- sensitive_info_disclosure/sensitive_data_sanitizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sensitive_info_disclosure/sensitive_data_sanitizer.py b/sensitive_info_disclosure/sensitive_data_sanitizer.py index 2353d7c..cea79fd 100644 --- a/sensitive_info_disclosure/sensitive_data_sanitizer.py +++ b/sensitive_info_disclosure/sensitive_data_sanitizer.py @@ -1,5 +1,5 @@ import re -from ai_sanitizer_app.config import SENSITIVE_DATA_CONFIGS +from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine