diff --git a/.gitignore b/.gitignore index d032ff6..ccef881 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data/ .env/ .idea/ .DS_Store diff --git a/ai_sanitizer_app/sensitive_data_sanitizer.py b/ai_sanitizer_app/sensitive_data_sanitizer.py deleted file mode 100644 index 52c6ef4..0000000 --- a/ai_sanitizer_app/sensitive_data_sanitizer.py +++ /dev/null @@ -1,17 +0,0 @@ -import re -from config import SENSITIVE_DATA_CONFIGS - - -class SensitiveDataSanitizer: - def __init__(self) -> None: - self.sensitive_data = SENSITIVE_DATA_CONFIGS - - def sanitize_input(self, input_content: str) -> str: - sanitized_content = input_content - for entity, details in self.sensitive_data.items(): - regex_pattern = details["pattern"] - placeholder = details["placeholder"] - sanitized_content = re.sub(regex_pattern, placeholder, sanitized_content, flags=re.IGNORECASE) - return sanitized_content - - diff --git a/requirements.txt b/requirements.txt index f831cf4..3def881 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,8 @@ matplotlib seaborn tqdm pytest +presidio-anonymizer +presidio_analyzer + + + diff --git a/ai_sanitizer_app/__init__.py b/sensitive_info_disclosure/__init__.py similarity index 100% rename from ai_sanitizer_app/__init__.py rename to sensitive_info_disclosure/__init__.py diff --git a/ai_sanitizer_app/config.py b/sensitive_info_disclosure/config.py similarity index 58% rename from ai_sanitizer_app/config.py rename to sensitive_info_disclosure/config.py index add422c..39b25a2 100644 --- a/ai_sanitizer_app/config.py +++ b/sensitive_info_disclosure/config.py @@ -1,43 +1,42 @@ SENSITIVE_DATA_CONFIGS = { "EMAIL": { "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", - "placeholder": "[EMAIL_ADDRESS]" + "placeholder": "" }, "CREDIT_CARD": { "pattern": r"\b(?:\d{4}[-\s]?){3}\d{4}\b", - "placeholder": "[CREDIT_CARD]" + "placeholder": "" }, "US_SSN": { "pattern": r"\b(?!\d{9}$)\d{3}-?\d{2}-?\d{4}\b", - "placeholder": "[US_SSN]" + "placeholder": "" }, "US_BANK_ACCOUNT": { "pattern": r"\b\d{9}\b", - "placeholder": "[US_BANK_ACCOUNT]" + "placeholder": "" }, "PHONE_NUMBER": { "pattern": r"\b\d{3}-?\d{3}-?\d{4}\b", - "placeholder": "[PHONE_NUMBER]" + "placeholder": "" }, "IP_ADDRESS": { "pattern": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", - "placeholder": "[IP_ADDRESS]" + "placeholder": "" }, "UUID": { "pattern": r"\b[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}\b", - "placeholder": "[UUID]" + "placeholder": "" }, "US_DRIVING_LICENSE": { "pattern": r"\b[A-Z]{1,2}\d{4,8}\b", - "placeholder": "[US_DRIVING_LICENSE]" + "placeholder": "" }, "IBAN_CODE": { "pattern": r"\b[A-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{14}\b", - "placeholder": "[IBAN_CODE]" + "placeholder": "" }, - "PERSON_NAME": { - "pattern": r"\b[A-Z][a-z]*(?:-[A-Z][a-z]*)? " - r"(?:[A-Z]\.? )?[A-Z][a-z]*(?:-[A-Z][a-z]*)?(?:,? (Jr\.|Sr\.|III|IV|Ph\.D\.))?", - "placeholder": "[PERSON_NAME]" + "OTHER": { + "pattern": r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b", + "placeholder": "" } } \ No newline at end of file diff --git a/sensitive_info_disclosure/sensitive_data_sanitizer.py b/sensitive_info_disclosure/sensitive_data_sanitizer.py new file mode 100644 index 0000000..cea79fd --- /dev/null +++ b/sensitive_info_disclosure/sensitive_data_sanitizer.py @@ -0,0 +1,32 @@ +import re +from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine + + +class SensitiveDataSanitizer: + def __init__(self) -> None: + self.sensitive_data = SENSITIVE_DATA_CONFIGS + self.analyzer = AnalyzerEngine() + self.anonymizer = AnonymizerEngine() + + def sanitize_input(self, input_content: str) -> str: + sanitized_content = input_content + for entity, details in self.sensitive_data.items(): + if entity == "OTHER": + analysis_results = self.analyzer.analyze( + text=sanitized_content, + language="en", + ) + anonymized_results = self.anonymizer.anonymize( + text=sanitized_content, + analyzer_results=analysis_results + ) + sanitized_content = anonymized_results.text + else: + regex_pattern = details["pattern"] + placeholder = details["placeholder"] + sanitized_content = re.sub(regex_pattern, placeholder, sanitized_content, flags=re.IGNORECASE) + return sanitized_content + + diff --git a/tests/test_sensitive_data_sanitizer.py b/tests/test_sensitive_data_sanitizer.py index 11061e5..ca6f874 100644 --- a/tests/test_sensitive_data_sanitizer.py +++ b/tests/test_sensitive_data_sanitizer.py @@ -1,6 +1,6 @@ import pytest -from ai_sanitizer_app.sensitive_data_sanitizer import SensitiveDataSanitizer -from ai_sanitizer_app.config import SENSITIVE_DATA_CONFIGS +from sensitive_info_disclosure.sensitive_data_sanitizer import SensitiveDataSanitizer +from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS test_cases = { "CREDIT_CARD": "4012-8888-8888-8881", @@ -12,7 +12,6 @@ "UUID": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11", "US_DRIVING_LICENSE": "CA1234567", "US_BANK_ACCOUNT": "123456789", - "PERSON_NAME": "John A. Doe Jr.", }