Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
data/
.env/
.idea/
.DS_Store
Expand Down
17 changes: 0 additions & 17 deletions ai_sanitizer_app/sensitive_data_sanitizer.py

This file was deleted.

5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ matplotlib
seaborn
tqdm
pytest
presidio-anonymizer
presidio_analyzer



File renamed without changes.
25 changes: 12 additions & 13 deletions ai_sanitizer_app/config.py → sensitive_info_disclosure/config.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,42 @@
SENSITIVE_DATA_CONFIGS = {
"EMAIL": {
"pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"placeholder": "[EMAIL_ADDRESS]"
"placeholder": "<EMAIL_ADDRESS>"
},
"CREDIT_CARD": {
"pattern": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
"placeholder": "[CREDIT_CARD]"
"placeholder": "<CREDIT_CARD>"
},
"US_SSN": {
"pattern": r"\b(?!\d{9}$)\d{3}-?\d{2}-?\d{4}\b",
"placeholder": "[US_SSN]"
"placeholder": "<US_SSN>"
},
"US_BANK_ACCOUNT": {
"pattern": r"\b\d{9}\b",
"placeholder": "[US_BANK_ACCOUNT]"
"placeholder": "<US_BANK_ACCOUNT>"
},
"PHONE_NUMBER": {
"pattern": r"\b\d{3}-?\d{3}-?\d{4}\b",
"placeholder": "[PHONE_NUMBER]"
"placeholder": "<PHONE_NUMBER>"
},
"IP_ADDRESS": {
"pattern": r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
"placeholder": "[IP_ADDRESS]"
"placeholder": "<IP_ADDRESS>"
},
"UUID": {
"pattern": r"\b[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}\b",
"placeholder": "[UUID]"
"placeholder": "<UUID>"
},
"US_DRIVING_LICENSE": {
"pattern": r"\b[A-Z]{1,2}\d{4,8}\b",
"placeholder": "[US_DRIVING_LICENSE]"
"placeholder": "<US_DRIVING_LICENSE>"
},
"IBAN_CODE": {
"pattern": r"\b[A-Z]{2}[0-9]{2}[a-zA-Z0-9]{4}[0-9]{14}\b",
"placeholder": "[IBAN_CODE]"
"placeholder": "<IBAN_CODE>"
},
"PERSON_NAME": {
"pattern": r"\b[A-Z][a-z]*(?:-[A-Z][a-z]*)? "
r"(?:[A-Z]\.? )?[A-Z][a-z]*(?:-[A-Z][a-z]*)?(?:,? (Jr\.|Sr\.|III|IV|Ph\.D\.))?",
"placeholder": "[PERSON_NAME]"
"OTHER": {
"pattern": r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b",
"placeholder": ""
}
}
32 changes: 32 additions & 0 deletions sensitive_info_disclosure/sensitive_data_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import re
from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine


class SensitiveDataSanitizer:
def __init__(self) -> None:
self.sensitive_data = SENSITIVE_DATA_CONFIGS
self.analyzer = AnalyzerEngine()
self.anonymizer = AnonymizerEngine()

def sanitize_input(self, input_content: str) -> str:
sanitized_content = input_content
for entity, details in self.sensitive_data.items():
if entity == "OTHER":
analysis_results = self.analyzer.analyze(
text=sanitized_content,
language="en",
)
anonymized_results = self.anonymizer.anonymize(
text=sanitized_content,
analyzer_results=analysis_results
)
sanitized_content = anonymized_results.text
else:
regex_pattern = details["pattern"]
placeholder = details["placeholder"]
sanitized_content = re.sub(regex_pattern, placeholder, sanitized_content, flags=re.IGNORECASE)
return sanitized_content


5 changes: 2 additions & 3 deletions tests/test_sensitive_data_sanitizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
from ai_sanitizer_app.sensitive_data_sanitizer import SensitiveDataSanitizer
from ai_sanitizer_app.config import SENSITIVE_DATA_CONFIGS
from sensitive_info_disclosure.sensitive_data_sanitizer import SensitiveDataSanitizer
from sensitive_info_disclosure.config import SENSITIVE_DATA_CONFIGS

test_cases = {
"CREDIT_CARD": "4012-8888-8888-8881",
Expand All @@ -12,7 +12,6 @@
"UUID": "a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11",
"US_DRIVING_LICENSE": "CA1234567",
"US_BANK_ACCOUNT": "123456789",
"PERSON_NAME": "John A. Doe Jr.",
}


Expand Down