|
| 1 | +""" |
| 2 | +DataFog Core API - Lightweight PII detection functions. |
| 3 | +
|
| 4 | +This module provides simple, lightweight functions for PII detection and anonymization |
| 5 | +without requiring heavy dependencies like spaCy or PyTorch. |
| 6 | +""" |
| 7 | + |
| 8 | +from typing import Dict, List, Union |
| 9 | + |
| 10 | +from datafog.models.anonymizer import AnonymizerType |
| 11 | + |
| 12 | +# Engine types as constants |
| 13 | +REGEX_ENGINE = "regex" |
| 14 | +SPACY_ENGINE = "spacy" |
| 15 | +AUTO_ENGINE = "auto" |
| 16 | + |
| 17 | + |
| 18 | +def detect_pii(text: str) -> Dict[str, List[str]]: |
| 19 | + """ |
| 20 | + Simple PII detection using lightweight regex engine. |
| 21 | +
|
| 22 | + Args: |
| 23 | + text: Text to scan for PII |
| 24 | +
|
| 25 | + Returns: |
| 26 | + Dictionary mapping entity types to lists of detected values |
| 27 | +
|
| 28 | + Example: |
| 29 | + >>> result = detect_pii("Contact john@example.com at (555) 123-4567") |
| 30 | + >>> print(result) |
| 31 | + {'EMAIL': ['john@example.com'], 'PHONE': ['(555) 123-4567']} |
| 32 | + """ |
| 33 | + try: |
| 34 | + from datafog.services.text_service import TextService |
| 35 | + |
| 36 | + # Use lightweight regex engine only |
| 37 | + service = TextService(engine=REGEX_ENGINE) |
| 38 | + result = service.annotate_text_sync(text, structured=True) |
| 39 | + |
| 40 | + # Convert to simple dictionary format, filtering out empty matches |
| 41 | + pii_dict = {} |
| 42 | + for annotation in result: |
| 43 | + if annotation.text.strip(): # Only include non-empty matches |
| 44 | + entity_type = annotation.label |
| 45 | + if entity_type not in pii_dict: |
| 46 | + pii_dict[entity_type] = [] |
| 47 | + pii_dict[entity_type].append(annotation.text) |
| 48 | + |
| 49 | + return pii_dict |
| 50 | + |
| 51 | + except ImportError as e: |
| 52 | + raise ImportError( |
| 53 | + "Core dependencies missing. Install with: pip install datafog[all]" |
| 54 | + ) from e |
| 55 | + |
| 56 | + |
| 57 | +def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> str: |
| 58 | + """ |
| 59 | + Simple text anonymization using lightweight regex engine. |
| 60 | +
|
| 61 | + Args: |
| 62 | + text: Text to anonymize |
| 63 | + method: Anonymization method ('redact', 'replace', or 'hash') |
| 64 | +
|
| 65 | + Returns: |
| 66 | + Anonymized text string |
| 67 | +
|
| 68 | + Example: |
| 69 | + >>> result = anonymize_text("Contact john@example.com", method="redact") |
| 70 | + >>> print(result) |
| 71 | + "Contact [EMAIL_REDACTED]" |
| 72 | + """ |
| 73 | + try: |
| 74 | + from datafog.models.anonymizer import Anonymizer, AnonymizerType |
| 75 | + from datafog.services.text_service import TextService |
| 76 | + |
| 77 | + # Convert string method to enum if needed |
| 78 | + if isinstance(method, str): |
| 79 | + method_map = { |
| 80 | + "redact": AnonymizerType.REDACT, |
| 81 | + "replace": AnonymizerType.REPLACE, |
| 82 | + "hash": AnonymizerType.HASH, |
| 83 | + } |
| 84 | + if method not in method_map: |
| 85 | + raise ValueError( |
| 86 | + f"Invalid method: {method}. Use 'redact', 'replace', or 'hash'" |
| 87 | + ) |
| 88 | + method = method_map[method] |
| 89 | + |
| 90 | + # Use lightweight regex engine only |
| 91 | + service = TextService(engine=REGEX_ENGINE) |
| 92 | + span_results = service.annotate_text_sync(text, structured=True) |
| 93 | + |
| 94 | + # Convert Span objects to AnnotationResult format for anonymizer, filtering empty matches |
| 95 | + from datafog.models.annotator import AnnotationResult |
| 96 | + |
| 97 | + annotations = [] |
| 98 | + for span in span_results: |
| 99 | + if span.text.strip(): # Only include non-empty matches |
| 100 | + annotation = AnnotationResult( |
| 101 | + entity_type=span.label, |
| 102 | + start=span.start, |
| 103 | + end=span.end, |
| 104 | + score=1.0, # Regex matches are certain |
| 105 | + recognition_metadata=None, |
| 106 | + ) |
| 107 | + annotations.append(annotation) |
| 108 | + |
| 109 | + # Create anonymizer and apply |
| 110 | + anonymizer = Anonymizer(anonymizer_type=method) |
| 111 | + result = anonymizer.anonymize(text, annotations) |
| 112 | + return result.anonymized_text |
| 113 | + |
| 114 | + except ImportError as e: |
| 115 | + raise ImportError( |
| 116 | + "Core dependencies missing. Install with: pip install datafog[all]" |
| 117 | + ) from e |
| 118 | + |
| 119 | + |
| 120 | +def scan_text( |
| 121 | + text: str, return_entities: bool = False |
| 122 | +) -> Union[bool, Dict[str, List[str]]]: |
| 123 | + """ |
| 124 | + Quick scan to check if text contains any PII. |
| 125 | +
|
| 126 | + Args: |
| 127 | + text: Text to scan |
| 128 | + return_entities: If True, return detected entities; if False, return boolean |
| 129 | +
|
| 130 | + Returns: |
| 131 | + Boolean indicating PII presence, or dictionary of detected entities |
| 132 | +
|
| 133 | + Example: |
| 134 | + >>> has_pii = scan_text("Contact john@example.com") |
| 135 | + >>> print(has_pii) |
| 136 | + True |
| 137 | +
|
| 138 | + >>> entities = scan_text("Contact john@example.com", return_entities=True) |
| 139 | + >>> print(entities) |
| 140 | + {'EMAIL': ['john@example.com']} |
| 141 | + """ |
| 142 | + entities = detect_pii(text) |
| 143 | + |
| 144 | + if return_entities: |
| 145 | + return entities |
| 146 | + else: |
| 147 | + return len(entities) > 0 |
| 148 | + |
| 149 | + |
| 150 | +def get_supported_entities() -> List[str]: |
| 151 | + """ |
| 152 | + Get list of PII entity types supported by the regex engine. |
| 153 | +
|
| 154 | + Returns: |
| 155 | + List of supported entity type names |
| 156 | +
|
| 157 | + Example: |
| 158 | + >>> entities = get_supported_entities() |
| 159 | + >>> print(entities) |
| 160 | + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DOB', 'ZIP'] |
| 161 | + """ |
| 162 | + try: |
| 163 | + from datafog.processing.text_processing.regex_annotator.regex_annotator import ( |
| 164 | + RegexAnnotator, |
| 165 | + ) |
| 166 | + |
| 167 | + annotator = RegexAnnotator() |
| 168 | + return [entity.value for entity in annotator.supported_entities] |
| 169 | + |
| 170 | + except ImportError: |
| 171 | + # Fallback to basic list if imports fail |
| 172 | + return ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"] |
| 173 | + |
| 174 | + |
| 175 | +# Backward compatibility aliases |
| 176 | +detect = detect_pii |
| 177 | +process = anonymize_text |
0 commit comments