Skip to content

Commit 0ac5c44

Browse files
sidmohan0claude
andcommitted
feat(telemetry): add anonymous opt-out PostHog telemetry for v4.3.0
Add lightweight, privacy-preserving usage telemetry to understand which engines, functions, and features are actually used. Zero new dependencies (stdlib urllib.request only). Fire-and-forget daemon threads ensure zero latency impact. - Create datafog/telemetry.py with PostHog /capture/ integration - Instrument detect, process, detect_pii, anonymize_text, scan_text, get_supported_entities, DataFog class, TextService, and CLI commands - Wire track_error() into exception handlers for error visibility - Opt-out via DATAFOG_NO_TELEMETRY=1 or DO_NOT_TRACK=1 - Anonymous ID via SHA-256 of machine info (no PII) - Text lengths bucketed, error messages never sent - Thread-local dedup prevents double-counting nested calls - Fix services/__init__.py to lazy-import ImageService and SparkService, so TextService works on minimal installs without aiohttp/PIL/pyspark - Fix pre-existing NameError in __init__.py detect() for RegexAnnotator - 44 tests covering opt-out, privacy, non-blocking, payloads, integration, error tracking, and edge cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e6796dc commit 0ac5c44

File tree

9 files changed

+1223
-17
lines changed

9 files changed

+1223
-17
lines changed

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,29 @@ async def redact_pii_middleware(request, call_next):
294294

295295
---
296296

297+
## Privacy & Telemetry
298+
299+
DataFog collects **anonymous** usage telemetry to help us understand which features are used and prioritize development. This data contains:
300+
301+
- Function and engine usage (e.g., "regex" vs "gliner")
302+
- Coarse performance buckets (e.g., "10-100ms"), never exact timings
303+
- Error class names only (e.g., "ImportError"), never error messages or stack traces
304+
- A one-way hashed machine identifier — no IP addresses, usernames, or file paths
305+
306+
**No text content, PII, or personally identifiable information is ever collected.**
307+
308+
To opt out, set either environment variable before running DataFog:
309+
310+
```bash
311+
export DATAFOG_NO_TELEMETRY=1
312+
# or
313+
export DO_NOT_TRACK=1
314+
```
315+
316+
Telemetry uses only Python's standard library (`urllib.request`) — no additional dependencies are installed. All sends are fire-and-forget in background threads and will never affect performance or raise exceptions.
317+
318+
---
319+
297320
## Common Use Cases
298321

299322
### Enterprise

datafog/__init__.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ def detect(text: str) -> list:
149149
>>> detect("Contact john@example.com")
150150
[{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}]
151151
"""
152+
import time as _time
153+
154+
_start = _time.monotonic()
155+
156+
_lazy_import_regex_annotator()
152157
annotator = RegexAnnotator()
153158
# Use the structured output to get proper positions
154159
_, result = annotator.annotate_with_spans(text)
@@ -166,6 +171,27 @@ def detect(text: str) -> list:
166171
}
167172
)
168173

174+
try:
175+
from .telemetry import (
176+
_get_duration_bucket,
177+
_get_text_length_bucket,
178+
track_function_call,
179+
)
180+
181+
_duration = (_time.monotonic() - _start) * 1000
182+
entity_types = list({e["type"] for e in entities})
183+
track_function_call(
184+
function_name="detect",
185+
module="datafog",
186+
engine="regex",
187+
text_length_bucket=_get_text_length_bucket(len(text)),
188+
entity_count=len(entities),
189+
entity_types_found=entity_types,
190+
duration_ms_bucket=_get_duration_bucket(_duration),
191+
)
192+
except Exception:
193+
pass
194+
169195
return entities
170196

171197

@@ -190,6 +216,10 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
190216
'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}]
191217
}
192218
"""
219+
import time as _time
220+
221+
_start = _time.monotonic()
222+
193223
findings = detect(text)
194224

195225
result = {"original": text, "findings": findings}
@@ -216,6 +246,21 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
216246

217247
result["anonymized"] = anonymized
218248

249+
try:
250+
from .telemetry import _get_duration_bucket, track_function_call
251+
252+
_duration = (_time.monotonic() - _start) * 1000
253+
track_function_call(
254+
function_name="process",
255+
module="datafog",
256+
anonymize=anonymize,
257+
method=method,
258+
entity_count=len(findings),
259+
duration_ms_bucket=_get_duration_bucket(_duration),
260+
)
261+
except Exception:
262+
pass
263+
219264
return result
220265

221266

datafog/client.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,26 @@ def scan_image(
4848
try:
4949
results = asyncio.run(ocr_client.run_ocr_pipeline(image_urls=image_urls))
5050
typer.echo(f"OCR Pipeline Results: {results}")
51+
52+
try:
53+
from .telemetry import track_function_call
54+
55+
track_function_call(
56+
function_name="scan_image",
57+
module="datafog.client",
58+
source="cli",
59+
batch_size=len(image_urls),
60+
)
61+
except Exception:
62+
pass
5163
except Exception as e:
5264
logging.exception("Error in run_ocr_pipeline")
65+
try:
66+
from .telemetry import track_error
67+
68+
track_error("scan_image", type(e).__name__, source="cli")
69+
except Exception:
70+
pass
5371
typer.echo(f"Error: {str(e)}", err=True)
5472
raise typer.Exit(code=1)
5573

@@ -83,8 +101,27 @@ def scan_text(
83101
try:
84102
results = text_client.run_text_pipeline_sync(str_list=str_list)
85103
typer.echo(f"Text Pipeline Results: {results}")
104+
105+
try:
106+
from .telemetry import track_function_call
107+
108+
track_function_call(
109+
function_name="scan_text",
110+
module="datafog.client",
111+
source="cli",
112+
batch_size=len(str_list),
113+
operations=[op.value for op in operation_list],
114+
)
115+
except Exception:
116+
pass
86117
except Exception as e:
87118
logging.exception("Text pipeline error")
119+
try:
120+
from .telemetry import track_error
121+
122+
track_error("scan_text", type(e).__name__, source="cli")
123+
except Exception:
124+
pass
88125
typer.echo(f"Error: {str(e)}", err=True)
89126
raise typer.Exit(code=1)
90127

@@ -245,6 +282,18 @@ def redact_text(text: str = typer.Argument(None, help="Text to redact")):
245282
result = anonymizer.anonymize(text, annotations)
246283
typer.echo(result.anonymized_text)
247284

285+
try:
286+
from .telemetry import track_function_call
287+
288+
track_function_call(
289+
function_name="redact_text",
290+
module="datafog.client",
291+
source="cli",
292+
method="redact",
293+
)
294+
except Exception:
295+
pass
296+
248297

249298
@app.command()
250299
def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
@@ -266,6 +315,18 @@ def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
266315
result = anonymizer.anonymize(text, annotations)
267316
typer.echo(result.anonymized_text)
268317

318+
try:
319+
from .telemetry import track_function_call
320+
321+
track_function_call(
322+
function_name="replace_text",
323+
module="datafog.client",
324+
source="cli",
325+
method="replace",
326+
)
327+
except Exception:
328+
pass
329+
269330

270331
@app.command()
271332
def hash_text(
@@ -291,6 +352,19 @@ def hash_text(
291352
result = anonymizer.anonymize(text, annotations)
292353
typer.echo(result.anonymized_text)
293354

355+
try:
356+
from .telemetry import track_function_call
357+
358+
track_function_call(
359+
function_name="hash_text",
360+
module="datafog.client",
361+
source="cli",
362+
method="hash",
363+
hash_type=hash_type.value,
364+
)
365+
except Exception:
366+
pass
367+
294368

295369
if __name__ == "__main__":
296370
app()

datafog/core.py

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ def detect_pii(text: str) -> Dict[str, List[str]]:
3030
>>> print(result)
3131
{'EMAIL': ['john@example.com'], 'PHONE': ['(555) 123-4567']}
3232
"""
33+
import time as _time
34+
35+
_start = _time.monotonic()
36+
3337
try:
3438
from datafog.services.text_service import TextService
3539

@@ -46,9 +50,36 @@ def detect_pii(text: str) -> Dict[str, List[str]]:
4650
pii_dict[entity_type] = []
4751
pii_dict[entity_type].append(annotation.text)
4852

53+
try:
54+
from datafog.telemetry import (
55+
_get_duration_bucket,
56+
_get_text_length_bucket,
57+
track_function_call,
58+
)
59+
60+
_duration = (_time.monotonic() - _start) * 1000
61+
entity_count = sum(len(v) for v in pii_dict.values())
62+
track_function_call(
63+
function_name="detect_pii",
64+
module="datafog.core",
65+
engine="regex",
66+
text_length_bucket=_get_text_length_bucket(len(text)),
67+
entity_count=entity_count,
68+
entity_types_found=list(pii_dict.keys()),
69+
duration_ms_bucket=_get_duration_bucket(_duration),
70+
)
71+
except Exception:
72+
pass
73+
4974
return pii_dict
5075

5176
except ImportError as e:
77+
try:
78+
from datafog.telemetry import track_error
79+
80+
track_error("detect_pii", type(e).__name__, engine="regex")
81+
except Exception:
82+
pass
5283
raise ImportError(
5384
"Core dependencies missing. Install with: pip install datafog[all]"
5485
) from e
@@ -70,6 +101,11 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") ->
70101
>>> print(result)
71102
"Contact [EMAIL_REDACTED]"
72103
"""
104+
import time as _time
105+
106+
_start = _time.monotonic()
107+
_method_str = method if isinstance(method, str) else method.value
108+
73109
try:
74110
from datafog.models.anonymizer import Anonymizer, AnonymizerType
75111
from datafog.services.text_service import TextService
@@ -109,9 +145,34 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") ->
109145
# Create anonymizer and apply
110146
anonymizer = Anonymizer(anonymizer_type=method)
111147
result = anonymizer.anonymize(text, annotations)
148+
149+
try:
150+
from datafog.telemetry import (
151+
_get_duration_bucket,
152+
_get_text_length_bucket,
153+
track_function_call,
154+
)
155+
156+
_duration = (_time.monotonic() - _start) * 1000
157+
track_function_call(
158+
function_name="anonymize_text",
159+
module="datafog.core",
160+
method=_method_str,
161+
text_length_bucket=_get_text_length_bucket(len(text)),
162+
duration_ms_bucket=_get_duration_bucket(_duration),
163+
)
164+
except Exception:
165+
pass
166+
112167
return result.anonymized_text
113168

114169
except ImportError as e:
170+
try:
171+
from datafog.telemetry import track_error
172+
173+
track_error("anonymize_text", type(e).__name__, method=_method_str)
174+
except Exception:
175+
pass
115176
raise ImportError(
116177
"Core dependencies missing. Install with: pip install datafog[all]"
117178
) from e
@@ -139,12 +200,28 @@ def scan_text(
139200
>>> print(entities)
140201
{'EMAIL': ['john@example.com']}
141202
"""
203+
import time as _time
204+
205+
_start = _time.monotonic()
206+
142207
entities = detect_pii(text)
143208

144-
if return_entities:
145-
return entities
146-
else:
147-
return len(entities) > 0
209+
result = entities if return_entities else len(entities) > 0
210+
211+
try:
212+
from datafog.telemetry import _get_duration_bucket, track_function_call
213+
214+
_duration = (_time.monotonic() - _start) * 1000
215+
track_function_call(
216+
function_name="scan_text",
217+
module="datafog.core",
218+
return_entities=return_entities,
219+
duration_ms_bucket=_get_duration_bucket(_duration),
220+
)
221+
except Exception:
222+
pass
223+
224+
return result
148225

149226

150227
def get_supported_entities() -> List[str]:
@@ -165,7 +242,19 @@ def get_supported_entities() -> List[str]:
165242
)
166243

167244
annotator = RegexAnnotator()
168-
return [entity.value for entity in annotator.supported_entities]
245+
result = [entity.value for entity in annotator.supported_entities]
246+
247+
try:
248+
from datafog.telemetry import track_function_call
249+
250+
track_function_call(
251+
function_name="get_supported_entities",
252+
module="datafog.core",
253+
)
254+
except Exception:
255+
pass
256+
257+
return result
169258

170259
except ImportError:
171260
# Fallback to basic list if imports fail

0 commit comments

Comments
 (0)