Skip to content

Commit 63e6403

Browse files
sidmohan0claude
andauthored
feat(telemetry): add anonymous opt-out PostHog telemetry for v4.3.0 (#115)
Add lightweight, privacy-preserving usage telemetry to understand which engines, functions, and features are actually used. Zero new dependencies (stdlib urllib.request only). Fire-and-forget daemon threads ensure zero latency impact. - Create datafog/telemetry.py with PostHog /capture/ integration - Instrument detect, process, detect_pii, anonymize_text, scan_text, get_supported_entities, DataFog class, TextService, and CLI commands - Wire track_error() into exception handlers for error visibility - Opt-out via DATAFOG_NO_TELEMETRY=1 or DO_NOT_TRACK=1 - Anonymous ID via SHA-256 of machine info (no PII) - Text lengths bucketed, error messages never sent - Thread-local dedup prevents double-counting nested calls - Fix services/__init__.py to lazy-import ImageService and SparkService, so TextService works on minimal installs without aiohttp/PIL/pyspark - Fix pre-existing NameError in __init__.py detect() for RegexAnnotator - 44 tests covering opt-out, privacy, non-blocking, payloads, integration, error tracking, and edge cases Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e6796dc commit 63e6403

File tree

11 files changed

+1236
-24
lines changed

11 files changed

+1236
-24
lines changed

.coveragerc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
[run]
22
source = datafog
3-
omit =
3+
omit =
44
*/tests/*
55
*/test_*
66
*/__pycache__/*
77
*/venv/*
88
*/env/*
99
setup.py
10+
datafog/__init___lean.py
11+
datafog/__init___original.py
12+
datafog/main_lean.py
13+
datafog/main_original.py
14+
datafog/services/text_service_lean.py
15+
datafog/services/text_service_original.py
1016

1117
[report]
1218
exclude_lines =

.github/workflows/ci.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@ jobs:
3838
sudo apt-get update
3939
sudo apt-get install -y tesseract-ocr libtesseract-dev
4040
41-
- name: Install minimal dependencies to prevent segfault
41+
- name: Install dependencies
4242
run: |
4343
python -m pip install --upgrade pip
44-
pip install -e ".[dev]"
44+
pip install -e ".[dev]"
4545
pip install -r requirements-dev.txt
46-
# Add only safe extras that don't include heavy ML dependencies
47-
pip install -e ".[cli]"
46+
pip install -e ".[nlp,cli]"
47+
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
4848
4949
- name: Run test suite (ignore segfault during cleanup)
5050
run: |
@@ -86,9 +86,9 @@ jobs:
8686
exit(1)
8787
"
8888
89-
- name: Run coverage on core modules only
89+
- name: Run coverage
9090
run: |
91-
python -m pytest tests/test_text_service.py tests/test_regex_annotator.py tests/test_anonymizer.py --cov=datafog --cov-report=xml --cov-config=.coveragerc
91+
python -m pytest tests/ -v --ignore=tests/test_gliner_annotator.py --cov=datafog --cov-report=xml --cov-config=.coveragerc
9292
9393
- name: Upload coverage
9494
uses: codecov/codecov-action@v4

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,29 @@ async def redact_pii_middleware(request, call_next):
294294

295295
---
296296

297+
## Privacy & Telemetry
298+
299+
DataFog collects **anonymous** usage telemetry to help us understand which features are used and prioritize development. This data contains:
300+
301+
- Function and engine usage (e.g., "regex" vs "gliner")
302+
- Coarse performance buckets (e.g., "10-100ms"), never exact timings
303+
- Error class names only (e.g., "ImportError"), never error messages or stack traces
304+
- A one-way hashed machine identifier — no IP addresses, usernames, or file paths
305+
306+
**No text content, PII, or personally identifiable information is ever collected.**
307+
308+
To opt out, set either environment variable before running DataFog:
309+
310+
```bash
311+
export DATAFOG_NO_TELEMETRY=1
312+
# or
313+
export DO_NOT_TRACK=1
314+
```
315+
316+
Telemetry uses only Python's standard library (`urllib.request`) — no additional dependencies are installed. All sends are fire-and-forget in background threads and will never affect performance or raise exceptions.
317+
318+
---
319+
297320
## Common Use Cases
298321

299322
### Enterprise

datafog/__init__.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ def detect(text: str) -> list:
149149
>>> detect("Contact john@example.com")
150150
[{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}]
151151
"""
152+
import time as _time
153+
154+
_start = _time.monotonic()
155+
156+
_lazy_import_regex_annotator()
152157
annotator = RegexAnnotator()
153158
# Use the structured output to get proper positions
154159
_, result = annotator.annotate_with_spans(text)
@@ -166,6 +171,27 @@ def detect(text: str) -> list:
166171
}
167172
)
168173

174+
try:
175+
from .telemetry import (
176+
_get_duration_bucket,
177+
_get_text_length_bucket,
178+
track_function_call,
179+
)
180+
181+
_duration = (_time.monotonic() - _start) * 1000
182+
entity_types = list({e["type"] for e in entities})
183+
track_function_call(
184+
function_name="detect",
185+
module="datafog",
186+
engine="regex",
187+
text_length_bucket=_get_text_length_bucket(len(text)),
188+
entity_count=len(entities),
189+
entity_types_found=entity_types,
190+
duration_ms_bucket=_get_duration_bucket(_duration),
191+
)
192+
except Exception:
193+
pass
194+
169195
return entities
170196

171197

@@ -190,6 +216,10 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
190216
'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}]
191217
}
192218
"""
219+
import time as _time
220+
221+
_start = _time.monotonic()
222+
193223
findings = detect(text)
194224

195225
result = {"original": text, "findings": findings}
@@ -216,6 +246,21 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict:
216246

217247
result["anonymized"] = anonymized
218248

249+
try:
250+
from .telemetry import _get_duration_bucket, track_function_call
251+
252+
_duration = (_time.monotonic() - _start) * 1000
253+
track_function_call(
254+
function_name="process",
255+
module="datafog",
256+
anonymize=anonymize,
257+
method=method,
258+
entity_count=len(findings),
259+
duration_ms_bucket=_get_duration_bucket(_duration),
260+
)
261+
except Exception:
262+
pass
263+
219264
return result
220265

221266

datafog/client.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,26 @@ def scan_image(
4848
try:
4949
results = asyncio.run(ocr_client.run_ocr_pipeline(image_urls=image_urls))
5050
typer.echo(f"OCR Pipeline Results: {results}")
51+
52+
try:
53+
from .telemetry import track_function_call
54+
55+
track_function_call(
56+
function_name="scan_image",
57+
module="datafog.client",
58+
source="cli",
59+
batch_size=len(image_urls),
60+
)
61+
except Exception:
62+
pass
5163
except Exception as e:
5264
logging.exception("Error in run_ocr_pipeline")
65+
try:
66+
from .telemetry import track_error
67+
68+
track_error("scan_image", type(e).__name__, source="cli")
69+
except Exception:
70+
pass
5371
typer.echo(f"Error: {str(e)}", err=True)
5472
raise typer.Exit(code=1)
5573

@@ -83,8 +101,27 @@ def scan_text(
83101
try:
84102
results = text_client.run_text_pipeline_sync(str_list=str_list)
85103
typer.echo(f"Text Pipeline Results: {results}")
104+
105+
try:
106+
from .telemetry import track_function_call
107+
108+
track_function_call(
109+
function_name="scan_text",
110+
module="datafog.client",
111+
source="cli",
112+
batch_size=len(str_list),
113+
operations=[op.value for op in operation_list],
114+
)
115+
except Exception:
116+
pass
86117
except Exception as e:
87118
logging.exception("Text pipeline error")
119+
try:
120+
from .telemetry import track_error
121+
122+
track_error("scan_text", type(e).__name__, source="cli")
123+
except Exception:
124+
pass
88125
typer.echo(f"Error: {str(e)}", err=True)
89126
raise typer.Exit(code=1)
90127

@@ -245,6 +282,18 @@ def redact_text(text: str = typer.Argument(None, help="Text to redact")):
245282
result = anonymizer.anonymize(text, annotations)
246283
typer.echo(result.anonymized_text)
247284

285+
try:
286+
from .telemetry import track_function_call
287+
288+
track_function_call(
289+
function_name="redact_text",
290+
module="datafog.client",
291+
source="cli",
292+
method="redact",
293+
)
294+
except Exception:
295+
pass
296+
248297

249298
@app.command()
250299
def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
@@ -266,6 +315,18 @@ def replace_text(text: str = typer.Argument(None, help="Text to replace PII")):
266315
result = anonymizer.anonymize(text, annotations)
267316
typer.echo(result.anonymized_text)
268317

318+
try:
319+
from .telemetry import track_function_call
320+
321+
track_function_call(
322+
function_name="replace_text",
323+
module="datafog.client",
324+
source="cli",
325+
method="replace",
326+
)
327+
except Exception:
328+
pass
329+
269330

270331
@app.command()
271332
def hash_text(
@@ -291,6 +352,19 @@ def hash_text(
291352
result = anonymizer.anonymize(text, annotations)
292353
typer.echo(result.anonymized_text)
293354

355+
try:
356+
from .telemetry import track_function_call
357+
358+
track_function_call(
359+
function_name="hash_text",
360+
module="datafog.client",
361+
source="cli",
362+
method="hash",
363+
hash_type=hash_type.value,
364+
)
365+
except Exception:
366+
pass
367+
294368

295369
if __name__ == "__main__":
296370
app()

0 commit comments

Comments
 (0)