Merge pull request #56 from DataFog/feat/4.1-baseline-fixes

sidmohan0 · web-flow · commit 3e9683a022b2 · 2025-04-26T17:24:35.000-07:00
Feat/4.1 baseline fixes
diff --git a/.codecov.yml b/.codecov.yml
@@ -1 +1,15 @@
 comment: no
+
+coverage:
+  status:
+    project:
+      default:
+        # Target overall coverage percentage
+        target: 74%
+        # Allow coverage to drop by this amount without failing
+        # threshold: 0.5%  # Optional: uncomment to allow small drops
+    patch:
+      default:
+        # Target coverage percentage for the changes in the PR/commit
+        target: 20% # Lower target for patch coverage
+        # threshold: 1%   # Optional: Allow patch coverage to drop
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,4 +25,5 @@ repos:
     rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
+        types: [yaml, markdown] # Explicitly define file types
         exclude: .venv
diff --git a/README.md b/README.md
@@ -21,10 +21,31 @@
 
 DataFog can be installed via pip:
 
-```
+```bash
 pip install datafog
 ```
 
+### Optional Features (Extras)
+
+DataFog uses `extras` to manage dependencies for optional features like specific OCR engines or Apache Spark integration. You can install these as needed:
+
+- **OCR (Tesseract):** For image scanning using Tesseract. Requires Tesseract OCR engine to be installed on your system separately.
+  ```bash
+  pip install "datafog[ocr]"
+  ```
+- **OCR (Donut):** For image scanning using the Donut document understanding model.
+  ```bash
+  pip install "datafog[donut]"
+  ```
+- **Spark:** For processing data using PySpark.
+  ```bash
+  pip install "datafog[spark]"
+  ```
+- **All:** To install all optional features at once.
+  ```bash
+  pip install "datafog[all]"
+  ```
+
 # CLI
 
 ## 📚 Quick Reference
diff --git a/datafog/__about__.py b/datafog/__about__.py
@@ -1 +1 @@
-__version__ = "3.3.0"
+__version__ = "4.1.0"
diff --git a/datafog/processing/image_processing/donut_processor.py b/datafog/processing/image_processing/donut_processor.py
@@ -19,6 +19,21 @@
 
 from .image_downloader import ImageDownloader
 
+# Attempt imports and provide helpful error messages
+try:
+    import torch
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "torch is not installed. Please install it to use Donut features: pip install 'datafog[donut]'"
+    )
+try:
+    from transformers import DonutProcessor as TransformersDonutProcessor
+    from transformers import VisionEncoderDecoderModel
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "transformers is not installed. Please install it to use Donut features: pip install 'datafog[donut]'"
+    )
+
 
 class DonutProcessor:
     """
@@ -30,28 +45,13 @@ class DonutProcessor:
     """
 
     def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"):
-        self.ensure_installed("torch")
-        self.ensure_installed("transformers")
-
-        import torch
-        from transformers import DonutProcessor as TransformersDonutProcessor
-        from transformers import VisionEncoderDecoderModel
-
         self.processor = TransformersDonutProcessor.from_pretrained(model_path)
         self.model = VisionEncoderDecoderModel.from_pretrained(model_path)
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
         self.model.eval()
         self.downloader = ImageDownloader()
 
-    def ensure_installed(self, package_name):
-        try:
-            importlib.import_module(package_name)
-        except ImportError:
-            subprocess.check_call(
-                [sys.executable, "-m", "pip", "install", package_name]
-            )
-
     def preprocess_image(self, image: Image.Image) -> np.ndarray:
         # Convert to RGB if the image is not already in RGB mode
         if image.mode != "RGB":
diff --git a/datafog/processing/spark_processing/pyspark_udfs.py b/datafog/processing/spark_processing/pyspark_udfs.py
@@ -8,70 +8,63 @@
 """
 
 import importlib
+import logging
 import subprocess
 import sys
+import traceback
+from typing import List
 
-PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
-MAXIMAL_STRING_SIZE = 1000000
-
-
-def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
-    """Extract features using en_core_web_lg model.
-
-    Returns:
-        list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.
-    """
-    ensure_installed("pyspark")
-    ensure_installed("spacy")
+try:
     import spacy
+except ImportError:
+    print("Spacy not found. Please install it: pip install spacy")
+    print("and download the model: python -m spacy download en_core_web_lg")
+    spacy = None
+    traceback.print_exc()
+    sys.exit(1)
+
+try:
     from pyspark.sql import SparkSession
     from pyspark.sql.functions import udf
-    from pyspark.sql.types import ArrayType, StringType, StructField, StructType
+    from pyspark.sql.types import ArrayType, StringType
+except ImportError:
+    print(
+        "PySpark not found. Please install it with the [spark] extra: pip install 'datafog[spark]'"
+    )
+
+    # Set placeholders to allow module import even if pyspark is not installed
+    def placeholder_udf(*args, **kwargs):
+        return None
+
+    def placeholder_arraytype(x):
+        return None
 
-    if text:
-        if len(text) > MAXIMAL_STRING_SIZE:
-            # Cut the strings for required sizes
-            text = text[:MAXIMAL_STRING_SIZE]
-        nlp = broadcasted_nlp.value
-        doc = nlp(text)
+    def placeholder_stringtype():
+        return None
 
-        # Pre-create dictionary with labels matching to expected extracted entities
-        classified_entities: dict[str, list[str]] = {
-            _label: [] for _label in PII_ANNOTATION_LABELS
-        }
-        for ent in doc.ents:
-            # Add entities from extracted values
-            classified_entities[ent.label_].append(ent.text)
+    udf = placeholder_udf
+    ArrayType = placeholder_arraytype
+    StringType = placeholder_stringtype
+    SparkSession = None  # Define a placeholder
+    traceback.print_exc()
+    # Do not exit, allow basic import but functions using Spark will fail later if called
 
-        return [_ent for _ent in classified_entities.values()]
-    else:
-        return [[] for _ in PII_ANNOTATION_LABELS]
+from datafog.processing.text_processing.spacy_pii_annotator import pii_annotator
+
+PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
+MAXIMAL_STRING_SIZE = 1000000
 
 
 def broadcast_pii_annotator_udf(
     spark_session=None, spacy_model: str = "en_core_web_lg"
 ):
     """Broadcast PII annotator across Spark cluster and create UDF"""
-    ensure_installed("pyspark")
-    ensure_installed("spacy")
-    import spacy
-    from pyspark.sql import SparkSession
-    from pyspark.sql.functions import udf
-    from pyspark.sql.types import ArrayType, StringType, StructField, StructType
-
     if not spark_session:
-        spark_session = SparkSession.builder.getOrCreate()
+        spark_session = SparkSession.builder.getOrCreate()  # noqa: F821
     broadcasted_nlp = spark_session.sparkContext.broadcast(spacy.load(spacy_model))
 
     pii_annotation_udf = udf(
         lambda text: pii_annotator(text, broadcasted_nlp),
         ArrayType(ArrayType(StringType())),
     )
     return pii_annotation_udf
-
-
-def ensure_installed(self, package_name):
-    try:
-        importlib.import_module(package_name)
-    except ImportError:
-        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
diff --git a/datafog/services/spark_service.py b/datafog/services/spark_service.py
@@ -7,9 +7,21 @@
 
 import importlib
 import json
+import logging
 import subprocess
 import sys
-from typing import Any, List
+from typing import Any, List, Optional
+
+# Attempt to import pyspark and provide a helpful error message if missing
+try:
+    from pyspark.sql import DataFrame, SparkSession
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "pyspark is not installed. Please install it to use Spark features: pip install datafog[spark]"
+    )
+
+from pyspark.sql.functions import udf
+from pyspark.sql.types import ArrayType, StringType
 
 
 class SparkService:
@@ -20,30 +32,21 @@ class SparkService:
     data reading and package installation.
     """
 
-    def __init__(self):
-        self.spark = self.create_spark_session()
-        self.ensure_installed("pyspark")
-
-        from pyspark.sql import DataFrame, SparkSession
-        from pyspark.sql.functions import udf
-        from pyspark.sql.types import ArrayType, StringType
+    def __init__(self, spark_session: Optional[SparkSession] = None):
+        if spark_session:
+            self.spark = spark_session
+        else:
+            self.spark = self.create_spark_session()
 
-        self.SparkSession = SparkSession
         self.DataFrame = DataFrame
         self.udf = udf
         self.ArrayType = ArrayType
         self.StringType = StringType
 
+        logging.info("SparkService initialized.")
+
     def create_spark_session(self):
-        return self.SparkSession.builder.appName("datafog").getOrCreate()
+        return SparkSession.builder.appName("datafog").getOrCreate()
 
     def read_json(self, path: str) -> List[dict]:
         return self.spark.read.json(path).collect()
-
-    def ensure_installed(self, package_name):
-        try:
-            importlib.import_module(package_name)
-        except ImportError:
-            subprocess.check_call(
-                [sys.executable, "-m", "pip", "install", package_name]
-            )
diff --git a/notes/ROADMAP.md b/notes/ROADMAP.md
@@ -0,0 +1,77 @@
+---
+
+### **v4.1.0 — Baseline stability**
+
+* **MUST** read `__version__` from `datafog/__about__.py` and import it in `setup.py`; delete the duplicate there.  
+* **MUST** remove every `ensure_installed()` runtime `pip install`; fail fast instead.  
+* **MUST** document OCR/Donut extras in `setup.py[extras]`.
+
+---
+
+### **v4.2.0 — Faster spaCy path**
+
+- **MUST** hold the spaCy `nlp` object in a module-level cache (singleton).
+- **MUST** replace per-doc loops with `nlp.pipe(batch_size=?, n_process=-1)`.
+- **MUST** run spaCy and Tesseract calls in `asyncio.to_thread()` (or a thread-pool) so the event-loop stays free.
+- **SHOULD** expose `PIPE_BATCH_SIZE` env var for tuning.
+
+---
+
+### **v4.3.0 — Strong types, predictable output**
+
+- **MUST** make `_process_text` always return `Dict[str, Dict]`.
+- **MUST** add `mypy --strict` to CI; fix any revealed issues.
+- **SHOULD** convert `datafog.config` to a Pydantic v2 `BaseSettings`.
+
+---
+
+### **v4.4.0 — Clean OCR architecture**
+
+- **MUST** split `ImageService` into `TesseractOCR` and `DonutOCR`, each with `extract_text(Image)->str`.
+- **MUST** let users pick via `ImageService(backend="tesseract"|"donut")` or the `DATAFOG_DEFAULT_OCR` env var.
+- **SHOULD** add unit tests that stub each backend independently.
+
+---
+
+### **v4.5.0 — Rust-powered pattern matching (optional wheel)**
+
+- **MUST** create a PyO3 extension `datafog._fastregex` that wraps `aho-corasick` / `regex-automata`.
+- **MUST** auto-import it when available; fall back to pure-Python silently.
+- **SHOULD** publish platform wheels under `pip install "datafog[fastregex]"`.
+
+---
+
+### **v4.6.0 — Streaming and zero-copy**
+
+- **MUST** add `async def stream_text_pipeline(iterable[str]) -> AsyncIterator[Result]`.
+- **MUST** scan CSV/JSON via `pyarrow.dataset` to avoid reading the whole file into RAM.
+- **SHOULD** provide example notebook comparing latency/bandwidth vs. v4.5.
+
+---
+
+### **v4.7.0 — GPU / transformer toggle**
+
+- **MUST** accept `DataFog(use_gpu=True)` which loads `en_core_web_trf` in half precision if CUDA is present.
+- **MUST** fall back gracefully on CPU-only hosts.
+- **SHOULD** benchmark and log model choice at INFO level.
+
+---
+
+### **v4.8.0 — Fast anonymizer core**
+
+- **MUST** rewrite `Anonymizer.replace_pii/redact_pii/hash_pii` in Cython (single-pass over the string).
+- **MUST** switch hashing to OpenSSL EVP via `cffi` for SHA-256/SHA3-256.
+- **SHOULD** guard with `pip install "datafog[fast]"`.
+
+---
+
+### **v4.9.0 — Edge & CI polish**
+
+- **MUST** compile the annotator and anonymizer to WebAssembly using `maturin`, package as `_datafog_wasm`.
+- **MUST** auto-load WASM build on `wasmtime` when `import datafog.wasm` succeeds.
+- **MUST** cache spaCy model artefacts in GitHub Actions with `actions/cache`, keyed by `model-hash`.
+- **SHOULD** update docs and `README.md` badges for new extras and WASM support.
+
+---
+
+Use this ladder as-is, bumping **only the minor version** each time, so v4.0.x callers never break.
diff --git a/notes/v4.1.0-tickets.md b/notes/v4.1.0-tickets.md
diff --git a/setup.py b/setup.py
diff --git a/tests/test_spacy_nlp.py b/tests/test_spacy_nlp.py
diff --git a/tests/test_spark_service.py b/tests/test_spark_service.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "3.3.0"`
	`1`	`+__version__ = "4.1.0"`