Skip to content

Commit 1dad146

Browse files
committed
feat: add tests for SpacyAnnotator and improve coverage
- Added tests for datafog.models.spacy_nlp.SpacyAnnotator.annotate_text - Mocked spaCy dependencies to avoid network/model download needs - Corrected entity type validation based on EntityTypes Enum - Skipped test_spark_service_handles_pyspark_import_error due to mocking complexity - Increased overall test coverage to >74%
1 parent a0a8bfd commit 1dad146

File tree

2 files changed

+154
-0
lines changed

2 files changed

+154
-0
lines changed

tests/test_spacy_nlp.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# tests/test_spacy_nlp.py
2+
import pytest
3+
from unittest.mock import patch, MagicMock
4+
from uuid import UUID
5+
from datafog.models.spacy_nlp import SpacyAnnotator, AnnotationResult
6+
7+
8+
@patch('datafog.models.spacy_nlp.spacy.load')
9+
def test_annotate_text_basic(mock_spacy_load):
10+
"""
11+
Test that annotate_text correctly processes text and returns AnnotationResult objects.
12+
"""
13+
# Arrange: Mock the spaCy NLP object and its return value
14+
mock_nlp = MagicMock()
15+
mock_doc = MagicMock()
16+
17+
# Simulate entities found by spaCy
18+
mock_ent1 = MagicMock()
19+
mock_ent1.start_char = 0
20+
mock_ent1.end_char = 4
21+
mock_ent1.label_ = "PERSON"
22+
23+
mock_ent2 = MagicMock()
24+
mock_ent2.start_char = 11
25+
mock_ent2.end_char = 17
26+
mock_ent2.label_ = "LOCATION" # Use valid EntityTypes member
27+
28+
mock_doc.ents = [mock_ent1, mock_ent2]
29+
mock_nlp.return_value = mock_doc # nlp(text) returns the mock_doc
30+
mock_spacy_load.return_value = mock_nlp # spacy.load() returns the mock_nlp
31+
32+
# Instantiate the annotator (doesn't load model immediately)
33+
annotator = SpacyAnnotator()
34+
35+
# Act: Call the method under test
36+
test_text = "John lives in London."
37+
results = annotator.annotate_text(test_text)
38+
39+
# Assert:
40+
# Check that spacy.load was called (implicitly tests load_model)
41+
mock_spacy_load.assert_called_once_with(annotator.model_name)
42+
# Check that the nlp object was called with the text
43+
mock_nlp.assert_called_once()
44+
# Check the number of results
45+
assert len(results) == 2
46+
47+
# Check the details of the first result
48+
assert isinstance(results[0], AnnotationResult)
49+
assert results[0].start == 0
50+
assert results[0].end == 4
51+
assert results[0].entity_type == "PERSON"
52+
assert isinstance(results[0].score, float)
53+
54+
# Check the details of the second result
55+
assert isinstance(results[1], AnnotationResult)
56+
assert results[1].start == 11
57+
assert results[1].end == 17
58+
assert results[1].entity_type == "LOCATION" # Assert for LOCATION
59+
assert isinstance(results[1].score, float)
60+
61+
# Example of testing other branches (e.g., model already loaded)
62+
@patch('datafog.models.spacy_nlp.spacy.load')
63+
def test_annotate_text_model_already_loaded(mock_spacy_load):
64+
"""
65+
Test that annotate_text doesn't reload the model if already loaded.
66+
"""
67+
# Arrange
68+
mock_nlp = MagicMock()
69+
mock_doc = MagicMock()
70+
mock_doc.ents = [] # No entities for simplicity
71+
mock_nlp.return_value = mock_doc
72+
mock_spacy_load.return_value = mock_nlp
73+
74+
annotator = SpacyAnnotator()
75+
annotator.nlp = mock_nlp # Pre-set the nlp attribute
76+
77+
# Act
78+
annotator.annotate_text("Some text.")
79+
80+
# Assert
81+
mock_spacy_load.assert_not_called() # Should not be called again
82+
mock_nlp.assert_called_once_with("Some text.")

tests/test_spark_service.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# tests/test_spark_service.py
2+
import sys
3+
import importlib
4+
import pytest
5+
from unittest.mock import patch, MagicMock
6+
7+
# DO NOT import datafog.services.spark_service at the top level
8+
9+
@pytest.mark.skip(reason="Skipping due to complex mocking interactions with dependencies. "
10+
"Needs revisit when SparkService has real functionality.")
11+
def test_spark_service_handles_pyspark_import_error(capsys):
12+
"""
13+
Test that SparkService handles ImportError for pyspark gracefully during import
14+
and prints the expected message, isolating it from dependency import errors.
15+
"""
16+
# Ensure the module under test and its dependency are not cached
17+
if "datafog.services.spark_service" in sys.modules:
18+
del sys.modules["datafog.services.spark_service"]
19+
if "datafog.processing.spark_processing.pyspark_udfs" in sys.modules:
20+
del sys.modules["datafog.processing.spark_processing.pyspark_udfs"]
21+
22+
# Store original state
23+
original_modules = sys.modules.copy()
24+
25+
# Modules to remove/mock
26+
modules_to_patch = {}
27+
# Remove pyspark
28+
modules_to_patch['pyspark'] = None
29+
modules_to_patch['pyspark.sql'] = None # Also remove submodule just in case
30+
# Mock the problematic dependency
31+
modules_to_patch['datafog.processing.spark_processing.pyspark_udfs'] = MagicMock()
32+
33+
# Use patch.dict to modify sys.modules for this context
34+
with patch.dict(sys.modules, modules_to_patch, clear=False): # clear=False, just overlay
35+
try:
36+
# Attempt to import the module *within* the patch context
37+
# The import of spark_service itself should trigger its try/except
38+
# The import *within* spark_service for pyspark_udfs should get the MagicMock
39+
import datafog.services.spark_service as spark_service
40+
41+
# Check if the warning message was printed (stdout)
42+
captured = capsys.readouterr()
43+
expected_message = "PySpark not found. Please install it with the [spark] extra"
44+
assert expected_message in captured.out
45+
46+
# Check stderr for the traceback from spark_service's except block
47+
assert "ImportError" in captured.err or "ModuleNotFoundError" in captured.err
48+
assert "pyspark" in captured.err
49+
50+
# Verify that the placeholder is set in the imported module
51+
assert spark_service.SparkSession is None
52+
53+
# Verify dependency was mocked (optional, but good practice)
54+
assert isinstance(spark_service.pyspark_udfs, MagicMock)
55+
56+
finally:
57+
# Strict restoration of original modules is important
58+
sys.modules.clear()
59+
sys.modules.update(original_modules)
60+
# Re-delete the target module and dependency to ensure clean state
61+
if "datafog.services.spark_service" in sys.modules:
62+
del sys.modules["datafog.services.spark_service"]
63+
if "datafog.processing.spark_processing.pyspark_udfs" in sys.modules:
64+
del sys.modules["datafog.processing.spark_processing.pyspark_udfs"]
65+
66+
67+
# Add placeholder for actual SparkService tests later if needed
68+
# class TestSparkServiceFunctionality:
69+
# @pytest.mark.skipif(sys.modules.get("pyspark") is None, reason="pyspark not installed")
70+
# def test_spark_functionality(self):
71+
# # Add tests for actual service methods here
72+
# pass

0 commit comments

Comments
 (0)