@@ -241,6 +241,97 @@ def _annotate_with_smart_cascade(
241241 return result .spans
242242 return regex_result
243243
244+ def _annotate_single_chunk (
245+ self , text : str , structured : bool = False
246+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
247+ """Annotate a single chunk of text based on the engine type."""
248+ if self .engine == "regex" :
249+ if structured :
250+ _ , result = self .regex_annotator .annotate_with_spans (text )
251+ return result .spans
252+ return self .regex_annotator .annotate (text )
253+ elif self .engine == "spacy" :
254+ if self .spacy_annotator is None :
255+ raise ImportError (
256+ "SpaCy engine not available. Install with: pip install datafog[nlp]"
257+ )
258+ return self .spacy_annotator .annotate (text )
259+ elif self .engine == "gliner" :
260+ if self .gliner_annotator is None :
261+ raise ImportError (
262+ "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
263+ )
264+ return self .gliner_annotator .annotate (text )
265+ elif self .engine == "smart" :
266+ return self ._annotate_with_smart_cascade (text , structured )
267+ elif self .engine == "auto" :
268+ return self ._annotate_with_auto_engine (text , structured )
269+
270+ def _annotate_with_auto_engine (
271+ self , text : str , structured : bool = False
272+ ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
273+ """Handle auto engine annotation with regex fallback to spacy."""
274+ # Try regex first
275+ if structured :
276+ # For structured output, use annotate_with_spans directly to avoid double processing
277+ _ , result = self .regex_annotator .annotate_with_spans (text )
278+ regex_result = {}
279+ for span in result .spans :
280+ if span .label not in regex_result :
281+ regex_result [span .label ] = []
282+ regex_result [span .label ].append (span .text )
283+
284+ # Check if regex found any entities
285+ if any (entities for entities in regex_result .values ()):
286+ return result .spans
287+ else :
288+ regex_result = self .regex_annotator .annotate (text )
289+
290+ # Check if regex found any entities
291+ if any (entities for entities in regex_result .values ()):
292+ return regex_result
293+
294+ # Fall back to spacy if available
295+ if self .spacy_annotator is not None :
296+ return self .spacy_annotator .annotate (text )
297+
298+ # Return regex result even if empty
299+ if structured :
300+ # We already have the result from above in structured mode
301+ return result .spans
302+ return regex_result
303+
304+ def _annotate_multiple_chunks_structured (self , chunks : List [str ]) -> List ["Span" ]:
305+ """Handle structured annotation across multiple chunks."""
306+ all_spans = []
307+ current_offset = 0
308+
309+ # Get Span class once outside the loop for efficiency
310+ SpanClass = _get_span_class ()
311+
312+ for chunk in chunks :
313+ chunk_spans = self ._annotate_single_chunk (chunk , structured = True )
314+ # Adjust span positions to account for chunk offset
315+ for span in chunk_spans :
316+ adjusted_span = SpanClass (
317+ start = span .start + current_offset ,
318+ end = span .end + current_offset ,
319+ text = span .text ,
320+ label = span .label ,
321+ )
322+ all_spans .append (adjusted_span )
323+ current_offset += len (chunk )
324+
325+ return all_spans
326+
327+ def _annotate_multiple_chunks_dict (self , chunks : List [str ]) -> Dict [str , List [str ]]:
328+ """Handle dictionary annotation across multiple chunks."""
329+ chunk_annotations = []
330+ for chunk in chunks :
331+ chunk_result = self ._annotate_single_chunk (chunk , structured = False )
332+ chunk_annotations .append (chunk_result )
333+ return self ._combine_annotations (chunk_annotations )
334+
244335 def annotate_text_sync (
245336 self , text : str , structured : bool = False
246337 ) -> Union [Dict [str , List [str ]], List ["Span" ]]:
@@ -256,88 +347,15 @@ def annotate_text_sync(
256347 """
257348 if len (text ) <= self .text_chunk_length :
258349 # Single chunk processing
259- if self .engine == "regex" :
260- if structured :
261- _ , result = self .regex_annotator .annotate_with_spans (text )
262- return result .spans
263- return self .regex_annotator .annotate (text )
264- elif self .engine == "spacy" :
265- if self .spacy_annotator is None :
266- raise ImportError (
267- "SpaCy engine not available. Install with: pip install datafog[nlp]"
268- )
269- return self .spacy_annotator .annotate (text )
270- elif self .engine == "gliner" :
271- if self .gliner_annotator is None :
272- raise ImportError (
273- "GLiNER engine not available. Install with: pip install datafog[nlp-advanced]"
274- )
275- return self .gliner_annotator .annotate (text )
276- elif self .engine == "smart" :
277- return self ._annotate_with_smart_cascade (text , structured )
278- elif self .engine == "auto" :
279- # Try regex first
280- if structured :
281- # For structured output, use annotate_with_spans directly to avoid double processing
282- _ , result = self .regex_annotator .annotate_with_spans (text )
283- regex_result = {}
284- for span in result .spans :
285- if span .label not in regex_result :
286- regex_result [span .label ] = []
287- regex_result [span .label ].append (span .text )
288-
289- # Check if regex found any entities
290- if any (entities for entities in regex_result .values ()):
291- return result .spans
292- else :
293- regex_result = self .regex_annotator .annotate (text )
294-
295- # Check if regex found any entities
296- if any (entities for entities in regex_result .values ()):
297- return regex_result
298-
299- # Fall back to spacy if available
300- if self .spacy_annotator is not None :
301- return self .spacy_annotator .annotate (text )
302-
303- # Return regex result even if empty
304- if structured :
305- # We already have the result from above in structured mode
306- return result .spans
307- return regex_result
350+ return self ._annotate_single_chunk (text , structured )
308351 else :
309352 # Multi-chunk processing
310353 chunks = self ._chunk_text (text )
311354
312355 if structured :
313- # For structured output, we need to handle span positions across chunks
314- all_spans = []
315- current_offset = 0
316-
317- # Get Span class once outside the loop for efficiency
318- SpanClass = _get_span_class ()
319-
320- for chunk in chunks :
321- chunk_spans = self .annotate_text_sync (chunk , structured = True )
322- # Adjust span positions to account for chunk offset
323- for span in chunk_spans :
324- adjusted_span = SpanClass (
325- start = span .start + current_offset ,
326- end = span .end + current_offset ,
327- text = span .text ,
328- label = span .label ,
329- )
330- all_spans .append (adjusted_span )
331- current_offset += len (chunk )
332-
333- return all_spans
356+ return self ._annotate_multiple_chunks_structured (chunks )
334357 else :
335- # Dictionary format - combine annotations
336- chunk_annotations = []
337- for chunk in chunks :
338- chunk_result = self .annotate_text_sync (chunk , structured = False )
339- chunk_annotations .append (chunk_result )
340- return self ._combine_annotations (chunk_annotations )
358+ return self ._annotate_multiple_chunks_dict (chunks )
341359
342360 async def annotate_text_async (
343361 self , text : str , structured : bool = False
0 commit comments