From 709f33a2cf159b2060f740d3275c85d6958efedf Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 23 Sep 2025 04:42:43 -0400 Subject: [PATCH 1/8] Updating WtP models. Adding sentence splitting option. --- detection/nlp_text_splitter/README.md | 18 +- detection/nlp_text_splitter/install.sh | 18 +- .../nlp_text_splitter/__init__.py | 202 ++++++++++++------ .../tests/test_text_splitter.py | 39 +++- 4 files changed, 196 insertions(+), 81 deletions(-) diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md index e8c7d14..3db47e1 100644 --- a/detection/nlp_text_splitter/README.md +++ b/detection/nlp_text_splitter/README.md @@ -1,7 +1,7 @@ # Overview This directory contains the source code, test examples, and installation script -for the OpenMPF NlpTextSplitter tool, which uses WtP and spaCy libraries +for the OpenMPF NlpTextSplitter tool, which uses **SaT (Segment any Text)**, **WtP**, and **spaCy** to detect sentences in a given chunk of text. # Background @@ -10,14 +10,17 @@ Our primary motivation for creating this tool was to find a lightweight, accurat sentence detection capability to support a large variety of text processing tasks including translation and tagging. -Through preliminary investigation, we identified the [WtP library ("Where's the -Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence +Through preliminary investigation, we identified the [WtP/SaT library ("Where's the +Point"/"Segment any Text")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence detection model](https://spacy.io/models) for identifying sentence breaks in a large section of text. WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 -GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +GB of GPU memory. SaT is the newer successor to WtP from the same authors and +generally offers better accuracy/efficiency. + +On the other hand, spaCy has a single multilingual sentence detection that appears to work better for splitting up English text in certain cases. Unfortunately this model lacks support handling for Chinese punctuation. @@ -40,12 +43,13 @@ Please note that several customizations are supported: setup a PyTorch installation with CUDA (GPU) libraries. - `--wtp-models-dir |-m `: Add this parameter to - change the default WtP model installation directory + change the default WtP/SaT model installation directory (default: `/opt/wtp/models`). - `--install-wtp-model|-w `: Add this parameter to specify - additional WTP models for installation. This parameter can be provided - multiple times to install more than one model. + additional WtP/SaT models for installation. Accepts both **WtP** names + (e.g., `wtp-bert-mini`) and **SaT** names (e.g., `sat-3l-sm`). + This parameter can be provided multiple times to install more than one model. - `--install-spacy-model|-s `: Add this parameter to specify additional spaCy models for installation. This parameter can be provided diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index 38d1f5c..eb3fac9 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -107,10 +107,20 @@ download_wtp_models() { for model_name in "${model_names[@]}"; do echo "Downloading the $model_name model to $wtp_models_dir." - local wtp_model_dir="$wtp_models_dir/$model_name" - python3 -c \ - "from huggingface_hub import snapshot_download; \ - snapshot_download('benjamin/$model_name', local_dir='$wtp_model_dir')" + local model_dir="$wtp_models_dir/$model_name" + + # Decide which HF org to use based on model prefix. + # - WtP: benjamin/ + # - SaT: segment-any-text/ + local hf_owner="benjamin" + case "$model_name" in + sat-*) hf_owner="segment-any-text" ;; + esac + + python3 - << PY + from huggingface_hub import snapshot_download + snapshot_download(repo_id="${hf_owner}/${model_name}", local_dir="${model_dir}") +PY done } diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index 3913b9a..69d9330 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -29,32 +29,36 @@ import importlib.resources from importlib.resources.abc import Traversable +from enum import Enum import spacy -from wtpsplit import WtP +import torch + +from wtpsplit import WtP, SaT from typing import Callable, List, Optional, Tuple from .wtp_lang_settings import WtpLanguageSettings -import torch - +class SplitMode(Enum): + DEFAULT = 'DEFAULT' + SENTENCE = 'SENTENCE' DEFAULT_WTP_MODELS = "/opt/wtp/models" # If we want to package model installation with this utility in the future: -WTP_MODELS_PATH: Traversable = importlib.resources.files(__name__) / 'models' +MODELS_PATH: Traversable = importlib.resources.files(__name__) / 'models' log = logging.getLogger(__name__) # These models must have an specified language during sentence splitting. -WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l', - 'wtp-canine-s-3l', - 'wtp-canine-s-6l', - 'wtp-canine-s-9l', - 'wtp-canine-s-12l'] +WTP_MANDATORY_ADAPTOR = { + 'wtp-canine-s-1l', + 'wtp-canine-s-3l', + 'wtp-canine-s-6l', + 'wtp-canine-s-9l', + 'wtp-canine-s-12l', +} -GPU_AVAILABLE = False -if torch.cuda.is_available(): - GPU_AVAILABLE = True +GPU_AVAILABLE = torch.cuda.is_available() class TextSplitterModel: @@ -68,68 +72,95 @@ def __init__(self, model_name: str, model_setting: str, default_lang: str = "en" self.split = lambda t, **param: [t] self.update_model(model_name, model_setting, default_lang) - def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): - if model_name: - if "wtp" in model_name: - self._update_wtp_model(model_name, model_setting, default_lang) - self.split = self._split_wtp - log.info(f"Setup WtP model: {model_name}") - else: - self._update_spacy_model(model_name) - self.split = self._split_spacy - log.info(f"Setup spaCy model: {model_name}") - - def _update_wtp_model(self, wtp_model_name: str, - model_setting: str, - default_lang: str) -> None: + def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str = "en"): + if not model_name: + return + + lower_name = model_name.lower() + if lower_name.startswith("wtp"): + self._update_wtp_model(model_name, model_setting, default_lang) + self.split = self._split_wtp + log.info("Setup WtP model: %s", model_name) + elif lower_name.startswith("sat"): + self._update_sat_model(model_name, model_setting, default_lang) + self.split = self._split_sat + log.info("Setup SaT model: %s", model_name) + else: + self._update_spacy_model(model_name) + self.split = self._split_spacy + log.info("Setup spaCy model: %s", model_name) + def _resolve_cpu_gpu_device(self, model_setting: str) -> str: if model_setting == "gpu" or model_setting == "cuda": if GPU_AVAILABLE: - model_setting = "cuda" + return "cuda" else: log.warning("PyTorch determined that CUDA is not available. " "You may need to update the NVIDIA driver for the host system, " "or reinstall PyTorch with GPU support by setting " "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") - model_setting = "cpu" - elif model_setting != "cpu": - log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " + return "cpu" + if model_setting != "cpu": + log.warning( + "Invalid model setting '%s'. Only `cpu` and `cuda` " "(or `gpu`) WtP model options available at this time. " - "Defaulting to `cpu` mode.") - model_setting = "cpu" + "Defaulting to `cpu` mode.", model_setting) + return "cpu" - if wtp_model_name in WTP_MANDATORY_ADAPTOR: - self._mandatory_wtp_language = True - self._default_lang = default_lang + def _find_local_model_path(self, model_name: str) -> Optional[str]: + candidate = MODELS_PATH / model_name + if candidate.is_file() or candidate.is_dir(): + with importlib.resources.as_file(candidate) as path: + return str(path) - if self._model_name == wtp_model_name and self._model_setting == model_setting: - log.info(f"Using cached model, running on {self._model_setting}: " - f"{self._model_name}") + fallback = os.path.join(DEFAULT_WTP_MODELS, model_name) + if os.path.exists(fallback): + return fallback + return None + + def _update_wtp_model(self, wtp_model_name: str, + model_setting: str, + default_lang: str) -> None: + device = self._resolve_cpu_gpu_device(model_setting) + + self._model_name = wtp_model_name + self._model_setting = device + self._default_lang = default_lang + self._mandatory_wtp_language = (wtp_model_name in WTP_MANDATORY_ADAPTOR) + + local_path = self._find_local_model_path(wtp_model_name) + + if local_path: + log.info("Using downloaded WtP model at %s", local_path) + self.wtp_model = WtP(local_path) else: - self._model_setting = model_setting - self._model_name = wtp_model_name - # Check if model has been downloaded - if (WTP_MODELS_PATH / wtp_model_name).is_file(): - log.info(f"Using downloaded {wtp_model_name} model.") - with importlib.resources.as_file(WTP_MODELS_PATH / wtp_model_name) as path: - self.wtp_model = WtP(str(path)) - elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, - wtp_model_name)): - - log.info(f"Using downloaded {wtp_model_name} model.") - wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, - wtp_model_name) - self.wtp_model = WtP(wtp_model_name) - else: - log.warning(f"Model {wtp_model_name} not found, " - "downloading from hugging face.") - self.wtp_model = WtP(wtp_model_name) + log.warning("WtP model '%s' not found locally; downloading from Hugging Face.", wtp_model_name) + self.wtp_model = WtP(wtp_model_name) + self.wtp_model.to(device) + + def _update_sat_model(self, sat_model_name: str, model_setting: str, default_lang: str) -> None: + device = self._resolve_cpu_gpu_device(model_setting) + + self._model_name = sat_model_name + self._model_setting = device + self._default_lang = default_lang + self._mandatory_wtp_language = (sat_model_name in WTP_MANDATORY_ADAPTOR) + + local_path = self._find_local_model_path(sat_model_name) + + if local_path: + log.info("Using downloaded SaT model at %s", local_path) + self.sat_model = SaT(local_path) + else: + log.warning("SaT model '%s' not found locally; downloading from Hugging Face.", sat_model_name) + self.sat_model = SaT(sat_model_name) + + # Move model to device; SaT benefits from half precision on GPU. + if device == "cuda": + self.sat_model.half().to("cuda") + else: + self.sat_model.to("cpu") - if model_setting != "cpu" and model_setting != "cuda": - log.warning(f"Invalid setting for WtP runtime {model_setting}. " - "Defaulting to CPU mode.") - model_setting = "cpu" - self.wtp_model.to(model_setting) def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: if lang: @@ -152,6 +183,10 @@ def _update_spacy_model(self, spacy_model_name: str): self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) self.spacy_model.enable_pipe("senter") + def _split_sat(self, text: str, lang: Optional[str] = None) -> List[str]: + # TODO: For now, we'll only use the SaT models that are language agnostic. + return self.sat_model.split(text) + def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: # TODO: We may add an auto model selection for spaCy in the future. # However, the drawback is we will also need to @@ -165,7 +200,9 @@ def __init__( self, text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, - in_lang: Optional[str] = None) -> None: + in_lang: Optional[str] = None, + split_mode: SplitMode = SplitMode.DEFAULT) -> None: + self._sentence_model = sentence_model self._limit = limit self._num_boundary_chars = num_boundary_chars @@ -175,6 +212,7 @@ def __init__( self._overhead_size = 0 self._soft_limit = self._limit self._in_lang = in_lang + self._split_mode = split_mode if text: self.set_text(text) @@ -218,17 +256,45 @@ def _isolate_largest_section(self, text:str) -> str: def split(cls, text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, - in_lang: Optional[str] = None - ): - return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() - + in_lang: Optional[str] = None, + split_mode: SplitMode = SplitMode.DEFAULT, + ): + return cls(text, limit, num_boundary_chars, get_text_size, + sentence_model, in_lang, split_mode)._split() def _split(self): + if self._split_mode == SplitMode.SENTENCE: + yield from self._split_sentences_individually() + else: + yield from self._split_default() + + def _split_default(self): if self._text_full_size <= self._limit: yield self._text else: yield from self._split_internal(self._text) + def _split_sentences_individually(self): + """ + Yield one sentence at a time. If any individual sentence exceeds the limit, + reuse the internal chunking logic to subdivide that sentence. + """ + sentences = self._sentence_model.split(self._text, lang=self._in_lang) + for sentence in sentences: + if self._get_text_size(sentence) <= self._limit: + yield sentence + else: + # Split oversized sentence using the default internal logic. + yield from self._split_sentence_text(sentence) + + def _split_sentence_text(self, text: str): + saved = (self._text, self._text_full_size, self._overhead_size, self._soft_limit) + try: + self.set_text(text) + yield from self._split_internal(text) + finally: + self._text, self._text_full_size, self._overhead_size, self._soft_limit = saved + def _split_internal(self, text): right = text while True: @@ -250,9 +316,7 @@ def _divide(self, text) -> Tuple[str, str]: left = self._isolate_largest_section(left) return left, text[len(left):] - char_per_size = len(left) / left_size - - + char_per_size = len(left) / max(left_size, 1) limit = int(self._limit * char_per_size) - self._overhead_size if limit < 1: diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index 9782870..b24faaa 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -27,7 +27,7 @@ import pathlib import unittest -from nlp_text_splitter import TextSplitterModel, TextSplitter +from nlp_text_splitter import TextSplitterModel, TextSplitter, SplitMode TEST_DATA = pathlib.Path(__file__).parent / 'test_data' @@ -38,6 +38,35 @@ def setUpClass(cls): cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "zh") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") + cls.sat_model = TextSplitterModel("sat-3l-sm", "cpu", "en") + + def test_sat_basic_sentence_split(self): + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 100, + 100, + len, + self.sat_model, + split_mode=SplitMode.SENTENCE)) + + self.assertEqual(2, len(actual)) + self.assertEqual('Hello, what is your name? ', actual[0]) + self.assertEqual('My name is John.', actual[1]) + + def test_sat_chunk_split(self): + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 28, + 28, + len, + self.sat_model, + split_mode=SplitMode.DEFAULT)) + + + self.assertEqual(2, len(actual)) + self.assertEqual('Hello, what is your name? ', actual[0]) + self.assertEqual('My name is John.', actual[1]) + def test_split_engine_difference(self): # Note: Only WtP's multilingual models @@ -68,6 +97,14 @@ def test_guess_split_simple_sentence(self): self.assertEqual(input_text, ''.join(actual)) self.assertEqual(2, len(actual)) + actual = list(TextSplitter.split(input_text, + 500, + 500, + len, + self.sat_model,split_mode=SplitMode.SENTENCE)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + # "Hello, what is your name?" self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." From 40d4bb790685ef86967d35ed5f3c3e04023cf618 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 23 Sep 2025 09:37:39 -0400 Subject: [PATCH 2/8] Updating WtP models. Adding sentence splitting option. --- detection/nlp_text_splitter/install.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index eb3fac9..c657a2e 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -117,10 +117,10 @@ download_wtp_models() { sat-*) hf_owner="segment-any-text" ;; esac - python3 - << PY - from huggingface_hub import snapshot_download - snapshot_download(repo_id="${hf_owner}/${model_name}", local_dir="${model_dir}") -PY + python3 -c \ + "from huggingface_hub import snapshot_download; \ + snapshot_download(repo_id='${hf_owner}/${model_name}', local_dir='${model_dir}')" + done } From be38c7880223ea2e021650782aaa09561bcf351a Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 23 Sep 2025 10:09:00 -0400 Subject: [PATCH 3/8] Updating WtP models. Adding sentence splitting option. --- detection/nlp_text_splitter/install.sh | 2 +- detection/nlp_text_splitter/nlp_text_splitter/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index c657a2e..be7a6e5 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -37,7 +37,7 @@ main() { fi eval set -- "$options" local wtp_models_dir=/opt/wtp/models - local wtp_models=("wtp-bert-mini") + local wtp_models=("wtp-bert-mini", "sat-3l-sm") local spacy_models=("xx_sent_ud_sm") while true; do case "$1" in diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index 69d9330..e3dd40a 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -152,7 +152,7 @@ def _update_sat_model(self, sat_model_name: str, model_setting: str, default_lan log.info("Using downloaded SaT model at %s", local_path) self.sat_model = SaT(local_path) else: - log.warning("SaT model '%s' not found locally; downloading from Hugging Face.", sat_model_name) + log.warning("SaT model '%s' not found locally; downloading from Hugging Face.", sat_model_name) self.sat_model = SaT(sat_model_name) # Move model to device; SaT benefits from half precision on GPU. From b008096c732cd820706509c25e366e2b581c711d Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 23 Sep 2025 10:37:34 -0400 Subject: [PATCH 4/8] Minor bugfix --- detection/nlp_text_splitter/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index be7a6e5..34615c6 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -37,7 +37,7 @@ main() { fi eval set -- "$options" local wtp_models_dir=/opt/wtp/models - local wtp_models=("wtp-bert-mini", "sat-3l-sm") + local wtp_models=("wtp-bert-mini" "sat-3l-sm") local spacy_models=("xx_sent_ud_sm") while true; do case "$1" in From cfd4e9060c29ff9815ebbcb7652a9d96135904d1 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 14 Oct 2025 00:57:18 -0400 Subject: [PATCH 5/8] Adding newline processing to text splitter. --- LICENSE | 7 +- detection/nlp_text_splitter/README.md | 41 ++++- detection/nlp_text_splitter/install.sh | 10 +- .../nlp_text_splitter/__init__.py | 72 ++++---- .../nlp_text_splitter/newline_behavior.py | 159 ++++++++++++++++++ .../nlp_text_splitter/wtp_lang_settings.py | 4 +- detection/nlp_text_splitter/pyproject.toml | 4 +- .../tests/test_text_splitter.py | 77 +++++---- 8 files changed, 298 insertions(+), 76 deletions(-) create mode 100644 detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py diff --git a/LICENSE b/LICENSE index c4d1011..f79bf7b 100644 --- a/LICENSE +++ b/LICENSE @@ -25,13 +25,14 @@ The nlp_text_splitter utlity uses the following sentence detection libraries: ***************************************************************************** -The WtP, "Where the Point", sentence segmentation library falls under the MIT License: +The WtP, "Where the Point", and SaT, "Segment any Text" sentence segmentation +library falls under the MIT License: -https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE +https://github.com/segment-any-text/wtpsplit/blob/main/LICENSE MIT License -Copyright (c) 2024 Benjamin Minixhofer +Copyright (c) 2024 Benjamin Minixhofer, Markus Frohmann, Igor Sterner Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/detection/nlp_text_splitter/README.md b/detection/nlp_text_splitter/README.md index 3db47e1..f050a41 100644 --- a/detection/nlp_text_splitter/README.md +++ b/detection/nlp_text_splitter/README.md @@ -1,8 +1,8 @@ # Overview This directory contains the source code, test examples, and installation script -for the OpenMPF NlpTextSplitter tool, which uses **SaT (Segment any Text)**, **WtP**, and **spaCy** -to detect sentences in a given chunk of text. +for the OpenMPF NlpTextSplitter tool, which uses **SaT (Segment any Text)**, +**WtP (Where's the Point)**, and **spaCy** to detect sentences in a given chunk of text. # Background @@ -21,8 +21,37 @@ GB of GPU memory. SaT is the newer successor to WtP from the same authors and generally offers better accuracy/efficiency. On the other hand, spaCy has a single multilingual sentence detection -that appears to work better for splitting up English text in certain cases. Unfortunately -this model lacks support handling for Chinese punctuation. +that appears to work better for splitting up English text in certain cases. + +This component has been updated to use the Azure Translation Component's NewLineBehavior class +for swapping newlines with either whitespace or removing it altogether based on script detected. + +The reason why we need to consider the script/character encodings is because certain languages +will treat whitespace between words as possessing different meanings. For instance in Chinese + +`电脑` would mean `computer` but `电 脑` would mean `electricity brain`. + +When calling the NLP text splitter, users can adjust the following parameters to control for sentence +splitting behaviors: + +- `split_mode`: set to `DEFAULT` for splitting by chunk size and `SENTENCE` when splitting by sentences + +- `newline_behavior` : controls how newlines are handled in a submitted input text. Options include: + - `GUESS` to choose ' ' for space-separated langs; '' for Chinese/Japanese/Korean. + - `SPACE` to always replace with a single space. + - `REMOVE` to always remove (no space). + - `NONE` to no change. + +For instance: +``` + result = list(TextSplitter.split(input_text, + ... + self.sat_model, + split_mode='DEFAULT') + newline_behavior='NONE') +``` +Will attempt to split using an SaT model, using the default chunking parameters and no newline adjustments. + # Installation @@ -47,8 +76,8 @@ Please note that several customizations are supported: (default: `/opt/wtp/models`). - `--install-wtp-model|-w `: Add this parameter to specify - additional WtP/SaT models for installation. Accepts both **WtP** names - (e.g., `wtp-bert-mini`) and **SaT** names (e.g., `sat-3l-sm`). + additional WtP/SaT models for installation. Accepts both WtP names + (e.g., `wtp-bert-mini`) and SaT names (e.g., `sat-3l-sm`). This parameter can be provided multiple times to install more than one model. - `--install-spacy-model|-s `: Add this parameter to specify diff --git a/detection/nlp_text_splitter/install.sh b/detection/nlp_text_splitter/install.sh index 34615c6..749b682 100755 --- a/detection/nlp_text_splitter/install.sh +++ b/detection/nlp_text_splitter/install.sh @@ -7,11 +7,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -159,12 +159,12 @@ Options --text-splitter-dir, -t : Path to text splitter source code. (defaults to to the same directory as this script) --gpu, -g: Install the GPU version of PyTorch - --wtp-models-dir , -m : Path where WTP models will be stored. + --wtp-models-dir , -m : Path where WtP/SaT models will be stored. (defaults to /opt/wtp/models) - --install-wtp-model, -w : Name of a WTP model to install in addtion to wtp-bert-mini. + --install-wtp-model, -w : Name of a WTP or SaT model to install in addition to 'wtp-bert-mini' and 'sat-3l-sm. This option can be provided more than once to specify multiple models. - --install-spacy-model | -s : Names of a spaCy model to install in addtion to + --install-spacy-model | -s : Names of a spaCy model to install in addition to xx_sent_ud_sm. The option can be provided more than once to specify multiple models. " diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index e3dd40a..0f5f5cd 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -34,13 +34,10 @@ import torch from wtpsplit import WtP, SaT -from typing import Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple, Union from .wtp_lang_settings import WtpLanguageSettings - -class SplitMode(Enum): - DEFAULT = 'DEFAULT' - SENTENCE = 'SENTENCE' +from .newline_behavior import NewLineBehavior, SplitMode DEFAULT_WTP_MODELS = "/opt/wtp/models" @@ -49,6 +46,7 @@ class SplitMode(Enum): log = logging.getLogger(__name__) + # These models must have an specified language during sentence splitting. WTP_MANDATORY_ADAPTOR = { 'wtp-canine-s-1l', @@ -62,7 +60,7 @@ class SplitMode(Enum): class TextSplitterModel: - # To hold spaCy, WtP, and other potential sentence detection models in cache + # To hold spaCy, WtP, SaT, and other potential sentence detection models in cache def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: self._model_name = "" @@ -80,18 +78,18 @@ def update_model(self, model_name: str, model_setting: str = "cpu", default_lang if lower_name.startswith("wtp"): self._update_wtp_model(model_name, model_setting, default_lang) self.split = self._split_wtp - log.info("Setup WtP model: %s", model_name) + log.info(f"Setup WtP model: {model_name}") elif lower_name.startswith("sat"): self._update_sat_model(model_name, model_setting, default_lang) self.split = self._split_sat - log.info("Setup SaT model: %s", model_name) + log.info(f"Setup SaT model: {model_name}" else: self._update_spacy_model(model_name) self.split = self._split_spacy - log.info("Setup spaCy model: %s", model_name) + log.info(f"Setup spaCy model: {model_name}") def _resolve_cpu_gpu_device(self, model_setting: str) -> str: - if model_setting == "gpu" or model_setting == "cuda": + if model_setting in ("gpu", "cuda"): if GPU_AVAILABLE: return "cuda" else: @@ -102,9 +100,9 @@ def _resolve_cpu_gpu_device(self, model_setting: str) -> str: return "cpu" if model_setting != "cpu": log.warning( - "Invalid model setting '%s'. Only `cpu` and `cuda` " - "(or `gpu`) WtP model options available at this time. " - "Defaulting to `cpu` mode.", model_setting) + f"Invalid model setting {model_setting}. Only `cpu` and `cuda` " + "(or `gpu`) WtP/SaT model options available at this time. " + "Defaulting to `cpu` mode.") return "cpu" def _find_local_model_path(self, model_name: str) -> Optional[str]: @@ -131,10 +129,10 @@ def _update_wtp_model(self, wtp_model_name: str, local_path = self._find_local_model_path(wtp_model_name) if local_path: - log.info("Using downloaded WtP model at %s", local_path) + log.info(f"Using downloaded WtP model at {local_path}") self.wtp_model = WtP(local_path) else: - log.warning("WtP model '%s' not found locally; downloading from Hugging Face.", wtp_model_name) + log.warning(f"WtP model {wtp_model_name} not found locally; downloading from Hugging Face.") self.wtp_model = WtP(wtp_model_name) self.wtp_model.to(device) @@ -149,10 +147,10 @@ def _update_sat_model(self, sat_model_name: str, model_setting: str, default_lan local_path = self._find_local_model_path(sat_model_name) if local_path: - log.info("Using downloaded SaT model at %s", local_path) + log.info(f"Using downloaded SaT model at {local_path}") self.sat_model = SaT(local_path) else: - log.warning("SaT model '%s' not found locally; downloading from Hugging Face.", sat_model_name) + log.warning(f"SaT model {sat_model_name} not found locally; downloading from Hugging Face.") self.sat_model = SaT(sat_model_name) # Move model to device; SaT benefits from half precision on GPU. @@ -195,32 +193,46 @@ def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: return [sent.text_with_ws for sent in processed_text.sents] class TextSplitter: + NewLineBehaviorType = Union[ + NewLineBehavior.Behavior, # 'GUESS' | 'SPACE' | 'REMOVE' | 'NONE' | callable | None + ] def __init__( self, text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, in_lang: Optional[str] = None, - split_mode: SplitMode = SplitMode.DEFAULT) -> None: + split_mode: SplitMode = SplitMode.DEFAULT, + newline_behavior: NewLineBehaviorType = 'GUESS' + ) -> None: self._sentence_model = sentence_model self._limit = limit self._num_boundary_chars = num_boundary_chars self._get_text_size = get_text_size + self._in_lang = in_lang + self._split_mode = split_mode + + self._newline_fn: Callable[[str, Optional[str]], str] = NewLineBehavior.get(newline_behavior) self._text = "" self._text_full_size = 0 self._overhead_size = 0 self._soft_limit = self._limit - self._in_lang = in_lang - self._split_mode = split_mode if text: self.set_text(text) def set_text(self, text: str): - self._text = text - self._text_full_size = self._get_text_size(text) - chars_per_size = len(text) / self._text_full_size + + if text: + self._text = self._newline_fn(text, self._in_lang) + else: + self._text = text + + self._text_full_size = self._get_text_size(self._text) + + text_size = self._text_full_size if self._text_full_size > 0 else 1 + chars_per_size = len(self._text) / text_size self._overhead_size = self._get_text_size('') self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size @@ -232,7 +244,6 @@ def set_text(self, text: str): # before applying chars_per_size weighting. self._soft_limit = max(1, int((self._limit - self._overhead_size) * chars_per_size)) - def _isolate_largest_section(self, text:str) -> str: # Using cached word splitting model, isolate largest section of text string_length = len(text) @@ -247,7 +258,7 @@ def _isolate_largest_section(self, text:str) -> str: substring_list = self._sentence_model.split(substring, lang = self._in_lang) div_index = string_length - len(substring_list[-1]) - if div_index==start_indx: + if div_index == start_indx: return text return text[0:div_index] @@ -258,9 +269,12 @@ def split(cls, sentence_model: TextSplitterModel, in_lang: Optional[str] = None, split_mode: SplitMode = SplitMode.DEFAULT, + newline_behavior: NewLineBehavior.Behavior = 'GUESS' # <-- NEW ): - return cls(text, limit, num_boundary_chars, get_text_size, - sentence_model, in_lang, split_mode)._split() + return cls( + text, limit, num_boundary_chars, get_text_size, + sentence_model, in_lang, split_mode, newline_behavior + )._split() def _split(self): if self._split_mode == SplitMode.SENTENCE: diff --git a/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py b/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py new file mode 100644 index 0000000..203435e --- /dev/null +++ b/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py @@ -0,0 +1,159 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2025 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from __future__ import annotations + +import bisect +import re +from typing import Callable, Literal, Optional, Union + +import mpf_component_api as mpf + +# Languages that typically do NOT use spaces between words +NO_SPACE_LANGS = ('JA', 'YUE', 'ZH-HANS', 'ZH-HANT') + +class SplitMode(Enum): + DEFAULT = 'DEFAULT' + SENTENCE = 'SENTENCE' + +class ChineseAndJapaneseCodePoints: + # From http://www.unicode.org/charts/ + RANGES = sorted(( + range(0x2e80, 0x2fe0), + range(0x2ff0, 0x3130), + range(0x3190, 0x3300), + range(0x3400, 0x4dc0), + range(0x4e00, 0xa4d0), + range(0xf900, 0xfb00), + range(0xfe10, 0xfe20), + range(0xfe30, 0xfe70), + range(0xff00, 0xffa0), + range(0x16f00, 0x16fa0), + range(0x16fe0, 0x18d09), + range(0x1b000, 0x1b300), + range(0x1f200, 0x1f300), + range(0x20000, 0x2a6de), + range(0x2a700, 0x2ebe1), + range(0x2f800, 0x2fa20), + range(0x30000, 0x3134b) + ), key=lambda r: r.start) + + RANGE_BEGINS = [r.start for r in RANGES] + + @classmethod + def check_char(cls, char: str) -> bool: + """ + Determine whether or not the given character is in the Unicode code point ranges assigned + to Chinese and Japanese. + """ + code_point = ord(char[0]) + if code_point < cls.RANGE_BEGINS[0]: + return False + else: + idx = bisect.bisect_right(cls.RANGE_BEGINS, code_point) + return code_point in cls.RANGES[idx - 1] + + +class NewLineBehavior: + """ + Provides a callable to normalize *single* newline events while preserving intended breaks. + Modes: + - 'GUESS' : choose ' ' for space-separated langs; '' for CJK. + - 'SPACE' : always replace with a single space. + - 'REMOVE' : always remove (no space). + - 'NONE' : no change. + + Users can also provide a custom callable to augment NewLineBehavior. + """ + + Behavior = Union[ + Literal['GUESS', 'SPACE', 'REMOVE', 'NONE'], + Callable[[str, Optional[str]], str], + None + + ] + + @classmethod + def get(cls, behavior: Behavior) -> Callable[[str, Optional[str]], str]: + if callable(behavior): + return behavior + + # Default to GUESS if None or invalid string + if behavior is None: + behavior = 'GUESS' + + behavior = behavior.upper() + + if behavior == 'GUESS': + return lambda s, l: cls._replace_new_lines(s, cls._guess_lang_separator(s, l)) + elif behavior == 'REMOVE': + return lambda s, _: cls._replace_new_lines(s, '') + elif behavior == 'SPACE': + return lambda s, _: cls._replace_new_lines(s, ' ') + elif behavior == 'NONE': + return lambda s, _: s + else: + raise mpf.DetectionError.INVALID_PROPERTY.exception( + f'"{behavior}" is not a valid value for the "STRIP_NEW_LINE_BEHAVIOR" property. ' + 'Valid value are GUESS, REMOVE, SPACE, NONE.') + + @staticmethod + def _guess_lang_separator(text: str, language: Optional[str]) -> Literal['', ' ']: + if language: + if language.upper() in NO_SPACE_LANGS: + return '' + else: + return ' ' + else: + first_alpha_letter = next((ch for ch in text if ch.isalpha()), 'a') + if ChineseAndJapaneseCodePoints.check_char(first_alpha_letter): + return '' + else: + return ' ' + + + REPLACE_NEW_LINE_REGEX = re.compile(r''' + \s? # Include preceding whitespace character if present + (? str: + + def do_replacement(match: Match[str]) -> str: + match_text = match.group(0) + if match_text == '\n': + # Surrounding characters are not whitespace. + return replacement + else: + # There is already whitespace next to newline character, so it can just be removed. + return match_text.replace('\n', '', 1) + + return cls.REPLACE_NEW_LINE_REGEX.sub(do_replacement, text) \ No newline at end of file diff --git a/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py index c682fd3..05a3936 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/wtp_lang_settings.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/detection/nlp_text_splitter/pyproject.toml b/detection/nlp_text_splitter/pyproject.toml index 992a847..cffb338 100644 --- a/detection/nlp_text_splitter/pyproject.toml +++ b/detection/nlp_text_splitter/pyproject.toml @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index b24faaa..3a76c29 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +# Copyright 2025 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2024 The MITRE Corporation # +# Copyright 2025 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -43,29 +43,15 @@ def setUpClass(cls): def test_sat_basic_sentence_split(self): input_text = 'Hello, what is your name? My name is John.' actual = list(TextSplitter.split(input_text, - 100, - 100, - len, - self.sat_model, - split_mode=SplitMode.SENTENCE)) - + 100, + 100, + len, + self.sat_model, + split_mode=SplitMode.SENTENCE)) self.assertEqual(2, len(actual)) self.assertEqual('Hello, what is your name? ', actual[0]) self.assertEqual('My name is John.', actual[1]) - def test_sat_chunk_split(self): - input_text = 'Hello, what is your name? My name is John.' - actual = list(TextSplitter.split(input_text, - 28, - 28, - len, - self.sat_model, - split_mode=SplitMode.DEFAULT)) - - - self.assertEqual(2, len(actual)) - self.assertEqual('Hello, what is your name? ', actual[0]) - self.assertEqual('My name is John.', actual[1]) def test_split_engine_difference(self): @@ -87,8 +73,14 @@ def test_split_engine_difference(self): actual = self.wtp_model._split_wtp(text) self.assertEqual(10, len(actual)) + # SaT seems to try to split using additional features, in addition to newlines. + actual = self.sat_model._split_sat(text) + self.assertEqual(19, len(actual)) + def test_guess_split_simple_sentence(self): - input_text = 'Hello, what is your name? My name is John.' + input_text = 'Hello, what is your name? My name is John. C. Finn.' + + # WtP Produces a clean split. actual = list(TextSplitter.split(input_text, 28, 28, @@ -97,20 +89,27 @@ def test_guess_split_simple_sentence(self): self.assertEqual(input_text, ''.join(actual)) self.assertEqual(2, len(actual)) + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John. C. Finn.', actual[1]) + + # Seems SaT is a bit more aggressive at splitting text. actual = list(TextSplitter.split(input_text, 500, 500, len, - self.sat_model,split_mode=SplitMode.SENTENCE)) + self.sat_model, + split_mode=SplitMode.SENTENCE)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(2, len(actual)) + self.assertEqual(3, len(actual)) # "Hello, what is your name?" self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." - self.assertEqual('My name is John.', actual[1]) + self.assertEqual('My name is John. ', actual[1]) + self.assertEqual('C. Finn.', actual[2]) - input_text = 'Hello, what is your name? My name is John.' actual = list(TextSplitter.split(input_text, 28, 28, @@ -122,7 +121,7 @@ def test_guess_split_simple_sentence(self): # "Hello, what is your name?" self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." - self.assertEqual('My name is John.', actual[1]) + self.assertEqual('My name is John. C. Finn.', actual[1]) def test_split_sentence_end_punctuation(self): input_text = 'Hello. How are you? asdfasdf' @@ -161,7 +160,8 @@ def test_guess_split_edge_cases(self): 30, 30, len, - self.wtp_model)) + self.wtp_model, + newline_behavior = "NONE")) self.assertEqual(input_text, ''.join(actual)) self.assertEqual(4, len(actual)) @@ -172,11 +172,30 @@ def test_guess_split_edge_cases(self): self.assertEqual("Maybe...maybe not? \n ", actual[2]) self.assertEqual("All done, I think!", actual[3]) + # Split using WtP model. + actual = list(TextSplitter.split(input_text, + 30, + 30, + len, + self.wtp_model, + newline_behavior = "GUESS")) + + self.assertEqual(input_text.replace('\n',''), ''.join(actual)) + self.assertEqual(4, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + actual = list(TextSplitter.split(input_text, 35, 35, len, - self.spacy_model)) + self.spacy_model, + newline_behavior = "NONE")) self.assertEqual(input_text, ''.join(actual)) self.assertEqual(4, len(actual)) From b4daeca17f21c716bee2da5074feea96cd3c1ae6 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 14 Oct 2025 01:29:14 -0400 Subject: [PATCH 6/8] Adding newline processing to text splitter. --- detection/nlp_text_splitter/nlp_text_splitter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index 0f5f5cd..18f7139 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -82,7 +82,7 @@ def update_model(self, model_name: str, model_setting: str = "cpu", default_lang elif lower_name.startswith("sat"): self._update_sat_model(model_name, model_setting, default_lang) self.split = self._split_sat - log.info(f"Setup SaT model: {model_name}" + log.info(f"Setup SaT model: {model_name}") else: self._update_spacy_model(model_name) self.split = self._split_spacy From 5674a7b19bb83b12b074f5146f4de63f8ed87fd3 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 14 Oct 2025 01:45:06 -0400 Subject: [PATCH 7/8] Adding newline processing to text splitter. --- .../nlp_text_splitter/nlp_text_splitter/__init__.py | 11 +++++------ .../nlp_text_splitter/newline_behavior.py | 4 ---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py index 18f7139..804dc83 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/__init__.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/__init__.py @@ -29,7 +29,6 @@ import importlib.resources from importlib.resources.abc import Traversable -from enum import Enum import spacy import torch @@ -37,7 +36,7 @@ from typing import Callable, List, Optional, Tuple, Union from .wtp_lang_settings import WtpLanguageSettings -from .newline_behavior import NewLineBehavior, SplitMode +from .newline_behavior import NewLineBehavior DEFAULT_WTP_MODELS = "/opt/wtp/models" @@ -202,7 +201,7 @@ def __init__( get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, in_lang: Optional[str] = None, - split_mode: SplitMode = SplitMode.DEFAULT, + split_mode: str = 'DEFAULT', newline_behavior: NewLineBehaviorType = 'GUESS' ) -> None: @@ -268,8 +267,8 @@ def split(cls, text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], sentence_model: TextSplitterModel, in_lang: Optional[str] = None, - split_mode: SplitMode = SplitMode.DEFAULT, - newline_behavior: NewLineBehavior.Behavior = 'GUESS' # <-- NEW + split_mode: str = 'DEFAULT', + newline_behavior: NewLineBehavior.Behavior = 'GUESS' ): return cls( text, limit, num_boundary_chars, get_text_size, @@ -277,7 +276,7 @@ def split(cls, )._split() def _split(self): - if self._split_mode == SplitMode.SENTENCE: + if self._split_mode == 'SENTENCE': yield from self._split_sentences_individually() else: yield from self._split_default() diff --git a/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py b/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py index 203435e..8424438 100644 --- a/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py +++ b/detection/nlp_text_splitter/nlp_text_splitter/newline_behavior.py @@ -35,10 +35,6 @@ # Languages that typically do NOT use spaces between words NO_SPACE_LANGS = ('JA', 'YUE', 'ZH-HANS', 'ZH-HANT') -class SplitMode(Enum): - DEFAULT = 'DEFAULT' - SENTENCE = 'SENTENCE' - class ChineseAndJapaneseCodePoints: # From http://www.unicode.org/charts/ RANGES = sorted(( From a74d7e77e0ed7d5ddaacb81048cd1fd5684926b3 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 14 Oct 2025 01:54:35 -0400 Subject: [PATCH 8/8] Adding newline processing to text splitter. --- detection/nlp_text_splitter/tests/test_text_splitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/detection/nlp_text_splitter/tests/test_text_splitter.py b/detection/nlp_text_splitter/tests/test_text_splitter.py index 3a76c29..030f3db 100644 --- a/detection/nlp_text_splitter/tests/test_text_splitter.py +++ b/detection/nlp_text_splitter/tests/test_text_splitter.py @@ -27,7 +27,7 @@ import pathlib import unittest -from nlp_text_splitter import TextSplitterModel, TextSplitter, SplitMode +from nlp_text_splitter import TextSplitterModel, TextSplitter TEST_DATA = pathlib.Path(__file__).parent / 'test_data' @@ -47,7 +47,7 @@ def test_sat_basic_sentence_split(self): 100, len, self.sat_model, - split_mode=SplitMode.SENTENCE)) + split_mode='SENTENCE')) self.assertEqual(2, len(actual)) self.assertEqual('Hello, what is your name? ', actual[0]) self.assertEqual('My name is John.', actual[1]) @@ -100,7 +100,7 @@ def test_guess_split_simple_sentence(self): 500, len, self.sat_model, - split_mode=SplitMode.SENTENCE)) + split_mode='SENTENCE')) self.assertEqual(input_text, ''.join(actual)) self.assertEqual(3, len(actual))