From 61abb88a502ea3cd4c6a91ac111f6e8229903c17 Mon Sep 17 00:00:00 2001 From: dmnc-grdnr Date: Sun, 10 Aug 2025 12:46:20 +0200 Subject: [PATCH] #328 - Remove annotations in a specified range - added functions for removing annoations from cas - cut_sofa_string_to_range - remove_annotations_in_range - added tests --- cassis/cas.py | 52 ++++++++++++++++++++++++++++++++ tests/test_cas.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/cassis/cas.py b/cassis/cas.py index 1125abc..ca0c163 100644 --- a/cassis/cas.py +++ b/cassis/cas.py @@ -367,6 +367,38 @@ def add_annotations(self, annotations: Iterable[FeatureStructure]): """ self.add_all(annotations) + def cut_sofa_string_to_range(self, sofa_begin:int, sofa_end:int, overlap=True): + """Replaces current sofa string with a cutout of the given range. Removes all annotations outside of range, + but keeps annotations that overlap with cutout points by default. + + Args: + sofa_begin: The beginning of the cutout sofa. + sofa_end: The end of the cutout sofa. + overlap: If true, keeps overlapping annotations and modifies begin and end of annotation accordingly. + + Raises: + ValueError: If cutout indices are invalid. + """ + if 0 <= sofa_begin < sofa_end <= len(self.sofa_string): + self.sofa_string = self.sofa_string[sofa_begin:sofa_end] + for annotation in self.select_all(): + if sofa_begin <= annotation.begin and annotation.end <= sofa_end: + annotation.begin = annotation.begin - sofa_begin + annotation.end = annotation.end - sofa_begin + elif overlap and sofa_begin < annotation.end <= sofa_end: + annotation.begin = 0 + annotation.end = sofa_end - sofa_begin + elif overlap and sofa_begin <= annotation.begin < sofa_end: + annotation.begin = annotation.begin - sofa_begin + annotation.end = len(self.sofa_string) + elif overlap and annotation.begin <= sofa_begin and sofa_end <= annotation.end: + annotation.begin = 0 + annotation.end = len(self.sofa_string) + else: + self.remove(annotation) + else: + raise ValueError(f"Invalid indices for begin {sofa_begin} and end {sofa_end}") + def remove(self, annotation: FeatureStructure): """Removes an annotation from an index. This throws if the annotation was not present. @@ -386,6 +418,26 @@ def remove_annotation(self, annotation: FeatureStructure): """ self.remove(annotation) + + def remove_in_range(self, cut_begin:int, cut_end:int, type_: Union[Type, str]=None): + """Removes annotations between two indices of the sofa string. + + Args: + cut_begin: The beginning of the cutting interval. + cut_end: The end of the cutting interval. + type_: The type or name of the type name whose annotation instances are to be found + Raises: + ValueError: If range indices are invalid or annotation type_ not found. + """ + + annotations = self.select_all() if type_ is None else self.select(type_) + if 0 <= cut_begin < cut_end <= len(self.sofa_string): + for annotation in annotations: + if cut_begin <= annotation.begin < annotation.end <= cut_end: + self.remove(annotation) + else: + raise ValueError(f"Invalid indices for begin {cut_begin} and end {cut_end}") + @deprecation.deprecated(details="Use annotation.get_covered_text()") def get_covered_text(self, annotation: FeatureStructure) -> str: """Gets the text that is covered by `annotation`. diff --git a/tests/test_cas.py b/tests/test_cas.py index 670db07..1a56a96 100644 --- a/tests/test_cas.py +++ b/tests/test_cas.py @@ -540,3 +540,78 @@ def test_covered_text_on_annotation_without_sofa(): with pytest.raises(AnnotationHasNoSofa): ann.get_covered_text() + + +def test_remove_in_range(small_typesystem_xml, small_xmi): + typesystem = load_typesystem(small_typesystem_xml) + cas = load_cas_from_xmi(small_xmi, typesystem) + + begin = 10 + end = 20 + + expected_leftover_annotations = [annotation for annotation in cas.select_all() + if not (begin <= annotation.begin < annotation.end <= end)] + + cas.remove_in_range(begin, end) + + result_leftover_annotations = cas.select_all() + + assert len(result_leftover_annotations) == len(expected_leftover_annotations) + + for annotation in expected_leftover_annotations: + assert annotation in result_leftover_annotations + +def test_remove_in_range_with_type(small_typesystem_xml, small_xmi): + typesystem = load_typesystem(small_typesystem_xml) + cas = load_cas_from_xmi(small_xmi, typesystem) + + begin = 0 + end = 27 + type_ = 'cassis.Token' + expected_leftover_annotations = [annotation for annotation in cas.select_all() + if not (begin <= annotation.begin < annotation.end <= end + and annotation.type.name == type_)] + + cas.remove_in_range(begin, end, type_) + + result_leftover_annotations = cas.select_all() + + assert len(result_leftover_annotations) == len(expected_leftover_annotations) + + for annotation in expected_leftover_annotations: + assert annotation in result_leftover_annotations + if begin <= annotation.begin < annotation.end <= end: + assert annotation.type.name != type_ + + +def test_cut_sofa_string_to_range(small_typesystem_xml, small_xmi): + typesystem = load_typesystem(small_typesystem_xml) + cas = load_cas_from_xmi(small_xmi, typesystem) + + begin = 10 + end = 20 + + expected_leftover_annotations = [annotation for annotation in cas.select_all() + if (begin <= annotation.begin < end) + or (annotation.begin < begin < end <= annotation.end)] + + cas.cut_sofa_string_to_range(begin, end) + + assert len(cas.select_all()) == len(expected_leftover_annotations) + + +def test_cut_sofa_string_to_range_no_overlap(small_typesystem_xml, small_xmi): + typesystem = load_typesystem(small_typesystem_xml) + cas = load_cas_from_xmi(small_xmi, typesystem) + + begin = 10 + end = 20 + + expected_leftover_annotations = [annotation for annotation in cas.select_all() + if begin <= annotation.begin < annotation.end <= end] + + cas.cut_sofa_string_to_range(begin, end, overlap=False) + + assert len(cas.select_all()) == len(expected_leftover_annotations) + + print(cas.sofa_string)