11import unicodedata
22from collections import OrderedDict
3- from typing import Any , Dict , Tuple , Iterator , OrderedDict as OrderedDictType
3+ from typing import Any , Dict , Tuple , Iterator , OrderedDict as OrderedDictType , List
44from functools import total_ordering
55
66def get_match_ordered_dict () -> OrderedDictType [str , type ]:
@@ -58,21 +58,29 @@ def auto_type(obj: Any) -> Any:
5858 except ValueError :
5959 return obj
6060
61- """ Sample match JSON:
62- {
63- 'message': 'Possible spelling mistake found.',
64- 'shortMessage': 'Spelling mistake',
65- 'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
66- 'offset': 8,
67- 'length': 4,
68- 'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
69- 'type': {'typeName': 'Other'},
70- 'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
71- 'ignoreForIncompleteSentence': False,
72- 'contextForSureMatch': 0
73- }
74-
75- """
61+ def four_byte_char_positions (text : str ) -> List [int ]:
62+ """
63+ Identify positions of 4-byte encoded characters in a UTF-8 string.
64+ This function scans through the input text and identifies the positions
65+ of characters that are encoded with 4 bytes in UTF-8. These characters
66+ are typically non-BMP (Basic Multilingual Plane) characters, such as
67+ certain emoji and some rare Chinese, Japanese, and Korean characters.
68+
69+ :param text: The input string to be analyzed.
70+ :type text: str
71+ :return: A list of positions where 4-byte encoded characters are found.
72+ :rtype: List[int]
73+ """
74+ positions = []
75+ char_index = 0
76+ for char in text :
77+ if len (char .encode ('utf-8' )) == 4 :
78+ positions .append (char_index )
79+ # Adding 1 to the index because 4 byte characters are
80+ # 2 bytes in length in LanguageTool, instead of 1 byte in Python.
81+ char_index += 1
82+ char_index += 1
83+ return positions
7684
7785@total_ordering
7886class Match :
@@ -92,6 +100,8 @@ class Match:
92100
93101 - 'message': The message describing the error.
94102 :type attrib: Dict[str, Any]
103+ :param text: The original text in which the error occurred (the whole text, not just the context).
104+ :type text: str
95105
96106 Attributes:
97107 ruleId (str): The ID of the rule that was violated.
@@ -103,12 +113,31 @@ class Match:
103113 errorLength (int): The length of the error.
104114 category (str): The category of the rule that was violated.
105115 ruleIssueType (str): The issue type of the rule that was violated.
116+
117+ Exemple of a match object received from the LanguageTool API :
118+
119+ ```
120+ {
121+ 'message': 'Possible spelling mistake found.',
122+ 'shortMessage': 'Spelling mistake',
123+ 'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
124+ 'offset': 8,
125+ 'length': 4,
126+ 'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
127+ 'type': {'typeName': 'Other'},
128+ 'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
129+ 'ignoreForIncompleteSentence': False,
130+ 'contextForSureMatch': 0
131+ }
132+ ```
106133 """
107134
108- def __init__ (self , attrib : Dict [str , Any ]) -> None :
135+ def __init__ (self , attrib : Dict [str , Any ], text : str ) -> None :
109136 """
110137 Initialize a Match object with the given attributes.
111138 The method processes and normalizes the attributes before storing them on the object.
139+ This method adjusts the positions of 4-byte encoded characters in the text
140+ to ensure the offsets of the matches are correct.
112141 """
113142 # Process rule.
114143 attrib ['category' ] = attrib ['rule' ]['category' ]['id' ]
@@ -127,6 +156,11 @@ def __init__(self, attrib: Dict[str, Any]) -> None:
127156 # Store objects on self.
128157 for k , v in attrib .items ():
129158 setattr (self , k , v )
159+
160+ # Get the positions of 4-byte encoded characters in the text because without
161+ # carrying out this step, the offsets of the matches could be incorrect.
162+ four_byte_positions = four_byte_char_positions (text )
163+ self .offset -= sum (1 for pos in four_byte_positions if pos < self .offset )
130164
131165 def __repr__ (self ) -> str :
132166 """
0 commit comments