Skip to content

Commit a1f6670

Browse files
committed
fix: moving the offset correction for the matches in the match definition, so the correction is applied every time the match is used and not just in the correct method
1 parent e90810e commit a1f6670

File tree

3 files changed

+52
-50
lines changed

3 files changed

+52
-50
lines changed

language_tool_python/match.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unicodedata
22
from collections import OrderedDict
3-
from typing import Any, Dict, Tuple, Iterator, OrderedDict as OrderedDictType
3+
from typing import Any, Dict, Tuple, Iterator, OrderedDict as OrderedDictType, List
44
from functools import total_ordering
55

66
def get_match_ordered_dict() -> OrderedDictType[str, type]:
@@ -58,21 +58,29 @@ def auto_type(obj: Any) -> Any:
5858
except ValueError:
5959
return obj
6060

61-
""" Sample match JSON:
62-
{
63-
'message': 'Possible spelling mistake found.',
64-
'shortMessage': 'Spelling mistake',
65-
'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
66-
'offset': 8,
67-
'length': 4,
68-
'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
69-
'type': {'typeName': 'Other'},
70-
'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
71-
'ignoreForIncompleteSentence': False,
72-
'contextForSureMatch': 0
73-
}
74-
75-
"""
61+
def four_byte_char_positions(text: str) -> List[int]:
62+
"""
63+
Identify positions of 4-byte encoded characters in a UTF-8 string.
64+
This function scans through the input text and identifies the positions
65+
of characters that are encoded with 4 bytes in UTF-8. These characters
66+
are typically non-BMP (Basic Multilingual Plane) characters, such as
67+
certain emoji and some rare Chinese, Japanese, and Korean characters.
68+
69+
:param text: The input string to be analyzed.
70+
:type text: str
71+
:return: A list of positions where 4-byte encoded characters are found.
72+
:rtype: List[int]
73+
"""
74+
positions = []
75+
char_index = 0
76+
for char in text:
77+
if len(char.encode('utf-8')) == 4:
78+
positions.append(char_index)
79+
# Adding 1 to the index because 4 byte characters are
80+
# 2 bytes in length in LanguageTool, instead of 1 byte in Python.
81+
char_index += 1
82+
char_index += 1
83+
return positions
7684

7785
@total_ordering
7886
class Match:
@@ -92,6 +100,8 @@ class Match:
92100
93101
- 'message': The message describing the error.
94102
:type attrib: Dict[str, Any]
103+
:param text: The original text in which the error occurred (the whole text, not just the context).
104+
:type text: str
95105
96106
Attributes:
97107
ruleId (str): The ID of the rule that was violated.
@@ -103,12 +113,31 @@ class Match:
103113
errorLength (int): The length of the error.
104114
category (str): The category of the rule that was violated.
105115
ruleIssueType (str): The issue type of the rule that was violated.
116+
117+
Exemple of a match object received from the LanguageTool API :
118+
119+
```
120+
{
121+
'message': 'Possible spelling mistake found.',
122+
'shortMessage': 'Spelling mistake',
123+
'replacements': [{'value': 'newt'}, {'value': 'not'}, {'value': 'new', 'shortDescription': 'having just been made'}, {'value': 'news'}, {'value': 'foot', 'shortDescription': 'singular'}, {'value': 'root', 'shortDescription': 'underground organ of a plant'}, {'value': 'boot'}, {'value': 'noon'}, {'value': 'loot', 'shortDescription': 'plunder'}, {'value': 'moot'}, {'value': 'Root'}, {'value': 'soot', 'shortDescription': 'carbon black'}, {'value': 'newts'}, {'value': 'nook'}, {'value': 'Lieut'}, {'value': 'coot'}, {'value': 'hoot'}, {'value': 'toot'}, {'value': 'snoot'}, {'value': 'neut'}, {'value': 'nowt'}, {'value': 'Noor'}, {'value': 'noob'}],
124+
'offset': 8,
125+
'length': 4,
126+
'context': {'text': 'This is noot okay. ', 'offset': 8, 'length': 4}, 'sentence': 'This is noot okay.',
127+
'type': {'typeName': 'Other'},
128+
'rule': {'id': 'MORFOLOGIK_RULE_EN_US', 'description': 'Possible spelling mistake', 'issueType': 'misspelling', 'category': {'id': 'TYPOS', 'name': 'Possible Typo'}},
129+
'ignoreForIncompleteSentence': False,
130+
'contextForSureMatch': 0
131+
}
132+
```
106133
"""
107134

108-
def __init__(self, attrib: Dict[str, Any]) -> None:
135+
def __init__(self, attrib: Dict[str, Any], text: str) -> None:
109136
"""
110137
Initialize a Match object with the given attributes.
111138
The method processes and normalizes the attributes before storing them on the object.
139+
This method adjusts the positions of 4-byte encoded characters in the text
140+
to ensure the offsets of the matches are correct.
112141
"""
113142
# Process rule.
114143
attrib['category'] = attrib['rule']['category']['id']
@@ -127,6 +156,11 @@ def __init__(self, attrib: Dict[str, Any]) -> None:
127156
# Store objects on self.
128157
for k, v in attrib.items():
129158
setattr(self, k, v)
159+
160+
# Get the positions of 4-byte encoded characters in the text because without
161+
# carrying out this step, the offsets of the matches could be incorrect.
162+
four_byte_positions = four_byte_char_positions(text)
163+
self.offset -= sum(1 for pos in four_byte_positions if pos < self.offset)
130164

131165
def __repr__(self) -> str:
132166
"""

language_tool_python/server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def check(self, text: str) -> List[Match]:
273273
url = urllib.parse.urljoin(self._url, 'check')
274274
response = self._query_server(url, self._create_params(text))
275275
matches = response['matches']
276-
return [Match(match) for match in matches]
276+
return [Match(match, text) for match in matches]
277277

278278
def _create_params(self, text: str) -> Dict[str, str]:
279279
"""

language_tool_python/utils.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -88,37 +88,9 @@ def parse_url(url_str: str) -> str:
8888
return urllib.parse.urlparse(url_str).geturl()
8989

9090

91-
def _4_bytes_encoded_positions(text: str) -> List[int]:
92-
"""
93-
Identify positions of 4-byte encoded characters in a UTF-8 string.
94-
This function scans through the input text and identifies the positions
95-
of characters that are encoded with 4 bytes in UTF-8. These characters
96-
are typically non-BMP (Basic Multilingual Plane) characters, such as
97-
certain emoji and some rare Chinese, Japanese, and Korean characters.
98-
99-
:param text: The input string to be analyzed.
100-
:type text: str
101-
:return: A list of positions where 4-byte encoded characters are found.
102-
:rtype: List[int]
103-
"""
104-
positions = []
105-
char_index = 0
106-
for char in text:
107-
if len(char.encode('utf-8')) == 4:
108-
positions.append(char_index)
109-
# Adding 1 to the index because 4 byte characters are
110-
# 2 bytes in length in LanguageTool, instead of 1 byte in Python.
111-
char_index += 1
112-
char_index += 1
113-
return positions
114-
115-
11691
def correct(text: str, matches: List[Match]) -> str:
11792
"""
11893
Corrects the given text based on the provided matches.
119-
This function adjusts the positions of 4-byte encoded characters in the text
120-
to ensure the offsets of the matches are correct. It then applies the corrections
121-
specified in the matches to the text.
12294
Only the first replacement for each match is applied to the text.
12395
12496
:param text: The original text to be corrected.
@@ -128,10 +100,6 @@ def correct(text: str, matches: List[Match]) -> str:
128100
:return: The corrected text.
129101
:rtype: str
130102
"""
131-
# Get the positions of 4-byte encoded characters in the text because without
132-
# carrying out this step, the offsets of the matches could be incorrect.
133-
for match in matches:
134-
match.offset -= sum(1 for i in _4_bytes_encoded_positions(text) if i <= match.offset)
135103
ltext = list(text)
136104
matches = [match for match in matches if match.replacements]
137105
errors = [ltext[match.offset:match.offset + match.errorLength]

0 commit comments

Comments
 (0)