Skip to content

Commit de938bb

Browse files
committed
perf: exec only once the calcul for 4 bytes encoded chars for matches from the same text
1 parent a1f6670 commit de938bb

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

language_tool_python/match.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unicodedata
22
from collections import OrderedDict
3-
from typing import Any, Dict, Tuple, Iterator, OrderedDict as OrderedDictType, List
3+
from typing import Any, Dict, Tuple, Iterator, OrderedDict as OrderedDictType, List, Optional
44
from functools import total_ordering
55

66
def get_match_ordered_dict() -> OrderedDictType[str, type]:
@@ -104,6 +104,8 @@ class Match:
104104
:type text: str
105105
106106
Attributes:
107+
PREVIOUS_MATCHES_TEXT (Optional[str]): The text of the previous match object.
108+
FOUR_BYTES_POSITIONS (Optional[List[int]]): The positions of 4-byte encoded characters in the text, registered by the previous match object (kept for optimization purposes if the text is the same).
107109
ruleId (str): The ID of the rule that was violated.
108110
message (str): The message describing the error.
109111
replacements (list): A list of suggested replacements for the error.
@@ -131,6 +133,9 @@ class Match:
131133
}
132134
```
133135
"""
136+
137+
PREVIOUS_MATCHES_TEXT: Optional[str] = None
138+
FOUR_BYTES_POSITIONS: Optional[List[int]] = None
134139

135140
def __init__(self, attrib: Dict[str, Any], text: str) -> None:
136141
"""
@@ -139,6 +144,11 @@ def __init__(self, attrib: Dict[str, Any], text: str) -> None:
139144
This method adjusts the positions of 4-byte encoded characters in the text
140145
to ensure the offsets of the matches are correct.
141146
"""
147+
if text is None:
148+
raise ValueError("The text parameter must not be None")
149+
elif not isinstance(text, str):
150+
raise TypeError("The text parameter must be a string")
151+
142152
# Process rule.
143153
attrib['category'] = attrib['rule']['category']['id']
144154
attrib['ruleId'] = attrib['rule']['id']
@@ -157,10 +167,12 @@ def __init__(self, attrib: Dict[str, Any], text: str) -> None:
157167
for k, v in attrib.items():
158168
setattr(self, k, v)
159169

170+
if Match.PREVIOUS_MATCHES_TEXT != text:
171+
Match.PREVIOUS_MATCHES_TEXT = text
172+
Match.FOUR_BYTES_POSITIONS = four_byte_char_positions(text)
160173
# Get the positions of 4-byte encoded characters in the text because without
161174
# carrying out this step, the offsets of the matches could be incorrect.
162-
four_byte_positions = four_byte_char_positions(text)
163-
self.offset -= sum(1 for pos in four_byte_positions if pos < self.offset)
175+
self.offset -= sum(1 for pos in Match.FOUR_BYTES_POSITIONS if pos < self.offset)
164176

165177
def __repr__(self) -> str:
166178
"""

0 commit comments

Comments
 (0)