Skip to content

Commit 75fbc2c

Browse files
authored
Merge pull request jxmorris12#94 from mdevolde/patch-4-bytes-encoded
Correction of incorrect offsets to apply corrections when there are characters encoded on 4 bytes in the text to be corrected
2 parents 90cfd79 + a1fdbc1 commit 75fbc2c

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

language_tool_python/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,26 @@ def parse_url(url_str):
5757
return urllib.parse.urlparse(url_str).geturl()
5858

5959

60+
def _4_bytes_encoded_positions(text: str) -> List[int]:
61+
"""Return a list of positions of 4-byte encoded characters in the text."""
62+
positions = []
63+
char_index = 0
64+
for char in text:
65+
if len(char.encode('utf-8')) == 4:
66+
positions.append(char_index)
67+
# Adding 1 to the index because 4 byte characters are
68+
# 2 bytes in length in LanguageTool, instead of 1 byte in Python.
69+
char_index += 1
70+
char_index += 1
71+
return positions
72+
73+
6074
def correct(text: str, matches: List[Match]) -> str:
6175
"""Automatically apply suggestions to the text."""
76+
# Get the positions of 4-byte encoded characters in the text because without
77+
# carrying out this step, the offsets of the matches could be incorrect.
78+
for match in matches:
79+
match.offset -= sum(1 for i in _4_bytes_encoded_positions(text) if i <= match.offset)
6280
ltext = list(text)
6381
matches = [match for match in matches if match.replacements]
6482
errors = [ltext[match.offset:match.offset + match.errorLength]

0 commit comments

Comments
 (0)