diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..755184dd90 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -8,6 +8,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import html import os import re import string @@ -423,6 +424,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split): if TRACE_TOK: logger_debug(' get_tokens: before preped line: ' + repr(line)) + line = html.unescape(line) last_line = line if TRACE_TOK: @@ -2216,7 +2218,7 @@ def build_detection_from_node( (r'^electronics?$', 'NNP'), # proper nouns with digits - (r'^([A-Z][a-z0-9]+){1,2}[\.,]?$', 'NNP'), + (r'^([A-Z][a-zà-ÿ0-9]+){1,2}[\.,]?$', 'NNP'), # saxon genitive, ie. Philippe's (r"^[A-Z][a-z]+'s$", 'NNP'), @@ -2256,7 +2258,7 @@ def build_detection_from_node( # proper noun: first CAP, as in JohnGlen including optional trailing por comma. # Was before this problematic regex: r'^([A-Z][a-zA-Z0-9]+){,2}\.?,?$': # this was capturing AbCdEf or a bare comma. - (r'^([A-Z][a-z0-9]+){1,2}\.?,?$', 'NNP'), + (r'^([A-Z][a-zà-ÿ0-9]+){1,2}\.?,?$', 'NNP'), ############################################################################ # URLS and emails diff --git a/tests/cluecode/data/authors/author_html_entity.java b/tests/cluecode/data/authors/author_html_entity.java new file mode 100644 index 0000000000..5591a4f468 --- /dev/null +++ b/tests/cluecode/data/authors/author_html_entity.java @@ -0,0 +1,7 @@ +/** + * @author Alexander Dorokhine + * @author Robert Elliot + * @author Ceki Gülcü + */ +public class LoggerFactory { +} diff --git a/tests/cluecode/data/authors/author_html_entity.java.yml b/tests/cluecode/data/authors/author_html_entity.java.yml new file mode 100644 index 0000000000..392c77eb97 --- /dev/null +++ b/tests/cluecode/data/authors/author_html_entity.java.yml @@ -0,0 +1,7 @@ +what: + - authors + +authors: + - Alexander Dorokhine + - Robert Elliot + - Ceki Gülcü