diff --git a/setup.cfg b/setup.cfg index 7c45f388fd..9f8090cfbc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -189,9 +189,9 @@ scancode_scan = packages = packagedcode.plugin_package:PackageScanner emails = cluecode.plugin_email:EmailScanner urls = cluecode.plugin_url:UrlScanner + patents = cluecode.plugin_patent:PatentScanner generated = summarycode.generated:GeneratedCodeDetector - # scancode_post_scan is the entry point for post_scan plugins executed after the # scan plugins and before the output plugins. See also plugincode.post_scan # module for details and doc. @@ -237,3 +237,5 @@ scancode_output = yaml = formattedcode.output_yaml:YamlOutput cyclonedx = formattedcode.output_cyclonedx:CycloneDxJsonOutput cyclonedx-xml = formattedcode.output_cyclonedx:CycloneDxXmlOutput + + diff --git a/src/cluecode/patents.py b/src/cluecode/patents.py new file mode 100644 index 0000000000..63783c440e --- /dev/null +++ b/src/cluecode/patents.py @@ -0,0 +1,66 @@ +import re + +# Keywords that indicate patent-related references +PATENT_KEYWORDS = [ + "patent pending", + "patented", + "patent application", + "patent number", +] + +# Precompile keyword regex patterns (case-insensitive) +KEYWORD_REGEXES = [ + re.compile(rf"\b{re.escape(keyword)}\b", re.IGNORECASE) + for keyword in PATENT_KEYWORDS +] + +# Regex for patent numbers and international formats +PATENT_NUMBER_REGEX = re.compile( + r""" + \b + (?: + (?:US|EP|WO|JP|CN|KR|GB|IN) # Country codes + \s* + (?:Patent(?:\s+No\.?)?\s*)? # Optional 'Patent' or 'Patent No.' + \d+(?:[,\/]\d+)* # Number part (allow commas/slashes) + \s*(?:A1|A2|B1|B2)? # Optional kind codes + ) + \b + """, + re.IGNORECASE | re.VERBOSE, +) + + +def find_patents(location): + """ + Detect patent references and patent-related keywords in a file. + + Return a list of tuples: + (kind, value, line_number) + where: + kind: "number" or "keyword" + value: matched text (original casing preserved) + line_number: line where match occurred + """ + results = [] + + try: + with open(location, "r", errors="ignore") as f: + lines = f.readlines() + except Exception: + return results + + for line_num, line in enumerate(lines, start=1): + + # Detect patent numbers + for match in PATENT_NUMBER_REGEX.finditer(line): + results.append(("number", match.group().strip(), line_num)) + + # Detect keyword references + for regex in KEYWORD_REGEXES: + match = regex.search(line) + if match: + results.append(("keyword", match.group(), line_num)) + + return results + diff --git a/src/cluecode/plugin_patent.py b/src/cluecode/plugin_patent.py new file mode 100644 index 0000000000..6bea5f024f --- /dev/null +++ b/src/cluecode/plugin_patent.py @@ -0,0 +1,54 @@ +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from functools import partial +import attr + +from commoncode.cliutils import PluggableCommandLineOption +from commoncode.cliutils import OTHER_SCAN_GROUP +from commoncode.cliutils import SCAN_OPTIONS_GROUP +from plugincode.scan import ScanPlugin +from plugincode.scan import scan_impl + + +@scan_impl +class PatentScanner(ScanPlugin): + """ + Scan a Resource for patent references and patent numbers. + """ + resource_attributes = dict( + patent_detections=attr.ib(default=attr.Factory(list)) + ) + + run_order = 8 + sort_order = 8 + + options = [ + PluggableCommandLineOption( + ('--patent',), + is_flag=True, + default=False, + help='Scan for patent references and patent numbers.', + help_group=OTHER_SCAN_GROUP, + ), + PluggableCommandLineOption( + ('--max-patent',), + type=int, + default=50, + metavar='INT', + show_default=True, + required_options=['patent'], + help='Report only up to INT patent references found in a file. Use 0 for no limit.', + help_group=SCAN_OPTIONS_GROUP, + ), + ] + + def is_enabled(self, patent, **kwargs): + return patent + + def get_scanner(self, max_patent=50, **kwargs): + from scancode.api import get_patents + return partial( + get_patents, + threshold=max_patent, + ) diff --git a/src/scancode/api.py b/src/scancode/api.py index 71382f4a6a..7b7be03c0c 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -366,3 +366,33 @@ def get_file_info(location, **kwargs): result['is_source'] = bool(collector.is_source) result['is_script'] = bool(collector.is_script) return result + +def get_patents(location, threshold=50, **kwargs): + from itertools import islice + from cluecode.patents import find_patents + + raw_matches = find_patents(location) + + seen = set() + matches = [] + for kind, value, line_num in raw_matches: + key = (kind, value, line_num) + if key not in seen: + seen.add(key) + matches.append(key) + + if threshold and threshold > 0: + matches = list(islice(matches, threshold)) + + results = [] + for kind, value, line_num in matches: + results.append({ + "type": kind, + "patent_reference": value, + "start_line": line_num, + "end_line": line_num, + }) + + return dict(patent_detections=results) + + \ No newline at end of file diff --git a/tests/cluecode/test_plugin_patent.py b/tests/cluecode/test_plugin_patent.py new file mode 100644 index 0000000000..aaffc35713 --- /dev/null +++ b/tests/cluecode/test_plugin_patent.py @@ -0,0 +1,108 @@ +import json +from scancode.cli_test_utils import run_scan_click +from scancode.cli_test_utils import load_json_result + + +def test_patent_detection_basic(tmp_path): + test_file = tmp_path / "test.txt" + test_file.write_text("US Patent 8,123,456 B2 and patent pending.") + + result_file = tmp_path / "result.json" + + run_scan_click( + ["--patent", "--json", str(result_file), str(test_file)] + ) + + result = load_json_result(result_file) + + detections = result["files"][0].get("patent_detections", []) + + assert len(detections) == 2 + + values = [d["patent_reference"] for d in detections] + + assert "US Patent 8,123,456 B2" in values + assert "patent pending" in values + + for d in detections: + assert "type" in d + assert "start_line" in d + assert "end_line" in d + + +def test_patent_detection_none(tmp_path): + test_file = tmp_path / "test.txt" + test_file.write_text("This file has no patent reference.") + + result_file = tmp_path / "result.json" + + run_scan_click( + ["--patent", "--json", str(result_file), str(test_file)] + ) + + result = load_json_result(result_file) + + detections = result["files"][0].get("patent_detections", []) + + assert detections == [] + + +def test_patent_international_formats(tmp_path): + test_file = tmp_path / "test.txt" + test_file.write_text( + "EP1234567B1\nWO 2019/123456\nUS20190012345A1" + ) + + result_file = tmp_path / "result.json" + + run_scan_click( + ["--patent", "--json", str(result_file), str(test_file)] + ) + + result = load_json_result(result_file) + + detections = result["files"][0].get("patent_detections", []) + + values = [d["patent_reference"] for d in detections] + + assert any("EP1234567B1" in v for v in values) + assert any("WO 2019/123456" in v for v in values) + assert any("US20190012345A1" in v for v in values) + + +def test_patent_no_false_positive(tmp_path): + test_file = tmp_path / "test.txt" + test_file.write_text("This is unpatented technology.") + + result_file = tmp_path / "result.json" + + run_scan_click( + ["--patent", "--json", str(result_file), str(test_file)] + ) + + result = load_json_result(result_file) + + detections = result["files"][0].get("patent_detections", []) + + assert detections == [] + + +def test_patent_threshold(tmp_path): + test_file = tmp_path / "test.txt" + test_file.write_text( + "US Patent 1\nUS Patent 2\nUS Patent 3" + ) + + result_file = tmp_path / "result.json" + + run_scan_click( + ["--patent", "--max-patent", "1", "--json", str(result_file), str(test_file)] + ) + + result = load_json_result(result_file) + + detections = result["files"][0].get("patent_detections", []) + + assert len(detections) == 1 + assert detections[0]["patent_reference"] == "US Patent 1" + \ No newline at end of file diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index e725888ead..63915bce61 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -19,6 +19,7 @@ Options: license/copyright detection and top-level package creation. -c, --copyright Scan for copyrights. + --patent Scan for patent references and patent numbers. other scans: -i, --info Scan for file information (size, checksums, etc). @@ -45,6 +46,8 @@ Options: 0 for no limit. [default: 50] --max-url INT Report only up to INT urls found in a file. Use 0 for no limit. [default: 50] + --max-patent INT Report only up to INT patent references found in a + file. Use 0 for no limit. [default: 50] --unknown-licenses [EXPERIMENTAL] Detect unknown licenses. output formats: diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 6794b19d60..afb8decbb5 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -21,6 +21,7 @@ Options: -c, --copyright Scan for copyrights. --go-symbol Collect Go symbols. --rust-symbol Collect Rust symbols from rust binaries. + --patent Scan for patent references and patent numbers. other scans: -i, --info Scan for file information (size, checksums, etc). @@ -47,7 +48,10 @@ Options: 0 for no limit. [default: 50] --max-url INT Report only up to INT urls found in a file. Use 0 for no limit. [default: 50] + --max-patent INT Report only up to INT patent references found in a + file. Use 0 for no limit. [default: 50] --unknown-licenses [EXPERIMENTAL] Detect unknown licenses. + output formats: --json FILE Write scan output as compact JSON to FILE.