diff --git a/setup.cfg b/setup.cfg
index 7c45f388fd..9f8090cfbc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -189,9 +189,9 @@ scancode_scan =
packages = packagedcode.plugin_package:PackageScanner
emails = cluecode.plugin_email:EmailScanner
urls = cluecode.plugin_url:UrlScanner
+ patents = cluecode.plugin_patent:PatentScanner
generated = summarycode.generated:GeneratedCodeDetector
-
# scancode_post_scan is the entry point for post_scan plugins executed after the
# scan plugins and before the output plugins. See also plugincode.post_scan
# module for details and doc.
@@ -237,3 +237,5 @@ scancode_output =
yaml = formattedcode.output_yaml:YamlOutput
cyclonedx = formattedcode.output_cyclonedx:CycloneDxJsonOutput
cyclonedx-xml = formattedcode.output_cyclonedx:CycloneDxXmlOutput
+
+
diff --git a/src/cluecode/patents.py b/src/cluecode/patents.py
new file mode 100644
index 0000000000..63783c440e
--- /dev/null
+++ b/src/cluecode/patents.py
@@ -0,0 +1,66 @@
+import re
+
+# Keywords that indicate patent-related references
+PATENT_KEYWORDS = [
+ "patent pending",
+ "patented",
+ "patent application",
+ "patent number",
+]
+
+# Precompile keyword regex patterns (case-insensitive)
+KEYWORD_REGEXES = [
+ re.compile(rf"\b{re.escape(keyword)}\b", re.IGNORECASE)
+ for keyword in PATENT_KEYWORDS
+]
+
+# Regex for patent numbers and international formats
+PATENT_NUMBER_REGEX = re.compile(
+ r"""
+ \b
+ (?:
+ (?:US|EP|WO|JP|CN|KR|GB|IN) # Country codes
+ \s*
+ (?:Patent(?:\s+No\.?)?\s*)? # Optional 'Patent' or 'Patent No.'
+ \d+(?:[,\/]\d+)* # Number part (allow commas/slashes)
+ \s*(?:A1|A2|B1|B2)? # Optional kind codes
+ )
+ \b
+ """,
+ re.IGNORECASE | re.VERBOSE,
+)
+
+
+def find_patents(location):
+ """
+ Detect patent references and patent-related keywords in a file.
+
+ Return a list of tuples:
+ (kind, value, line_number)
+ where:
+ kind: "number" or "keyword"
+ value: matched text (original casing preserved)
+ line_number: line where match occurred
+ """
+ results = []
+
+ try:
+ with open(location, "r", errors="ignore") as f:
+ lines = f.readlines()
+ except Exception:
+ return results
+
+ for line_num, line in enumerate(lines, start=1):
+
+ # Detect patent numbers
+ for match in PATENT_NUMBER_REGEX.finditer(line):
+ results.append(("number", match.group().strip(), line_num))
+
+ # Detect keyword references
+ for regex in KEYWORD_REGEXES:
+ match = regex.search(line)
+ if match:
+ results.append(("keyword", match.group(), line_num))
+
+ return results
+
diff --git a/src/cluecode/plugin_patent.py b/src/cluecode/plugin_patent.py
new file mode 100644
index 0000000000..6bea5f024f
--- /dev/null
+++ b/src/cluecode/plugin_patent.py
@@ -0,0 +1,54 @@
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from functools import partial
+import attr
+
+from commoncode.cliutils import PluggableCommandLineOption
+from commoncode.cliutils import OTHER_SCAN_GROUP
+from commoncode.cliutils import SCAN_OPTIONS_GROUP
+from plugincode.scan import ScanPlugin
+from plugincode.scan import scan_impl
+
+
+@scan_impl
+class PatentScanner(ScanPlugin):
+ """
+ Scan a Resource for patent references and patent numbers.
+ """
+ resource_attributes = dict(
+ patent_detections=attr.ib(default=attr.Factory(list))
+ )
+
+ run_order = 8
+ sort_order = 8
+
+ options = [
+ PluggableCommandLineOption(
+ ('--patent',),
+ is_flag=True,
+ default=False,
+ help='Scan for patent references and patent numbers.',
+ help_group=OTHER_SCAN_GROUP,
+ ),
+ PluggableCommandLineOption(
+ ('--max-patent',),
+ type=int,
+ default=50,
+ metavar='INT',
+ show_default=True,
+ required_options=['patent'],
+ help='Report only up to INT patent references found in a file. Use 0 for no limit.',
+ help_group=SCAN_OPTIONS_GROUP,
+ ),
+ ]
+
+ def is_enabled(self, patent, **kwargs):
+ return patent
+
+ def get_scanner(self, max_patent=50, **kwargs):
+ from scancode.api import get_patents
+ return partial(
+ get_patents,
+ threshold=max_patent,
+ )
diff --git a/src/scancode/api.py b/src/scancode/api.py
index 71382f4a6a..7b7be03c0c 100644
--- a/src/scancode/api.py
+++ b/src/scancode/api.py
@@ -366,3 +366,33 @@ def get_file_info(location, **kwargs):
result['is_source'] = bool(collector.is_source)
result['is_script'] = bool(collector.is_script)
return result
+
+def get_patents(location, threshold=50, **kwargs):
+ from itertools import islice
+ from cluecode.patents import find_patents
+
+ raw_matches = find_patents(location)
+
+ seen = set()
+ matches = []
+ for kind, value, line_num in raw_matches:
+ key = (kind, value, line_num)
+ if key not in seen:
+ seen.add(key)
+ matches.append(key)
+
+ if threshold and threshold > 0:
+ matches = list(islice(matches, threshold))
+
+ results = []
+ for kind, value, line_num in matches:
+ results.append({
+ "type": kind,
+ "patent_reference": value,
+ "start_line": line_num,
+ "end_line": line_num,
+ })
+
+ return dict(patent_detections=results)
+
+
\ No newline at end of file
diff --git a/tests/cluecode/test_plugin_patent.py b/tests/cluecode/test_plugin_patent.py
new file mode 100644
index 0000000000..aaffc35713
--- /dev/null
+++ b/tests/cluecode/test_plugin_patent.py
@@ -0,0 +1,108 @@
+import json
+from scancode.cli_test_utils import run_scan_click
+from scancode.cli_test_utils import load_json_result
+
+
+def test_patent_detection_basic(tmp_path):
+ test_file = tmp_path / "test.txt"
+ test_file.write_text("US Patent 8,123,456 B2 and patent pending.")
+
+ result_file = tmp_path / "result.json"
+
+ run_scan_click(
+ ["--patent", "--json", str(result_file), str(test_file)]
+ )
+
+ result = load_json_result(result_file)
+
+ detections = result["files"][0].get("patent_detections", [])
+
+ assert len(detections) == 2
+
+ values = [d["patent_reference"] for d in detections]
+
+ assert "US Patent 8,123,456 B2" in values
+ assert "patent pending" in values
+
+ for d in detections:
+ assert "type" in d
+ assert "start_line" in d
+ assert "end_line" in d
+
+
+def test_patent_detection_none(tmp_path):
+ test_file = tmp_path / "test.txt"
+ test_file.write_text("This file has no patent reference.")
+
+ result_file = tmp_path / "result.json"
+
+ run_scan_click(
+ ["--patent", "--json", str(result_file), str(test_file)]
+ )
+
+ result = load_json_result(result_file)
+
+ detections = result["files"][0].get("patent_detections", [])
+
+ assert detections == []
+
+
+def test_patent_international_formats(tmp_path):
+ test_file = tmp_path / "test.txt"
+ test_file.write_text(
+ "EP1234567B1\nWO 2019/123456\nUS20190012345A1"
+ )
+
+ result_file = tmp_path / "result.json"
+
+ run_scan_click(
+ ["--patent", "--json", str(result_file), str(test_file)]
+ )
+
+ result = load_json_result(result_file)
+
+ detections = result["files"][0].get("patent_detections", [])
+
+ values = [d["patent_reference"] for d in detections]
+
+ assert any("EP1234567B1" in v for v in values)
+ assert any("WO 2019/123456" in v for v in values)
+ assert any("US20190012345A1" in v for v in values)
+
+
+def test_patent_no_false_positive(tmp_path):
+ test_file = tmp_path / "test.txt"
+ test_file.write_text("This is unpatented technology.")
+
+ result_file = tmp_path / "result.json"
+
+ run_scan_click(
+ ["--patent", "--json", str(result_file), str(test_file)]
+ )
+
+ result = load_json_result(result_file)
+
+ detections = result["files"][0].get("patent_detections", [])
+
+ assert detections == []
+
+
+def test_patent_threshold(tmp_path):
+ test_file = tmp_path / "test.txt"
+ test_file.write_text(
+ "US Patent 1\nUS Patent 2\nUS Patent 3"
+ )
+
+ result_file = tmp_path / "result.json"
+
+ run_scan_click(
+ ["--patent", "--max-patent", "1", "--json", str(result_file), str(test_file)]
+ )
+
+ result = load_json_result(result_file)
+
+ detections = result["files"][0].get("patent_detections", [])
+
+ assert len(detections) == 1
+ assert detections[0]["patent_reference"] == "US Patent 1"
+
\ No newline at end of file
diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt
index e725888ead..63915bce61 100644
--- a/tests/scancode/data/help/help.txt
+++ b/tests/scancode/data/help/help.txt
@@ -19,6 +19,7 @@ Options:
license/copyright detection and top-level package
creation.
-c, --copyright Scan for copyrights.
+ --patent Scan for patent references and patent numbers.
other scans:
-i, --info Scan for file information (size, checksums, etc).
@@ -45,6 +46,8 @@ Options:
0 for no limit. [default: 50]
--max-url INT Report only up to INT urls found in a file. Use 0
for no limit. [default: 50]
+ --max-patent INT Report only up to INT patent references found in a
+ file. Use 0 for no limit. [default: 50]
--unknown-licenses [EXPERIMENTAL] Detect unknown licenses.
output formats:
diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt
index 6794b19d60..afb8decbb5 100644
--- a/tests/scancode/data/help/help_linux.txt
+++ b/tests/scancode/data/help/help_linux.txt
@@ -21,6 +21,7 @@ Options:
-c, --copyright Scan for copyrights.
--go-symbol Collect Go symbols.
--rust-symbol Collect Rust symbols from rust binaries.
+ --patent Scan for patent references and patent numbers.
other scans:
-i, --info Scan for file information (size, checksums, etc).
@@ -47,7 +48,10 @@ Options:
0 for no limit. [default: 50]
--max-url INT Report only up to INT urls found in a file. Use 0
for no limit. [default: 50]
+ --max-patent INT Report only up to INT patent references found in a
+ file. Use 0 for no limit. [default: 50]
--unknown-licenses [EXPERIMENTAL] Detect unknown licenses.
+
output formats:
--json FILE Write scan output as compact JSON to FILE.