Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,9 @@ scancode_scan =
packages = packagedcode.plugin_package:PackageScanner
emails = cluecode.plugin_email:EmailScanner
urls = cluecode.plugin_url:UrlScanner
patents = cluecode.plugin_patent:PatentScanner
generated = summarycode.generated:GeneratedCodeDetector


# scancode_post_scan is the entry point for post_scan plugins executed after the
# scan plugins and before the output plugins. See also plugincode.post_scan
# module for details and doc.
Expand Down Expand Up @@ -237,3 +237,5 @@ scancode_output =
yaml = formattedcode.output_yaml:YamlOutput
cyclonedx = formattedcode.output_cyclonedx:CycloneDxJsonOutput
cyclonedx-xml = formattedcode.output_cyclonedx:CycloneDxXmlOutput


66 changes: 66 additions & 0 deletions src/cluecode/patents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import re

# Keywords that indicate patent-related references
PATENT_KEYWORDS = [
"patent pending",
"patented",
"patent application",
"patent number",
]

# Precompile keyword regex patterns (case-insensitive)
KEYWORD_REGEXES = [
re.compile(rf"\b{re.escape(keyword)}\b", re.IGNORECASE)
for keyword in PATENT_KEYWORDS
]

# Regex for patent numbers and international formats
PATENT_NUMBER_REGEX = re.compile(
r"""
\b
(?:
(?:US|EP|WO|JP|CN|KR|GB|IN) # Country codes
\s*
(?:Patent(?:\s+No\.?)?\s*)? # Optional 'Patent' or 'Patent No.'
\d+(?:[,\/]\d+)* # Number part (allow commas/slashes)
\s*(?:A1|A2|B1|B2)? # Optional kind codes
)
\b
""",
re.IGNORECASE | re.VERBOSE,
)


def find_patents(location):
"""
Detect patent references and patent-related keywords in a file.

Return a list of tuples:
(kind, value, line_number)
where:
kind: "number" or "keyword"
value: matched text (original casing preserved)
line_number: line where match occurred
"""
results = []

try:
with open(location, "r", errors="ignore") as f:
lines = f.readlines()
except Exception:
return results

for line_num, line in enumerate(lines, start=1):

# Detect patent numbers
for match in PATENT_NUMBER_REGEX.finditer(line):
results.append(("number", match.group().strip(), line_num))

# Detect keyword references
for regex in KEYWORD_REGEXES:
match = regex.search(line)
if match:
results.append(("keyword", match.group(), line_num))

return results

54 changes: 54 additions & 0 deletions src/cluecode/plugin_patent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) nexB Inc. and others. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from functools import partial
import attr

from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import OTHER_SCAN_GROUP
from commoncode.cliutils import SCAN_OPTIONS_GROUP
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl


@scan_impl
class PatentScanner(ScanPlugin):
"""
Scan a Resource for patent references and patent numbers.
"""
resource_attributes = dict(
patent_detections=attr.ib(default=attr.Factory(list))
)

run_order = 8
sort_order = 8

options = [
PluggableCommandLineOption(
('--patent',),
is_flag=True,
default=False,
help='Scan <input> for patent references and patent numbers.',
help_group=OTHER_SCAN_GROUP,
),
PluggableCommandLineOption(
('--max-patent',),
type=int,
default=50,
metavar='INT',
show_default=True,
required_options=['patent'],
help='Report only up to INT patent references found in a file. Use 0 for no limit.',
help_group=SCAN_OPTIONS_GROUP,
),
]

def is_enabled(self, patent, **kwargs):
return patent

def get_scanner(self, max_patent=50, **kwargs):
from scancode.api import get_patents
return partial(
get_patents,
threshold=max_patent,
)
30 changes: 30 additions & 0 deletions src/scancode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,3 +366,33 @@ def get_file_info(location, **kwargs):
result['is_source'] = bool(collector.is_source)
result['is_script'] = bool(collector.is_script)
return result

def get_patents(location, threshold=50, **kwargs):
from itertools import islice
from cluecode.patents import find_patents

raw_matches = find_patents(location)

seen = set()
matches = []
for kind, value, line_num in raw_matches:
key = (kind, value, line_num)
if key not in seen:
seen.add(key)
matches.append(key)

if threshold and threshold > 0:
matches = list(islice(matches, threshold))

results = []
for kind, value, line_num in matches:
results.append({
"type": kind,
"patent_reference": value,
"start_line": line_num,
"end_line": line_num,
})

return dict(patent_detections=results)


108 changes: 108 additions & 0 deletions tests/cluecode/test_plugin_patent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import json
from scancode.cli_test_utils import run_scan_click
from scancode.cli_test_utils import load_json_result


def test_patent_detection_basic(tmp_path):
test_file = tmp_path / "test.txt"
test_file.write_text("US Patent 8,123,456 B2 and patent pending.")

result_file = tmp_path / "result.json"

run_scan_click(
["--patent", "--json", str(result_file), str(test_file)]
)

result = load_json_result(result_file)

detections = result["files"][0].get("patent_detections", [])

assert len(detections) == 2

values = [d["patent_reference"] for d in detections]

assert "US Patent 8,123,456 B2" in values
assert "patent pending" in values

for d in detections:
assert "type" in d
assert "start_line" in d
assert "end_line" in d


def test_patent_detection_none(tmp_path):
test_file = tmp_path / "test.txt"
test_file.write_text("This file has no patent reference.")

result_file = tmp_path / "result.json"

run_scan_click(
["--patent", "--json", str(result_file), str(test_file)]
)

result = load_json_result(result_file)

detections = result["files"][0].get("patent_detections", [])

assert detections == []


def test_patent_international_formats(tmp_path):
test_file = tmp_path / "test.txt"
test_file.write_text(
"EP1234567B1\nWO 2019/123456\nUS20190012345A1"
)

result_file = tmp_path / "result.json"

run_scan_click(
["--patent", "--json", str(result_file), str(test_file)]
)

result = load_json_result(result_file)

detections = result["files"][0].get("patent_detections", [])

values = [d["patent_reference"] for d in detections]

assert any("EP1234567B1" in v for v in values)
assert any("WO 2019/123456" in v for v in values)
assert any("US20190012345A1" in v for v in values)


def test_patent_no_false_positive(tmp_path):
test_file = tmp_path / "test.txt"
test_file.write_text("This is unpatented technology.")

result_file = tmp_path / "result.json"

run_scan_click(
["--patent", "--json", str(result_file), str(test_file)]
)

result = load_json_result(result_file)

detections = result["files"][0].get("patent_detections", [])

assert detections == []


def test_patent_threshold(tmp_path):
test_file = tmp_path / "test.txt"
test_file.write_text(
"US Patent 1\nUS Patent 2\nUS Patent 3"
)

result_file = tmp_path / "result.json"

run_scan_click(
["--patent", "--max-patent", "1", "--json", str(result_file), str(test_file)]
)

result = load_json_result(result_file)

detections = result["files"][0].get("patent_detections", [])

assert len(detections) == 1
assert detections[0]["patent_reference"] == "US Patent 1"

3 changes: 3 additions & 0 deletions tests/scancode/data/help/help.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Options:
license/copyright detection and top-level package
creation.
-c, --copyright Scan <input> for copyrights.
--patent Scan <input> for patent references and patent numbers.

other scans:
-i, --info Scan <input> for file information (size, checksums, etc).
Expand All @@ -45,6 +46,8 @@ Options:
0 for no limit. [default: 50]
--max-url INT Report only up to INT urls found in a file. Use 0
for no limit. [default: 50]
--max-patent INT Report only up to INT patent references found in a
file. Use 0 for no limit. [default: 50]
--unknown-licenses [EXPERIMENTAL] Detect unknown licenses.

output formats:
Expand Down
4 changes: 4 additions & 0 deletions tests/scancode/data/help/help_linux.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Options:
-c, --copyright Scan <input> for copyrights.
--go-symbol Collect Go symbols.
--rust-symbol Collect Rust symbols from rust binaries.
--patent Scan <input> for patent references and patent numbers.

other scans:
-i, --info Scan <input> for file information (size, checksums, etc).
Expand All @@ -47,7 +48,10 @@ Options:
0 for no limit. [default: 50]
--max-url INT Report only up to INT urls found in a file. Use 0
for no limit. [default: 50]
--max-patent INT Report only up to INT patent references found in a
file. Use 0 for no limit. [default: 50]
--unknown-licenses [EXPERIMENTAL] Detect unknown licenses.


output formats:
--json FILE Write scan output as compact JSON to FILE.
Expand Down
Loading