Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions scanpipe/pipelines/analyze_nixpkgs_licenses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from django.db import models

from scanpipe.pipelines.scan_codebase import ScanCodebase
from scanpipe.pipes import nixpkgs
from scanpipe.pipes import scancode


class AnalyzeNixpkgsLicenses(ScanCodebase):
"""
Analyze Nixpkgs packages for license clarity and correctness.

This pipeline automatically finds and reports issues for licenses in Nixpkgs,
including:
- Detecting when a nixpkgs license declaration is incorrect
- Determining what the correct license should be
- Identifying ambiguous or unclear license declarations
- Reporting license inconsistencies between declared and detected

This pipeline applies specific rules for nixpkgs, given the large diversity
of nixpkgs tech stacks and upstreams.

Key features:
- Scans package source code for license detections
- Compares declared vs detected licenses
- Checks for ecosystem-specific license patterns (Python, Rust, Node.js, etc.)
- Detects copyleft compliance issues
- Validates license file presence and consistency
- Generates comprehensive license report with severity levels
- Provides suggested license corrections with confidence scores

Output:
- Stores issues in package notes for review
- Flags license detections needing manual review
- Generates license report in project extra_data:
* nixpkgs_license_issues: Dict of issues by package
* nixpkgs_license_report: Comprehensive report with summary and grouping

Example workflow:
1. Copy inputs and extract archives
2. Scan codebase for packages and licenses
3. Analyze packages for license issues
4. Flag packages and detections needing review
5. Generate comprehensive report

See scanpipe.pipes.nixpkgs for detailed license analysis functions.
"""

@classmethod
def steps(cls):
return (
cls.copy_inputs_to_codebase_directory,
cls.extract_archives,
cls.collect_and_create_codebase_resources,
cls.flag_empty_files,
cls.flag_ignored_resources,
cls.scan_for_application_packages,
cls.scan_for_files,
cls.collect_and_create_license_detections,
cls.analyze_nixpkgs_license_issues,
cls.flag_packages_with_license_issues,
cls.flag_license_detections_needing_review,
cls.generate_nixpkgs_license_report,
)
Comment on lines +70 to +85
Copy link

Copilot AI Jan 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pipeline references steps that are not defined in the class or its parent: 'copy_inputs_to_codebase_directory', 'collect_and_create_codebase_resources', 'scan_for_application_packages', 'scan_for_files', and 'collect_and_create_license_detections'. While these methods exist in the ScanCodebase pipeline, AnalyzeNixpkgsLicenses inherits from Pipeline (not ScanCodebase), so these methods are not available. Either inherit from ScanCodebase instead of Pipeline, or implement these methods in this class.

Copilot uses AI. Check for mistakes.

def analyze_nixpkgs_license_issues(self):
"""
Analyze all packages in the project to detect license issues specific
to nixpkgs packages.
"""
self.log("Analyzing nixpkgs packages for license issues")

issues = nixpkgs.analyze_license_issues(self.project)

if issues:
self.log(f"Found license issues in {len(issues)} package(s)")
# Store issues in project extra_data for later reporting
self.project.update_extra_data({
"nixpkgs_license_issues": issues
})
else:
self.log("No license issues detected")

def flag_packages_with_license_issues(self):
"""
Flag discovered packages that have license issues for review.
"""
self.log("Flagging packages with license issues")

issues = self.project.extra_data.get("nixpkgs_license_issues", {})

for package_str, package_issues in issues.items():
# Find package by its string representation or purl
packages = self.project.discoveredpackages.filter(
models.Q(package_url=package_str) |
models.Q(name=package_str)
)
Comment on lines +114 to +118
Copy link

Copilot AI Jan 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The package lookup uses a Q object to search by either package_url or name, but the 'package_str' variable is the string representation from str(package). This could lead to incorrect matches if multiple packages have similar names or if the string representation doesn't match either the package_url or name field exactly. Consider using a more reliable lookup method, such as storing package IDs in the issues dictionary or using unique package identifiers.

Copilot uses AI. Check for mistakes.

for package in packages:
# Collect issue messages
issue_messages = [
f"{issue['severity'].upper()}: {issue['message']}"
for issue in package_issues
]

# Update package notes with issues (idempotent)
current_notes = package.notes or ""
header = "=== License Issues ==="

# Remove existing license issues section to avoid duplication
if header in current_notes:
current_notes = current_notes.split("\n" + header, 1)[0].rstrip()

sections = []
if current_notes:
sections.append(current_notes)
sections.append("\n" + header)
sections.extend(issue_messages)

Comment on lines +134 to +140
Copy link

Copilot AI Jan 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code constructs multi-line strings by joining sections with newlines, but the logic is complex and hard to follow. At line 136, sections might be empty (if current_notes is empty), then line 138 adds "\n" + header (which starts with a newline), and lines 139 extends with issue_messages. This could result in unexpected leading newlines. Consider simplifying this logic or adding comments to clarify the intended format of the final notes string.

Suggested change
sections = []
if current_notes:
sections.append(current_notes)
sections.append("\n" + header)
sections.extend(issue_messages)
sections = []
if current_notes:
# Preserve existing notes, followed by a blank line before the header
sections.append(current_notes)
sections.append("")
sections.append(header)
sections.extend(issue_messages)

Copilot uses AI. Check for mistakes.
new_notes = "\n".join(sections).strip()
package.update(notes=new_notes)

# Get detected licenses for this package
detected_licenses = nixpkgs.get_detected_licenses_for_package(package)

# Try to suggest correction if declared license is wrong
if detected_licenses:
suggestion = nixpkgs.suggest_license_correction(
package,
detected_licenses
)
if suggestion and package.declared_license_expression:
if suggestion["suggested_license"] != package.declared_license_expression:
self.log(
f"Package {package}: suggested license "
f"'{suggestion['suggested_license']}' "
f"(confidence: {suggestion['confidence']})"
)
# Add suggestion to notes (idempotent) - use the already updated notes
suggestion_note = (
f"\nSuggested license: {suggestion['suggested_license']} "
f"(confidence: {suggestion['confidence']})\n"
f"Reason: {suggestion['reason']}"
)
# Refresh package from DB to get latest notes after previous update
package.refresh_from_db()
current_notes = package.notes or ""
if suggestion_note not in current_notes:
package.update(notes=current_notes + suggestion_note)
Comment on lines +168 to +170
Copy link

Copilot AI Jan 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current_notes variable is retrieved from package.notes on line 167, but this happens after the package notes were already updated on line 143. This means current_notes will contain the newly updated notes (including the "=== License Issues ===" section), not the original notes. The suggestion_note could be appended to the wrong location or create duplicates. Consider refetching the package or using the new_notes variable from line 142 instead.

Copilot uses AI. Check for mistakes.
Comment on lines +142 to +170
Copy link

Copilot AI Jan 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation has a potential race condition or stale data issue. After updating the package notes at line 142, the code then calls refresh_from_db() at line 167, but only if a suggestion exists. This means the second update at line 170 might be operating on stale data if the first update completed. The logic should either refresh before the second update or combine both updates into a single operation to avoid potential data inconsistency.

Copilot uses AI. Check for mistakes.

def flag_license_detections_needing_review(self):
"""
Automatically check all license detections for issues and flag them
for review when needed.
"""
self.log("Checking license detections for issues")

# Get all license detections in the project
license_detections = self.project.discoveredlicenses.all()

flagged_count = 0
for detection in license_detections:
# Check for issues using existing scancode functionality
if not detection.needs_review:
scancode.check_license_detection_for_issues(detection)
if detection.needs_review:
flagged_count += 1

Comment on lines +178 to +189
Copy link

Copilot AI Jan 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The flag_license_detections_needing_review function iterates through all license detections and calls check_license_detection_for_issues for each one. This could be slow for large projects with many license detections. Additionally, the function modifies objects in a loop without using bulk operations. Consider using bulk_update() or filtering the queryset to only process detections that haven't been checked yet.

Suggested change
# Get all license detections in the project
license_detections = self.project.discoveredlicenses.all()
flagged_count = 0
for detection in license_detections:
# Check for issues using existing scancode functionality
if not detection.needs_review:
scancode.check_license_detection_for_issues(detection)
if detection.needs_review:
flagged_count += 1
# Only process detections that are not already flagged for review
license_detections = (
self.project.discoveredlicenses.filter(needs_review=False).iterator()
)
flagged_count = 0
to_update = []
for detection in license_detections:
# Check for issues using existing scancode functionality
scancode.check_license_detection_for_issues(detection)
if detection.needs_review:
flagged_count += 1
to_update.append(detection)
# Persist updated review flags in bulk, if needed
if to_update:
models.bulk_update(to_update, ["needs_review"])

Copilot uses AI. Check for mistakes.
self.log(f"Flagged {flagged_count} license detections for review")

def generate_nixpkgs_license_report(self):
"""
Generate a comprehensive license report for all nixpkgs packages.
"""
self.log("Generating nixpkgs license report")

report = nixpkgs.generate_license_report(self.project)

# Store report in project extra_data
self.project.update_extra_data({
"nixpkgs_license_report": report
})

# Log summary
summary = report["summary"]
self.log(
f"License Report Summary:\n"
f" Total packages: {summary['total_packages']}\n"
f" Packages with issues: {summary['packages_with_issues']}\n"
f" Total issues: {summary['total_issues']}\n"
f" Errors: {report['by_severity']['error']}\n"
f" Warnings: {report['by_severity']['warning']}\n"
f" Info: {report['by_severity']['info']}"
)
Loading