diff --git a/scanpipe/pipelines/analyze_nixpkgs_licenses.py b/scanpipe/pipelines/analyze_nixpkgs_licenses.py new file mode 100644 index 0000000000..642a12455c --- /dev/null +++ b/scanpipe/pipelines/analyze_nixpkgs_licenses.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from django.db import models + +from scanpipe.pipelines.scan_codebase import ScanCodebase +from scanpipe.pipes import nixpkgs +from scanpipe.pipes import scancode + + +class AnalyzeNixpkgsLicenses(ScanCodebase): + """ + Analyze Nixpkgs packages for license clarity and correctness. + + This pipeline automatically finds and reports issues for licenses in Nixpkgs, + including: + - Detecting when a nixpkgs license declaration is incorrect + - Determining what the correct license should be + - Identifying ambiguous or unclear license declarations + - Reporting license inconsistencies between declared and detected + + This pipeline applies specific rules for nixpkgs, given the large diversity + of nixpkgs tech stacks and upstreams. + + Key features: + - Scans package source code for license detections + - Compares declared vs detected licenses + - Checks for ecosystem-specific license patterns (Python, Rust, Node.js, etc.) + - Detects copyleft compliance issues + - Validates license file presence and consistency + - Generates comprehensive license report with severity levels + - Provides suggested license corrections with confidence scores + + Output: + - Stores issues in package notes for review + - Flags license detections needing manual review + - Generates license report in project extra_data: + * nixpkgs_license_issues: Dict of issues by package + * nixpkgs_license_report: Comprehensive report with summary and grouping + + Example workflow: + 1. Copy inputs and extract archives + 2. Scan codebase for packages and licenses + 3. Analyze packages for license issues + 4. Flag packages and detections needing review + 5. Generate comprehensive report + + See scanpipe.pipes.nixpkgs for detailed license analysis functions. + """ + + @classmethod + def steps(cls): + return ( + cls.copy_inputs_to_codebase_directory, + cls.extract_archives, + cls.collect_and_create_codebase_resources, + cls.flag_empty_files, + cls.flag_ignored_resources, + cls.scan_for_application_packages, + cls.scan_for_files, + cls.collect_and_create_license_detections, + cls.analyze_nixpkgs_license_issues, + cls.flag_packages_with_license_issues, + cls.flag_license_detections_needing_review, + cls.generate_nixpkgs_license_report, + ) + + def analyze_nixpkgs_license_issues(self): + """ + Analyze all packages in the project to detect license issues specific + to nixpkgs packages. + """ + self.log("Analyzing nixpkgs packages for license issues") + + issues = nixpkgs.analyze_license_issues(self.project) + + if issues: + self.log(f"Found license issues in {len(issues)} package(s)") + # Store issues in project extra_data for later reporting + self.project.update_extra_data({ + "nixpkgs_license_issues": issues + }) + else: + self.log("No license issues detected") + + def flag_packages_with_license_issues(self): + """ + Flag discovered packages that have license issues for review. + """ + self.log("Flagging packages with license issues") + + issues = self.project.extra_data.get("nixpkgs_license_issues", {}) + + for package_str, package_issues in issues.items(): + # Find package by its string representation or purl + packages = self.project.discoveredpackages.filter( + models.Q(package_url=package_str) | + models.Q(name=package_str) + ) + + for package in packages: + # Collect issue messages + issue_messages = [ + f"{issue['severity'].upper()}: {issue['message']}" + for issue in package_issues + ] + + # Update package notes with issues (idempotent) + current_notes = package.notes or "" + header = "=== License Issues ===" + + # Remove existing license issues section to avoid duplication + if header in current_notes: + current_notes = current_notes.split("\n" + header, 1)[0].rstrip() + + sections = [] + if current_notes: + sections.append(current_notes) + sections.append("\n" + header) + sections.extend(issue_messages) + + new_notes = "\n".join(sections).strip() + package.update(notes=new_notes) + + # Get detected licenses for this package + detected_licenses = nixpkgs.get_detected_licenses_for_package(package) + + # Try to suggest correction if declared license is wrong + if detected_licenses: + suggestion = nixpkgs.suggest_license_correction( + package, + detected_licenses + ) + if suggestion and package.declared_license_expression: + if suggestion["suggested_license"] != package.declared_license_expression: + self.log( + f"Package {package}: suggested license " + f"'{suggestion['suggested_license']}' " + f"(confidence: {suggestion['confidence']})" + ) + # Add suggestion to notes (idempotent) - use the already updated notes + suggestion_note = ( + f"\nSuggested license: {suggestion['suggested_license']} " + f"(confidence: {suggestion['confidence']})\n" + f"Reason: {suggestion['reason']}" + ) + # Refresh package from DB to get latest notes after previous update + package.refresh_from_db() + current_notes = package.notes or "" + if suggestion_note not in current_notes: + package.update(notes=current_notes + suggestion_note) + + def flag_license_detections_needing_review(self): + """ + Automatically check all license detections for issues and flag them + for review when needed. + """ + self.log("Checking license detections for issues") + + # Get all license detections in the project + license_detections = self.project.discoveredlicenses.all() + + flagged_count = 0 + for detection in license_detections: + # Check for issues using existing scancode functionality + if not detection.needs_review: + scancode.check_license_detection_for_issues(detection) + if detection.needs_review: + flagged_count += 1 + + self.log(f"Flagged {flagged_count} license detections for review") + + def generate_nixpkgs_license_report(self): + """ + Generate a comprehensive license report for all nixpkgs packages. + """ + self.log("Generating nixpkgs license report") + + report = nixpkgs.generate_license_report(self.project) + + # Store report in project extra_data + self.project.update_extra_data({ + "nixpkgs_license_report": report + }) + + # Log summary + summary = report["summary"] + self.log( + f"License Report Summary:\n" + f" Total packages: {summary['total_packages']}\n" + f" Packages with issues: {summary['packages_with_issues']}\n" + f" Total issues: {summary['total_issues']}\n" + f" Errors: {report['by_severity']['error']}\n" + f" Warnings: {report['by_severity']['warning']}\n" + f" Info: {report['by_severity']['info']}" + ) diff --git a/scanpipe/pipes/nixpkgs.py b/scanpipe/pipes/nixpkgs.py new file mode 100644 index 0000000000..f4726b3b20 --- /dev/null +++ b/scanpipe/pipes/nixpkgs.py @@ -0,0 +1,633 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import logging +import re +from collections import defaultdict + +from licensedcode.cache import get_licensing + +logger = logging.getLogger(__name__) + +""" +Utilities for analyzing and detecting license issues in Nixpkgs packages. +This module provides specific rules for nixpkgs given the large diversity +of tech stacks and upstreams. + +Main features: +- Automatically detect when a nixpkgs license declaration is incorrect +- Determine what the correct license should be based on detected licenses +- Identify ambiguous or unclear license declarations +- Report license inconsistencies between declared and detected +- Apply nixpkgs-specific rules for different ecosystems (Python, Rust, etc.) +- Check for copyleft compliance issues +- Validate license file presence and consistency + +Usage: + from scanpipe.pipes import nixpkgs + + # Analyze all packages in a project + issues = nixpkgs.analyze_license_issues(project) + + # Detect issues in a specific package + package_issues = nixpkgs.detect_package_license_issues(package) + + # Generate comprehensive report + report = nixpkgs.generate_license_report(project) + + # Get license correction suggestion + suggestion = nixpkgs.suggest_license_correction(package, detected_licenses) +""" + + +def analyze_license_issues(project): + """ + Analyze all packages in the project and detect license inconsistencies. + Returns a dict mapping package identifiers to their license issues. + """ + issues = {} + # Optimize database queries with prefetch_related + packages = project.discoveredpackages.all().prefetch_related( + "codebase_resources", + ) + + for package in packages: + package_issues = detect_package_license_issues(package) + if package_issues: + issues[str(package)] = package_issues + + return issues + + +def detect_package_license_issues(package): + """ + Detect licensing issues in a single package. + Returns a list of issue descriptions for the package. + """ + issues = [] + + # Check if declared license exists + if not package.declared_license_expression: + issues.append({ + "type": "missing_declared_license", + "severity": "error", + "message": "Package has no declared license expression", + "suggestion": "Review package metadata and source files to determine correct license" + }) + + # Get detected licenses from codebase resources + detected_licenses = get_detected_licenses_for_package(package) + + # Compare declared vs detected + if package.declared_license_expression and detected_licenses: + declared_license = package.declared_license_expression + inconsistencies = find_license_inconsistencies( + declared_license, + detected_licenses + ) + if inconsistencies: + issues.extend(inconsistencies) + + # Check for ambiguous license detections + ambiguous_detections = check_ambiguous_detections(package) + if ambiguous_detections: + issues.extend(ambiguous_detections) + + # Check for license clarity issues + clarity_issues = check_license_clarity(package) + if clarity_issues: + issues.extend(clarity_issues) + + # Nixpkgs-specific checks + ecosystem_issue = check_nixpkgs_ecosystem_license(package) + if ecosystem_issue: + issues.append(ecosystem_issue) + + copyleft_issues = detect_copyleft_compliance_issues(package) + if copyleft_issues: + issues.extend(copyleft_issues) + + license_file_issues = detect_license_file_issues(package) + if license_file_issues: + issues.extend(license_file_issues) + + return issues + + +def get_detected_licenses_for_package(package): + """ + Extract all detected license expressions from the package's codebase resources. + Returns a list of unique license expressions found in the package files. + """ + detected = set() + + resources = package.codebase_resources.all() + for resource in resources: + if resource.detected_license_expression: + detected.add(resource.detected_license_expression) + + return list(detected) + + +def find_license_inconsistencies(declared_license, detected_licenses): + """ + Compare declared license with detected licenses to find inconsistencies. + Returns a list of inconsistency issues. + """ + issues = [] + licensing = get_licensing() + + try: + declared_parsed = licensing.parse(declared_license, validate=True) + except Exception as e: + issues.append({ + "type": "invalid_declared_license", + "severity": "error", + "message": f"Invalid declared license expression: {declared_license}", + "details": str(e), + "suggestion": "Fix the license expression syntax" + }) + return issues + + # Check if any detected license is not compatible with declared + for detected in detected_licenses: + try: + detected_parsed = licensing.parse(detected, validate=True) + + # Check if detected is subset of declared or vice versa + if not are_licenses_compatible(declared_parsed, detected_parsed, licensing): + issues.append({ + "type": "license_mismatch", + "severity": "warning", + "message": f"Detected license '{detected}' differs from declared '{declared_license}'", + "declared": declared_license, + "detected": detected, + "suggestion": "Review source files to determine correct license. Consider if this is dual-licensing or incorrect declaration." + }) + except Exception as e: + # Log invalid detected licenses for debugging + logger.debug(f"Invalid detected license expression '{detected}': {e}") + continue + + return issues + + +def are_licenses_compatible(declared, detected, licensing): + """ + Check if detected license is compatible with declared license. + This is a simplified check - could be expanded with more sophisticated logic. + """ + declared_str = str(declared).lower() + detected_str = str(detected).lower() + + # Exact match + if declared_str == detected_str: + return True + + # Check if either expression contains the other + # Split only on spaces, hyphens, and parentheses (not on operators) + declared_parts = set(re.split(r'[\s()]+', declared_str)) + detected_parts = set(re.split(r'[\s()]+', detected_str)) + + # Remove empty strings and common operators from comparison + operators = {'or', 'and', 'with', ''} + declared_parts -= operators + detected_parts -= operators + + # Check if detected parts are subset of declared or vice versa + if detected_parts and declared_parts: + if detected_parts.issubset(declared_parts) or declared_parts.issubset(detected_parts): + return True + + return False + + +def check_ambiguous_detections(package): + """ + Check for ambiguous license detections that need review. + Returns a list of ambiguity issues. + """ + issues = [] + + # Get license detections that need review from the package's project + ambiguous_licenses = package.project.discoveredlicenses.filter( + needs_review=True + ) + + # Get file regions related to this package + package_paths = set(package.codebase_resources.values_list('path', flat=True)) + + for lic in ambiguous_licenses: + # Check if this detection appears in package files + detection_paths = { + fr_path + for fr in (lic.file_regions or []) + if isinstance(fr, dict) + for fr_path in [fr.get("path")] + if fr_path is not None + } + if package_paths & detection_paths: + issues.append({ + "type": "ambiguous_detection", + "severity": "warning", + "message": f"Ambiguous license detection: {lic.license_expression}", + "identifier": lic.identifier, + "review_comments": lic.review_comments, + "suggestion": "Manual review required for this detection" + }) + + return issues + + +def check_license_clarity(package): + """ + Check for license clarity issues in the package. + Returns a list of clarity issues. + """ + issues = [] + + # Check if license is too generic or unclear + if package.declared_license_expression: + unclear_indicators = [ + "unknown", + "see-license", + "other", + "proprietary", + "free", + "open-source", + ] + + declared_lower = package.declared_license_expression.lower() + for indicator in unclear_indicators: + # Use word boundary matching to avoid false positives + pattern = r'\b' + re.escape(indicator) + r'\b' + if re.search(pattern, declared_lower): + issues.append({ + "type": "unclear_license", + "severity": "warning", + "message": f"License expression contains unclear term: '{indicator}'", + "declared": package.declared_license_expression, + "suggestion": f"Review package to determine specific license instead of '{indicator}'" + }) + break + + # Check for missing license files + has_license_file = False + for resource in package.codebase_resources.all(): + # Prefer the dedicated legal/notice flag + if getattr(resource, "is_legal", False): + has_license_file = True + break + # Fall back to common license filenames + resource_name = resource.name.lower() + if (resource_name in {"license", "license.txt", "license.md", "copying", "copying.txt"} + or resource_name.startswith("license.") + or resource_name.startswith("copying.")): + has_license_file = True + break + + if not has_license_file and package.declared_license_expression: + issues.append({ + "type": "missing_license_file", + "severity": "info", + "message": "No LICENSE file found in package", + "suggestion": "Consider including a LICENSE file for clarity" + }) + + return issues + + +def generate_license_report(project): + """ + Generate a comprehensive license report for all packages in the project. + Returns a structured report dict. + """ + issues_by_package = analyze_license_issues(project) + + # Group by issue type and severity + by_type = defaultdict(list) + by_severity = defaultdict(list) + + for package_id, package_issues in issues_by_package.items(): + for issue in package_issues: + issue_with_package = {**issue, "package": package_id} + by_type[issue["type"]].append(issue_with_package) + by_severity[issue["severity"]].append(issue_with_package) + + total_packages = project.discoveredpackages.count() + packages_with_issues = len(issues_by_package) + + report = { + "summary": { + "total_packages": total_packages, + "packages_with_issues": packages_with_issues, + "packages_without_issues": total_packages - packages_with_issues, + "total_issues": sum(len(issues) for issues in issues_by_package.values()), + }, + "by_severity": { + "error": len(by_severity.get("error", [])), + "warning": len(by_severity.get("warning", [])), + "info": len(by_severity.get("info", [])), + }, + "by_type": { + issue_type: len(issues) + for issue_type, issues in by_type.items() + }, + "issues_by_package": issues_by_package, + "issues_by_type": dict(by_type), + "issues_by_severity": dict(by_severity), + } + + return report + + +def suggest_license_correction(package, detected_licenses): + """ + Suggest the correct license for a package based on detected licenses. + Returns a suggestion dict or None if no clear suggestion can be made. + """ + if not detected_licenses: + return None + + # If all detected licenses are the same, suggest that + unique_detected = set(detected_licenses) + if len(unique_detected) == 1: + return { + "suggested_license": list(unique_detected)[0], + "confidence": "high", + "reason": "All detected licenses are identical" + } + + # If multiple licenses detected, check if they're compatible + licensing = get_licensing() + + # Try to create a combined expression + try: + # Sort for consistent ordering + sorted_licenses = sorted(unique_detected) + # Most common pattern: dual licensing with OR + combined_or = " OR ".join(sorted_licenses) + licensing.parse(combined_or, validate=True) + + return { + "suggested_license": combined_or, + "confidence": "medium", + "reason": "Multiple licenses detected - may be dual-licensed" + } + except Exception: + pass + + # If can't combine, report most common + if detected_licenses: + most_common = max(set(detected_licenses), key=detected_licenses.count) + return { + "suggested_license": most_common, + "confidence": "low", + "reason": f"Most frequently detected license (appears {detected_licenses.count(most_common)} times)" + } + + return None + + +# Nixpkgs-specific license mappings and rules +NIXPKGS_LICENSE_MAPPINGS = { + # Common license name variations in nixpkgs + "gpl": "GPL-1.0-or-later", + "gpl2": "GPL-2.0-only", + "gpl2+": "GPL-2.0-or-later", + "gpl3": "GPL-3.0-only", + "gpl3+": "GPL-3.0-or-later", + "lgpl": "LGPL-2.1-or-later", + "lgpl2": "LGPL-2.1-only", + "lgpl21": "LGPL-2.1-only", + "lgpl21+": "LGPL-2.1-or-later", + "lgpl3": "LGPL-3.0-only", + "lgpl3+": "LGPL-3.0-or-later", + "bsd": "BSD-3-Clause", + "bsd2": "BSD-2-Clause", + "bsd3": "BSD-3-Clause", + "apache": "Apache-2.0", + "apache2": "Apache-2.0", + "apache-2": "Apache-2.0", + "mpl": "MPL-2.0", + "mpl2": "MPL-2.0", + "mit": "MIT", + "isc": "ISC", + "zlib": "Zlib", + "artistic": "Artistic-2.0", + "artistic2": "Artistic-2.0", +} + + +# Package type to ecosystem mapping +PACKAGE_TYPE_TO_ECOSYSTEM = { + "pypi": "python", + "npm": "nodejs", + "cargo": "rust", + "gem": "ruby", + "cpan": "perl", + "maven": "java", + "nuget": "dotnet", + "hackage": "haskell", +} + +# Known nixpkgs package ecosystems and their typical licenses +NIXPKGS_ECOSYSTEM_LICENSE_PATTERNS = { + "python": ["MIT", "Apache-2.0", "BSD-3-Clause", "GPL-3.0-or-later"], + "rust": ["MIT", "Apache-2.0", "MIT OR Apache-2.0"], + "nodejs": ["MIT", "ISC", "BSD-3-Clause"], + "go": ["MIT", "Apache-2.0", "BSD-3-Clause"], + "haskell": ["BSD-3-Clause", "MIT"], + "ruby": ["MIT", "GPL-2.0-or-later"], + "perl": ["Artistic-2.0", "GPL-1.0-or-later"], + "java": ["Apache-2.0", "MIT", "LGPL-2.1-or-later"], + "dotnet": ["MIT", "Apache-2.0"], +} + + +def normalize_nixpkgs_license(license_str): + """ + Normalize a nixpkgs license string to SPDX identifier. + Returns normalized license or original if no mapping found. + """ + if not license_str: + return license_str + + normalized = license_str.lower().strip() + return NIXPKGS_LICENSE_MAPPINGS.get(normalized, license_str) + + +def check_nixpkgs_ecosystem_license(package): + """ + Check if the package license is typical for its ecosystem. + Returns issue dict if license seems unusual for the ecosystem. + """ + if not package.type or not package.declared_license_expression: + return None + + package_type = package.type.lower() + # Map package type to ecosystem (e.g., pypi -> python) + ecosystem = PACKAGE_TYPE_TO_ECOSYSTEM.get(package_type, package_type) + expected_licenses = NIXPKGS_ECOSYSTEM_LICENSE_PATTERNS.get(ecosystem) + + if not expected_licenses: + return None + + declared = package.declared_license_expression + + # Check if declared license matches expected patterns + declared_lower = declared.lower() + for expected in expected_licenses: + # Use word boundary matching to avoid false positives like "mit" in "limited" + pattern = r'\b' + re.escape(expected.lower()) + r'\b' + if re.search(pattern, declared_lower): + return None + + return { + "type": "unusual_ecosystem_license", + "severity": "info", + "message": f"License '{declared}' is unusual for {ecosystem} packages", + "expected_licenses": expected_licenses, + "suggestion": f"Verify this is correct. Common {ecosystem} licenses: {', '.join(expected_licenses)}" + } + + +def detect_copyleft_compliance_issues(package): + """ + Detect potential copyleft compliance issues. + Returns list of compliance-related issues. + """ + issues = [] + + if not package.declared_license_expression: + return issues + + declared_lower = package.declared_license_expression.lower() + + # Check for copyleft licenses using word boundary matching + copyleft_indicators = ["gpl", "agpl", "lgpl", "mpl", "epl", "cpl"] + is_copyleft = any( + re.search(r'\b' + re.escape(ind) + r'\b', declared_lower) + for ind in copyleft_indicators + ) + + if is_copyleft: + # Check if package has dependencies + dependencies = package.project.discovereddependencies.filter( + for_package=package + ) + + if dependencies.exists(): + issues.append({ + "type": "copyleft_with_dependencies", + "severity": "warning", + "message": f"Copyleft license {package.declared_license_expression} with dependencies", + "suggestion": "Review dependencies for license compatibility" + }) + + # Check for proprietary indicators in notes or description + proprietary_indicators = ["proprietary", "commercial", "closed"] + description = (package.description or "").lower() + notes = (package.notes or "").lower() + + # Use word boundary matching to avoid false positives like "commercial" in "noncommercial" + has_proprietary = any( + re.search(r'\b' + re.escape(ind) + r'\b', description) or + re.search(r'\b' + re.escape(ind) + r'\b', notes) + for ind in proprietary_indicators + ) + + if has_proprietary: + issues.append({ + "type": "copyleft_proprietary_conflict", + "severity": "error", + "message": "Copyleft license declared but proprietary indicators found", + "suggestion": "Verify license - copyleft and proprietary are incompatible" + }) + + return issues + + +def detect_license_file_issues(package): + """ + Detect issues with license files in the package. + Returns list of license file-related issues. + """ + issues = [] + + license_files = [] + resources = package.codebase_resources.all() + + # Known file extensions for license files + license_extensions = {'.txt', '.md', '.rst', '.html', '.pdf', ''} + + for resource in resources: + # Prefer the is_legal flag when available + if getattr(resource, "is_legal", False): + license_files.append(resource) + continue + + # Fall back to matching common canonical license filenames + name_lower = resource.name.lower() + # Extract file extension for validation + file_ext = '.' + name_lower.split('.')[-1] if '.' in name_lower else '' + + # Check if it matches expected license file patterns with valid extensions + if ( + # Exact common license filenames + name_lower in { + "license", "license.txt", "license.md", + "licence", "licence.txt", "licence.md", + "copying", "copying.txt", "copying.md", + "copyright", "copyright.txt", "copyright.md", + } + # Files starting with common license-related prefixes and valid extensions + or (file_ext in license_extensions and name_lower.startswith(( + "license.", "license-", + "licence.", "licence-", + "copying.", "copying-", + "copyright.", "copyright-", + ))) + ): + license_files.append(resource) + + # Multiple license files might indicate multiple licenses + if len(license_files) > 1: + issues.append({ + "type": "multiple_license_files", + "severity": "info", + "message": f"Found {len(license_files)} license-related files", + "files": [f.path for f in license_files], + "suggestion": "Review all license files to ensure declared license is complete" + }) + + # License file but no declared license + if license_files and not package.declared_license_expression: + issues.append({ + "type": "license_file_without_declaration", + "severity": "warning", + "message": "License file exists but no license declared", + "files": [f.path for f in license_files], + "suggestion": "Extract license from license file and update declaration" + }) + + return issues diff --git a/scanpipe/tests/pipes/test_nixpkgs.py b/scanpipe/tests/pipes/test_nixpkgs.py new file mode 100644 index 0000000000..5e82994161 --- /dev/null +++ b/scanpipe/tests/pipes/test_nixpkgs.py @@ -0,0 +1,332 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from django.test import TestCase + +from scanpipe.models import CodebaseResource +from scanpipe.models import DiscoveredPackage +from scanpipe.models import Project +from scanpipe.pipes import nixpkgs + + +class NixpkgsLicenseAnalysisTest(TestCase): + def setUp(self): + self.project = Project.objects.create(name="Test Nixpkgs Project") + + def test_detect_missing_declared_license(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + ) + + issues = nixpkgs.detect_package_license_issues(package) + + self.assertEqual(len(issues), 1) + self.assertEqual(issues[0]["type"], "missing_declared_license") + self.assertEqual(issues[0]["severity"], "error") + + def test_detect_license_mismatch(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="MIT", + ) + + # Create a resource with different detected license + resource = CodebaseResource.objects.create( + project=self.project, + path="test.py", + detected_license_expression="GPL-3.0-only", + ) + package.codebase_resources.add(resource) + + issues = nixpkgs.detect_package_license_issues(package) + + # Should detect mismatch + mismatch_issues = [i for i in issues if i["type"] == "license_mismatch"] + self.assertGreater(len(mismatch_issues), 0) + self.assertEqual(mismatch_issues[0]["severity"], "warning") + + def test_normalize_nixpkgs_license(self): + # Test common nixpkgs license mappings + self.assertEqual( + nixpkgs.normalize_nixpkgs_license("gpl2+"), + "GPL-2.0-or-later" + ) + self.assertEqual( + nixpkgs.normalize_nixpkgs_license("apache2"), + "Apache-2.0" + ) + self.assertEqual( + nixpkgs.normalize_nixpkgs_license("mit"), + "MIT" + ) + + # Test unknown license passes through + self.assertEqual( + nixpkgs.normalize_nixpkgs_license("CustomLicense"), + "CustomLicense" + ) + + def test_check_nixpkgs_ecosystem_license(self): + # Python package with unusual license + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="Artistic-2.0", + ) + + issue = nixpkgs.check_nixpkgs_ecosystem_license(package) + + self.assertIsNotNone(issue) + self.assertEqual(issue["type"], "unusual_ecosystem_license") + self.assertEqual(issue["severity"], "info") + + def test_check_nixpkgs_ecosystem_license_typical(self): + # Python package with typical license + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="MIT", + ) + + issue = nixpkgs.check_nixpkgs_ecosystem_license(package) + + # No issue expected for typical license + self.assertIsNone(issue) + + def test_detect_copyleft_with_dependencies(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="GPL-3.0-only", + ) + + issues = nixpkgs.detect_copyleft_compliance_issues(package) + + # Note: This test would need dependencies to trigger the issue + # For now, it should return empty list + self.assertIsInstance(issues, list) + + def test_suggest_license_correction_single_detected(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="GPL-3.0-only", + ) + + detected_licenses = ["MIT"] + suggestion = nixpkgs.suggest_license_correction(package, detected_licenses) + + self.assertIsNotNone(suggestion) + self.assertEqual(suggestion["suggested_license"], "MIT") + self.assertEqual(suggestion["confidence"], "high") + + def test_suggest_license_correction_multiple_detected(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="GPL-3.0-only", + ) + + detected_licenses = ["MIT", "Apache-2.0"] + suggestion = nixpkgs.suggest_license_correction(package, detected_licenses) + + self.assertIsNotNone(suggestion) + self.assertIn("OR", suggestion["suggested_license"]) + self.assertEqual(suggestion["confidence"], "medium") + + def test_suggest_license_correction_no_detected(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="MIT", + ) + + suggestion = nixpkgs.suggest_license_correction(package, []) + + self.assertIsNone(suggestion) + + def test_analyze_license_issues(self): + # Create package with issue + DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="package-with-issue", + version="1.0.0", + # No declared license - should trigger issue + ) + + # Create package without issue + DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="package-without-issue", + version="1.0.0", + declared_license_expression="MIT", + ) + + issues = nixpkgs.analyze_license_issues(self.project) + + # Should have issues for one package + self.assertEqual(len(issues), 1) + + def test_generate_license_report(self): + # Create package with issue + DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="package1", + version="1.0.0", + ) + + # Create package without issue + DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="package2", + version="1.0.0", + declared_license_expression="MIT", + ) + + report = nixpkgs.generate_license_report(self.project) + + self.assertIn("summary", report) + self.assertIn("by_severity", report) + self.assertIn("by_type", report) + self.assertIn("issues_by_package", report) + + summary = report["summary"] + self.assertEqual(summary["total_packages"], 2) + self.assertEqual(summary["packages_with_issues"], 1) + self.assertEqual(summary["packages_without_issues"], 1) + + def test_check_license_clarity_unclear_license(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package-unclear", + version="1.0.0", + declared_license_expression="Unknown", + ) + + issues = nixpkgs.check_license_clarity(package) + + unclear_issues = [i for i in issues if i["type"] == "unclear_license"] + self.assertGreater(len(unclear_issues), 0) + self.assertEqual(unclear_issues[0]["severity"], "warning") + + def test_get_detected_licenses_for_package(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package-detected", + version="1.0.0", + ) + + # Create resources with licenses + resource1 = CodebaseResource.objects.create( + project=self.project, + path="file1.py", + detected_license_expression="MIT", + ) + resource2 = CodebaseResource.objects.create( + project=self.project, + path="file2.py", + detected_license_expression="Apache-2.0", + ) + + package.codebase_resources.add(resource1, resource2) + + detected = nixpkgs.get_detected_licenses_for_package(package) + + self.assertEqual(len(detected), 2) + self.assertIn("MIT", detected) + self.assertIn("Apache-2.0", detected) + + def test_detect_license_file_issues_multiple_files(self): + package = DiscoveredPackage.objects.create( + project=self.project, + type="pypi", + name="test-package", + version="1.0.0", + declared_license_expression="MIT", + ) + + # Create multiple license files + license1 = CodebaseResource.objects.create( + project=self.project, + path="LICENSE", + name="LICENSE", + ) + license2 = CodebaseResource.objects.create( + project=self.project, + path="COPYING", + name="COPYING", + ) + + package.codebase_resources.add(license1, license2) + + issues = nixpkgs.detect_license_file_issues(package) + + multiple_file_issues = [ + i for i in issues if i["type"] == "multiple_license_files" + ] + self.assertGreater(len(multiple_file_issues), 0) + + def test_are_licenses_compatible_exact_match(self): + from licensedcode.cache import get_licensing + + licensing = get_licensing() + lic1 = licensing.parse("MIT", validate=True) + lic2 = licensing.parse("MIT", validate=True) + + result = nixpkgs.are_licenses_compatible(lic1, lic2, licensing) + + self.assertTrue(result) + + def test_are_licenses_compatible_different(self): + from licensedcode.cache import get_licensing + + licensing = get_licensing() + lic1 = licensing.parse("MIT", validate=True) + lic2 = licensing.parse("GPL-3.0-only", validate=True) + + result = nixpkgs.are_licenses_compatible(lic1, lic2, licensing) + + self.assertFalse(result)