diff --git a/component_catalog/api.py b/component_catalog/api.py index ad91d8dc..e00c7a5b 100644 --- a/component_catalog/api.py +++ b/component_catalog/api.py @@ -11,8 +11,10 @@ from django.db import transaction from django.forms.widgets import HiddenInput from django.http import FileResponse +from django.http.response import StreamingHttpResponse import django_filters +import requests from packageurl.contrib import url2purl from packageurl.contrib.django.filters import PackageURLFilter from rest_framework import serializers @@ -879,6 +881,11 @@ class ScanDataUnavailable(APIException): default_detail = "Scan data is not available" +class ScanFetchError(APIException): + status_code = status.HTTP_400_BAD_REQUEST + default_detail = "Could not fetch scan data" + + class PackageViewSet( SendAboutFilesMixin, AboutCodeFilesActionMixin, @@ -956,7 +963,12 @@ def scan_info(self, request, uuid): @action(detail=True, name="Scan results") def scan_results(self, request, uuid): - """Return the scan results from ScanCode.io.""" + """ + Stream scan results directly from ScanCode.io back to the client. + + The response body is not loaded in memory but proxied chunk by chunk, + making it suitable for large scan result payloads. + """ package = self.get_object() dataspace = request.user.dataspace scancodeio = ScanCodeIO(dataspace) @@ -964,9 +976,16 @@ def scan_results(self, request, uuid): project_uuid = project_info.get("uuid") scan_results_url = scancodeio.get_scan_action_url(project_uuid, "results") - scan_results = scancodeio.fetch_scan_data(scan_results_url) - return Response(scan_results) + try: + scan_response = scancodeio.stream_scan_data(scan_results_url) + except requests.RequestException: + raise ScanFetchError() + + return StreamingHttpResponse( + scan_response.iter_content(chunk_size=8192), + content_type=scan_response.headers.get("Content-Type", "application/json"), + ) @action(detail=True, name="Scan summary") def scan_summary(self, request, uuid): diff --git a/component_catalog/tests/test_api.py b/component_catalog/tests/test_api.py index 5bc93cae..afd60a0a 100644 --- a/component_catalog/tests/test_api.py +++ b/component_catalog/tests/test_api.py @@ -17,6 +17,7 @@ from django.test import override_settings from django.urls import reverse +import requests from rest_framework import status from rest_framework.exceptions import ErrorDetail from rest_framework.test import APIClient @@ -1523,19 +1524,32 @@ def test_api_package_viewset_scan_info_action(self, mock_is_available, mock_get_ self.assertEqual(project_info, response.data) @mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.get_project_info") - @mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.fetch_scan_data") + @mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.stream_scan_data") @mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.is_available") def test_api_package_viewset_scan_results_action( - self, mock_is_available, mock_fetch_scan_data, mock_get_project_info + self, mock_is_available, mock_stream_scan_data, mock_get_project_info ): self.client.login(username=self.base_user.username, password="secret") action_url = reverse("api_v2:package-scan-results", args=[self.package1.uuid]) mock_is_available.return_value = True mock_get_project_info.return_value = {"uuid": "abcdef"} - mock_fetch_scan_data.return_value = {"results": ""} + + mock_stream_scan_data.side_effect = requests.RequestException + response = self.client.get(action_url) + self.assertEqual(400, response.status_code) + error = {"detail": ErrorDetail(string="Could not fetch scan data", code="error")} + self.assertEqual(error, response.data) + + mock_response = mock.Mock() + mock_response.iter_content.return_value = iter([b'{"results": ""}']) + mock_response.headers = {"Content-Type": "application/json"} + mock_stream_scan_data.side_effect = None + mock_stream_scan_data.return_value = mock_response + response = self.client.get(action_url) self.assertEqual(200, response.status_code) - self.assertEqual({"results": ""}, response.data) + self.assertEqual(b'{"results": ""}', b"".join(response.streaming_content)) + self.assertEqual("application/json", response.headers["Content-Type"]) @mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.get_project_info") @mock.patch("dejacode_toolkit.scancodeio.ScanCodeIO.fetch_scan_data") diff --git a/dejacode/settings.py b/dejacode/settings.py index a53f1e97..c689575c 100644 --- a/dejacode/settings.py +++ b/dejacode/settings.py @@ -696,6 +696,9 @@ def get_fake_redis_connection(config, use_strict_redis): # during the Django 6.x release cycle. URLIZE_ASSUME_HTTPS = env.bool("DEJACODE_URLIZE_ASSUME_HTTPS", default=True) +# Default to 5 seconds. +DEJACODE_INTEGRATION_REQUESTS_TIMEOUT = env.int("DEJACODE_INTEGRATION_REQUESTS_TIMEOUT", default=5) + if IS_TESTS: # Silent the django-axes logging during tests LOGGING["loggers"].update({"axes": {"handlers": ["null"]}}) diff --git a/dejacode_toolkit/__init__.py b/dejacode_toolkit/__init__.py index a75562d7..a7c23cc8 100644 --- a/dejacode_toolkit/__init__.py +++ b/dejacode_toolkit/__init__.py @@ -21,10 +21,13 @@ def get_settings(var_name, default=None): return getenv(var_name) or getattr(settings, var_name, default) +REQUESTS_TIMEOUT = get_settings("DEJACODE_INTEGRATION_REQUESTS_TIMEOUT", default=5) + + def is_service_available(label, session, url, raise_exceptions): """Check if a configured integration service is available.""" try: - response = session.head(url, timeout=5) + response = session.head(url, timeout=REQUESTS_TIMEOUT) response.raise_for_status() except requests.exceptions.RequestException as request_exception: logger.debug(f"{label} is_available() error: {request_exception}") @@ -40,7 +43,7 @@ class BaseService: settings_prefix = None url_field_name = None api_key_field_name = None - default_timeout = 5 + default_timeout = REQUESTS_TIMEOUT def __init__(self, dataspace): if not dataspace: diff --git a/dejacode_toolkit/scancodeio.py b/dejacode_toolkit/scancodeio.py index c9d20c21..5530e39f 100644 --- a/dejacode_toolkit/scancodeio.py +++ b/dejacode_toolkit/scancodeio.py @@ -146,8 +146,16 @@ def fetch_scan_data(self, data_url): return self.request_get(url=data_url) def stream_scan_data(self, data_url): + """ + Stream scan data from the given URL. + + With stream=True, only headers are fetched initially, so raise_for_status() + can fail fast on errors before any body content is downloaded. + """ logger.debug(f"{self.label}: stream scan data data_url={data_url}") - return self.session.get(url=data_url, stream=True) + response = self.session.get(url=data_url, stream=True) + response.raise_for_status() + return response def delete_scan(self, detail_url): logger.debug(f"{self.label}: delete scan detail_url={detail_url}") diff --git a/dejacode_toolkit/vulnerablecode.py b/dejacode_toolkit/vulnerablecode.py index fb70ba12..34980dcf 100644 --- a/dejacode_toolkit/vulnerablecode.py +++ b/dejacode_toolkit/vulnerablecode.py @@ -126,7 +126,7 @@ def get_vulnerable_cpes(self, components): if not cpes: return [] - search_results = self.bulk_search_by_cpes(cpes, timeout=5) + search_results = self.bulk_search_by_cpes(cpes) if not search_results: return []