From 453198b40859a38decc63cc3c89d4c92745237a6 Mon Sep 17 00:00:00 2001 From: Aidajafarbigloo <143706305+Aidajafarbigloo@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:21:36 +0200 Subject: [PATCH 01/19] Issue #276 - Add a new argument to accept a URL for harvesting --- src/hermes/commands/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py index 82692975..a4fb6410 100644 --- a/src/hermes/commands/base.py +++ b/src/hermes/commands/base.py @@ -104,6 +104,13 @@ def init_common_parser(self, parser: argparse.ArgumentParser) -> None: help="Configuration file in TOML format", ) + # Add a new argument to accept a URL for harvesting + parser.add_argument( + "--url", + type=str, + help="URL from which to extract metadata" + ) + plugin_args = parser.add_argument_group("Extra options") plugin_args.add_argument( "-O", From 3a7c9ad90191e288bb3b256996452e58d6a844a3 Mon Sep 17 00:00:00 2001 From: Aidajafarbigloo <143706305+Aidajafarbigloo@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:50:08 +0200 Subject: [PATCH 02/19] Issue #276 - Harvest metadata from the provided URL --- src/hermes/commands/harvest/base.py | 127 +++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 4 deletions(-) diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py index 4d2a1731..937b5e62 100644 --- a/src/hermes/commands/harvest/base.py +++ b/src/hermes/commands/harvest/base.py @@ -5,15 +5,25 @@ # SPDX-FileContributor: Michael Meinel import argparse -import typing as t +import json +import logging from datetime import datetime +import typing as t +from typing import Optional, Dict, Tuple +import requests from pydantic import BaseModel +from ruamel.yaml import YAML +from cffconvert import Citation from hermes.commands.base import HermesCommand, HermesPlugin from hermes.model.context import HermesContext, HermesHarvestContext from hermes.model.errors import HermesValidationError, MergeError +CITATION_FILE = "CITATION.cff" +CODEMETA_FILE = "codemeta.json" + +logger = logging.getLogger(__name__) class HermesHarvestPlugin(HermesPlugin): """Base plugin that does harvesting. @@ -32,18 +42,60 @@ class HarvestSettings(BaseModel): class HermesHarvestCommand(HermesCommand): - """ Harvest metadata from configured sources. """ + """Harvest metadata from the provided URL or configured sources.""" command_name = "harvest" settings_class = HarvestSettings + def add_arguments(self, parser: argparse.ArgumentParser) -> None: + """Adds arguments for the harvest command to harvest metadata from the specific URL.""" + parser.add_argument('url', nargs='?', default=None, help="Optional URL to harvest from") + def __call__(self, args: argparse.Namespace) -> None: + """Execute the harvesting command based on the provided arguments.""" self.args = args ctx = HermesContext() - - # Initialize the harvest cache directory here to indicate the step ran ctx.init_cache("harvest") + if hasattr(args, 'url') and args.url: + result = self._process_url(args.url, ctx) + if result is None: + logger.error("Failed to process URL: %s", args.url) + else: + self._harvest_locally(ctx) + + def _process_url(self, url: str, ctx: HermesContext) -> Optional[Dict[str, Dict]]: + """Process the provided URL for metadata harvesting.""" + try: + files_to_search = [CITATION_FILE, CODEMETA_FILE] + found_files = self._search_repo_for_metadata(url, files_to_search) + + if not found_files: + raise FileNotFoundError("Neither CITATION.cff nor codemeta.json found in the repository.") + + cff_dict = self._handle_citation_file(found_files) + codemeta_dict = self._handle_codemeta_file(found_files) + + logger.info("Harvesting successful from URL: %s", url) + print('**********************************************************') + print("Original CodeMeta from codemeta.json:") + print(json.dumps(codemeta_dict, indent=4)) + + print('**********************************************************') + print("CFF converted to CodeMeta:") + print(json.dumps(cff_dict, indent=4)) + + return { + "codemeta_from_cff": cff_dict, + "codemeta_json": codemeta_dict + } + + except (FileNotFoundError, ValueError) as e: + logger.error(f"Error processing URL: {e}") + return None + + def _harvest_locally(self, ctx: HermesContext) -> None: + """Harvest metadata from configured sources.""" for plugin_name in self.settings.sources: try: plugin_func = self.plugins[plugin_name]() @@ -64,3 +116,70 @@ def __call__(self, args: argparse.Namespace) -> None: except HermesValidationError as e: self.log.error("Error while executing %s: %s", plugin_name, e) self.errors.append(e) + + def _search_repo_for_metadata(self, repo_url: str, files_to_search: list) -> Dict[str, str]: + """Search for metadata files in the given GitHub repository and return their URLs.""" + repo_api_url = repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/') + '/contents' + + try: + response = requests.get(repo_api_url) + response.raise_for_status() + + repo_files = response.json() + found_files = {file_entry["name"]: file_entry["download_url"] for file_entry in repo_files + if file_entry["name"] in files_to_search} + + return found_files + + except requests.RequestException as e: + if e.response and e.response.status_code == 404: + logger.error(f"Repository not found: {repo_url}") + raise FileNotFoundError(f"Repository {repo_url} not found or is private.") + else: + logger.error(f"Failed to list repository contents: {e}") + raise + + def _fetch_file_from_url(self, file_url: str) -> str: + """Fetch the content of a file from its URL.""" + try: + response = requests.get(file_url) + response.raise_for_status() + return response.text + except requests.RequestException as e: + logger.error(f"Failed to fetch file from {file_url}: {e}") + raise FileNotFoundError(f"Unable to fetch file from {file_url}") + + def _load_cff_from_file(self, cff_data: str) -> dict: + """Load and parse CFF data from a file.""" + yaml = YAML(typ='safe') + yaml.constructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = yaml.constructor.yaml_constructors[ + u'tag:yaml.org,2002:str'] + return yaml.load(cff_data) + + def _convert_cff_to_codemeta(self, cff_data: str) -> dict: + """Convert metadata from CFF to CodeMeta format.""" + codemeta_str = Citation(cff_data).as_codemeta() + return json.loads(codemeta_str) + + def _patch_author_emails(self, cff: dict, codemeta: dict) -> dict: + """Patch author emails from CFF into CodeMeta.""" + cff_authors = cff["authors"] + for i, author in enumerate(cff_authors): + if "email" in author: + codemeta["author"][i]["email"] = author["email"] + return codemeta + + def _handle_citation_file(self, found_files: dict) -> Optional[dict]: + """Handle the CITATION.cff file if found.""" + if CITATION_FILE in found_files: + cff_content = self._fetch_file_from_url(found_files[CITATION_FILE]) + cff_dict = self._load_cff_from_file(cff_content) + return self._convert_cff_to_codemeta(cff_content) + return None + + def _handle_codemeta_file(self, found_files: dict) -> Optional[dict]: + """Handle the codemeta.json file if found.""" + if CODEMETA_FILE in found_files: + codemeta_content = self._fetch_file_from_url(found_files[CODEMETA_FILE]) + return json.loads(codemeta_content) + return None \ No newline at end of file From 153e67649b11b530bc1cae6c5bfc9e1ebc153963 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Sat, 26 Oct 2024 16:43:50 +0200 Subject: [PATCH 03/19] Issue #276 - Store harvested data from URL --- src/hermes/commands/harvest/base.py | 190 +++++++++++++++------------- 1 file changed, 103 insertions(+), 87 deletions(-) diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py index 937b5e62..f0f23546 100644 --- a/src/hermes/commands/harvest/base.py +++ b/src/hermes/commands/harvest/base.py @@ -9,7 +9,9 @@ import logging from datetime import datetime import typing as t -from typing import Optional, Dict, Tuple +from typing import Optional, Dict, Tuple, List +import yaml +from urllib.parse import quote import requests from pydantic import BaseModel @@ -42,102 +44,110 @@ class HarvestSettings(BaseModel): class HermesHarvestCommand(HermesCommand): - """Harvest metadata from the provided URL or configured sources.""" + """ Harvest metadata from configured sources. """ command_name = "harvest" settings_class = HarvestSettings - - def add_arguments(self, parser: argparse.ArgumentParser) -> None: - """Adds arguments for the harvest command to harvest metadata from the specific URL.""" - parser.add_argument('url', nargs='?', default=None, help="Optional URL to harvest from") - - def __call__(self, args: argparse.Namespace) -> None: - """Execute the harvesting command based on the provided arguments.""" + + def __call__(self, args) -> None: self.args = args ctx = HermesContext() ctx.init_cache("harvest") - - if hasattr(args, 'url') and args.url: - result = self._process_url(args.url, ctx) - if result is None: - logger.error("Failed to process URL: %s", args.url) + + if args.url: + self._process_url(args.url, ctx) else: self._harvest_locally(ctx) - def _process_url(self, url: str, ctx: HermesContext) -> Optional[Dict[str, Dict]]: + def _process_url(self, url: str, ctx: HermesContext) -> Optional[Tuple[Dict, Dict]]: """Process the provided URL for metadata harvesting.""" try: files_to_search = [CITATION_FILE, CODEMETA_FILE] - found_files = self._search_repo_for_metadata(url, files_to_search) - + if "github.com" in url: + found_files = self._search_github_repo_for_metadata(url, files_to_search) + elif "gitlab.com" in url: + found_files = self._search_gitlab_repo_for_metadata(url, files_to_search) + else: + raise ValueError("Unsupported repository provider. Only GitHub and GitLab are supported.") if not found_files: - raise FileNotFoundError("Neither CITATION.cff nor codemeta.json found in the repository.") - - cff_dict = self._handle_citation_file(found_files) - codemeta_dict = self._handle_codemeta_file(found_files) - - logger.info("Harvesting successful from URL: %s", url) - print('**********************************************************') - print("Original CodeMeta from codemeta.json:") - print(json.dumps(codemeta_dict, indent=4)) - - print('**********************************************************') - print("CFF converted to CodeMeta:") - print(json.dumps(cff_dict, indent=4)) - - return { - "codemeta_from_cff": cff_dict, - "codemeta_json": codemeta_dict - } - + raise FileNotFoundError(f"Neither {CITATION_FILE} nor {CODEMETA_FILE} found in repository.") + # Process and store metadata from files + self._process_found_files(found_files, ctx) + return None, None except (FileNotFoundError, ValueError) as e: logger.error(f"Error processing URL: {e}") - return None + return None, None + + def _search_github_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]: + """Search for metadata files in a GitHub repository.""" + repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents" + try: + response = requests.get(repo_api_url) + response.raise_for_status() + repo_files = response.json() + return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search} + except requests.HTTPError as e: + logger.error(f"HTTP Error accessing GitHub repository: {repo_url}, {e}") + raise FileNotFoundError(f"GitHub repository {repo_url} not found or is private.") + except requests.RequestException as e: + logger.error(f"Failed to list GitHub repository contents: {e}") + raise + def _search_gitlab_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]: + """Search for metadata files in a GitLab repository.""" + try: + project_path = repo_url.rstrip('/').split('gitlab.com/')[1] + encoded_project = quote(project_path, safe='') + found_files = {} + for file_name in files_to_search: + file_api_url = f"https://gitlab.com/api/v4/projects/{encoded_project}/repository/files/{quote(file_name)}/raw?ref=main" + + response = requests.get(file_api_url) + if response.status_code == 200: + found_files[file_name] = file_api_url + elif response.status_code != 404: + logger.error(f"Error accessing GitLab repository: {repo_url}, {response.status_code}") + raise FileNotFoundError(f"GitLab repository {repo_url} not found or is private.") + return found_files + except requests.RequestException as e: + logger.error(f"Failed to list GitLab repository contents: {e}") + raise + def _harvest_locally(self, ctx: HermesContext) -> None: - """Harvest metadata from configured sources.""" + """Harvest metadata from configured sources using plugins.""" for plugin_name in self.settings.sources: try: plugin_func = self.plugins[plugin_name]() harvested_data, tags = plugin_func(self) - - with HermesHarvestContext(ctx, plugin_name) as harvest_ctx: - harvest_ctx.update_from(harvested_data, - plugin=plugin_name, - timestamp=datetime.now().isoformat(), **tags) - for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items(): - if any(v != _value and t == _tag for v, t in _trace): - raise MergeError(_key, None, _value) - + self.store_harvested_data(ctx, harvested_data, tags, plugin_name) except KeyError as e: - self.log.error("Plugin '%s' not found.", plugin_name) - self.errors.append(e) - + logger.error(f"Plugin '{plugin_name}' not found. Error: {e}") except HermesValidationError as e: - self.log.error("Error while executing %s: %s", plugin_name, e) - self.errors.append(e) + logger.error(f"Error while executing '{plugin_name}': {e}") - def _search_repo_for_metadata(self, repo_url: str, files_to_search: list) -> Dict[str, str]: + def _search_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]: """Search for metadata files in the given GitHub repository and return their URLs.""" - repo_api_url = repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/') + '/contents' - + repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents" try: response = requests.get(repo_api_url) response.raise_for_status() - repo_files = response.json() - found_files = {file_entry["name"]: file_entry["download_url"] for file_entry in repo_files - if file_entry["name"] in files_to_search} - - return found_files - + return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search} + except requests.HTTPError as e: + logger.error(f"HTTP Error accessing repository: {repo_url}, {e}") + raise FileNotFoundError(f"Repository {repo_url} not found or is private.") except requests.RequestException as e: - if e.response and e.response.status_code == 404: - logger.error(f"Repository not found: {repo_url}") - raise FileNotFoundError(f"Repository {repo_url} not found or is private.") - else: - logger.error(f"Failed to list repository contents: {e}") - raise + logger.error(f"Failed to list repository contents: {e}") + raise + + def _process_found_files(self, found_files: Dict[str, str], ctx: HermesContext) -> None: + """Process and store metadata from CFF and CodeMeta files.""" + cff_data = self._handle_citation_file(found_files) + codemeta_data = self._handle_codemeta_file(found_files) + if cff_data: + self.store_harvested_data(ctx, cff_data, {"source_type": "CFF"}, "cff") + if codemeta_data: + self.store_harvested_data(ctx, codemeta_data, {"source_type": "CodeMeta"}, "codemeta") def _fetch_file_from_url(self, file_url: str) -> str: """Fetch the content of a file from its URL.""" @@ -148,38 +158,44 @@ def _fetch_file_from_url(self, file_url: str) -> str: except requests.RequestException as e: logger.error(f"Failed to fetch file from {file_url}: {e}") raise FileNotFoundError(f"Unable to fetch file from {file_url}") - - def _load_cff_from_file(self, cff_data: str) -> dict: - """Load and parse CFF data from a file.""" - yaml = YAML(typ='safe') - yaml.constructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = yaml.constructor.yaml_constructors[ - u'tag:yaml.org,2002:str'] - return yaml.load(cff_data) - - def _convert_cff_to_codemeta(self, cff_data: str) -> dict: - """Convert metadata from CFF to CodeMeta format.""" - codemeta_str = Citation(cff_data).as_codemeta() - return json.loads(codemeta_str) - + def _patch_author_emails(self, cff: dict, codemeta: dict) -> dict: - """Patch author emails from CFF into CodeMeta.""" cff_authors = cff["authors"] for i, author in enumerate(cff_authors): if "email" in author: codemeta["author"][i]["email"] = author["email"] return codemeta - def _handle_citation_file(self, found_files: dict) -> Optional[dict]: + def _handle_citation_file(self, found_files: Dict[str, str]) -> Optional[Dict]: """Handle the CITATION.cff file if found.""" if CITATION_FILE in found_files: - cff_content = self._fetch_file_from_url(found_files[CITATION_FILE]) - cff_dict = self._load_cff_from_file(cff_content) - return self._convert_cff_to_codemeta(cff_content) + cff_content_str = self._fetch_file_from_url(found_files[CITATION_FILE]) + cff_content = yaml.safe_load(cff_content_str) + cff_codemeta_dict = self._convert_cff_to_codemeta(cff_content_str) + cff_codemeta_dict = self._patch_author_emails(cff_content, cff_codemeta_dict) + return cff_codemeta_dict return None - def _handle_codemeta_file(self, found_files: dict) -> Optional[dict]: + def _handle_codemeta_file(self, found_files: Dict[str, str]) -> Optional[Dict]: """Handle the codemeta.json file if found.""" if CODEMETA_FILE in found_files: codemeta_content = self._fetch_file_from_url(found_files[CODEMETA_FILE]) return json.loads(codemeta_content) - return None \ No newline at end of file + return None + + def _convert_cff_to_codemeta(self, cff_data: str) -> Dict: + """Convert metadata from CFF to CodeMeta format.""" + codemeta_str = Citation(cff_data).as_codemeta() + return json.loads(codemeta_str) + + def store_harvested_data(self, ctx: HermesContext, harvested_data: Dict, tags: Dict, source_name: str) -> None: + """Store harvested data into Hermes context.""" + with HermesHarvestContext(ctx, source_name) as harvest_ctx: + harvest_ctx.update_from(harvested_data, plugin=source_name, timestamp=datetime.now().isoformat(), **tags) + self._check_for_merge_conflicts(harvest_ctx) + + def _check_for_merge_conflicts(self, harvest_ctx: HermesHarvestContext) -> None: + """Check for merge conflicts after updating harvest context.""" + for key, ((value, tag), *trace) in harvest_ctx._data.items(): + if any(v != value and t == tag for v, t in trace): + raise MergeError(key, None, value) From 16cba5d80bed0ead3369026a903eeed920dba6da Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Thu, 30 Jan 2025 16:15:18 +0100 Subject: [PATCH 04/19] Issue #276 - Harvest metadata from CFF via path --- src/hermes/commands/harvest/cff.py | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py index 4cc879b0..1d14c10b 100644 --- a/src/hermes/commands/harvest/cff.py +++ b/src/hermes/commands/harvest/cff.py @@ -19,6 +19,7 @@ from hermes.model.context import ContextPath from hermes.model.errors import HermesValidationError from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand +from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo # TODO: should this be configurable via a CLI option? @@ -106,18 +107,23 @@ def _validate(self, cff_file: pathlib.Path, cff_dict: t.Dict) -> bool: return True def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: - # Find CFF files in directories and subdirectories - cff_file = path / 'CITATION.cff' - if cff_file.exists(): - return cff_file - - # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir, - # which is given via the --path arg. Maybe add another option to enable pointing to a single file? - # (So this stays "convention over configuration") - files = list(path.rglob('**/CITATION.cff')) - if len(files) == 1: - return pathlib.Path(files[0]) - # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup? - # TODO: Do we want to hand down a logging instance via Hermes context or just encourage - # peeps to use the Click context? - return None + if str(path).startswith("http:") or str(path).startswith("https:"): + # Find CFF files from the provided URL repository + normalized_url = normalize_url(str(path)) + return fetch_metadata_from_repo(normalized_url, "CITATION.cff") + else: + # Find CFF files in directories and subdirectories + cff_file = path / 'CITATION.cff' + if cff_file.exists(): + return cff_file + + # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir, + # which is given via the --path arg. Maybe add another option to enable pointing to a single file? + # (So this stays "convention over configuration") + files = list(path.rglob('**/CITATION.cff')) + if len(files) == 1: + return pathlib.Path(files[0]) + # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup? + # TODO: Do we want to hand down a logging instance via Hermes context or just encourage + # peeps to use the Click context? + return None From afb818932652654ebc5407e1b9b4416f4f8a9bbf Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Thu, 30 Jan 2025 16:42:17 +0100 Subject: [PATCH 05/19] Issue #276 - Harvest metadata from CodeMeta via path --- src/hermes/commands/harvest/codemeta.py | 27 +++++++++++++++---------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py index b75bb002..dd0143ee 100644 --- a/src/hermes/commands/harvest/codemeta.py +++ b/src/hermes/commands/harvest/codemeta.py @@ -13,7 +13,7 @@ from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin from hermes.commands.harvest.util.validate_codemeta import validate_codemeta from hermes.model.errors import HermesValidationError - +from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo class CodeMetaHarvestPlugin(HermesHarvestPlugin): def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: @@ -56,13 +56,18 @@ def _validate(self, codemeta_file: pathlib.Path) -> bool: return True def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: - # Find CodeMeta files in directories and subdirectories - # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file? - # (So this stays "convention over configuration") - files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True) - if len(files) == 1: - return pathlib.Path(files[0]) - # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup? - # TODO: Do we want to hand down a logging instance via Hermes context or just encourage - # peeps to use the Click context? - return None + if str(path).startswith("http:") or str(path).startswith("https:"): + # Find CodeMeta files from the provided URL repository + normalized_url = normalize_url(str(path)) + return fetch_metadata_from_repo(normalized_url, "codemeta.json") + else: + # Find CodeMeta files in directories and subdirectories + # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file? + # (So this stays "convention over configuration") + files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True) + if len(files) == 1: + return pathlib.Path(files[0]) + # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup? + # TODO: Do we want to hand down a logging instance via Hermes context or just encourage + # peeps to use the Click context? + return None From 09401ed98bc6b97cb04ac5eaecc688366f47a9f4 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Thu, 30 Jan 2025 16:55:03 +0100 Subject: [PATCH 06/19] Issue #276 - Refactor functions for harvesting CFF/CodeMeta via path --- .../harvest/util/remote_harvesting.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 src/hermes/commands/harvest/util/remote_harvesting.py diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py new file mode 100644 index 00000000..fcc3ed32 --- /dev/null +++ b/src/hermes/commands/harvest/util/remote_harvesting.py @@ -0,0 +1,75 @@ +import pathlib +import re +import requests +import tempfile +import typing as t + + +def normalize_url(path: str) -> str: + """Normalize a given URL by correcting backslashes and fixing malformed HTTPS.""" + corrected_url = path.replace("\\", "/") + return corrected_url.replace("https:/", "https://") + + +def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]: + """ + Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository. + + :param repo_url: The repository URL. + :param filename: The name of the metadata file to fetch. + :return: Path to the temporary file containing the downloaded metadata, or None. + """ + try: + if "github.com" in repo_url: + # GitHub API + api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents" + response = requests.get(api_url) + if response.status_code == 200: + for file_info in response.json(): + if file_info["name"] == filename: + return _download_to_tempfile(file_info["download_url"], filename) + elif "gitlab.com" in repo_url: + # GitLab API + match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url) + if match: + base_domain = match.group(1) + group_or_user = match.group(2) + project_name = match.group(3).split('/')[0] + project_path = f"{group_or_user}/{project_name}" + api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree" + + response = requests.get(api_url) + if response.status_code == 200: + for file_info in response.json(): + if file_info["name"] == filename: + file_url = ( + f"https://{base_domain}/api/v4/projects/" + f"{requests.utils.quote(project_path, safe='')}/repository/files/" + f"{requests.utils.quote(filename, safe='')}/raw" + ) + return _download_to_tempfile(file_url, filename) + else: + print(f"Unsupported repository URL: {repo_url}") + return None + except Exception as e: + print(f"Error fetching metadata from repository: {e}") + return None + + +def _download_to_tempfile(url: str, filename: str) -> pathlib.Path: + """ + Download a file from a URL and save it to a temporary file. + + :param url: The URL to download from. + :param filename: The name of the file to save. + :return: Path to the temporary file. + """ + try: + content = requests.get(url).text + with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file: + temp_file.write(content.encode("utf-8")) + print(f"Downloaded {filename} to {temp_file.name}") + return pathlib.Path(temp_file.name) + except Exception as e: + print(f"Error downloading {filename}: {e}") + return None From f193cc96590c426a47dacef63b6389582bbb3a5e Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Thu, 30 Jan 2025 17:13:42 +0100 Subject: [PATCH 07/19] Issue #276 - Revert to original base.py --- src/hermes/commands/harvest/base.py | 179 ++++------------------------ 1 file changed, 22 insertions(+), 157 deletions(-) diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py index f0f23546..460345ea 100644 --- a/src/hermes/commands/harvest/base.py +++ b/src/hermes/commands/harvest/base.py @@ -5,27 +5,15 @@ # SPDX-FileContributor: Michael Meinel import argparse -import json -import logging -from datetime import datetime import typing as t -from typing import Optional, Dict, Tuple, List -import yaml -from urllib.parse import quote +from datetime import datetime -import requests from pydantic import BaseModel -from ruamel.yaml import YAML -from cffconvert import Citation from hermes.commands.base import HermesCommand, HermesPlugin from hermes.model.context import HermesContext, HermesHarvestContext from hermes.model.errors import HermesValidationError, MergeError -CITATION_FILE = "CITATION.cff" -CODEMETA_FILE = "codemeta.json" - -logger = logging.getLogger(__name__) class HermesHarvestPlugin(HermesPlugin): """Base plugin that does harvesting. @@ -37,7 +25,7 @@ def __call__(self, command: HermesCommand) -> t.Tuple[t.Dict, t.Dict]: pass -class HarvestSettings(BaseModel): +class _HarvestSettings(BaseModel): """Generic harvesting settings.""" sources: list[str] = [] @@ -47,155 +35,32 @@ class HermesHarvestCommand(HermesCommand): """ Harvest metadata from configured sources. """ command_name = "harvest" - settings_class = HarvestSettings - - def __call__(self, args) -> None: + settings_class = _HarvestSettings + + def __call__(self, args: argparse.Namespace) -> None: self.args = args ctx = HermesContext() + + # Initialize the harvest cache directory here to indicate the step ran ctx.init_cache("harvest") - - if args.url: - self._process_url(args.url, ctx) - else: - self._harvest_locally(ctx) - - def _process_url(self, url: str, ctx: HermesContext) -> Optional[Tuple[Dict, Dict]]: - """Process the provided URL for metadata harvesting.""" - try: - files_to_search = [CITATION_FILE, CODEMETA_FILE] - if "github.com" in url: - found_files = self._search_github_repo_for_metadata(url, files_to_search) - elif "gitlab.com" in url: - found_files = self._search_gitlab_repo_for_metadata(url, files_to_search) - else: - raise ValueError("Unsupported repository provider. Only GitHub and GitLab are supported.") - if not found_files: - raise FileNotFoundError(f"Neither {CITATION_FILE} nor {CODEMETA_FILE} found in repository.") - # Process and store metadata from files - self._process_found_files(found_files, ctx) - return None, None - except (FileNotFoundError, ValueError) as e: - logger.error(f"Error processing URL: {e}") - return None, None - - def _search_github_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]: - """Search for metadata files in a GitHub repository.""" - repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents" - try: - response = requests.get(repo_api_url) - response.raise_for_status() - repo_files = response.json() - return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search} - except requests.HTTPError as e: - logger.error(f"HTTP Error accessing GitHub repository: {repo_url}, {e}") - raise FileNotFoundError(f"GitHub repository {repo_url} not found or is private.") - except requests.RequestException as e: - logger.error(f"Failed to list GitHub repository contents: {e}") - raise - - def _search_gitlab_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]: - """Search for metadata files in a GitLab repository.""" - try: - project_path = repo_url.rstrip('/').split('gitlab.com/')[1] - encoded_project = quote(project_path, safe='') - found_files = {} - for file_name in files_to_search: - file_api_url = f"https://gitlab.com/api/v4/projects/{encoded_project}/repository/files/{quote(file_name)}/raw?ref=main" - - response = requests.get(file_api_url) - if response.status_code == 200: - found_files[file_name] = file_api_url - elif response.status_code != 404: - logger.error(f"Error accessing GitLab repository: {repo_url}, {response.status_code}") - raise FileNotFoundError(f"GitLab repository {repo_url} not found or is private.") - return found_files - except requests.RequestException as e: - logger.error(f"Failed to list GitLab repository contents: {e}") - raise - - def _harvest_locally(self, ctx: HermesContext) -> None: - """Harvest metadata from configured sources using plugins.""" + for plugin_name in self.settings.sources: try: plugin_func = self.plugins[plugin_name]() harvested_data, tags = plugin_func(self) - self.store_harvested_data(ctx, harvested_data, tags, plugin_name) + + with HermesHarvestContext(ctx, plugin_name) as harvest_ctx: + harvest_ctx.update_from(harvested_data, + plugin=plugin_name, + timestamp=datetime.now().isoformat(), **tags) + for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items(): + if any(v != _value and t == _tag for v, t in _trace): + raise MergeError(_key, None, _value) + except KeyError as e: - logger.error(f"Plugin '{plugin_name}' not found. Error: {e}") + self.log.error("Plugin '%s' not found.", plugin_name) + self.errors.append(e) + except HermesValidationError as e: - logger.error(f"Error while executing '{plugin_name}': {e}") - - def _search_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]: - """Search for metadata files in the given GitHub repository and return their URLs.""" - repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents" - try: - response = requests.get(repo_api_url) - response.raise_for_status() - repo_files = response.json() - return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search} - except requests.HTTPError as e: - logger.error(f"HTTP Error accessing repository: {repo_url}, {e}") - raise FileNotFoundError(f"Repository {repo_url} not found or is private.") - except requests.RequestException as e: - logger.error(f"Failed to list repository contents: {e}") - raise - - def _process_found_files(self, found_files: Dict[str, str], ctx: HermesContext) -> None: - """Process and store metadata from CFF and CodeMeta files.""" - cff_data = self._handle_citation_file(found_files) - codemeta_data = self._handle_codemeta_file(found_files) - if cff_data: - self.store_harvested_data(ctx, cff_data, {"source_type": "CFF"}, "cff") - if codemeta_data: - self.store_harvested_data(ctx, codemeta_data, {"source_type": "CodeMeta"}, "codemeta") - - def _fetch_file_from_url(self, file_url: str) -> str: - """Fetch the content of a file from its URL.""" - try: - response = requests.get(file_url) - response.raise_for_status() - return response.text - except requests.RequestException as e: - logger.error(f"Failed to fetch file from {file_url}: {e}") - raise FileNotFoundError(f"Unable to fetch file from {file_url}") - - def _patch_author_emails(self, cff: dict, codemeta: dict) -> dict: - cff_authors = cff["authors"] - for i, author in enumerate(cff_authors): - if "email" in author: - codemeta["author"][i]["email"] = author["email"] - return codemeta - - def _handle_citation_file(self, found_files: Dict[str, str]) -> Optional[Dict]: - """Handle the CITATION.cff file if found.""" - if CITATION_FILE in found_files: - cff_content_str = self._fetch_file_from_url(found_files[CITATION_FILE]) - cff_content = yaml.safe_load(cff_content_str) - cff_codemeta_dict = self._convert_cff_to_codemeta(cff_content_str) - cff_codemeta_dict = self._patch_author_emails(cff_content, cff_codemeta_dict) - return cff_codemeta_dict - return None - - def _handle_codemeta_file(self, found_files: Dict[str, str]) -> Optional[Dict]: - """Handle the codemeta.json file if found.""" - if CODEMETA_FILE in found_files: - codemeta_content = self._fetch_file_from_url(found_files[CODEMETA_FILE]) - return json.loads(codemeta_content) - return None - - def _convert_cff_to_codemeta(self, cff_data: str) -> Dict: - """Convert metadata from CFF to CodeMeta format.""" - codemeta_str = Citation(cff_data).as_codemeta() - return json.loads(codemeta_str) - - def store_harvested_data(self, ctx: HermesContext, harvested_data: Dict, tags: Dict, source_name: str) -> None: - """Store harvested data into Hermes context.""" - with HermesHarvestContext(ctx, source_name) as harvest_ctx: - harvest_ctx.update_from(harvested_data, plugin=source_name, timestamp=datetime.now().isoformat(), **tags) - self._check_for_merge_conflicts(harvest_ctx) - - def _check_for_merge_conflicts(self, harvest_ctx: HermesHarvestContext) -> None: - """Check for merge conflicts after updating harvest context.""" - for key, ((value, tag), *trace) in harvest_ctx._data.items(): - if any(v != value and t == tag for v, t in trace): - raise MergeError(key, None, value) + self.log.error("Error while executing %s: %s", plugin_name, e) + self.errors.append(e) From 1bab2c762039417dd11063e4330b3a757bb6ac0f Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Thu, 30 Jan 2025 17:41:46 +0100 Subject: [PATCH 08/19] Issue #276 - Update base.py --- src/hermes/commands/base.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py index a4fb6410..499f5e04 100644 --- a/src/hermes/commands/base.py +++ b/src/hermes/commands/base.py @@ -104,13 +104,6 @@ def init_common_parser(self, parser: argparse.ArgumentParser) -> None: help="Configuration file in TOML format", ) - # Add a new argument to accept a URL for harvesting - parser.add_argument( - "--url", - type=str, - help="URL from which to extract metadata" - ) - plugin_args = parser.add_argument_group("Extra options") plugin_args.add_argument( "-O", @@ -138,7 +131,7 @@ def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None: def load_settings(self, args: argparse.Namespace): """Load settings from the configuration file (passed in from command line).""" - toml_data = toml.load(args.path / args.config) + toml_data = toml.load("." / args.config) self.root_settings = HermesCommand.settings_class.model_validate(toml_data) self.settings = getattr(self.root_settings, self.command_name) From 25eec31146d7851532f5cfcd10d11eea3cc9ee7f Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Fri, 7 Feb 2025 14:24:33 +0100 Subject: [PATCH 09/19] Add functionality to remove temp files Add functionality to remove temp files generated during remote harvesting. --- .../commands/harvest/util/remote_harvesting.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py index fcc3ed32..2b504cde 100644 --- a/src/hermes/commands/harvest/util/remote_harvesting.py +++ b/src/hermes/commands/harvest/util/remote_harvesting.py @@ -3,7 +3,7 @@ import requests import tempfile import typing as t - +import os def normalize_url(path: str) -> str: """Normalize a given URL by correcting backslashes and fixing malformed HTTPS.""" @@ -73,3 +73,14 @@ def _download_to_tempfile(url: str, filename: str) -> pathlib.Path: except Exception as e: print(f"Error downloading {filename}: {e}") return None + + +def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")): + """ + Removes a temporary file if it is inside the temp directory. + + :param file_path: The file path to check and remove. + :param temp_dir: The directory considered as temporary (default: "C:/Temp"). + """ + if str(file_path).startswith(str(temp_dir)): + os.remove(file_path) From 98814f4913726b2b442564f5411d23669ac8b090 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Fri, 7 Feb 2025 14:46:15 +0100 Subject: [PATCH 10/19] Remove temp files Remove temp files after harvesting CFF metadata --- src/hermes/commands/harvest/cff.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py index 1d14c10b..9a39f5e4 100644 --- a/src/hermes/commands/harvest/cff.py +++ b/src/hermes/commands/harvest/cff.py @@ -19,7 +19,7 @@ from hermes.model.context import ContextPath from hermes.model.errors import HermesValidationError from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand -from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo +from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file # TODO: should this be configurable via a CLI option? @@ -46,6 +46,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: # Read the content cff_data = cff_file.read_text() + remove_temp_file(cff_file) # Validate the content to be correct CFF cff_dict = self._load_cff_from_file(cff_data) if command.settings.cff.enable_validation and not self._validate(cff_file, cff_dict): From dd56827aadf446aa6e86ef2760cc992d6af06c0c Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Fri, 7 Feb 2025 14:51:41 +0100 Subject: [PATCH 11/19] Remove temp files Remove temp files after harvesting CodeMeta metadata --- src/hermes/commands/harvest/codemeta.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py index dd0143ee..7e6f2113 100644 --- a/src/hermes/commands/harvest/codemeta.py +++ b/src/hermes/commands/harvest/codemeta.py @@ -13,7 +13,7 @@ from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin from hermes.commands.harvest.util.validate_codemeta import validate_codemeta from hermes.model.errors import HermesValidationError -from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo +from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file class CodeMetaHarvestPlugin(HermesHarvestPlugin): def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: @@ -38,6 +38,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: if not self._validate(codemeta_file): raise HermesValidationError(codemeta_file) + remove_temp_file(codemeta_file) codemeta = json.loads(codemeta_str) return codemeta, {'local_path': str(codemeta_file)} From 5f75ad1c4ceef912c864d6344c6dd21777713f53 Mon Sep 17 00:00:00 2001 From: Aidajafarbigloo Date: Thu, 13 Feb 2025 16:05:50 +0100 Subject: [PATCH 12/19] Issue #276 - Add SPDX headers --- src/hermes/commands/harvest/util/remote_harvesting.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py index 2b504cde..f57c766f 100644 --- a/src/hermes/commands/harvest/util/remote_harvesting.py +++ b/src/hermes/commands/harvest/util/remote_harvesting.py @@ -1,3 +1,10 @@ +# SPDX-FileCopyrightText: 2025 OFFIS e.V. +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Stephan Ferenz +# SPDX-FileContributor: Aida Jafarbigloo + import pathlib import re import requests From 4d901fc7e385ba0f0d06bbed04ec51c19ca9be2f Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Mon, 14 Apr 2025 14:34:36 +0200 Subject: [PATCH 13/19] Update base.py To support repository URL as a path --- src/hermes/commands/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py index 3ae9030b..5a242a76 100644 --- a/src/hermes/commands/base.py +++ b/src/hermes/commands/base.py @@ -132,7 +132,7 @@ def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None: def load_settings(self, args: argparse.Namespace): """Load settings from the configuration file (passed in from command line).""" try: - toml_data = toml.load(args.path / args.config) + toml_data = toml.load("." / args.config) self.root_settings = HermesCommand.settings_class.model_validate(toml_data) self.settings = getattr(self.root_settings, self.command_name) except FileNotFoundError as e: From 3aa06a0f04576a4e5326b53c2694a31ddba8c7b4 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Wed, 14 May 2025 08:59:19 +0000 Subject: [PATCH 14/19] Fix issues: HERMES user agent and temporary files --- src/hermes/commands/harvest/cff.py | 20 ++++--- src/hermes/commands/harvest/codemeta.py | 15 +++-- .../harvest/util/remote_harvesting.py | 56 ++++++++++--------- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py index e07dbb14..48e9a8a9 100644 --- a/src/hermes/commands/harvest/cff.py +++ b/src/hermes/commands/harvest/cff.py @@ -19,7 +19,7 @@ from hermes.model.context import ContextPath from hermes.model.errors import HermesValidationError from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand -from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file +from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo # TODO: should this be configurable via a CLI option? @@ -38,7 +38,9 @@ class CffHarvestPlugin(HermesHarvestPlugin): def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: # Get source files - cff_file = self._get_single_cff(command.args.path) + + cff_file, temp_dir_obj = self._get_single_cff(command.args.path) + if not cff_file: raise HermesValidationError(f'{command.args.path} contains either no or more than 1 CITATION.cff file. ' 'Aborting harvesting for this metadata source.') @@ -46,7 +48,10 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: # Read the content cff_data = cff_file.read_text() - remove_temp_file(cff_file) + # clean up the temp + if temp_dir_obj: + temp_dir_obj.cleanup() + # Validate the content to be correct CFF cff_dict = self._load_cff_from_file(cff_data) @@ -114,20 +119,21 @@ def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: if str(path).startswith("http:") or str(path).startswith("https:"): # Find CFF files from the provided URL repository normalized_url = normalize_url(str(path)) - return fetch_metadata_from_repo(normalized_url, "CITATION.cff") + file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff") + return file_info else: # Find CFF files in directories and subdirectories cff_file = path / 'CITATION.cff' if cff_file.exists(): - return cff_file + return cff_file, None # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir, # which is given via the --path arg. Maybe add another option to enable pointing to a single file? # (So this stays "convention over configuration") files = list(path.rglob('**/CITATION.cff')) if len(files) == 1: - return pathlib.Path(files[0]) + return pathlib.Path(files[0]), None # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup? # TODO: Do we want to hand down a logging instance via Hermes context or just encourage # peeps to use the Click context? - return None + return None, None \ No newline at end of file diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py index 7e6f2113..bb37b097 100644 --- a/src/hermes/commands/harvest/codemeta.py +++ b/src/hermes/commands/harvest/codemeta.py @@ -13,7 +13,7 @@ from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin from hermes.commands.harvest.util.validate_codemeta import validate_codemeta from hermes.model.errors import HermesValidationError -from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file +from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo class CodeMetaHarvestPlugin(HermesHarvestPlugin): def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: @@ -25,7 +25,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: :param ctx: The harvesting context that should contain the provided metadata. """ # Get source files - codemeta_file = self._get_single_codemeta(command.args.path) + codemeta_file, temp_dir_obj = self._get_single_codemeta(command.args.path) if not codemeta_file: raise HermesValidationError( f"{command.args.path} contains either no or more than 1 codemeta.json file. Aborting harvesting " @@ -38,7 +38,9 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: if not self._validate(codemeta_file): raise HermesValidationError(codemeta_file) - remove_temp_file(codemeta_file) + if temp_dir_obj: + temp_dir_obj.cleanup() + codemeta = json.loads(codemeta_str) return codemeta, {'local_path': str(codemeta_file)} @@ -60,15 +62,16 @@ def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: if str(path).startswith("http:") or str(path).startswith("https:"): # Find CodeMeta files from the provided URL repository normalized_url = normalize_url(str(path)) - return fetch_metadata_from_repo(normalized_url, "codemeta.json") + file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json") + return file_info else: # Find CodeMeta files in directories and subdirectories # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file? # (So this stays "convention over configuration") files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True) if len(files) == 1: - return pathlib.Path(files[0]) + return pathlib.Path(files[0]), None # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup? # TODO: Do we want to hand down a logging instance via Hermes context or just encourage # peeps to use the Click context? - return None + return None, None \ No newline at end of file diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py index f57c766f..60c1d928 100644 --- a/src/hermes/commands/harvest/util/remote_harvesting.py +++ b/src/hermes/commands/harvest/util/remote_harvesting.py @@ -12,29 +12,38 @@ import typing as t import os +from hermes.utils import hermes_user_agent + +session = requests.Session() +session.headers.update({"User-Agent": hermes_user_agent}) + def normalize_url(path: str) -> str: """Normalize a given URL by correcting backslashes and fixing malformed HTTPS.""" corrected_url = path.replace("\\", "/") return corrected_url.replace("https:/", "https://") -def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]: +def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[t.Tuple[pathlib.Path, tempfile.TemporaryDirectory]]: """ Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository. :param repo_url: The repository URL. :param filename: The name of the metadata file to fetch. - :return: Path to the temporary file containing the downloaded metadata, or None. + :return: Tuple of (Path to the temporary file, TemporaryDirectory object) or None. """ try: + temp_dir_obj = tempfile.TemporaryDirectory() + temp_dir = pathlib.Path(temp_dir_obj.name) + if "github.com" in repo_url: # GitHub API api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents" - response = requests.get(api_url) + response = session.get(api_url) if response.status_code == 200: for file_info in response.json(): if file_info["name"] == filename: - return _download_to_tempfile(file_info["download_url"], filename) + temp_file = _download_to_tempfile(file_info["download_url"], filename, temp_dir) + return temp_file, temp_dir_obj elif "gitlab.com" in repo_url: # GitLab API match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url) @@ -45,7 +54,7 @@ def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib project_path = f"{group_or_user}/{project_name}" api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree" - response = requests.get(api_url) + response = session.get(api_url) if response.status_code == 200: for file_info in response.json(): if file_info["name"] == filename: @@ -54,40 +63,37 @@ def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib f"{requests.utils.quote(project_path, safe='')}/repository/files/" f"{requests.utils.quote(filename, safe='')}/raw" ) - return _download_to_tempfile(file_url, filename) + temp_file = _download_to_tempfile(file_url, filename, temp_dir) + return temp_file, temp_dir_obj else: print(f"Unsupported repository URL: {repo_url}") + temp_dir_obj.cleanup() return None except Exception as e: print(f"Error fetching metadata from repository: {e}") return None -def _download_to_tempfile(url: str, filename: str) -> pathlib.Path: +def _download_to_tempfile(url: str, filename: str, temp_dir: tempfile.TemporaryDirectory) -> pathlib.Path: """ - Download a file from a URL and save it to a temporary file. + Download a file from a URL and save it to a temporary directory. :param url: The URL to download from. :param filename: The name of the file to save. + :param temp_dir: TemporaryDirectory where the file will be saved. :return: Path to the temporary file. """ try: - content = requests.get(url).text - with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file: - temp_file.write(content.encode("utf-8")) - print(f"Downloaded {filename} to {temp_file.name}") - return pathlib.Path(temp_file.name) + response = session.get(url) + if response.status_code == 200: + content = requests.get(url).text + file_path = temp_dir / filename + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + return pathlib.Path(file_path) + else: + print(f"Failed to download {filename}: {response.status_code}") + return None except Exception as e: print(f"Error downloading {filename}: {e}") - return None - - -def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")): - """ - Removes a temporary file if it is inside the temp directory. - - :param file_path: The file path to check and remove. - :param temp_dir: The directory considered as temporary (default: "C:/Temp"). - """ - if str(file_path).startswith(str(temp_dir)): - os.remove(file_path) + return None \ No newline at end of file From 1bd4d1f8e17e6e5a5e39c385abfb400323d824cc Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Wed, 14 May 2025 13:44:24 +0000 Subject: [PATCH 15/19] Fix hermes clean command --- src/hermes/commands/clean/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hermes/commands/clean/base.py b/src/hermes/commands/clean/base.py index b588faf5..4e5e4ea2 100644 --- a/src/hermes/commands/clean/base.py +++ b/src/hermes/commands/clean/base.py @@ -6,6 +6,7 @@ import argparse import shutil +import logging from pydantic import BaseModel @@ -27,6 +28,7 @@ def __call__(self, args: argparse.Namespace) -> None: self.log.info("Removing HERMES caches...") # Naive implementation for now... check errors, validate directory, don't construct the path ourselves, etc. + logging.shutdown() shutil.rmtree(args.path / '.hermes') def load_settings(self, args: argparse.Namespace): From 3918954c463cf50916e824ac63d4e4e15e3c5277 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Sat, 17 May 2025 08:00:16 +0000 Subject: [PATCH 16/19] Small fix --- src/hermes/commands/harvest/cff.py | 5 ++++- src/hermes/commands/harvest/codemeta.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py index 48e9a8a9..4e6b0042 100644 --- a/src/hermes/commands/harvest/cff.py +++ b/src/hermes/commands/harvest/cff.py @@ -120,7 +120,10 @@ def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: # Find CFF files from the provided URL repository normalized_url = normalize_url(str(path)) file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff") - return file_info + if not file_info: + return {} + else: + return file_info else: # Find CFF files in directories and subdirectories cff_file = path / 'CITATION.cff' diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py index bb37b097..661cc4c0 100644 --- a/src/hermes/commands/harvest/codemeta.py +++ b/src/hermes/commands/harvest/codemeta.py @@ -63,7 +63,10 @@ def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: # Find CodeMeta files from the provided URL repository normalized_url = normalize_url(str(path)) file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json") - return file_info + if not file_info: + return None, None + else: + return file_info else: # Find CodeMeta files in directories and subdirectories # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file? From f170481869cfff8496fa4612da5d546911d020de Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Fri, 6 Jun 2025 09:19:22 +0000 Subject: [PATCH 17/19] Load token from toml file --- src/hermes/commands/harvest/util/token.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/hermes/commands/harvest/util/token.py diff --git a/src/hermes/commands/harvest/util/token.py b/src/hermes/commands/harvest/util/token.py new file mode 100644 index 00000000..a3539d4f --- /dev/null +++ b/src/hermes/commands/harvest/util/token.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2025 OFFIS e.V. +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Stephan Ferenz +# SPDX-FileContributor: Aida Jafarbigloo + +import toml +import base64 + + +def load_token_from_toml(config_path: str = "hermes.toml") -> str: + """ + Loads and decodes the token from the HERMES TOML configuration file. + + Args: + config_path (str): Path to the TOML config file. + + Returns: + str: The decoded token. + """ + with open(config_path, "r") as f: + config = toml.load(f) + + encoded_token = config.get('harvest', {}).get('token') + if encoded_token: + return base64.b64decode(encoded_token.encode()).decode() + else: + return None From a89425929db1ac2fe674a6012df6b31fd6206f49 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Fri, 6 Jun 2025 09:22:35 +0000 Subject: [PATCH 18/19] Use token --- src/hermes/commands/harvest/cff.py | 9 +- src/hermes/commands/harvest/codemeta.py | 8 +- .../harvest/util/remote_harvesting.py | 144 ++++++++++++------ 3 files changed, 114 insertions(+), 47 deletions(-) diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py index 4e6b0042..8dd36ef2 100644 --- a/src/hermes/commands/harvest/cff.py +++ b/src/hermes/commands/harvest/cff.py @@ -20,6 +20,7 @@ from hermes.model.errors import HermesValidationError from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo +from hermes.commands.harvest.util.token import load_token_from_toml # TODO: should this be configurable via a CLI option? @@ -31,12 +32,16 @@ class CffHarvestSettings(BaseModel): """Custom settings for CFF harvester.""" enable_validation: bool = True + token: str = '' class CffHarvestPlugin(HermesHarvestPlugin): settings_class = CffHarvestSettings def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: + + self.token = load_token_from_toml('hermes.toml') + # Get source files cff_file, temp_dir_obj = self._get_single_cff(command.args.path) @@ -46,7 +51,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: 'Aborting harvesting for this metadata source.') # Read the content - cff_data = cff_file.read_text() + cff_data = cff_file.read_text(encoding='utf-8') # clean up the temp if temp_dir_obj: @@ -119,7 +124,7 @@ def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: if str(path).startswith("http:") or str(path).startswith("https:"): # Find CFF files from the provided URL repository normalized_url = normalize_url(str(path)) - file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff") + file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff", token=self.token) if not file_info: return {} else: diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py index 661cc4c0..56db9b55 100644 --- a/src/hermes/commands/harvest/codemeta.py +++ b/src/hermes/commands/harvest/codemeta.py @@ -14,9 +14,13 @@ from hermes.commands.harvest.util.validate_codemeta import validate_codemeta from hermes.model.errors import HermesValidationError from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo +from hermes.commands.harvest.util.token import load_token_from_toml class CodeMetaHarvestPlugin(HermesHarvestPlugin): def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: + + self.token = load_token_from_toml('hermes.toml') + """ Implementation of a harvester that provides data from a codemeta.json file format. @@ -33,7 +37,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]: ) # Read the content - codemeta_str = codemeta_file.read_text() + codemeta_str = codemeta_file.read_text(encoding='utf-8') if not self._validate(codemeta_file): raise HermesValidationError(codemeta_file) @@ -62,7 +66,7 @@ def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]: if str(path).startswith("http:") or str(path).startswith("https:"): # Find CodeMeta files from the provided URL repository normalized_url = normalize_url(str(path)) - file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json") + file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json", token=self.token) if not file_info: return None, None else: diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py index 60c1d928..114088ed 100644 --- a/src/hermes/commands/harvest/util/remote_harvesting.py +++ b/src/hermes/commands/harvest/util/remote_harvesting.py @@ -6,16 +6,13 @@ # SPDX-FileContributor: Aida Jafarbigloo import pathlib -import re import requests import tempfile import typing as t -import os +from urllib.parse import urlparse, quote from hermes.utils import hermes_user_agent -session = requests.Session() -session.headers.update({"User-Agent": hermes_user_agent}) def normalize_url(path: str) -> str: """Normalize a given URL by correcting backslashes and fixing malformed HTTPS.""" @@ -23,77 +20,138 @@ def normalize_url(path: str) -> str: return corrected_url.replace("https:/", "https://") -def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[t.Tuple[pathlib.Path, tempfile.TemporaryDirectory]]: +def fetch_metadata_from_repo(repo_url: str, filename: str, token: str = None) -> t.Optional[t.Tuple[pathlib.Path, tempfile.TemporaryDirectory]]: """ Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository. :param repo_url: The repository URL. :param filename: The name of the metadata file to fetch. - :return: Tuple of (Path to the temporary file, TemporaryDirectory object) or None. + :param token: (Optional) Access token for authentication (GitHub token or GitLab private token). + :return: A tuple containing: + - Path to the downloaded metadata file. + - TemporaryDirectory object (caller is responsible for cleanup). + Returns None if the file could not be fetched. """ try: + session = requests.Session() + session.headers.update({"User-Agent": hermes_user_agent}) + if token: + if "github" in repo_url: + session.headers.update({"Authorization": f"token {token}"}) + elif "gitlab" in repo_url: + session.headers.update({"PRIVATE-TOKEN": token}) + temp_dir_obj = tempfile.TemporaryDirectory() temp_dir = pathlib.Path(temp_dir_obj.name) - + + parsed_url = urlparse(repo_url) + if "github.com" in repo_url: - # GitHub API + # GitHub API: List repository contents api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents" response = session.get(api_url) if response.status_code == 200: for file_info in response.json(): if file_info["name"] == filename: - temp_file = _download_to_tempfile(file_info["download_url"], filename, temp_dir) + temp_file = _download_to_tempfile(file_info["download_url"], filename, temp_dir, session) return temp_file, temp_dir_obj - elif "gitlab.com" in repo_url: - # GitLab API - match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url) - if match: - base_domain = match.group(1) - group_or_user = match.group(2) - project_name = match.group(3).split('/')[0] - project_path = f"{group_or_user}/{project_name}" - api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree" - - response = session.get(api_url) - if response.status_code == 200: - for file_info in response.json(): - if file_info["name"] == filename: - file_url = ( - f"https://{base_domain}/api/v4/projects/" - f"{requests.utils.quote(project_path, safe='')}/repository/files/" - f"{requests.utils.quote(filename, safe='')}/raw" - ) - temp_file = _download_to_tempfile(file_url, filename, temp_dir) - return temp_file, temp_dir_obj + elif "gitlab" in parsed_url.netloc: + # GitLab API + temp_file, temp_dir = _fetch_from_gitlab(parsed_url, filename, temp_dir, session) + if temp_file: + return temp_file, temp_dir_obj else: print(f"Unsupported repository URL: {repo_url}") temp_dir_obj.cleanup() return None + except Exception as e: print(f"Error fetching metadata from repository: {e}") return None -def _download_to_tempfile(url: str, filename: str, temp_dir: tempfile.TemporaryDirectory) -> pathlib.Path: +def _fetch_from_gitlab(parsed_url, filename, temp_dir, session): """ - Download a file from a URL and save it to a temporary directory. - - :param url: The URL to download from. - :param filename: The name of the file to save. - :param temp_dir: TemporaryDirectory where the file will be saved. - :return: Path to the temporary file. + Helper function to fetch a file from GitLab. """ + base_domain = parsed_url.netloc + project_path = parsed_url.path.lstrip('/') + encoded_project_path = quote(project_path, safe='') + + # Step 1: Detect default branch + project_api_url = f"https://{base_domain}/api/v4/projects/{encoded_project_path}" + project_resp = session.get(project_api_url) + if project_resp.status_code != 200: + print(f"Failed to fetch project info: {project_resp.status_code}") + return None, None + + project_info = project_resp.json() + default_branch = project_info.get('default_branch', 'main') # fallback to 'main' if not found + + # Step 2: Search for the file recursively + page = 1 + per_page = 100 + found_file = None + + while True: + api_url = ( + f"https://{base_domain}/api/v4/projects/{encoded_project_path}/repository/tree" + f"?recursive=true&per_page={per_page}&page={page}" + ) + response = session.get(api_url) + if response.status_code != 200: + print(f"Failed to fetch repo tree: {response.status_code}") + break + + files_list = response.json() + if not files_list: + break + + for file_info in files_list: + if file_info.get("type") == "blob" and file_info.get("name", "").lower() == filename.lower(): + found_file = file_info + break + + if found_file: + break + + page += 1 + + # Step 3: Download the file + if found_file: + file_path_in_repo = found_file["path"] + file_url = ( + f"https://{base_domain}/api/v4/projects/" + f"{encoded_project_path}/repository/files/" + f"{quote(file_path_in_repo, safe='')}/raw?ref={default_branch}" + ) + temp_file = _download_to_tempfile(file_url, filename, temp_dir, session) + if temp_file: + print(f"Downloaded file: {temp_file}") + return temp_file, temp_dir + + print(f"{filename} not found in repository.") + return None, None + + + +def _download_to_tempfile(url: str, filename: str, temp_dir: pathlib.Path, session: requests.Session) -> pathlib.Path: try: - response = session.get(url) + response = session.get(url) if response.status_code == 200: - content = requests.get(url).text - file_path = temp_dir / filename - with open(file_path, 'w', encoding='utf-8') as f: - f.write(content) + file_path = temp_dir / filename + + try: + text = response.content.decode('utf-8') + with open(file_path, 'w', encoding='utf-8') as f: + f.write(text) + except UnicodeDecodeError: + with open(file_path, 'wb') as f: + f.write(response.content) return pathlib.Path(file_path) else: print(f"Failed to download {filename}: {response.status_code}") return None except Exception as e: print(f"Error downloading {filename}: {e}") - return None \ No newline at end of file + return None From 14fc040afd4b7f7268b937477dce56bd7695fbf3 Mon Sep 17 00:00:00 2001 From: Aida Jafarbigloo Date: Fri, 6 Jun 2025 09:26:36 +0000 Subject: [PATCH 19/19] Small fix --- src/hermes/commands/harvest/cff.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py index 8dd36ef2..57fcf19f 100644 --- a/src/hermes/commands/harvest/cff.py +++ b/src/hermes/commands/harvest/cff.py @@ -32,7 +32,6 @@ class CffHarvestSettings(BaseModel): """Custom settings for CFF harvester.""" enable_validation: bool = True - token: str = '' class CffHarvestPlugin(HermesHarvestPlugin):