From 453198b40859a38decc63cc3c89d4c92745237a6 Mon Sep 17 00:00:00 2001
From: Aidajafarbigloo <143706305+Aidajafarbigloo@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:21:36 +0200
Subject: [PATCH 01/19] Issue #276 - Add a new argument to accept a URL for
 harvesting

---
 src/hermes/commands/base.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py
index 82692975..a4fb6410 100644
--- a/src/hermes/commands/base.py
+++ b/src/hermes/commands/base.py
@@ -104,6 +104,13 @@ def init_common_parser(self, parser: argparse.ArgumentParser) -> None:
             help="Configuration file in TOML format",
         )
 
+        # Add a new argument to accept a URL for harvesting
+        parser.add_argument(
+            "--url",
+            type=str,
+            help="URL from which to extract metadata"
+        )
+        
         plugin_args = parser.add_argument_group("Extra options")
         plugin_args.add_argument(
             "-O",

From 3a7c9ad90191e288bb3b256996452e58d6a844a3 Mon Sep 17 00:00:00 2001
From: Aidajafarbigloo <143706305+Aidajafarbigloo@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:50:08 +0200
Subject: [PATCH 02/19] Issue #276 - Harvest metadata from the provided URL

---
 src/hermes/commands/harvest/base.py | 127 +++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 4 deletions(-)

diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py
index 4d2a1731..937b5e62 100644
--- a/src/hermes/commands/harvest/base.py
+++ b/src/hermes/commands/harvest/base.py
@@ -5,15 +5,25 @@
 # SPDX-FileContributor: Michael Meinel
 
 import argparse
-import typing as t
+import json
+import logging
 from datetime import datetime
+import typing as t
+from typing import Optional, Dict, Tuple
 
+import requests
 from pydantic import BaseModel
+from ruamel.yaml import YAML
+from cffconvert import Citation
 
 from hermes.commands.base import HermesCommand, HermesPlugin
 from hermes.model.context import HermesContext, HermesHarvestContext
 from hermes.model.errors import HermesValidationError, MergeError
 
+CITATION_FILE = "CITATION.cff"
+CODEMETA_FILE = "codemeta.json"
+
+logger = logging.getLogger(__name__)
 
 class HermesHarvestPlugin(HermesPlugin):
     """Base plugin that does harvesting.
@@ -32,18 +42,60 @@ class HarvestSettings(BaseModel):
 
 
 class HermesHarvestCommand(HermesCommand):
-    """ Harvest metadata from configured sources. """
+    """Harvest metadata from the provided URL or configured sources."""
 
     command_name = "harvest"
     settings_class = HarvestSettings
 
+    def add_arguments(self, parser: argparse.ArgumentParser) -> None:
+        """Adds arguments for the harvest command to harvest metadata from the specific URL."""
+        parser.add_argument('url', nargs='?', default=None, help="Optional URL to harvest from")
+
     def __call__(self, args: argparse.Namespace) -> None:
+        """Execute the harvesting command based on the provided arguments."""
         self.args = args
         ctx = HermesContext()
-
-        # Initialize the harvest cache directory here to indicate the step ran
         ctx.init_cache("harvest")
 
+        if hasattr(args, 'url') and args.url:
+            result = self._process_url(args.url, ctx)
+            if result is None:
+                logger.error("Failed to process URL: %s", args.url)
+        else:
+            self._harvest_locally(ctx)
+
+    def _process_url(self, url: str, ctx: HermesContext) -> Optional[Dict[str, Dict]]:
+        """Process the provided URL for metadata harvesting."""
+        try:
+            files_to_search = [CITATION_FILE, CODEMETA_FILE]
+            found_files = self._search_repo_for_metadata(url, files_to_search)
+
+            if not found_files:
+                raise FileNotFoundError("Neither CITATION.cff nor codemeta.json found in the repository.")
+
+            cff_dict = self._handle_citation_file(found_files)
+            codemeta_dict = self._handle_codemeta_file(found_files)
+
+            logger.info("Harvesting successful from URL: %s", url)
+            print('**********************************************************')
+            print("Original CodeMeta from codemeta.json:")
+            print(json.dumps(codemeta_dict, indent=4))
+
+            print('**********************************************************')
+            print("CFF converted to CodeMeta:")
+            print(json.dumps(cff_dict, indent=4))
+
+            return {
+                "codemeta_from_cff": cff_dict,
+                "codemeta_json": codemeta_dict
+            }
+
+        except (FileNotFoundError, ValueError) as e:
+            logger.error(f"Error processing URL: {e}")
+            return None
+
+    def _harvest_locally(self, ctx: HermesContext) -> None:
+        """Harvest metadata from configured sources."""
         for plugin_name in self.settings.sources:
             try:
                 plugin_func = self.plugins[plugin_name]()
@@ -64,3 +116,70 @@ def __call__(self, args: argparse.Namespace) -> None:
             except HermesValidationError as e:
                 self.log.error("Error while executing %s: %s", plugin_name, e)
                 self.errors.append(e)
+
+    def _search_repo_for_metadata(self, repo_url: str, files_to_search: list) -> Dict[str, str]:
+        """Search for metadata files in the given GitHub repository and return their URLs."""
+        repo_api_url = repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/') + '/contents'
+
+        try:
+            response = requests.get(repo_api_url)
+            response.raise_for_status()
+
+            repo_files = response.json()
+            found_files = {file_entry["name"]: file_entry["download_url"] for file_entry in repo_files
+                           if file_entry["name"] in files_to_search}
+
+            return found_files
+
+        except requests.RequestException as e:
+            if e.response and e.response.status_code == 404:
+                logger.error(f"Repository not found: {repo_url}")
+                raise FileNotFoundError(f"Repository {repo_url} not found or is private.")
+            else:
+                logger.error(f"Failed to list repository contents: {e}")
+                raise
+
+    def _fetch_file_from_url(self, file_url: str) -> str:
+        """Fetch the content of a file from its URL."""
+        try:
+            response = requests.get(file_url)
+            response.raise_for_status()
+            return response.text
+        except requests.RequestException as e:
+            logger.error(f"Failed to fetch file from {file_url}: {e}")
+            raise FileNotFoundError(f"Unable to fetch file from {file_url}")
+
+    def _load_cff_from_file(self, cff_data: str) -> dict:
+        """Load and parse CFF data from a file."""
+        yaml = YAML(typ='safe')
+        yaml.constructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = yaml.constructor.yaml_constructors[
+            u'tag:yaml.org,2002:str']
+        return yaml.load(cff_data)
+
+    def _convert_cff_to_codemeta(self, cff_data: str) -> dict:
+        """Convert metadata from CFF to CodeMeta format."""
+        codemeta_str = Citation(cff_data).as_codemeta()
+        return json.loads(codemeta_str)
+
+    def _patch_author_emails(self, cff: dict, codemeta: dict) -> dict:
+        """Patch author emails from CFF into CodeMeta."""
+        cff_authors = cff["authors"]
+        for i, author in enumerate(cff_authors):
+            if "email" in author:
+                codemeta["author"][i]["email"] = author["email"]
+        return codemeta
+
+    def _handle_citation_file(self, found_files: dict) -> Optional[dict]:
+        """Handle the CITATION.cff file if found."""
+        if CITATION_FILE in found_files:
+            cff_content = self._fetch_file_from_url(found_files[CITATION_FILE])
+            cff_dict = self._load_cff_from_file(cff_content)
+            return self._convert_cff_to_codemeta(cff_content)
+        return None
+
+    def _handle_codemeta_file(self, found_files: dict) -> Optional[dict]:
+        """Handle the codemeta.json file if found."""
+        if CODEMETA_FILE in found_files:
+            codemeta_content = self._fetch_file_from_url(found_files[CODEMETA_FILE])
+            return json.loads(codemeta_content)
+        return None
\ No newline at end of file

From 153e67649b11b530bc1cae6c5bfc9e1ebc153963 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Sat, 26 Oct 2024 16:43:50 +0200
Subject: [PATCH 03/19] Issue #276 - Store harvested data from URL

---
 src/hermes/commands/harvest/base.py | 190 +++++++++++++++-------------
 1 file changed, 103 insertions(+), 87 deletions(-)

diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py
index 937b5e62..f0f23546 100644
--- a/src/hermes/commands/harvest/base.py
+++ b/src/hermes/commands/harvest/base.py
@@ -9,7 +9,9 @@
 import logging
 from datetime import datetime
 import typing as t
-from typing import Optional, Dict, Tuple
+from typing import Optional, Dict, Tuple, List
+import yaml
+from urllib.parse import quote
 
 import requests
 from pydantic import BaseModel
@@ -42,102 +44,110 @@ class HarvestSettings(BaseModel):
 
 
 class HermesHarvestCommand(HermesCommand):
-    """Harvest metadata from the provided URL or configured sources."""
+    """ Harvest metadata from configured sources. """
 
     command_name = "harvest"
     settings_class = HarvestSettings
-
-    def add_arguments(self, parser: argparse.ArgumentParser) -> None:
-        """Adds arguments for the harvest command to harvest metadata from the specific URL."""
-        parser.add_argument('url', nargs='?', default=None, help="Optional URL to harvest from")
-
-    def __call__(self, args: argparse.Namespace) -> None:
-        """Execute the harvesting command based on the provided arguments."""
+    
+    def __call__(self, args) -> None:
         self.args = args
         ctx = HermesContext()
         ctx.init_cache("harvest")
-
-        if hasattr(args, 'url') and args.url:
-            result = self._process_url(args.url, ctx)
-            if result is None:
-                logger.error("Failed to process URL: %s", args.url)
+        
+        if args.url:
+            self._process_url(args.url, ctx)
         else:
             self._harvest_locally(ctx)
 
-    def _process_url(self, url: str, ctx: HermesContext) -> Optional[Dict[str, Dict]]:
+    def _process_url(self, url: str, ctx: HermesContext) -> Optional[Tuple[Dict, Dict]]:
         """Process the provided URL for metadata harvesting."""
         try:
             files_to_search = [CITATION_FILE, CODEMETA_FILE]
-            found_files = self._search_repo_for_metadata(url, files_to_search)
-
+            if "github.com" in url:
+                found_files = self._search_github_repo_for_metadata(url, files_to_search)
+            elif "gitlab.com" in url:
+                found_files = self._search_gitlab_repo_for_metadata(url, files_to_search)
+            else:
+                raise ValueError("Unsupported repository provider. Only GitHub and GitLab are supported.")
             if not found_files:
-                raise FileNotFoundError("Neither CITATION.cff nor codemeta.json found in the repository.")
-
-            cff_dict = self._handle_citation_file(found_files)
-            codemeta_dict = self._handle_codemeta_file(found_files)
-
-            logger.info("Harvesting successful from URL: %s", url)
-            print('**********************************************************')
-            print("Original CodeMeta from codemeta.json:")
-            print(json.dumps(codemeta_dict, indent=4))
-
-            print('**********************************************************')
-            print("CFF converted to CodeMeta:")
-            print(json.dumps(cff_dict, indent=4))
-
-            return {
-                "codemeta_from_cff": cff_dict,
-                "codemeta_json": codemeta_dict
-            }
-
+                raise FileNotFoundError(f"Neither {CITATION_FILE} nor {CODEMETA_FILE} found in repository.")
+            # Process and store metadata from files
+            self._process_found_files(found_files, ctx)
+            return None, None
         except (FileNotFoundError, ValueError) as e:
             logger.error(f"Error processing URL: {e}")
-            return None
+            return None, None
+        
+    def _search_github_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]:
+        """Search for metadata files in a GitHub repository."""
+        repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents"
+        try:
+            response = requests.get(repo_api_url)
+            response.raise_for_status()
+            repo_files = response.json()
+            return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search}
+        except requests.HTTPError as e:
+            logger.error(f"HTTP Error accessing GitHub repository: {repo_url}, {e}")
+            raise FileNotFoundError(f"GitHub repository {repo_url} not found or is private.")
+        except requests.RequestException as e:
+            logger.error(f"Failed to list GitHub repository contents: {e}")
+            raise
 
+    def _search_gitlab_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]:
+        """Search for metadata files in a GitLab repository."""
+        try:
+            project_path = repo_url.rstrip('/').split('gitlab.com/')[1]
+            encoded_project = quote(project_path, safe='')
+            found_files = {}
+            for file_name in files_to_search:
+                file_api_url = f"https://gitlab.com/api/v4/projects/{encoded_project}/repository/files/{quote(file_name)}/raw?ref=main"
+                
+                response = requests.get(file_api_url)
+                if response.status_code == 200:
+                    found_files[file_name] = file_api_url
+                elif response.status_code != 404:
+                    logger.error(f"Error accessing GitLab repository: {repo_url}, {response.status_code}")
+                    raise FileNotFoundError(f"GitLab repository {repo_url} not found or is private.")
+            return found_files
+        except requests.RequestException as e:
+            logger.error(f"Failed to list GitLab repository contents: {e}")
+            raise
+            
     def _harvest_locally(self, ctx: HermesContext) -> None:
-        """Harvest metadata from configured sources."""
+        """Harvest metadata from configured sources using plugins."""
         for plugin_name in self.settings.sources:
             try:
                 plugin_func = self.plugins[plugin_name]()
                 harvested_data, tags = plugin_func(self)
-
-                with HermesHarvestContext(ctx, plugin_name) as harvest_ctx:
-                    harvest_ctx.update_from(harvested_data,
-                                            plugin=plugin_name,
-                                            timestamp=datetime.now().isoformat(), **tags)
-                    for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items():
-                        if any(v != _value and t == _tag for v, t in _trace):
-                            raise MergeError(_key, None, _value)
-
+                self.store_harvested_data(ctx, harvested_data, tags, plugin_name)
             except KeyError as e:
-                self.log.error("Plugin '%s' not found.", plugin_name)
-                self.errors.append(e)
-
+                logger.error(f"Plugin '{plugin_name}' not found. Error: {e}")
             except HermesValidationError as e:
-                self.log.error("Error while executing %s: %s", plugin_name, e)
-                self.errors.append(e)
+                logger.error(f"Error while executing '{plugin_name}': {e}")
 
-    def _search_repo_for_metadata(self, repo_url: str, files_to_search: list) -> Dict[str, str]:
+    def _search_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]:
         """Search for metadata files in the given GitHub repository and return their URLs."""
-        repo_api_url = repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/') + '/contents'
-
+        repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents"
         try:
             response = requests.get(repo_api_url)
             response.raise_for_status()
-
             repo_files = response.json()
-            found_files = {file_entry["name"]: file_entry["download_url"] for file_entry in repo_files
-                           if file_entry["name"] in files_to_search}
-
-            return found_files
-
+            return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search}
+        except requests.HTTPError as e:
+            logger.error(f"HTTP Error accessing repository: {repo_url}, {e}")
+            raise FileNotFoundError(f"Repository {repo_url} not found or is private.")
         except requests.RequestException as e:
-            if e.response and e.response.status_code == 404:
-                logger.error(f"Repository not found: {repo_url}")
-                raise FileNotFoundError(f"Repository {repo_url} not found or is private.")
-            else:
-                logger.error(f"Failed to list repository contents: {e}")
-                raise
+            logger.error(f"Failed to list repository contents: {e}")
+            raise
+
+    def _process_found_files(self, found_files: Dict[str, str], ctx: HermesContext) -> None:
+        """Process and store metadata from CFF and CodeMeta files."""
+        cff_data = self._handle_citation_file(found_files)
+        codemeta_data = self._handle_codemeta_file(found_files)
+        if cff_data:
+            self.store_harvested_data(ctx, cff_data, {"source_type": "CFF"}, "cff")
+        if codemeta_data:
+            self.store_harvested_data(ctx, codemeta_data, {"source_type": "CodeMeta"}, "codemeta")
 
     def _fetch_file_from_url(self, file_url: str) -> str:
         """Fetch the content of a file from its URL."""
@@ -148,38 +158,44 @@ def _fetch_file_from_url(self, file_url: str) -> str:
         except requests.RequestException as e:
             logger.error(f"Failed to fetch file from {file_url}: {e}")
             raise FileNotFoundError(f"Unable to fetch file from {file_url}")
-
-    def _load_cff_from_file(self, cff_data: str) -> dict:
-        """Load and parse CFF data from a file."""
-        yaml = YAML(typ='safe')
-        yaml.constructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = yaml.constructor.yaml_constructors[
-            u'tag:yaml.org,2002:str']
-        return yaml.load(cff_data)
-
-    def _convert_cff_to_codemeta(self, cff_data: str) -> dict:
-        """Convert metadata from CFF to CodeMeta format."""
-        codemeta_str = Citation(cff_data).as_codemeta()
-        return json.loads(codemeta_str)
-
+        
     def _patch_author_emails(self, cff: dict, codemeta: dict) -> dict:
-        """Patch author emails from CFF into CodeMeta."""
         cff_authors = cff["authors"]
         for i, author in enumerate(cff_authors):
             if "email" in author:
                 codemeta["author"][i]["email"] = author["email"]
         return codemeta
 
-    def _handle_citation_file(self, found_files: dict) -> Optional[dict]:
+    def _handle_citation_file(self, found_files: Dict[str, str]) -> Optional[Dict]:
         """Handle the CITATION.cff file if found."""
         if CITATION_FILE in found_files:
-            cff_content = self._fetch_file_from_url(found_files[CITATION_FILE])
-            cff_dict = self._load_cff_from_file(cff_content)
-            return self._convert_cff_to_codemeta(cff_content)
+            cff_content_str = self._fetch_file_from_url(found_files[CITATION_FILE])
+            cff_content = yaml.safe_load(cff_content_str)
+            cff_codemeta_dict = self._convert_cff_to_codemeta(cff_content_str)
+            cff_codemeta_dict = self._patch_author_emails(cff_content, cff_codemeta_dict)
+            return cff_codemeta_dict
         return None
 
-    def _handle_codemeta_file(self, found_files: dict) -> Optional[dict]:
+    def _handle_codemeta_file(self, found_files: Dict[str, str]) -> Optional[Dict]:
         """Handle the codemeta.json file if found."""
         if CODEMETA_FILE in found_files:
             codemeta_content = self._fetch_file_from_url(found_files[CODEMETA_FILE])
             return json.loads(codemeta_content)
-        return None
\ No newline at end of file
+        return None
+
+    def _convert_cff_to_codemeta(self, cff_data: str) -> Dict:
+        """Convert metadata from CFF to CodeMeta format."""
+        codemeta_str = Citation(cff_data).as_codemeta()
+        return json.loads(codemeta_str)
+
+    def store_harvested_data(self, ctx: HermesContext, harvested_data: Dict, tags: Dict, source_name: str) -> None:
+        """Store harvested data into Hermes context."""
+        with HermesHarvestContext(ctx, source_name) as harvest_ctx:
+            harvest_ctx.update_from(harvested_data, plugin=source_name, timestamp=datetime.now().isoformat(), **tags)
+            self._check_for_merge_conflicts(harvest_ctx)
+
+    def _check_for_merge_conflicts(self, harvest_ctx: HermesHarvestContext) -> None:
+        """Check for merge conflicts after updating harvest context."""
+        for key, ((value, tag), *trace) in harvest_ctx._data.items():
+            if any(v != value and t == tag for v, t in trace):
+                raise MergeError(key, None, value)

From 16cba5d80bed0ead3369026a903eeed920dba6da Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Thu, 30 Jan 2025 16:15:18 +0100
Subject: [PATCH 04/19] Issue #276 - Harvest metadata from CFF via path

---
 src/hermes/commands/harvest/cff.py | 36 +++++++++++++++++-------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
index 4cc879b0..1d14c10b 100644
--- a/src/hermes/commands/harvest/cff.py
+++ b/src/hermes/commands/harvest/cff.py
@@ -19,6 +19,7 @@
 from hermes.model.context import ContextPath
 from hermes.model.errors import HermesValidationError
 from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
 
 
 # TODO: should this be configurable via a CLI option?
@@ -106,18 +107,23 @@ def _validate(self, cff_file: pathlib.Path, cff_dict: t.Dict) -> bool:
             return True
 
     def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
-        # Find CFF files in directories and subdirectories
-        cff_file = path / 'CITATION.cff'
-        if cff_file.exists():
-            return cff_file
-
-        # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
-        #       which is given via the --path arg. Maybe add another option to enable pointing to a single file?
-        #       (So this stays "convention over configuration")
-        files = list(path.rglob('**/CITATION.cff'))
-        if len(files) == 1:
-            return pathlib.Path(files[0])
-        # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
-        # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
-        #       peeps to use the Click context?
-        return None
+        if str(path).startswith("http:") or str(path).startswith("https:"):
+            # Find CFF files from the provided URL repository
+            normalized_url = normalize_url(str(path))
+            return fetch_metadata_from_repo(normalized_url, "CITATION.cff")
+        else:
+            # Find CFF files in directories and subdirectories
+            cff_file = path / 'CITATION.cff'
+            if cff_file.exists():
+                return cff_file
+
+            # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
+            #       which is given via the --path arg. Maybe add another option to enable pointing to a single file?
+            #       (So this stays "convention over configuration")
+            files = list(path.rglob('**/CITATION.cff'))
+            if len(files) == 1:
+                return pathlib.Path(files[0])
+            # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
+            # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
+            #       peeps to use the Click context?
+            return None

From afb818932652654ebc5407e1b9b4416f4f8a9bbf Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Thu, 30 Jan 2025 16:42:17 +0100
Subject: [PATCH 05/19] Issue #276 - Harvest metadata from CodeMeta via path

---
 src/hermes/commands/harvest/codemeta.py | 27 +++++++++++++++----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py
index b75bb002..dd0143ee 100644
--- a/src/hermes/commands/harvest/codemeta.py
+++ b/src/hermes/commands/harvest/codemeta.py
@@ -13,7 +13,7 @@
 from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin
 from hermes.commands.harvest.util.validate_codemeta import validate_codemeta
 from hermes.model.errors import HermesValidationError
-
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
 
 class CodeMetaHarvestPlugin(HermesHarvestPlugin):
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
@@ -56,13 +56,18 @@ def _validate(self, codemeta_file: pathlib.Path) -> bool:
         return True
 
     def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
-        # Find CodeMeta files in directories and subdirectories
-        # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
-        #       (So this stays "convention over configuration")
-        files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
-        if len(files) == 1:
-            return pathlib.Path(files[0])
-        # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
-        # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
-        #       peeps to use the Click context?
-        return None
+        if str(path).startswith("http:") or str(path).startswith("https:"):
+            # Find CodeMeta files from the provided URL repository
+            normalized_url = normalize_url(str(path))
+            return fetch_metadata_from_repo(normalized_url, "codemeta.json")
+        else:
+            # Find CodeMeta files in directories and subdirectories
+            # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
+            #       (So this stays "convention over configuration")
+            files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
+            if len(files) == 1:
+                return pathlib.Path(files[0])
+            # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
+            # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
+            #       peeps to use the Click context?
+            return None

From 09401ed98bc6b97cb04ac5eaecc688366f47a9f4 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Thu, 30 Jan 2025 16:55:03 +0100
Subject: [PATCH 06/19] Issue #276 - Refactor functions for harvesting
 CFF/CodeMeta via path

---
 .../harvest/util/remote_harvesting.py         | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 src/hermes/commands/harvest/util/remote_harvesting.py

diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py
new file mode 100644
index 00000000..fcc3ed32
--- /dev/null
+++ b/src/hermes/commands/harvest/util/remote_harvesting.py
@@ -0,0 +1,75 @@
+import pathlib
+import re
+import requests
+import tempfile
+import typing as t
+
+
+def normalize_url(path: str) -> str:
+    """Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
+    corrected_url = path.replace("\\", "/")
+    return corrected_url.replace("https:/", "https://")
+
+
+def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]:
+    """
+    Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository.
+
+    :param repo_url: The repository URL.
+    :param filename: The name of the metadata file to fetch.
+    :return: Path to the temporary file containing the downloaded metadata, or None.
+    """
+    try:
+        if "github.com" in repo_url:
+            # GitHub API
+            api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents"
+            response = requests.get(api_url)
+            if response.status_code == 200:
+                for file_info in response.json():
+                    if file_info["name"] == filename:
+                        return _download_to_tempfile(file_info["download_url"], filename)
+        elif "gitlab.com" in repo_url:
+            # GitLab API
+            match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url)
+            if match:
+                base_domain = match.group(1)
+                group_or_user = match.group(2)
+                project_name = match.group(3).split('/')[0]
+                project_path = f"{group_or_user}/{project_name}"
+                api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree"
+
+                response = requests.get(api_url)
+                if response.status_code == 200:
+                    for file_info in response.json():
+                        if file_info["name"] == filename:
+                            file_url = (
+                                f"https://{base_domain}/api/v4/projects/"
+                                f"{requests.utils.quote(project_path, safe='')}/repository/files/"
+                                f"{requests.utils.quote(filename, safe='')}/raw"
+                            )
+                            return _download_to_tempfile(file_url, filename)
+        else:
+            print(f"Unsupported repository URL: {repo_url}")
+            return None
+    except Exception as e:
+        print(f"Error fetching metadata from repository: {e}")
+        return None
+
+
+def _download_to_tempfile(url: str, filename: str) -> pathlib.Path:
+    """
+    Download a file from a URL and save it to a temporary file.
+
+    :param url: The URL to download from.
+    :param filename: The name of the file to save.
+    :return: Path to the temporary file.
+    """
+    try:
+        content = requests.get(url).text
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file:
+            temp_file.write(content.encode("utf-8"))
+            print(f"Downloaded {filename} to {temp_file.name}")
+            return pathlib.Path(temp_file.name)
+    except Exception as e:
+        print(f"Error downloading {filename}: {e}")
+        return None

From f193cc96590c426a47dacef63b6389582bbb3a5e Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Thu, 30 Jan 2025 17:13:42 +0100
Subject: [PATCH 07/19] Issue #276 - Revert to original base.py

---
 src/hermes/commands/harvest/base.py | 179 ++++------------------------
 1 file changed, 22 insertions(+), 157 deletions(-)

diff --git a/src/hermes/commands/harvest/base.py b/src/hermes/commands/harvest/base.py
index f0f23546..460345ea 100644
--- a/src/hermes/commands/harvest/base.py
+++ b/src/hermes/commands/harvest/base.py
@@ -5,27 +5,15 @@
 # SPDX-FileContributor: Michael Meinel
 
 import argparse
-import json
-import logging
-from datetime import datetime
 import typing as t
-from typing import Optional, Dict, Tuple, List
-import yaml
-from urllib.parse import quote
+from datetime import datetime
 
-import requests
 from pydantic import BaseModel
-from ruamel.yaml import YAML
-from cffconvert import Citation
 
 from hermes.commands.base import HermesCommand, HermesPlugin
 from hermes.model.context import HermesContext, HermesHarvestContext
 from hermes.model.errors import HermesValidationError, MergeError
 
-CITATION_FILE = "CITATION.cff"
-CODEMETA_FILE = "codemeta.json"
-
-logger = logging.getLogger(__name__)
 
 class HermesHarvestPlugin(HermesPlugin):
     """Base plugin that does harvesting.
@@ -37,7 +25,7 @@ def __call__(self, command: HermesCommand) -> t.Tuple[t.Dict, t.Dict]:
         pass
 
 
-class HarvestSettings(BaseModel):
+class _HarvestSettings(BaseModel):
     """Generic harvesting settings."""
 
     sources: list[str] = []
@@ -47,155 +35,32 @@ class HermesHarvestCommand(HermesCommand):
     """ Harvest metadata from configured sources. """
 
     command_name = "harvest"
-    settings_class = HarvestSettings
-    
-    def __call__(self, args) -> None:
+    settings_class = _HarvestSettings
+
+    def __call__(self, args: argparse.Namespace) -> None:
         self.args = args
         ctx = HermesContext()
+
+        # Initialize the harvest cache directory here to indicate the step ran
         ctx.init_cache("harvest")
-        
-        if args.url:
-            self._process_url(args.url, ctx)
-        else:
-            self._harvest_locally(ctx)
-
-    def _process_url(self, url: str, ctx: HermesContext) -> Optional[Tuple[Dict, Dict]]:
-        """Process the provided URL for metadata harvesting."""
-        try:
-            files_to_search = [CITATION_FILE, CODEMETA_FILE]
-            if "github.com" in url:
-                found_files = self._search_github_repo_for_metadata(url, files_to_search)
-            elif "gitlab.com" in url:
-                found_files = self._search_gitlab_repo_for_metadata(url, files_to_search)
-            else:
-                raise ValueError("Unsupported repository provider. Only GitHub and GitLab are supported.")
-            if not found_files:
-                raise FileNotFoundError(f"Neither {CITATION_FILE} nor {CODEMETA_FILE} found in repository.")
-            # Process and store metadata from files
-            self._process_found_files(found_files, ctx)
-            return None, None
-        except (FileNotFoundError, ValueError) as e:
-            logger.error(f"Error processing URL: {e}")
-            return None, None
-        
-    def _search_github_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]:
-        """Search for metadata files in a GitHub repository."""
-        repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents"
-        try:
-            response = requests.get(repo_api_url)
-            response.raise_for_status()
-            repo_files = response.json()
-            return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search}
-        except requests.HTTPError as e:
-            logger.error(f"HTTP Error accessing GitHub repository: {repo_url}, {e}")
-            raise FileNotFoundError(f"GitHub repository {repo_url} not found or is private.")
-        except requests.RequestException as e:
-            logger.error(f"Failed to list GitHub repository contents: {e}")
-            raise
-
-    def _search_gitlab_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]:
-        """Search for metadata files in a GitLab repository."""
-        try:
-            project_path = repo_url.rstrip('/').split('gitlab.com/')[1]
-            encoded_project = quote(project_path, safe='')
-            found_files = {}
-            for file_name in files_to_search:
-                file_api_url = f"https://gitlab.com/api/v4/projects/{encoded_project}/repository/files/{quote(file_name)}/raw?ref=main"
-                
-                response = requests.get(file_api_url)
-                if response.status_code == 200:
-                    found_files[file_name] = file_api_url
-                elif response.status_code != 404:
-                    logger.error(f"Error accessing GitLab repository: {repo_url}, {response.status_code}")
-                    raise FileNotFoundError(f"GitLab repository {repo_url} not found or is private.")
-            return found_files
-        except requests.RequestException as e:
-            logger.error(f"Failed to list GitLab repository contents: {e}")
-            raise
-            
-    def _harvest_locally(self, ctx: HermesContext) -> None:
-        """Harvest metadata from configured sources using plugins."""
+
         for plugin_name in self.settings.sources:
             try:
                 plugin_func = self.plugins[plugin_name]()
                 harvested_data, tags = plugin_func(self)
-                self.store_harvested_data(ctx, harvested_data, tags, plugin_name)
+
+                with HermesHarvestContext(ctx, plugin_name) as harvest_ctx:
+                    harvest_ctx.update_from(harvested_data,
+                                            plugin=plugin_name,
+                                            timestamp=datetime.now().isoformat(), **tags)
+                    for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items():
+                        if any(v != _value and t == _tag for v, t in _trace):
+                            raise MergeError(_key, None, _value)
+
             except KeyError as e:
-                logger.error(f"Plugin '{plugin_name}' not found. Error: {e}")
+                self.log.error("Plugin '%s' not found.", plugin_name)
+                self.errors.append(e)
+
             except HermesValidationError as e:
-                logger.error(f"Error while executing '{plugin_name}': {e}")
-
-    def _search_repo_for_metadata(self, repo_url: str, files_to_search: List[str]) -> Dict[str, str]:
-        """Search for metadata files in the given GitHub repository and return their URLs."""
-        repo_api_url = f"{repo_url.rstrip('/').replace('https://github.com/', 'https://api.github.com/repos/')}/contents"
-        try:
-            response = requests.get(repo_api_url)
-            response.raise_for_status()
-            repo_files = response.json()
-            return {file["name"]: file["download_url"] for file in repo_files if file["name"] in files_to_search}
-        except requests.HTTPError as e:
-            logger.error(f"HTTP Error accessing repository: {repo_url}, {e}")
-            raise FileNotFoundError(f"Repository {repo_url} not found or is private.")
-        except requests.RequestException as e:
-            logger.error(f"Failed to list repository contents: {e}")
-            raise
-
-    def _process_found_files(self, found_files: Dict[str, str], ctx: HermesContext) -> None:
-        """Process and store metadata from CFF and CodeMeta files."""
-        cff_data = self._handle_citation_file(found_files)
-        codemeta_data = self._handle_codemeta_file(found_files)
-        if cff_data:
-            self.store_harvested_data(ctx, cff_data, {"source_type": "CFF"}, "cff")
-        if codemeta_data:
-            self.store_harvested_data(ctx, codemeta_data, {"source_type": "CodeMeta"}, "codemeta")
-
-    def _fetch_file_from_url(self, file_url: str) -> str:
-        """Fetch the content of a file from its URL."""
-        try:
-            response = requests.get(file_url)
-            response.raise_for_status()
-            return response.text
-        except requests.RequestException as e:
-            logger.error(f"Failed to fetch file from {file_url}: {e}")
-            raise FileNotFoundError(f"Unable to fetch file from {file_url}")
-        
-    def _patch_author_emails(self, cff: dict, codemeta: dict) -> dict:
-        cff_authors = cff["authors"]
-        for i, author in enumerate(cff_authors):
-            if "email" in author:
-                codemeta["author"][i]["email"] = author["email"]
-        return codemeta
-
-    def _handle_citation_file(self, found_files: Dict[str, str]) -> Optional[Dict]:
-        """Handle the CITATION.cff file if found."""
-        if CITATION_FILE in found_files:
-            cff_content_str = self._fetch_file_from_url(found_files[CITATION_FILE])
-            cff_content = yaml.safe_load(cff_content_str)
-            cff_codemeta_dict = self._convert_cff_to_codemeta(cff_content_str)
-            cff_codemeta_dict = self._patch_author_emails(cff_content, cff_codemeta_dict)
-            return cff_codemeta_dict
-        return None
-
-    def _handle_codemeta_file(self, found_files: Dict[str, str]) -> Optional[Dict]:
-        """Handle the codemeta.json file if found."""
-        if CODEMETA_FILE in found_files:
-            codemeta_content = self._fetch_file_from_url(found_files[CODEMETA_FILE])
-            return json.loads(codemeta_content)
-        return None
-
-    def _convert_cff_to_codemeta(self, cff_data: str) -> Dict:
-        """Convert metadata from CFF to CodeMeta format."""
-        codemeta_str = Citation(cff_data).as_codemeta()
-        return json.loads(codemeta_str)
-
-    def store_harvested_data(self, ctx: HermesContext, harvested_data: Dict, tags: Dict, source_name: str) -> None:
-        """Store harvested data into Hermes context."""
-        with HermesHarvestContext(ctx, source_name) as harvest_ctx:
-            harvest_ctx.update_from(harvested_data, plugin=source_name, timestamp=datetime.now().isoformat(), **tags)
-            self._check_for_merge_conflicts(harvest_ctx)
-
-    def _check_for_merge_conflicts(self, harvest_ctx: HermesHarvestContext) -> None:
-        """Check for merge conflicts after updating harvest context."""
-        for key, ((value, tag), *trace) in harvest_ctx._data.items():
-            if any(v != value and t == tag for v, t in trace):
-                raise MergeError(key, None, value)
+                self.log.error("Error while executing %s: %s", plugin_name, e)
+                self.errors.append(e)

From 1bab2c762039417dd11063e4330b3a757bb6ac0f Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Thu, 30 Jan 2025 17:41:46 +0100
Subject: [PATCH 08/19] Issue #276 - Update base.py

---
 src/hermes/commands/base.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py
index a4fb6410..499f5e04 100644
--- a/src/hermes/commands/base.py
+++ b/src/hermes/commands/base.py
@@ -104,13 +104,6 @@ def init_common_parser(self, parser: argparse.ArgumentParser) -> None:
             help="Configuration file in TOML format",
         )
 
-        # Add a new argument to accept a URL for harvesting
-        parser.add_argument(
-            "--url",
-            type=str,
-            help="URL from which to extract metadata"
-        )
-        
         plugin_args = parser.add_argument_group("Extra options")
         plugin_args.add_argument(
             "-O",
@@ -138,7 +131,7 @@ def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None:
     def load_settings(self, args: argparse.Namespace):
         """Load settings from the configuration file (passed in from command line)."""
 
-        toml_data = toml.load(args.path / args.config)
+        toml_data = toml.load("." / args.config)
         self.root_settings = HermesCommand.settings_class.model_validate(toml_data)
         self.settings = getattr(self.root_settings, self.command_name)
 

From 25eec31146d7851532f5cfcd10d11eea3cc9ee7f Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Fri, 7 Feb 2025 14:24:33 +0100
Subject: [PATCH 09/19] Add functionality to remove temp files

Add functionality to remove temp files generated during remote harvesting.
---
 .../commands/harvest/util/remote_harvesting.py      | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py
index fcc3ed32..2b504cde 100644
--- a/src/hermes/commands/harvest/util/remote_harvesting.py
+++ b/src/hermes/commands/harvest/util/remote_harvesting.py
@@ -3,7 +3,7 @@
 import requests
 import tempfile
 import typing as t
-
+import os
 
 def normalize_url(path: str) -> str:
     """Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
@@ -73,3 +73,14 @@ def _download_to_tempfile(url: str, filename: str) -> pathlib.Path:
     except Exception as e:
         print(f"Error downloading {filename}: {e}")
         return None
+
+
+def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")):
+    """
+    Removes a temporary file if it is inside the temp directory.
+
+    :param file_path: The file path to check and remove.
+    :param temp_dir: The directory considered as temporary (default: "C:/Temp").
+    """
+    if str(file_path).startswith(str(temp_dir)):
+        os.remove(file_path)

From 98814f4913726b2b442564f5411d23669ac8b090 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Fri, 7 Feb 2025 14:46:15 +0100
Subject: [PATCH 10/19] Remove temp files

Remove temp files after harvesting CFF metadata
---
 src/hermes/commands/harvest/cff.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
index 1d14c10b..9a39f5e4 100644
--- a/src/hermes/commands/harvest/cff.py
+++ b/src/hermes/commands/harvest/cff.py
@@ -19,7 +19,7 @@
 from hermes.model.context import ContextPath
 from hermes.model.errors import HermesValidationError
 from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
-from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file
 
 
 # TODO: should this be configurable via a CLI option?
@@ -46,6 +46,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         # Read the content
         cff_data = cff_file.read_text()
 
+        remove_temp_file(cff_file)
         # Validate the content to be correct CFF
         cff_dict = self._load_cff_from_file(cff_data)
         if command.settings.cff.enable_validation and not self._validate(cff_file, cff_dict):

From dd56827aadf446aa6e86ef2760cc992d6af06c0c Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Fri, 7 Feb 2025 14:51:41 +0100
Subject: [PATCH 11/19] Remove temp files

Remove temp files after harvesting CodeMeta metadata
---
 src/hermes/commands/harvest/codemeta.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py
index dd0143ee..7e6f2113 100644
--- a/src/hermes/commands/harvest/codemeta.py
+++ b/src/hermes/commands/harvest/codemeta.py
@@ -13,7 +13,7 @@
 from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin
 from hermes.commands.harvest.util.validate_codemeta import validate_codemeta
 from hermes.model.errors import HermesValidationError
-from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file
 
 class CodeMetaHarvestPlugin(HermesHarvestPlugin):
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
@@ -38,6 +38,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         if not self._validate(codemeta_file):
             raise HermesValidationError(codemeta_file)
 
+        remove_temp_file(codemeta_file)
         codemeta = json.loads(codemeta_str)
         return codemeta, {'local_path': str(codemeta_file)}
 

From 5f75ad1c4ceef912c864d6344c6dd21777713f53 Mon Sep 17 00:00:00 2001
From: Aidajafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Thu, 13 Feb 2025 16:05:50 +0100
Subject: [PATCH 12/19] Issue #276 - Add SPDX headers

---
 src/hermes/commands/harvest/util/remote_harvesting.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py
index 2b504cde..f57c766f 100644
--- a/src/hermes/commands/harvest/util/remote_harvesting.py
+++ b/src/hermes/commands/harvest/util/remote_harvesting.py
@@ -1,3 +1,10 @@
+# SPDX-FileCopyrightText: 2025 OFFIS e.V.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# SPDX-FileContributor: Stephan Ferenz
+# SPDX-FileContributor: Aida Jafarbigloo
+
 import pathlib
 import re
 import requests

From 4d901fc7e385ba0f0d06bbed04ec51c19ca9be2f Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Mon, 14 Apr 2025 14:34:36 +0200
Subject: [PATCH 13/19] Update base.py

To support repository URL as a path
---
 src/hermes/commands/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py
index 3ae9030b..5a242a76 100644
--- a/src/hermes/commands/base.py
+++ b/src/hermes/commands/base.py
@@ -132,7 +132,7 @@ def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None:
     def load_settings(self, args: argparse.Namespace):
         """Load settings from the configuration file (passed in from command line)."""
         try:
-            toml_data = toml.load(args.path / args.config)
+            toml_data = toml.load("." / args.config)
             self.root_settings = HermesCommand.settings_class.model_validate(toml_data)
             self.settings = getattr(self.root_settings, self.command_name)
         except FileNotFoundError as e:

From 3aa06a0f04576a4e5326b53c2694a31ddba8c7b4 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Wed, 14 May 2025 08:59:19 +0000
Subject: [PATCH 14/19] Fix issues: HERMES user agent and temporary files

---
 src/hermes/commands/harvest/cff.py            | 20 ++++---
 src/hermes/commands/harvest/codemeta.py       | 15 +++--
 .../harvest/util/remote_harvesting.py         | 56 ++++++++++---------
 3 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
index e07dbb14..48e9a8a9 100644
--- a/src/hermes/commands/harvest/cff.py
+++ b/src/hermes/commands/harvest/cff.py
@@ -19,7 +19,7 @@
 from hermes.model.context import ContextPath
 from hermes.model.errors import HermesValidationError
 from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
-from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
 
 
 # TODO: should this be configurable via a CLI option?
@@ -38,7 +38,9 @@ class CffHarvestPlugin(HermesHarvestPlugin):
 
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         # Get source files
-        cff_file = self._get_single_cff(command.args.path)
+        
+        cff_file, temp_dir_obj = self._get_single_cff(command.args.path)
+
         if not cff_file:
             raise HermesValidationError(f'{command.args.path} contains either no or more than 1 CITATION.cff file. '
                                         'Aborting harvesting for this metadata source.')
@@ -46,7 +48,10 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         # Read the content
         cff_data = cff_file.read_text()
 
-        remove_temp_file(cff_file)
+        # clean up the temp
+        if temp_dir_obj:
+            temp_dir_obj.cleanup()
+        
         # Validate the content to be correct CFF
         cff_dict = self._load_cff_from_file(cff_data)
 
@@ -114,20 +119,21 @@ def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
         if str(path).startswith("http:") or str(path).startswith("https:"):
             # Find CFF files from the provided URL repository
             normalized_url = normalize_url(str(path))
-            return fetch_metadata_from_repo(normalized_url, "CITATION.cff")
+            file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff")
+            return file_info
         else:
             # Find CFF files in directories and subdirectories
             cff_file = path / 'CITATION.cff'
             if cff_file.exists():
-                return cff_file
+                return cff_file, None
 
             # TODO: Do we really want to search recursive? CFF convention is the file should be at the topmost dir,
             #       which is given via the --path arg. Maybe add another option to enable pointing to a single file?
             #       (So this stays "convention over configuration")
             files = list(path.rglob('**/CITATION.cff'))
             if len(files) == 1:
-                return pathlib.Path(files[0])
+                return pathlib.Path(files[0]), None
             # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
             # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
             #       peeps to use the Click context?
-            return None
+            return None, None
\ No newline at end of file
diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py
index 7e6f2113..bb37b097 100644
--- a/src/hermes/commands/harvest/codemeta.py
+++ b/src/hermes/commands/harvest/codemeta.py
@@ -13,7 +13,7 @@
 from hermes.commands.harvest.base import HermesHarvestCommand, HermesHarvestPlugin
 from hermes.commands.harvest.util.validate_codemeta import validate_codemeta
 from hermes.model.errors import HermesValidationError
-from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo, remove_temp_file
+from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
 
 class CodeMetaHarvestPlugin(HermesHarvestPlugin):
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
@@ -25,7 +25,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         :param ctx: The harvesting context that should contain the provided metadata.
         """
         # Get source files
-        codemeta_file = self._get_single_codemeta(command.args.path)
+        codemeta_file, temp_dir_obj = self._get_single_codemeta(command.args.path)
         if not codemeta_file:
             raise HermesValidationError(
                 f"{command.args.path} contains either no or more than 1 codemeta.json file. Aborting harvesting "
@@ -38,7 +38,9 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
         if not self._validate(codemeta_file):
             raise HermesValidationError(codemeta_file)
 
-        remove_temp_file(codemeta_file)
+        if temp_dir_obj:
+            temp_dir_obj.cleanup()
+
         codemeta = json.loads(codemeta_str)
         return codemeta, {'local_path': str(codemeta_file)}
 
@@ -60,15 +62,16 @@ def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
         if str(path).startswith("http:") or str(path).startswith("https:"):
             # Find CodeMeta files from the provided URL repository
             normalized_url = normalize_url(str(path))
-            return fetch_metadata_from_repo(normalized_url, "codemeta.json")
+            file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json")
+            return file_info
         else:
             # Find CodeMeta files in directories and subdirectories
             # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?
             #       (So this stays "convention over configuration")
             files = glob.glob(str(path / "**" / "codemeta.json"), recursive=True)
             if len(files) == 1:
-                return pathlib.Path(files[0])
+                return pathlib.Path(files[0]), None
             # TODO: Shouldn't we log/echo the found CFF files so a user can debug/cleanup?
             # TODO: Do we want to hand down a logging instance via Hermes context or just encourage
             #       peeps to use the Click context?
-            return None
+            return None, None
\ No newline at end of file
diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py
index f57c766f..60c1d928 100644
--- a/src/hermes/commands/harvest/util/remote_harvesting.py
+++ b/src/hermes/commands/harvest/util/remote_harvesting.py
@@ -12,29 +12,38 @@
 import typing as t
 import os
 
+from hermes.utils import hermes_user_agent
+
+session = requests.Session()
+session.headers.update({"User-Agent": hermes_user_agent})
+
 def normalize_url(path: str) -> str:
     """Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
     corrected_url = path.replace("\\", "/")
     return corrected_url.replace("https:/", "https://")
 
 
-def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib.Path]:
+def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[t.Tuple[pathlib.Path, tempfile.TemporaryDirectory]]:
     """
     Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository.
 
     :param repo_url: The repository URL.
     :param filename: The name of the metadata file to fetch.
-    :return: Path to the temporary file containing the downloaded metadata, or None.
+    :return: Tuple of (Path to the temporary file, TemporaryDirectory object) or None.
     """
     try:
+        temp_dir_obj = tempfile.TemporaryDirectory()
+        temp_dir = pathlib.Path(temp_dir_obj.name)
+
         if "github.com" in repo_url:
             # GitHub API
             api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents"
-            response = requests.get(api_url)
+            response = session.get(api_url)
             if response.status_code == 200:
                 for file_info in response.json():
                     if file_info["name"] == filename:
-                        return _download_to_tempfile(file_info["download_url"], filename)
+                        temp_file = _download_to_tempfile(file_info["download_url"], filename, temp_dir)
+                        return temp_file, temp_dir_obj
         elif "gitlab.com" in repo_url:
             # GitLab API
             match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url)
@@ -45,7 +54,7 @@ def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib
                 project_path = f"{group_or_user}/{project_name}"
                 api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree"
 
-                response = requests.get(api_url)
+                response = session.get(api_url)
                 if response.status_code == 200:
                     for file_info in response.json():
                         if file_info["name"] == filename:
@@ -54,40 +63,37 @@ def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[pathlib
                                 f"{requests.utils.quote(project_path, safe='')}/repository/files/"
                                 f"{requests.utils.quote(filename, safe='')}/raw"
                             )
-                            return _download_to_tempfile(file_url, filename)
+                            temp_file = _download_to_tempfile(file_url, filename, temp_dir)
+                            return temp_file, temp_dir_obj
         else:
             print(f"Unsupported repository URL: {repo_url}")
+            temp_dir_obj.cleanup()
             return None
     except Exception as e:
         print(f"Error fetching metadata from repository: {e}")
         return None
 
 
-def _download_to_tempfile(url: str, filename: str) -> pathlib.Path:
+def _download_to_tempfile(url: str, filename: str, temp_dir: tempfile.TemporaryDirectory) -> pathlib.Path:
     """
-    Download a file from a URL and save it to a temporary file.
+    Download a file from a URL and save it to a temporary directory.
 
     :param url: The URL to download from.
     :param filename: The name of the file to save.
+    :param temp_dir: TemporaryDirectory where the file will be saved.
     :return: Path to the temporary file.
     """
     try:
-        content = requests.get(url).text
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{filename.split('.')[-1]}") as temp_file:
-            temp_file.write(content.encode("utf-8"))
-            print(f"Downloaded {filename} to {temp_file.name}")
-            return pathlib.Path(temp_file.name)
+        response = session.get(url) 
+        if response.status_code == 200:
+            content = requests.get(url).text
+            file_path = temp_dir / filename 
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+            return pathlib.Path(file_path)
+        else:
+            print(f"Failed to download {filename}: {response.status_code}")
+            return None
     except Exception as e:
         print(f"Error downloading {filename}: {e}")
-        return None
-
-
-def remove_temp_file(file_path: pathlib.Path, temp_dir: pathlib.Path = pathlib.Path("C:/Temp")):
-    """
-    Removes a temporary file if it is inside the temp directory.
-
-    :param file_path: The file path to check and remove.
-    :param temp_dir: The directory considered as temporary (default: "C:/Temp").
-    """
-    if str(file_path).startswith(str(temp_dir)):
-        os.remove(file_path)
+        return None
\ No newline at end of file

From 1bd4d1f8e17e6e5a5e39c385abfb400323d824cc Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Wed, 14 May 2025 13:44:24 +0000
Subject: [PATCH 15/19] Fix hermes clean command

---
 src/hermes/commands/clean/base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/hermes/commands/clean/base.py b/src/hermes/commands/clean/base.py
index b588faf5..4e5e4ea2 100644
--- a/src/hermes/commands/clean/base.py
+++ b/src/hermes/commands/clean/base.py
@@ -6,6 +6,7 @@
 
 import argparse
 import shutil
+import logging
 
 from pydantic import BaseModel
 
@@ -27,6 +28,7 @@ def __call__(self, args: argparse.Namespace) -> None:
         self.log.info("Removing HERMES caches...")
 
         # Naive implementation for now... check errors, validate directory, don't construct the path ourselves, etc.
+        logging.shutdown() 
         shutil.rmtree(args.path / '.hermes')
 
     def load_settings(self, args: argparse.Namespace):

From 3918954c463cf50916e824ac63d4e4e15e3c5277 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Sat, 17 May 2025 08:00:16 +0000
Subject: [PATCH 16/19] Small fix

---
 src/hermes/commands/harvest/cff.py      | 5 ++++-
 src/hermes/commands/harvest/codemeta.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
index 48e9a8a9..4e6b0042 100644
--- a/src/hermes/commands/harvest/cff.py
+++ b/src/hermes/commands/harvest/cff.py
@@ -120,7 +120,10 @@ def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
             # Find CFF files from the provided URL repository
             normalized_url = normalize_url(str(path))
             file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff")
-            return file_info
+            if not file_info:
+                return {}
+            else:
+                return file_info
         else:
             # Find CFF files in directories and subdirectories
             cff_file = path / 'CITATION.cff'
diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py
index bb37b097..661cc4c0 100644
--- a/src/hermes/commands/harvest/codemeta.py
+++ b/src/hermes/commands/harvest/codemeta.py
@@ -63,7 +63,10 @@ def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
             # Find CodeMeta files from the provided URL repository
             normalized_url = normalize_url(str(path))
             file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json")
-            return file_info
+            if not file_info:
+                return None, None 
+            else:
+                return file_info
         else:
             # Find CodeMeta files in directories and subdirectories
             # TODO: Do we really want to search recursive? Maybe add another option to enable pointing to a single file?

From f170481869cfff8496fa4612da5d546911d020de Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Fri, 6 Jun 2025 09:19:22 +0000
Subject: [PATCH 17/19] Load token from toml file

---
 src/hermes/commands/harvest/util/token.py | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 src/hermes/commands/harvest/util/token.py

diff --git a/src/hermes/commands/harvest/util/token.py b/src/hermes/commands/harvest/util/token.py
new file mode 100644
index 00000000..a3539d4f
--- /dev/null
+++ b/src/hermes/commands/harvest/util/token.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: 2025 OFFIS e.V.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# SPDX-FileContributor: Stephan Ferenz
+# SPDX-FileContributor: Aida Jafarbigloo
+
+import toml
+import base64
+
+
+def load_token_from_toml(config_path: str = "hermes.toml") -> str:
+    """
+    Loads and decodes the token from the HERMES TOML configuration file.
+
+    Args:
+        config_path (str): Path to the TOML config file.
+
+    Returns:
+        str: The decoded token.
+    """
+    with open(config_path, "r") as f:
+        config = toml.load(f)
+    
+    encoded_token = config.get('harvest', {}).get('token')
+    if encoded_token:
+        return base64.b64decode(encoded_token.encode()).decode()
+    else:
+        return None

From a89425929db1ac2fe674a6012df6b31fd6206f49 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Fri, 6 Jun 2025 09:22:35 +0000
Subject: [PATCH 18/19] Use token

---
 src/hermes/commands/harvest/cff.py            |   9 +-
 src/hermes/commands/harvest/codemeta.py       |   8 +-
 .../harvest/util/remote_harvesting.py         | 144 ++++++++++++------
 3 files changed, 114 insertions(+), 47 deletions(-)

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
index 4e6b0042..8dd36ef2 100644
--- a/src/hermes/commands/harvest/cff.py
+++ b/src/hermes/commands/harvest/cff.py
@@ -20,6 +20,7 @@
 from hermes.model.errors import HermesValidationError
 from hermes.commands.harvest.base import HermesHarvestPlugin, HermesHarvestCommand
 from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
+from hermes.commands.harvest.util.token import load_token_from_toml
 
 
 # TODO: should this be configurable via a CLI option?
@@ -31,12 +32,16 @@
 class CffHarvestSettings(BaseModel):
     """Custom settings for CFF harvester."""
     enable_validation: bool = True
+    token: str = ''
 
 
 class CffHarvestPlugin(HermesHarvestPlugin):
     settings_class = CffHarvestSettings
 
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
+
+        self.token = load_token_from_toml('hermes.toml')
+        
         # Get source files
         
         cff_file, temp_dir_obj = self._get_single_cff(command.args.path)
@@ -46,7 +51,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
                                         'Aborting harvesting for this metadata source.')
 
         # Read the content
-        cff_data = cff_file.read_text()
+        cff_data = cff_file.read_text(encoding='utf-8')
 
         # clean up the temp
         if temp_dir_obj:
@@ -119,7 +124,7 @@ def _get_single_cff(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
         if str(path).startswith("http:") or str(path).startswith("https:"):
             # Find CFF files from the provided URL repository
             normalized_url = normalize_url(str(path))
-            file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff")
+            file_info = fetch_metadata_from_repo(normalized_url, "CITATION.cff", token=self.token)
             if not file_info:
                 return {}
             else:
diff --git a/src/hermes/commands/harvest/codemeta.py b/src/hermes/commands/harvest/codemeta.py
index 661cc4c0..56db9b55 100644
--- a/src/hermes/commands/harvest/codemeta.py
+++ b/src/hermes/commands/harvest/codemeta.py
@@ -14,9 +14,13 @@
 from hermes.commands.harvest.util.validate_codemeta import validate_codemeta
 from hermes.model.errors import HermesValidationError
 from hermes.commands.harvest.util.remote_harvesting import normalize_url, fetch_metadata_from_repo
+from hermes.commands.harvest.util.token import load_token_from_toml
 
 class CodeMetaHarvestPlugin(HermesHarvestPlugin):
     def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
+        
+        self.token = load_token_from_toml('hermes.toml')
+        
         """
         Implementation of a harvester that provides data from a codemeta.json file format.
 
@@ -33,7 +37,7 @@ def __call__(self, command: HermesHarvestCommand) -> t.Tuple[t.Dict, t.Dict]:
             )
 
         # Read the content
-        codemeta_str = codemeta_file.read_text()
+        codemeta_str = codemeta_file.read_text(encoding='utf-8')
 
         if not self._validate(codemeta_file):
             raise HermesValidationError(codemeta_file)
@@ -62,7 +66,7 @@ def _get_single_codemeta(self, path: pathlib.Path) -> t.Optional[pathlib.Path]:
         if str(path).startswith("http:") or str(path).startswith("https:"):
             # Find CodeMeta files from the provided URL repository
             normalized_url = normalize_url(str(path))
-            file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json")
+            file_info = fetch_metadata_from_repo(normalized_url, "codemeta.json", token=self.token)
             if not file_info:
                 return None, None 
             else:
diff --git a/src/hermes/commands/harvest/util/remote_harvesting.py b/src/hermes/commands/harvest/util/remote_harvesting.py
index 60c1d928..114088ed 100644
--- a/src/hermes/commands/harvest/util/remote_harvesting.py
+++ b/src/hermes/commands/harvest/util/remote_harvesting.py
@@ -6,16 +6,13 @@
 # SPDX-FileContributor: Aida Jafarbigloo
 
 import pathlib
-import re
 import requests
 import tempfile
 import typing as t
-import os
+from urllib.parse import urlparse, quote
 
 from hermes.utils import hermes_user_agent
 
-session = requests.Session()
-session.headers.update({"User-Agent": hermes_user_agent})
 
 def normalize_url(path: str) -> str:
     """Normalize a given URL by correcting backslashes and fixing malformed HTTPS."""
@@ -23,77 +20,138 @@ def normalize_url(path: str) -> str:
     return corrected_url.replace("https:/", "https://")
 
 
-def fetch_metadata_from_repo(repo_url: str, filename: str) -> t.Optional[t.Tuple[pathlib.Path, tempfile.TemporaryDirectory]]:
+def fetch_metadata_from_repo(repo_url: str, filename: str, token: str = None) -> t.Optional[t.Tuple[pathlib.Path, tempfile.TemporaryDirectory]]:
     """
     Fetch a metadata file (e.g., CITATION.cff or codemeta.json) from a GitHub or GitLab repository.
 
     :param repo_url: The repository URL.
     :param filename: The name of the metadata file to fetch.
-    :return: Tuple of (Path to the temporary file, TemporaryDirectory object) or None.
+    :param token: (Optional) Access token for authentication (GitHub token or GitLab private token).
+    :return: A tuple containing:
+             - Path to the downloaded metadata file.
+             - TemporaryDirectory object (caller is responsible for cleanup).
+             Returns None if the file could not be fetched.
     """
     try:
+        session = requests.Session()
+        session.headers.update({"User-Agent": hermes_user_agent})
+        if token:
+            if "github" in repo_url:
+                session.headers.update({"Authorization": f"token {token}"})
+            elif "gitlab" in repo_url:
+                session.headers.update({"PRIVATE-TOKEN": token})
+                
         temp_dir_obj = tempfile.TemporaryDirectory()
         temp_dir = pathlib.Path(temp_dir_obj.name)
-
+        
+        parsed_url = urlparse(repo_url)
+        
         if "github.com" in repo_url:
-            # GitHub API
+            # GitHub API: List repository contents
             api_url = repo_url.replace("github.com", "api.github.com/repos").rstrip("/") + "/contents"
             response = session.get(api_url)
             if response.status_code == 200:
                 for file_info in response.json():
                     if file_info["name"] == filename:
-                        temp_file = _download_to_tempfile(file_info["download_url"], filename, temp_dir)
+                        temp_file = _download_to_tempfile(file_info["download_url"], filename, temp_dir, session)
                         return temp_file, temp_dir_obj
-        elif "gitlab.com" in repo_url:
-            # GitLab API
-            match = re.match(r"https://([^/]+)/([^/]+)/([^/]+)", repo_url)
-            if match:
-                base_domain = match.group(1)
-                group_or_user = match.group(2)
-                project_name = match.group(3).split('/')[0]
-                project_path = f"{group_or_user}/{project_name}"
-                api_url = f"https://{base_domain}/api/v4/projects/{requests.utils.quote(project_path, safe='')}/repository/tree"
-
-                response = session.get(api_url)
-                if response.status_code == 200:
-                    for file_info in response.json():
-                        if file_info["name"] == filename:
-                            file_url = (
-                                f"https://{base_domain}/api/v4/projects/"
-                                f"{requests.utils.quote(project_path, safe='')}/repository/files/"
-                                f"{requests.utils.quote(filename, safe='')}/raw"
-                            )
-                            temp_file = _download_to_tempfile(file_url, filename, temp_dir)
-                            return temp_file, temp_dir_obj
+        elif "gitlab" in parsed_url.netloc:
+            # GitLab API 
+            temp_file, temp_dir = _fetch_from_gitlab(parsed_url, filename, temp_dir, session)
+            if temp_file:
+                return temp_file, temp_dir_obj
         else:
             print(f"Unsupported repository URL: {repo_url}")
             temp_dir_obj.cleanup()
             return None
+
     except Exception as e:
         print(f"Error fetching metadata from repository: {e}")
         return None
 
 
-def _download_to_tempfile(url: str, filename: str, temp_dir: tempfile.TemporaryDirectory) -> pathlib.Path:
+def _fetch_from_gitlab(parsed_url, filename, temp_dir, session):
     """
-    Download a file from a URL and save it to a temporary directory.
-
-    :param url: The URL to download from.
-    :param filename: The name of the file to save.
-    :param temp_dir: TemporaryDirectory where the file will be saved.
-    :return: Path to the temporary file.
+    Helper function to fetch a file from GitLab.
     """
+    base_domain = parsed_url.netloc 
+    project_path = parsed_url.path.lstrip('/')  
+    encoded_project_path = quote(project_path, safe='')
+
+    # Step 1: Detect default branch
+    project_api_url = f"https://{base_domain}/api/v4/projects/{encoded_project_path}"
+    project_resp = session.get(project_api_url)
+    if project_resp.status_code != 200:
+        print(f"Failed to fetch project info: {project_resp.status_code}")
+        return None, None
+
+    project_info = project_resp.json()
+    default_branch = project_info.get('default_branch', 'main')  # fallback to 'main' if not found
+
+    # Step 2: Search for the file recursively
+    page = 1
+    per_page = 100
+    found_file = None
+
+    while True:
+        api_url = (
+            f"https://{base_domain}/api/v4/projects/{encoded_project_path}/repository/tree"
+            f"?recursive=true&per_page={per_page}&page={page}"
+        )
+        response = session.get(api_url)
+        if response.status_code != 200:
+            print(f"Failed to fetch repo tree: {response.status_code}")
+            break
+
+        files_list = response.json()
+        if not files_list:
+            break
+
+        for file_info in files_list:
+            if file_info.get("type") == "blob" and file_info.get("name", "").lower() == filename.lower():
+                found_file = file_info
+                break
+
+        if found_file:
+            break
+
+        page += 1
+
+    # Step 3: Download the file
+    if found_file:
+        file_path_in_repo = found_file["path"]
+        file_url = (
+            f"https://{base_domain}/api/v4/projects/"
+            f"{encoded_project_path}/repository/files/"
+            f"{quote(file_path_in_repo, safe='')}/raw?ref={default_branch}"
+        )
+        temp_file = _download_to_tempfile(file_url, filename, temp_dir, session)
+        if temp_file:
+            print(f"Downloaded file: {temp_file}")
+        return temp_file, temp_dir
+
+    print(f"{filename} not found in repository.")
+    return None, None
+
+
+
+def _download_to_tempfile(url: str, filename: str, temp_dir: pathlib.Path, session: requests.Session) -> pathlib.Path:
     try:
-        response = session.get(url) 
+        response = session.get(url)
         if response.status_code == 200:
-            content = requests.get(url).text
-            file_path = temp_dir / filename 
-            with open(file_path, 'w', encoding='utf-8') as f:
-                f.write(content)
+            file_path = temp_dir / filename
+
+            try:
+                text = response.content.decode('utf-8')
+                with open(file_path, 'w', encoding='utf-8') as f:
+                    f.write(text)
+            except UnicodeDecodeError:
+                with open(file_path, 'wb') as f:
+                    f.write(response.content)
             return pathlib.Path(file_path)
         else:
             print(f"Failed to download {filename}: {response.status_code}")
             return None
     except Exception as e:
         print(f"Error downloading {filename}: {e}")
-        return None
\ No newline at end of file
+        return None

From 14fc040afd4b7f7268b937477dce56bd7695fbf3 Mon Sep 17 00:00:00 2001
From: Aida Jafarbigloo <aida.jafarbigloo.274@gmail.com>
Date: Fri, 6 Jun 2025 09:26:36 +0000
Subject: [PATCH 19/19] Small fix

---
 src/hermes/commands/harvest/cff.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/hermes/commands/harvest/cff.py b/src/hermes/commands/harvest/cff.py
index 8dd36ef2..57fcf19f 100644
--- a/src/hermes/commands/harvest/cff.py
+++ b/src/hermes/commands/harvest/cff.py
@@ -32,7 +32,6 @@
 class CffHarvestSettings(BaseModel):
     """Custom settings for CFF harvester."""
     enable_validation: bool = True
-    token: str = ''
 
 
 class CffHarvestPlugin(HermesHarvestPlugin):