Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ authors:
email: d.pape@hzdr.de
affiliation: Helmholtz-Zentrum Dresden-Rossendorf (HZDR)
orcid: 'https://orcid.org/0000-0002-3145-9880'
- given-names: Kernchen
family-names: Sophie
- given-names: Sophie
family-names: Kernchen
email: sophie.kernchen@dlr.de
affiliation: German Aerospace Center (DLR)
orcid: 'https://orcid.org/0009-0005-4430-6743'
Expand Down
1,598 changes: 972 additions & 626 deletions poetry.lock

Large diffs are not rendered by default.

13 changes: 4 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ authors = [
{ name = "Oliver Bertuch", email = "o.bertuch@fz-juelich.de" },
{ name = "Oliver Knodel", email = "o.knodel@hzdr.de" },
{ name = "David Pape", email = "d.pape@hzdr.de" },
{ name = "Sophie Kernchen", email = "sohpie.kernchen@dlr.de" },
{ name = "Sophie Kernchen", email = "sophie.kernchen@dlr.de" },
{ name = "Nitai Heeb", email = "n.heeb@fz-juelich.de" },
]
maintainers = [
{ name = "Stephan Druskat", email = "stephan.druskat@dlr.de" },
]
readme = "README.md"
repository = "https://github.com/softwarepub/hermes"
keywords = ["software", "publication", "metadata", "automation"]

dependencies = [
Expand All @@ -39,6 +38,7 @@ dependencies = [
"pydantic-settings>=2.1.0, <3.0.0",
"requests-oauthlib>=2.0.0, <3.0.0",
"pynacl>=1.5.0, <2.0.0",
"easydataverse>=0.4.4, <0.5.0",
]
requires-python = ">=3.10, <4.0.0"

Expand All @@ -48,12 +48,10 @@ documentation = "https://hermes.software-metadata.pub"
repository = "https://github.com/softwarepub/hermes.git"
issues = "https://github.com/softwarepub/hermes/issues"


[project.scripts]
hermes = "hermes.commands.cli:main"
hermes-marketplace = "hermes.commands.marketplace:main"


[project.entry-points."hermes.harvest"]
cff = "hermes.commands.harvest.cff:CffHarvestPlugin"
codemeta = "hermes.commands.harvest.codemeta:CodeMetaHarvestPlugin"
Expand All @@ -62,15 +60,15 @@ file_exists = "hermes.commands.harvest.file_exists:FileExistsHarvestPlugin"
[project.entry-points."hermes.deposit"]
file = "hermes.commands.deposit.file:FileDepositPlugin"
invenio = "hermes.commands.deposit.invenio:InvenioDepositPlugin"
invenio_rdm = "hermes.commands.deposit.invenio_rdm:IvenioRDMDepositPlugin"
invenio_rdm = "hermes.commands.deposit.invenio_rdm:InvenioRDMDepositPlugin"
rodare = "hermes.commands.deposit.rodare:RodareDepositPlugin"
dataverse = "hermes.commands.deposit.dataverse:DataverseDepositPlugin"

[project.entry-points."hermes.postprocess"]
config_invenio_record_id = "hermes.commands.postprocess.invenio:config_record_id"
config_invenio_rdm_record_id = "hermes.commands.postprocess.invenio_rdm:config_record_id"
cff_doi = "hermes.commands.postprocess.invenio:cff_doi"


[tool.poetry.group.dev.dependencies]
pytest = "^7.1.1"
pytest-cov = "^3.0.0"
Expand Down Expand Up @@ -100,22 +98,19 @@ sphinx-togglebutton="^0.3.2"
reuse = "^1.1.2"
sphinxcontrib-datatemplates = "^0.11.0"


[tool.taskipy.tasks]
docs-build = "poetry run sphinx-build -M html docs/source docs/build -W"
docs-clean = "poetry run sphinx-build -M clean docs/source docs/build"
docs-live = "poetry run sphinx-autobuild docs/source docs/build"
flake8 = "poetry run flake8 ./test/ ./src/ --count --select=E9,F63,F7,F82 --statistics"


[tool.pytest.ini_options]
norecursedirs = "docs/*"
testpaths = [
"test"
]
addopts = "--cov=hermes --cov-report term"


[build-system]
requires = [
"poetry-core>=2.1.3, <3.0.0"
Expand Down
232 changes: 232 additions & 0 deletions src/hermes/commands/deposit/dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# SPDX-FileCopyrightText: 2025 Forschungszentrum Jülich GmbH
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileContributor: Nitai Heeb

import json
import logging
from pathlib import Path

import requests
from easyDataverse import Dataset, Dataverse, License
from pydantic import BaseModel

from hermes.commands.deposit.base import BaseDepositPlugin
from hermes.commands.deposit.error import DepositionUnauthorizedError
from hermes.model.path import ContextPath
from hermes.utils import hermes_doi

_log = logging.getLogger("cli.deposit.dataverse")


class DataverseDepositSettings(BaseModel):
"""Settings required to deposit into Dataverse."""
site_url: str = ""
target_collection: str = ""
api_token: str = ""
target_pid: str = None
publication_type: str = "software"
files: list[Path] = []


class DataverseDepositPlugin(BaseDepositPlugin):
platform_name = "dataverse"
settings_class = DataverseDepositSettings

def __init__(self, command, ctx) -> None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO we should check some other conditions here as well:

  1. Check if the Dataverse Version Info API endpoint is reachable (so the site_url is valid) - maybe easyDataverse has a builtin check for that?
    • Some functionality (such as dataset types) are only available as of certain versions of Dataverse. It might be worth noting the version and not trying to use certain features if it is below a certain version.
  2. Check if the target_collextion exists. (if not available in easyDataverse / pyDataverse)
  3. If a "target_pid" is given, vet it.
    • It might be in a bunch of different formats, but we need to map them to URI form to query the Dataverse API.
    • Also, while at it, check if the dataset actually exists.
    • Also for the sake of a complete config check: maybe verify if the dataset is located within the target_collection. (Maybe not a hard error?)
    • We should check if it has the right publication type, as the type is (currently) not intended to be changeable without elevated privileges.
  4. Check if the choosen publication type is available in the target Dataverse (err out if not)
  5. Check if the CodeMeta metadata block is present (maybe not an error - we can still do a deposit, but wil not be able to map everything)
  6. Check if incomplete metadata deposition is enabled This way we can leave out some details if folks don't want or have no mapping.

"""
Sets up the DataverseDepositPlugin with data from the hermes toml.
Tests if everything is valid and creates an easyDataverse client.
"""
super().__init__(command, ctx)
self.ctx_path = ContextPath.parse(f"deposit.{self.platform_name}")
self.config = getattr(self.command.settings, self.platform_name)
self.check_if_all_valid()
self.client = Dataverse(server_url=self.config.site_url, api_token=self.config.api_token)

def check_if_all_valid(self) -> None:
"""
Tests if all conditions are met before starting the rest of the deposit.
"""
self.check_version()
self.check_api_token()
self.check_target_collection()
self.check_target_pid()
self.check_publication_type()

def check_version(self) -> None:
"""
Tests if the site_url is reachable as dataverse instance.
Also saves the dataverse version in the context incase we want to use it later on.
"""
url = f"{self.config.site_url}/api/info/version"
res = requests.get(url)
if not res.ok:
raise RuntimeError(f"Dataverse ({self.config.site_url}) not reachable.")
version_info = res.json().get("data", {}).get("version", "")
self.ctx.update(self.ctx_path["dataverse_version"], version_info)

def check_api_token(self) -> None:
api_token = self.config.api_token
if not api_token:
raise DepositionUnauthorizedError("No api-token given for deposition platform (dataverse).")
token_valid_url = f"{self.config.site_url}/api/users/token"
token_valid_response = requests.get(token_valid_url, headers={"X-Dataverse-key": api_token})
if not token_valid_response.ok:
raise DepositionUnauthorizedError("Given api-token for deposition platform (dataverse) is not valid.")

def check_target_collection(self) -> None:
"""
Tests if the target collection exists.
"""
target_collection = self.config.target_collection
url = f"{self.config.site_url}/api/dataverses/{target_collection}"
res = requests.get(url)
if not res.ok:
raise RuntimeError(f"Dataverse collection '{target_collection}' not found.")

def check_target_pid(self) -> None:
"""
Tests if the given pid is valid.
"""
if not self.config.target_pid:
return
url = f"{self.config.site_url}/api/datasets/:persistentId/?persistentId={self.config.target_pid}"
res = requests.get(url)
if not res.ok:
raise RuntimeError(f"Dataset {self.config.target_pid} not found.")
data = res.json().get("data", {})
if self.config.target_collection and not data.get("ownerAlias") == self.config.target_collection:
_log.warning("Dataset is not located inside the target collection.")

def check_publication_type(self) -> None:
"""
Tests if the given publication type (most likely "software") is supported by the target dataverse.
"""
url = f"{self.config.site_url}/api/datasets/datasetTypes"
res = requests.get(url)
if res.ok:
types = res.json().get("data", [])
type_names = [t["name"] for t in types]
if self.config.publication_type not in type_names:
raise RuntimeError(
f"Publication type '{self.config.publication_type}' not supported on target Dataverse.")
else:
# TBD what to do when showing supported datasetTypes does not work?
# This is currently the case for https://data.fz-juelich.de/ & https://data-beta.fz-juelich.de/
pass

def map_metadata(self) -> None:
"""
Saves the given codemeta-metadata, so it's ready to be used.
Since we add the metadata with easyDataverse there is no mapping needed at this point.
"""
metadata = self.ctx["codemeta"]
self.ctx.update(self.ctx_path["depositionMetadata"], metadata)
with open(self.ctx.get_cache("deposit", self.platform_name, create=True), 'w') as f:
json.dump(metadata, f, indent=2)

def is_initial_publication(self) -> bool:
return self.config.target_pid is None

def update_metadata_on_dataset(self, dataset: Dataset) -> None:
"""
Sets metadata on an easyDataverse.Dataset using the depositionMetadata
"""
metadata = self.ctx[self.ctx_path["depositionMetadata"]]
dataset.citation.title = metadata.get("name", "")
dataset.citation.subject = ["Other"]
dataset.citation.add_ds_description(value=metadata.get("description", ""))

authors = metadata.get("author", [])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm missing contributors mapping.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the difference between authors and contributors? In the cff file, which is our main source, there are only "authors", no "contributors"

Copy link
Member

@poikilotherm poikilotherm Jun 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's very true. Contributor is everyone who's not an author. The Git history is a good source for all contributors. Some of them might be authors as well and are then described as having this role (so no duplicated entries as both author & contributor!)

Copy link
Contributor

@zyzzyxdonta zyzzyxdonta Jun 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Invenio(RDM)/Rodare plugins we also don't have contributors at the moment. See:

contributors = [ # noqa: F841

And:

# TODO: Use `contributors`. In the case of the hermes workflow itself, the

for i, author in enumerate(authors):
full_name = f"{author.get('familyName')}, {author.get('givenName')}" \
if author.get("familyName") and author.get("givenName") \
else author.get("name")
affiliation_name = ""
if affiliation_dict := author.get("affiliation"):
affiliation_name = affiliation_dict.get("legalName", "")
dataset.citation.add_author(name=full_name, affiliation=affiliation_name)
if i == 0:
dataset.citation.add_dataset_contact(name=full_name, email=author.get("email"))

if date_published := metadata.get("datePublished"):
dataset.citation.date_of_deposit = date_published
# TODO look for "version" or something similar in dataverse
# if version := metadata.get("version"):
# dataset.citation.softwareVersion = version
if keywords := metadata.get("keywords", []):
if keywords is list:
for keyword in keywords:
dataset.citation.add_keyword(keyword)
if deposition_license := metadata.get("license"):
try:
dataverse_license = License.fetch_by_name(deposition_license, server_url=self.client.server_url)
dataset.citation.license = dataverse_license
except Exception as e:
_log.warning(f"Could not match license '{deposition_license}' to allowed licenses for deposition: {e}")
dataset.citation.other_references = [f"Compiled by HERMES ({hermes_doi})"]

def create_initial_version(self) -> None:
"""
Creates an initial version of a publication.
The original DepositPlugin flow with first creating the initial version and then adding metadata doesn't
fit well with easyDataverse module since it requires certain metadata field to create the initial version.
As solution, we use update_metadata_on_dataset(dataset) both in this method and in update_metadata().
"""
if not self.command.args.initial:
raise RuntimeError("Please use `--initial` to make an initial deposition.")
dataset = self.client.create_dataset()
self.update_metadata_on_dataset(dataset)
persistent_id = dataset.upload(dataverse_name=self.config.target_collection)
self.ctx.update(self.ctx_path["persistentId"], persistent_id)

def create_new_version(self) -> None:
"""
Creates a new version of an existing publication.
TODO implement this
"""
persistent_id = self.ctx[self.ctx_path["persistentId"]]
if not persistent_id:
raise RuntimeError("No persistent ID found in context. Cannot create new version.")
dataset = self.client.load_dataset(persistent_id)
if not dataset:
raise RuntimeError(f"Could not load dataset for persistent ID {persistent_id}")
_log.warning("Creating a new version of a dataset is not implemented right now.")

def update_metadata(self) -> None:
"""
Updates the dataset's metadata if the dataset is not newly created.
If it is newly created, update_metadata_on_dataset was already called in create_initial_version.
"""
if not self.command.args.initial:
persistent_id = self.ctx[self.ctx_path["persistentId"]]
dataset = self.client.load_dataset(persistent_id)
self.update_metadata_on_dataset(dataset)
res = dataset.update()
if not res.ok:
raise RuntimeError(f"Failed to update metadata: {res.status_code}: {res.text}")

def upload_artifacts(self) -> None:
"""
Uploads new artifacts to the current dataverse-dataset.
"""
persistent_id = self.ctx[self.ctx_path["persistentId"]]
dataset = self.client.load_dataset(persistent_id)
files = *self.config.files, *[f[0] for f in self.command.args.file]
for path_string in files:
path_string = str(path_string)
dataset.add_file(path_string)
dataset.update()

def publish(self) -> None:
"""
Publishes the newly created dataset / publication.
Unfortunately easyDataverse module does not support publishing. So we do that with requests.
"""
persistent_id = self.ctx[self.ctx_path["persistentId"]]
url = f"{self.config.site_url}/api/datasets/:persistentId/actions/:publish"
params = {"type": "major"}
headers = {"X-Dataverse-key": self.config.api_token}
res = requests.post(url, headers=headers, params=params, data={"persistentId": persistent_id})
if not res.ok:
raise RuntimeError(f"Publish failed: {res.status_code}: {res.text}")
15 changes: 0 additions & 15 deletions src/hermes/commands/deposit/invenio.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,21 +267,6 @@ def __init__(self, command: HermesDepositCommand, ctx: CodeMetaContext, client=N

if client is None:
auth_token = self.config.auth_token

# TODO reactivate this code again, once we use Zenodo OAuth again (once the refresh token works)
# If auth_token is a refresh-token, get the auth-token from that.
# if str(auth_token).startswith("REFRESH_TOKEN:"):
# _log.debug(f"Getting token from refresh_token {auth_token}")
# # TODO How do we know if this targets sandbox or not?
# # Now we assume it's sandbox
# connect_zenodo.setup(True)
# tokens = connect_zenodo.oauth_process() \
# .get_tokens_from_refresh_token(auth_token.split("REFRESH_TOKEN:")[1])
# _log.debug(f"Tokens: {str(tokens)}")
# auth_token = tokens.get("access_token", "")
# _log.debug(f"Auth Token: {auth_token}")
# # TODO Update the secret (github/lab token is needed)

if not auth_token:
raise DepositionUnauthorizedError("No valid auth token given for deposition platform")
self.client = self.invenio_client_class(self.config,
Expand Down
2 changes: 1 addition & 1 deletion src/hermes/commands/deposit/invenio_rdm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _search_license_info(self, _url: str, valid_licenses: dict) -> t.Optional[di
return None


class IvenioRDMDepositPlugin(InvenioDepositPlugin):
class InvenioRDMDepositPlugin(InvenioDepositPlugin):
platform_name = "invenio_rdm"
invenio_client_class = InvenioRDMClient
invenio_resolver_class = InvenioRDMResolver
Loading