diff --git a/docs/source/conf.py b/docs/source/conf.py index 9ee616da..7ab7b582 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -102,7 +102,7 @@ def read_version_from_pyproject(): 'sphinx_togglebutton', 'sphinxcontrib.datatemplates', # Custom extensions, see `_ext` directory. - 'plugin_markup', + # 'plugin_markup', ] language = 'en' diff --git a/docs/source/dev/data_model.md b/docs/source/dev/data_model.md index a5f4c727..8f008dc4 100644 --- a/docs/source/dev/data_model.md +++ b/docs/source/dev/data_model.md @@ -1,27 +1,314 @@ -# HERMES Data Model +# Data model -*hermes* uses an internal data model to store the output of the different stages. -All the data is collected in a directory called `.hermes` located in the root of the project directory. +`hermes`' internal data model acts like a contract between `hermes` and plugins. +It is based on [**JSON-LD (JSON Linked Data)**](https://json-ld.org/), and +the public API simplifies interaction with the data model through Python code. -You should not need to interact with this data directly. -Instead, use {class}`hermes.model.context.HermesContext` and respective subclasses to access the data in a consistent way. +Output of the different `hermes` commands consequently is valid JSON-LD, serialized as JSON, that is cached in +subdirectories of the `.hermes/` directory that is created in the root of the project directory. +The cache is purely for internal purposes, its data should not be interacted with. -## Harvest Data +Depending on whether you develop a plugin for `hermes`, or you develop `hermes` itself, you need to know either [_some_](#json-ld-for-plugin-developers), +or _quite a few_ things about JSON-LD. -The data of the havesters is cached in the sub-directory `.hermes/harvest`. -Each harvester has a separate cache file to allow parallel harvesting. -The cache file is encoded in JSON and stored in `.hermes/harvest/HARVESTER_NAME.json` -where `HARVESTER_NAME` corresponds to the entry point name. +The following sections provide documentation of the data model. +They aim to help you get started with `hermes` plugin and core development, +even if you have no previous experience with JSON-LD. -{class}`hermes.model.context.HermesHarvestContext` encapsulates these harvester caches. +## The data model for plugin developers + +If you develop a plugin for `hermes`, you will only need to work with a single Python class and the public API +it provides: {class}`hermes.model.SoftwareMetadata`. + +To work with this class, it is necessary that you know _some_ things about JSON-LD. + +### JSON-LD for plugin developers + +```{attention} +Work in progress. +``` + + +### Working with the `hermes` data model in plugins + +> **Goal** +> Understand how plugins access the `hermes` data model and interact with it. + +`hermes` aims to hide as much of the data model as possible behind a public API +to avoid that plugin developers have to deal with some of the more complex features of JSON-LD. + +#### Model instances in different types of plugin + +You can extend `hermes` with plugins for three different commands: `harvest`, `curate`, `deposit`. + +The commands differ in how they work with instances of the data model. + +- `harvest` plugins _create_ a single new model instance and return it. +- `curate` plugins are passed a single existing model instance (the output of `process`), +and return a single model instance. +- `deposit` plugins are passed a single existing model instance (the output of `curate`), +and return a single model instance. + +#### How plugins work with the API + +```{important} +Plugins access the data model _exclusively_ through the API class {class}`hermes.model.SoftwareMetadata`. +``` + +The following sections show how this class works. + +##### Creating a data model instance + +Model instances are primarily created in `harvest` plugins, but may also be created in other plugins to map +existing data into. + +To create a new model instance, initialize {class}`hermes.model.SoftwareMetadata`: + +```{code-block} python +:caption: Initializing a default data model instance +from hermes.model import SoftwareMetadata + +data = SoftwareMetadata() +``` + +`SoftwareMetadata` objects initialized without arguments provide the default _context_ +(see [_JSON-LD for plugin developers_](#json-ld-for-plugin-developers)). +This means that now, you can use terms from the schemas included in the default context to describe software metadata. + +Terms from [_CodeMeta_](https://codemeta.github.io/terms/) can be used without a prefix: + +```{code-block} python +:caption: Using terms from the default schema +data["readme"] = ... +``` + +Terms from [_Schema.org_](https://schema.org/) can be used with the prefix `schema`: + +```{code-block} python +:caption: Using terms from a non-default schema +data["schema:copyrightNotice"] = ... +``` + +You can also use other linked data vocabularies. To do this, you need to identify them with a prefix and register them +with the data model by passing it `extra_vocabs` as a `dict` mapping prefixes to URLs where the vocabularies are +provided as JSON-LD: + +```{code-block} python +:caption: Injecting additional schemas +from hermes.model import SoftwareMetadata + +# Contents served at https://bar.net/schema.jsonld: +# { +# "@context": +# { +# "baz": "https://schema.org/Thing" +# } +# } + +data = SoftwareMetadata(extra_vocabs={"foo": "https://bar.net/schema.jsonld"}) + +data["foo:baz"] = ... +``` + +##### Adding data + +Once you have an instance of {class}`hermes.model.SoftwareMetadata`, you can add data to it, +i.e., metadata that describes software: + +```{code-block} python +:caption: Setting data values +data["name"] = "My Research Software" # A simple "Text"-type value +# → Simplified model representation : { "name": [ "My Research Software" ] } +# Cf. "Accessing data" below +data["author"] = {"name": "Foo"} # An object value that uses terms available in the defined context +# → Simplified model representation : { "name": [ "My Research Software" ], "author": [ { "name": "Foo" } ] } +# Cf. "Accessing data" below +``` + +##### Accessing data + +You need to be able to access data in the data model instance to add, edit or remove data. +Data can be accessed by using term strings, similar to how values in Python `dict`s are accessed by keys. + +```{important} +When you access data from a data model instance, +it will always be returned in a **list**-like object! +``` + +The reason for providing data in list-like objects is that JSON-LD treats all property values as arrays. +Even if you add "single value" data to a `hermes` data model instance via the API, the underlying JSON-LD model +will treat it as an array, i.e., a list-like object: + +```{code-block} python +:caption: Internal data values are arrays +data["name"] = "My Research Software" # → [ "My Research Software" ] +data["author"] = {"name": "Foo"} # → [ { "name": [ "Foo" ] } ] +``` + +Therefore, you access data in the same way you would access data from a Python `list`: + +1. You access single values using indices, e.g., `data["name"][0]`. +2. You can use a list-like API to interact with data objects, e.g., +`data["name"].append("Bar")`, `data["name"].extend(["Bar", "Baz"])`, `for name in data["name"]: ...`, etc. + +##### Interacting with data + +The following longer example shows different ways that you can interact with `SoftwareMetadata` objects and the data API. + +```{code-block} python +:caption: Building the data model +from hermes.model import SoftwareMetadata + +# Create the model object with the default context +data = SoftwareMetadata() + +# Let's create author metadata for our software! +# Below each line of code, the value of `data["author"]` is given. + +data["author"] = {"name": "Foo"} +# → [{'name': ['Foo']}] + +data["author"].append({"name": "Bar"}) +# [{'name': ['Foo']}, {'name': ['Bar']}] + +data["author"][0]["email"] = "foo@baz.net" +# [{'name': ['Foo'], 'email': ['foo@baz.net']}, {'name': ['Bar']}] + +data["author"][1]["email"].append("bar@baz.net") +# [{'name': ['Foo'], 'email': ['foo@baz.net']}, {'name': ['Bar'], 'email': ['bar@baz.net']}] + +data["author"][1]["email"].extend(["bar@spam.org", "bar@eggs.com"]) +# [ +# {'name': ['Foo'], 'email': ['foo@baz.net']}, +# {'name': ['Bar'], 'email': ['bar@baz.net', 'bar@spam.org', 'bar@eggs.com']} +# ] +``` + +The example continues to show how to iterate through data. + +```{code-block} python +:caption: for-loop, containment check +for i, author in enumerate(data["author"]): + if author["name"][0] in ["Foo", "Bar"]: + print(f"Author {i + 1} has expected name.") + else: + raise ValueError("Unexpected author name found!", author["name"][0]) + +# Mock output: +# $> Author 1 has expected name. +# $> Author 2 has expected name. +``` + +```{code-block} python +:caption: Value check +for email in data["author"][0]["email"]: + if email.endswith(".edu"): + print("Author has an email address at an educational institution.") + else: + print("Cannot confirm affiliation with educational institution for author.") + +# Mock output +# $> Cannot confirm affiliation with educational institution for author. +``` + +```{code-block} python +:caption: Value check and list comprehension +if ["bar" in email for email in data["author"][1]["email"]]: + print("Author has only emails with their name in it.") + +# Mock output +# $> Author has only emails with their name in it. +``` + +The example continues to show how to assert data values. + +As mentioned in the [introduction to the data model](#data-model), +`hermes` uses a JSON-LD-like internal data model. +The API class {class}`hermes.model.SoftwareMetadata` hides many +of the more complex aspects of JSON-LD and makes it easy to work +with the data model. + +Assertions, however, operate on the internal model objects. +Therefore, they may not work as you would expect from plain +Python data: + +```{code-block} python +:caption: Naive containment assertion that raises +:emphasize-lines: 5,13 +try: + assert ( + {'name': ['Foo'], 'email': ['foo@baz.net']} + in + data["author"] + ) + print("The author was found!") +except AssertionError: + print("The author could not be found.") + raise + +# Mock output +# $> The author could not be found. +# $> AssertionError: +# assert +# {'email': ['foo@baz.net'], 'name': ['Foo']} +# in +# _LDList( +# {'@list': [ +# { +# 'http://schema.org/name': [{'@value': 'Foo'}], +# 'http://schema.org/email': [{'@value': 'foo@baz.net'}] +# }, +# { +# 'http://schema.org/name': [{'@value': 'Bar'}], +# 'http://schema.org/email': [ +# {'@list': [ +# {'@value': 'bar@baz.net'}, {'@value': 'bar@spam.org'}, {'@value': 'bar@eggs.com'} +# ]} +# ] +# }] +# } +# ) +``` + +The mock output in the example above shows the inequality of the expected and the actual value. +The actual value is an internal data type wrapping the more complex JSON-LD data. + +The complex data structure of JSON-LD is internally constructed in the `hermes` data +model, and to make it possible to work with only the data that is important - the actual terms +and their values - the internal data model types provide a function `.to_python()`. +This function can be used in assertions to assert full data integrity: + +```{code-block} python +:caption: Containment assertion with `to_python()` +:emphasize-lines: 5,13 +try: + assert ( + {'name': ['Foo'], 'email': ['foo@baz.net']} + in + data["author"].to_python() + ) + print("The author was found!") +except AssertionError: + print("The author could not be found.") + raise + +# Mock output +# $> The author was found! +``` + +--- + +## See Also + +- API reference: {class}`hermes.model.SoftwareMetadata` diff --git a/poetry.lock b/poetry.lock index 0eaf104b..34fa117f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "accessible-pygments" @@ -700,6 +700,19 @@ files = [ {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] +[[package]] +name = "isodate" +version = "0.7.2" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = false +python-versions = ">=3.7" +groups = ["main"] +markers = "python_version == \"3.10\"" +files = [ + {file = "isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15"}, + {file = "isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6"}, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -1139,6 +1152,24 @@ files = [ dev = ["abi3audit", "black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] test = ["pytest", "pytest-xdist", "setuptools"] +[[package]] +name = "pyaml" +version = "25.7.0" +description = "PyYAML-based module to produce a bit more pretty and readable YAML-serialized data" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pyaml-25.7.0-py3-none-any.whl", hash = "sha256:ce5d7867cc2b455efdb9b0448324ff7b9f74d99f64650f12ca570102db6b985f"}, + {file = "pyaml-25.7.0.tar.gz", hash = "sha256:e113a64ec16881bf2b092e2beb84b7dcf1bd98096ad17f5f14e8fb782a75d99b"}, +] + +[package.dependencies] +PyYAML = "*" + +[package.extras] +anchors = ["unidecode"] + [[package]] name = "pycodestyle" version = "2.9.1" @@ -1594,7 +1625,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -1651,6 +1682,29 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "rdflib" +version = "7.2.1" +description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information." +optional = false +python-versions = ">=3.8.1" +groups = ["main"] +files = [ + {file = "rdflib-7.2.1-py3-none-any.whl", hash = "sha256:1a175bc1386a167a42fbfaba003bfa05c164a2a3ca3cb9c0c97f9c9638ca6ac2"}, + {file = "rdflib-7.2.1.tar.gz", hash = "sha256:cf9b7fa25234e8925da8b1fb09700f8349b5f0f100e785fb4260e737308292ac"}, +] + +[package.dependencies] +isodate = {version = ">=0.7.2,<1.0.0", markers = "python_version < \"3.11\""} +pyparsing = ">=2.1.0,<4" + +[package.extras] +berkeleydb = ["berkeleydb (>=18.1.0,<19.0.0)"] +html = ["html5rdf (>=1.2,<2)"] +lxml = ["lxml (>=4.3,<6.0)"] +networkx = ["networkx (>=2,<4)"] +orjson = ["orjson (>=3.9.14,<4)"] + [[package]] name = "requests" version = "2.32.4" @@ -1806,6 +1860,21 @@ files = [ {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"}, ] +[[package]] +name = "schemaorg" +version = "0.1.1" +description = "Python functions for applied use of schema.org" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "schemaorg-0.1.1.tar.gz", hash = "sha256:567f1735df666221c893d2c206dd70f9cddcc983c8cdc39f3a7b7726884d2c51"}, +] + +[package.dependencies] +lxml = ">=4.1.1" +pyaml = ">=17.12.1" + [[package]] name = "setuptools" version = "80.9.0" @@ -2477,4 +2546,4 @@ files = [ [metadata] lock-version = "2.1" python-versions = ">=3.10, <4.0.0" -content-hash = "58304fd33d6ec1ce3400b43ecffb16b3f48a5621e513c3e8057f9e3e050835e8" +content-hash = "e76de51d1f5dd86486d4cc24a5cdf7d007b16ce5d9d0cc3f7d0f353cf0defff0"