Skip to content

Commit dff8b4f

Browse files
Cleanup data links and add automated test
Co-authored-by: RohanBhattaraiNP <RohanBhattaraiNP@users.noreply.github.com>
1 parent eefde6f commit dff8b4f

18 files changed

+518
-119
lines changed

.github/workflows/test.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Test CaltechAuthors Matcher
2+
3+
on:
4+
push:
5+
paths:
6+
- 'ames/matchers/caltechauthors.py'
7+
- 'tests/**'
8+
pull_request:
9+
paths:
10+
- 'ames/matchers/caltechauthors.py'
11+
- 'tests/**'
12+
workflow_dispatch:
13+
14+
jobs:
15+
test-caltechauthors:
16+
runs-on: ubuntu-latest
17+
18+
steps:
19+
- name: Checkout repository
20+
uses: actions/checkout@v3
21+
22+
- name: Set up Python
23+
uses: actions/setup-python@v4
24+
with:
25+
python-version: '3.10'
26+
27+
- name: Install dependencies
28+
run: |
29+
python -m pip install --upgrade pip
30+
pip install -r requirements.txt || true
31+
32+
- name: Run tests for caltechauthors
33+
env:
34+
RDMTOK: ${{ secrets.RDMTOK }}
35+
run: |
36+
PYTHONPATH=${{ github.workspace }} python -m unittest discover -s tests -p 'test_matchers.py'

CITATION.cff

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ authors:
88
- family-names: Doiel
99
given-names: Robert
1010
orcid: https://orcid.org/0000-0003-0900-6903
11+
- family-names: Bhattarai
12+
given-names: Rohan
13+
orcid: https://orcid.org/0009-0007-0323-4733
1114
- family-names: Won
1215
given-names: Elizabeth
1316
orcid: https://orcid.org/0009-0002-2450-6471
@@ -21,4 +24,4 @@ keywords:
2124
- GitHub
2225
- metadata
2326
- software
24-
date-released: 2025-05-19
27+
date-released: 2025-06-04

add_orcid_script.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
import csv,os
1+
import csv, os
22

3-
with open('orcids.csv', 'r') as f:
3+
with open("orcids.csv", "r") as f:
44
reader = csv.reader(f)
55
orcid_list = list(reader)
66
for orcid_data in orcid_list:
77
orcid = orcid_data[8]
88
clpid = orcid_data[10]
9-
os.system(f'python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid')
9+
os.system(
10+
f"python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid"
11+
)

ames/harvesters/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@
2424
from .caltechauthors import is_file_present
2525
from .caltechauthors import get_series_records
2626
from .caltechauthors import generate_data_citation_csv
27+
from .caltechauthors import get_data_availability_links

ames/harvesters/caltechauthors.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,7 @@ def get_author_records(
205205
query = f'?q=metadata.creators.person_or_org.identifiers.identifier%3A"{author_identifier}"'
206206

207207
if date:
208-
query += (
209-
f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D"
210-
)
208+
query += f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D"
211209

212210
if token:
213211
headers = {
@@ -482,3 +480,61 @@ def generate_data_citation_csv():
482480
)
483481

484482
print(f"Saved {len(all_citations)} citations to {output_file}")
483+
484+
485+
def get_data_availability_links(token=None, size=25):
486+
base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch"
487+
base_file_url_template = (
488+
"https://authors.library.caltech.edu/api/records/{record_id}/files"
489+
)
490+
491+
token = os.environ.get("RDMTOK")
492+
493+
output_file = "test_results_harvesters.csv"
494+
495+
headers = {}
496+
if token:
497+
headers = {
498+
"Authorization": f"Bearer {token}",
499+
"Content-type": "application/json",
500+
}
501+
502+
response = requests.get(base_url, headers=headers)
503+
if response.status_code != 200:
504+
print(
505+
f"Error: Unable to fetch records from the API. Status code: {response.status_code}"
506+
)
507+
exit()
508+
509+
records = response.json().get("hits", {}).get("hits", [])
510+
511+
if not records:
512+
print("No records found.")
513+
exit()
514+
515+
results = []
516+
for record in records:
517+
record_id = record.get("id")
518+
links = record.get("metadata", {}).get("additional_descriptions", [])
519+
520+
for link_data in links:
521+
description = link_data.get("description", "")
522+
links_in_description = extract_https_links(description)
523+
for link in links_in_description:
524+
classification = classify_link(link)
525+
cleaned = clean_link(link)
526+
filename = extract_filename_from_link(link)
527+
file_present = is_file_present(record_id, filename)
528+
529+
results.append(
530+
{
531+
"record_id": record_id,
532+
"original_link": link,
533+
"classification": classification,
534+
"cleaned_link": cleaned,
535+
"filename": filename,
536+
"file_present": file_present,
537+
}
538+
)
539+
540+
return results

ames/matchers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@
2424
from .caltechauthors import save_metadata_to_file
2525
from .caltechauthors import add_related_identifiers_from_csv
2626
from .caltechauthors import add_authors_affiliations
27+
from .caltechauthors import process_link_updates

0 commit comments

Comments
 (0)