diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..42ab51c0 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,36 @@ +name: Test CaltechAuthors Matcher + +on: + push: + paths: + - 'ames/matchers/caltechauthors.py' + - 'tests/**' + pull_request: + paths: + - 'ames/matchers/caltechauthors.py' + - 'tests/**' + workflow_dispatch: + +jobs: + test-caltechauthors: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt || true + + - name: Run tests for caltechauthors + env: + RDMTOK: ${{ secrets.RDMTOK }} + run: | + PYTHONPATH=${{ github.workspace }} python -m unittest discover -s tests -p 'test_matchers.py' diff --git a/CITATION.cff b/CITATION.cff index 3381ac61..ff07e63a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -8,6 +8,9 @@ authors: - family-names: Doiel given-names: Robert orcid: https://orcid.org/0000-0003-0900-6903 + - family-names: Bhattarai + given-names: Rohan + orcid: https://orcid.org/0009-0007-0323-4733 - family-names: Won given-names: Elizabeth orcid: https://orcid.org/0009-0002-2450-6471 @@ -21,4 +24,4 @@ keywords: - GitHub - metadata - software -date-released: 2025-05-19 +date-released: 2025-06-04 diff --git a/add_orcid_script.py b/add_orcid_script.py index 92d0d6f4..0bd59160 100644 --- a/add_orcid_script.py +++ b/add_orcid_script.py @@ -1,9 +1,11 @@ -import csv,os +import csv, os -with open('orcids.csv', 'r') as f: +with open("orcids.csv", "r") as f: reader = csv.reader(f) orcid_list = list(reader) for orcid_data in orcid_list: orcid = orcid_data[8] clpid = orcid_data[10] - os.system(f'python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid') + os.system( + f"python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid" + ) diff --git a/ames/harvesters/__init__.py b/ames/harvesters/__init__.py index 1add4aea..5e290431 100644 --- a/ames/harvesters/__init__.py +++ b/ames/harvesters/__init__.py @@ -24,3 +24,4 @@ from .caltechauthors import is_file_present from .caltechauthors import get_series_records from .caltechauthors import generate_data_citation_csv +from .caltechauthors import get_data_availability_links diff --git a/ames/harvesters/caltechauthors.py b/ames/harvesters/caltechauthors.py index 0695784c..454801d2 100644 --- a/ames/harvesters/caltechauthors.py +++ b/ames/harvesters/caltechauthors.py @@ -205,9 +205,7 @@ def get_author_records( query = f'?q=metadata.creators.person_or_org.identifiers.identifier%3A"{author_identifier}"' if date: - query += ( - f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D" - ) + query += f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D" if token: headers = { @@ -482,3 +480,61 @@ def generate_data_citation_csv(): ) print(f"Saved {len(all_citations)} citations to {output_file}") + + +def get_data_availability_links(token=None, size=25): + base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch" + base_file_url_template = ( + "https://authors.library.caltech.edu/api/records/{record_id}/files" + ) + + token = os.environ.get("RDMTOK") + + output_file = "test_results_harvesters.csv" + + headers = {} + if token: + headers = { + "Authorization": f"Bearer {token}", + "Content-type": "application/json", + } + + response = requests.get(base_url, headers=headers) + if response.status_code != 200: + print( + f"Error: Unable to fetch records from the API. Status code: {response.status_code}" + ) + exit() + + records = response.json().get("hits", {}).get("hits", []) + + if not records: + print("No records found.") + exit() + + results = [] + for record in records: + record_id = record.get("id") + links = record.get("metadata", {}).get("additional_descriptions", []) + + for link_data in links: + description = link_data.get("description", "") + links_in_description = extract_https_links(description) + for link in links_in_description: + classification = classify_link(link) + cleaned = clean_link(link) + filename = extract_filename_from_link(link) + file_present = is_file_present(record_id, filename) + + results.append( + { + "record_id": record_id, + "original_link": link, + "classification": classification, + "cleaned_link": cleaned, + "filename": filename, + "file_present": file_present, + } + ) + + return results diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py index a5923f56..69fb09bc 100644 --- a/ames/matchers/__init__.py +++ b/ames/matchers/__init__.py @@ -24,3 +24,4 @@ from .caltechauthors import save_metadata_to_file from .caltechauthors import add_related_identifiers_from_csv from .caltechauthors import add_authors_affiliations +from .caltechauthors import process_link_updates diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index 29fd02af..d1956bfd 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -342,7 +342,7 @@ def move_doi(record, token, test=False): ) -def add_related_identifiers_from_csv(csv_path, test=False): +def add_related_identifiers_from_csv(data_rows, token, test=False): """Reads a CSV file and adds related identifiers to each record using the CaltechDATA API.""" base_url = ( @@ -354,108 +354,173 @@ def add_related_identifiers_from_csv(csv_path, test=False): "Authorization": f"Bearer {token}", "Content-type": "application/json", } + results = [] + for row in data_rows: + record_id = row["Test_ID"] + doi = row["CaltechAUTHORS_DOI"] + caltech_author_id = row["CaltechAUTHORS_ID"] + resource_type = row["resource_type"] - with open(csv_path, "r") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - record_id = row["Test_ID"] - doi = row["CaltechAUTHORS_DOI"] - caltech_author_id = row["CaltechAUTHORS_ID"] - resource_type = row["resource_type"] + print( + f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}" + ) + print(f"Using resource_type: {resource_type}") - print( - f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}" - ) - print(f"Using resource_type: {resource_type}") + # Fetch the current record + response = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) + if response.status_code != 200: + print(f"Error fetching record {record_id}: {response.status_code}") + continue + record_data = response.json() - # Fetch the current record - response = requests.get( - f"{base_url}/api/records/{record_id}", headers=headers + # Draft check or create + draft_response = requests.get( + f"{base_url}/api/records/{record_id}/draft", headers=headers + ) + if draft_response.status_code == 200: + record_data = draft_response.json() + else: + draft_create_response = requests.post( + f"{base_url}/api/records/{record_id}/draft", headers=headers ) - if response.status_code != 200: - print(f"Error fetching record {record_id}: {response.status_code}") + if draft_create_response.status_code != 201: + print(f"Error creating draft: {draft_create_response.status_code}") continue - record_data = response.json() + record_data = draft_create_response.json() - # Draft check or create - draft_response = requests.get( - f"{base_url}/api/records/{record_id}/draft", headers=headers - ) - if draft_response.status_code == 200: - record_data = draft_response.json() - else: - draft_create_response = requests.post( - f"{base_url}/api/records/{record_id}/draft", headers=headers - ) - if draft_create_response.status_code != 201: - print(f"Error creating draft: {draft_create_response.status_code}") - continue - record_data = draft_create_response.json() + related_identifiers = ( + record_data.get("metadata", {}).get("related_identifiers", []) or [] + ) - related_identifiers = ( - record_data.get("metadata", {}).get("related_identifiers", []) or [] - ) + doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers) + author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" + author_url_exists = any( + ri.get("identifier") == author_url for ri in related_identifiers + ) - doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers) - author_url = ( - f"https://authors.library.caltech.edu/records/{caltech_author_id}" + if not doi_exists: + related_identifiers.append( + { + "relation_type": {"id": "issupplementedby"}, + "identifier": doi, + "scheme": "doi", + "resource_type": {"id": resource_type}, + } ) - author_url_exists = any( - ri.get("identifier") == author_url for ri in related_identifiers + print(f"Adding DOI: {doi}") + else: + print(f"DOI already exists") + + if not author_url_exists: + related_identifiers.append( + { + "relation_type": {"id": "isreferencedby"}, + "identifier": author_url, + "scheme": "url", + "resource_type": {"id": resource_type}, + } ) + print(f"Adding CaltechAUTHORS link: {author_url}") + else: + print(f"CaltechAUTHORS link already exists") - if not doi_exists: - related_identifiers.append( - { - "relation_type": {"id": "issupplementedby"}, - "identifier": doi, - "scheme": "doi", - "resource_type": {"id": resource_type}, - } - ) - print(f"Adding DOI: {doi}") - else: - print(f"DOI already exists") - - if not author_url_exists: - related_identifiers.append( - { - "relation_type": {"id": "isreferencedby"}, - "identifier": author_url, - "scheme": "url", - "resource_type": {"id": resource_type}, - } - ) - print(f"Adding CaltechAUTHORS link: {author_url}") - else: - print(f"CaltechAUTHORS link already exists") + record_data["metadata"]["related_identifiers"] = related_identifiers - record_data["metadata"]["related_identifiers"] = related_identifiers + update_response = requests.put( + f"{base_url}/api/records/{record_id}/draft", + headers=headers, + json=record_data, + ) + if update_response.status_code != 200: + print(f"Error updating draft: {update_response.status_code}") + continue - update_response = requests.put( - f"{base_url}/api/records/{record_id}/draft", - headers=headers, - json=record_data, + publish_response = requests.post( + f"{base_url}/api/records/{record_id}/draft/actions/publish", headers=headers + ) + if publish_response.status_code != 202: + print( + f"Error publishing record {record_id}: {publish_response.status_code}" ) - if update_response.status_code != 200: - print(f"Error updating draft: {update_response.status_code}") - continue + results.append((record_id, False)) + continue - publish_response = requests.post( - f"{base_url}/api/records/{record_id}/draft/actions/publish", - headers=headers, - ) - if publish_response.status_code != 202: - print( - f"Error publishing record {record_id}: {publish_response.status_code}" - ) - continue + print(f"Successfully updated and published {record_id}") + results.append((record_id, True)) + return results - print(f"Successfully updated and published {record_id}") - print("All records processed.") +def process_link_updates(input_csv): + # read the CSV file and build a dictionary: record_id -> {"links": [(link, classification), ...]} + records_data = {} + with open(input_file, newline="") as f: + reader = csv.DictReader(f, delimiter=",") + for row in reader: + record_id = row["record_id"].strip() + link = row["link"].strip() + classification = row["classification"].strip() + + if record_id not in records_data: + records_data[record_id] = {"links": []} + records_data[record_id]["links"].append((link, classification)) + + results = [] + + for record_id, record_info in records_data.items(): + print(f"Processing record {record_id}") + + # get metadata for the record + metadata = get_record_metadata(record_id) + if not metadata: + # if we failed to get metadata, record the error and continue + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": None, + "metadata_updated": False, + "notes": "Failed to retrieve metadata", + } + ) + continue + # check existing related identifiers in the record + related_identifiers = metadata.get("metadata", {}).get( + "related_identifiers", [] + ) + # run check_doi if a "doi" is present among the links + doi_check = None + for lk, ctype in record_info["links"]: + if ctype.lower() == "doi": + try: + doi_check = check_doi(lk, production=True) + except Exception as e: + doi_check = f"Error: {str(e)}" + + # update related identifiers + updated_metadata, updated_flag = update_related_identifiers( + metadata, record_info["links"], source_type="data" + ) + if updated_flag: + # saving to local JSON file for reference + save_metadata_to_file(updated_metadata, record_id) + pass + + # preparing the final row for the results CSV + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": doi_check, + "metadata_updated": updated_flag, + "notes": "", + } + ) + return results + def add_authors_affiliations(record, token, dimensions_key, allowed_identifiers=None): # Add dimensions affiliations to a record diff --git a/codemeta.json b/codemeta.json index 639447c7..9acd3040 100755 --- a/codemeta.json +++ b/codemeta.json @@ -30,6 +30,17 @@ "email": "rsdoiel@caltech.edu", "@id": "https://orcid.org/0000-0003-0900-6903" }, + { + "@type": "Person", + "givenName": "Rohan", + "familyName": "Bhattarai", + "affiliation": { + "@type": "Organization", + "name": "Caltech" + }, + "email": "rbhattar@caltech.edu", + "@id": "https://orcid.org/0009-0007-0323-4733" + }, { "@type": "Person", "givenName": "Elizabeth", diff --git a/run_archives_report.py b/run_archives_report.py index ebd0e69c..0e591fc2 100644 --- a/run_archives_report.py +++ b/run_archives_report.py @@ -139,6 +139,7 @@ def block_fields(): "text_4", ] + def accession_format_report(file_obj, repo, aspace, subject=None, years=None): fields = [ "title", @@ -187,7 +188,9 @@ def accession_report(file_obj, repo, aspace, subject=None, years=None): print(f"subject {subject} not found") exit() print(f"Requesting accessions") - file_obj.writerow(["title","identifier","accession_date","agent"] + block_fields()) + file_obj.writerow( + ["title", "identifier", "accession_date", "agent"] + block_fields() + ) for acc in repo.accessions: for uri in acc.subjects: if search_uri == uri.ref: diff --git a/run_authors_affiliation_enhancement.py b/run_authors_affiliation_enhancement.py index 9789bdfd..663d4ba6 100644 --- a/run_authors_affiliation_enhancement.py +++ b/run_authors_affiliation_enhancement.py @@ -23,13 +23,13 @@ args = parser.parse_args() author_identifier = args.author_identifier -#to_update = [get_metadata('6dmax-vx632',authors=True)] +# to_update = [get_metadata('6dmax-vx632',authors=True)] to_update = get_author_records(author_identifier, token, all_metadata=True) for record in to_update: add_authors_affiliations( - record, - token, - dimensions_key, - allowed_identifiers=ror, - ) + record, + token, + dimensions_key, + allowed_identifiers=ror, + ) diff --git a/run_authors_group_report.py b/run_authors_group_report.py index c91e1aa5..529e1b49 100644 --- a/run_authors_group_report.py +++ b/run_authors_group_report.py @@ -3,15 +3,15 @@ group_identifier = sys.argv[1] -#outfile = open(f"{group_identifier}_report.csv", "w") -#writer = csv.writer(outfile) +# outfile = open(f"{group_identifier}_report.csv", "w") +# writer = csv.writer(outfile) to_update = get_group_records(group_identifier) outfile = open(f"{group_identifier}_report.json", "w") outfile.write(json.dumps(to_update, indent=4)) -#for record in to_update: +# for record in to_update: # if "doi" not in record["pids"]: # metadata = record["metadata"] # publisher = "" diff --git a/run_authors_name_update.py b/run_authors_name_update.py index b1228f36..d1a6414b 100644 --- a/run_authors_name_update.py +++ b/run_authors_name_update.py @@ -20,7 +20,7 @@ old_identifier = args.old_identifier new_identifier = args.new_identifier -to_update = get_author_records(old_identifier,token) +to_update = get_author_records(old_identifier, token) for record in to_update: if args.add: edit_author_identifier( diff --git a/run_caltechauthors_get_links.py b/run_caltechauthors_get_links.py new file mode 100644 index 00000000..cd486026 --- /dev/null +++ b/run_caltechauthors_get_links.py @@ -0,0 +1,16 @@ +from ames.harvesters.caltechauthors import get_data_availability_links +import csv +import os + +output_file = "test_results_get_links.csv" +token = os.environ.get("RDMTOK") +results = get_data_availability_links(token=token) + +if results: + with open(output_file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=results[0].keys()) + writer.writeheader() + writer.writerows(results) + print(f"Saved {len(results)} links to {output_file}") +else: + print("No results.") diff --git a/run_caltechauthors_harvestors.py b/run_caltechauthors_harvestors.py index dc217f52..3e43d1a5 100644 --- a/run_caltechauthors_harvestors.py +++ b/run_caltechauthors_harvestors.py @@ -6,11 +6,13 @@ extract_https_links, clean_link, extract_filename_from_link, - is_file_present + is_file_present, ) base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch" -base_file_url_template = "https://authors.library.caltech.edu/api/records/{record_id}/files" +base_file_url_template = ( + "https://authors.library.caltech.edu/api/records/{record_id}/files" +) token = os.environ.get("RDMTOK") @@ -25,7 +27,9 @@ response = requests.get(base_url, headers=headers) if response.status_code != 200: - print(f"Error: Unable to fetch records from the API. Status code: {response.status_code}") + print( + f"Error: Unable to fetch records from the API. Status code: {response.status_code}" + ) exit() records = response.json().get("hits", {}).get("hits", []) @@ -48,14 +52,16 @@ filename = extract_filename_from_link(link) file_present = is_file_present(record_id, filename) - results.append({ - "record_id": record_id, - "original_link": link, - "classification": classification, - "cleaned_link": cleaned, - "filename": filename, - "file_present": file_present - }) + results.append( + { + "record_id": record_id, + "original_link": link, + "classification": classification, + "cleaned_link": cleaned, + "filename": filename, + "file_present": file_present, + } + ) if results: with open(output_file, "w", newline="") as f: diff --git a/run_caltechauthors_matchers.py b/run_caltechauthors_matchers.py index 3d71948c..a5eb361a 100644 --- a/run_caltechauthors_matchers.py +++ b/run_caltechauthors_matchers.py @@ -25,9 +25,7 @@ def main(): classification = row["classification"].strip() if record_id not in records_data: - records_data[record_id] = { - "links": [] - } + records_data[record_id] = {"links": []} records_data[record_id]["links"].append((link, classification)) results = [] @@ -52,11 +50,13 @@ def main(): continue # check existing related identifiers in the record - related_identifiers = metadata.get("metadata", {}).get("related_identifiers", []) + related_identifiers = metadata.get("metadata", {}).get( + "related_identifiers", [] + ) # run check_doi if a "doi" is present among the links doi_check = None - for (lk, ctype) in record_info["links"]: + for lk, ctype in record_info["links"]: if ctype.lower() == "doi": try: doi_check = check_doi(lk, production=True) @@ -65,7 +65,7 @@ def main(): # update related identifiers updated_metadata, updated_flag = update_related_identifiers( - metadata, record_info["links"], source_type="data" + metadata, record_info["links"], source_type="data" ) if updated_flag: # saving to local JSON file for reference @@ -84,5 +84,6 @@ def main(): } ) + if __name__ == "__main__": main() diff --git a/run_caltechauthors_update_links.py b/run_caltechauthors_update_links.py new file mode 100644 index 00000000..7f58fc48 --- /dev/null +++ b/run_caltechauthors_update_links.py @@ -0,0 +1,16 @@ +from ames.matchers.caltechauthors import process_link_updates +import csv + +input_file = "non_publisher_links.csv" +output_file = "test_results_update_links.csv" + +results = process_link_updates(input_file) + +if results: + with open(output_file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=results[0].keys()) + writer.writeheader() + writer.writerows(results) + print(f"Saved update results to {output_file}") +else: + print("No results.") diff --git a/run_harvest_links.py b/run_harvest_links.py new file mode 100644 index 00000000..a5eb361a --- /dev/null +++ b/run_harvest_links.py @@ -0,0 +1,89 @@ +import os +import csv +import json +import requests + +from ames.matchers.caltechauthors import ( + get_record_metadata, + update_related_identifiers, + save_metadata_to_file, + check_doi, +) + + +def main(): + input_file = "non_publisher_links.csv" + output_file = "test_results_matchers.csv" + + # read the CSV file and build a dictionary: record_id -> {"links": [(link, classification), ...]} + records_data = {} + with open(input_file, newline="") as f: + reader = csv.DictReader(f, delimiter=",") + for row in reader: + record_id = row["record_id"].strip() + link = row["link"].strip() + classification = row["classification"].strip() + + if record_id not in records_data: + records_data[record_id] = {"links": []} + records_data[record_id]["links"].append((link, classification)) + + results = [] + + for record_id, record_info in records_data.items(): + print(f"Processing record {record_id}") + + # get metadata for the record + metadata = get_record_metadata(record_id) + if not metadata: + # if we failed to get metadata, record the error and continue + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": None, + "metadata_updated": False, + "notes": "Failed to retrieve metadata", + } + ) + continue + + # check existing related identifiers in the record + related_identifiers = metadata.get("metadata", {}).get( + "related_identifiers", [] + ) + + # run check_doi if a "doi" is present among the links + doi_check = None + for lk, ctype in record_info["links"]: + if ctype.lower() == "doi": + try: + doi_check = check_doi(lk, production=True) + except Exception as e: + doi_check = f"Error: {str(e)}" + + # update related identifiers + updated_metadata, updated_flag = update_related_identifiers( + metadata, record_info["links"], source_type="data" + ) + if updated_flag: + # saving to local JSON file for reference + save_metadata_to_file(updated_metadata, record_id) + pass + + # preparing the final row for the results CSV + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": doi_check, + "metadata_updated": updated_flag, + "notes": "", + } + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_matchers.py b/tests/test_matchers.py new file mode 100644 index 00000000..a04da0f9 --- /dev/null +++ b/tests/test_matchers.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import csv +import os +import random +import sys +import unittest + +import requests + +# Ensure the local project package is importable when the repo root is the CWD. +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from ames.matchers.caltechauthors import add_related_identifiers_from_csv # noqa: E402 + +TOKEN = os.getenv("RDMTOK") +CSV_FILE = "test.csv" + +print(f"[init] RDMTOK present: {'YES' if TOKEN else 'NO'} (len={len(TOKEN) if TOKEN else 0})") + + +def load_test_data(from_csv: bool = True): + """Return rows for the upload function, from CSV when available.""" + if from_csv and os.path.exists(CSV_FILE): + with open(CSV_FILE, newline="") as fh: + return list(csv.DictReader(fh)) + + doi_stub = random.randint(1000, 9999) + return [ + { + "CaltechAUTHORS_ID": "bwww3-z8y74", + "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{doi_stub}", + "Related_DOI": "10.22002/D1.1458", + "Data_ID": "3hqgp-jhw61", + "Cross_Link": "No", + "Test_ID": "99s7k-d6f58", + "resource_type": "publication-article", + } + ] + + +def verify_related_identifiers_on_site(rows, *, test: bool = True): + """Fetch each record and report which links are present or missing.""" + base = ( + "https://data.caltechlibrary.dev" + if test + else "https://data.caltechlibrary.caltech.edu" + ) + headers = {"Authorization": f"Bearer {TOKEN}"} + results = [] + + for row in rows: + record_id = row["Test_ID"] + doi = row["CaltechAUTHORS_DOI"] + author_link = f"https://authors.library.caltech.edu/records/{row['CaltechAUTHORS_ID']}" + + resp = requests.get(f"{base}/api/records/{record_id}", headers=headers) + print(f"[verify] {record_id}: {resp.status_code}") + if resp.status_code != 200: + print(" Error: could not fetch record from server.") + results.append((record_id, False)) + continue + + related = resp.json().get("metadata", {}).get("related_identifiers", []) + has_doi = any(x["identifier"] == doi for x in related) + has_author = any(x["identifier"] == author_link for x in related) + + status_parts = [ + "DOI link present" if has_doi else "DOI link missing", + "CaltechAUTHORS link present" if has_author else "CaltechAUTHORS link missing", + ] + print(" " + "; ".join(status_parts)) + + results.append((record_id, has_doi and has_author)) + + return results + + +class TestCaltechDataUploader(unittest.TestCase): + @unittest.skipUnless(TOKEN, "needs RDMTOK to hit CaltechDATA API") + def test_add_and_verify_related_identifiers(self): + rows = load_test_data(from_csv=False) + + uploads = add_related_identifiers_from_csv(rows, TOKEN, test=True) + for record_id, ok in uploads: + self.assertTrue(ok, f"upload failed for {record_id}") + + verifies = verify_related_identifiers_on_site(rows, test=True) + for record_id, ok in verifies: + self.assertTrue(ok, f"verification failed for {record_id}") + + +if __name__ == "__main__": + unittest.main(verbosity=2)