caltechlibrary
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 4 additions & 1 deletion b/‎CITATION.cff‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎add_orcid_script.py‎
Lines changed: 5 additions & 3 deletions b/‎add_orcid_script.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎ames/harvesters/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎ames/harvesters/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ames/harvesters/caltechauthors.py‎
Lines changed: 59 additions & 3 deletions b/‎ames/harvesters/caltechauthors.py‎
Lines changed: 59 additions & 3 deletions
diff --git a/‎ames/matchers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎ames/matchers/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,36 @@
+name: Test CaltechAuthors Matcher
+
+on:
+  push:
+    paths:
+      - 'ames/matchers/caltechauthors.py'
+      - 'tests/**'
+  pull_request:
+    paths:
+      - 'ames/matchers/caltechauthors.py'
+      - 'tests/**'
+  workflow_dispatch:
+
+jobs:
+  test-caltechauthors:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt || true  
+
+    - name: Run tests for caltechauthors
+      env:
+          RDMTOK: ${{ secrets.RDMTOK }} 
+      run: |
+        PYTHONPATH=${{ github.workspace }} python -m unittest discover -s tests -p 'test_matchers.py'
@@ -8,6 +8,9 @@ authors:
   - family-names: Doiel
     given-names: Robert
     orcid: https://orcid.org/0000-0003-0900-6903
+  - family-names: Bhattarai
+    given-names: Rohan
+    orcid: https://orcid.org/0009-0007-0323-4733
   - family-names: Won
     given-names: Elizabeth
     orcid: https://orcid.org/0009-0002-2450-6471
@@ -21,4 +24,4 @@ keywords:
   - GitHub
   - metadata
   - software
-date-released: 2025-05-19
+date-released: 2025-06-04
@@ -1,9 +1,11 @@
-import csv,os
+import csv, os
 
-with open('orcids.csv', 'r') as f:
+with open("orcids.csv", "r") as f:
     reader = csv.reader(f)
     orcid_list = list(reader)
     for orcid_data in orcid_list:
         orcid = orcid_data[8]
         clpid = orcid_data[10]
-        os.system(f'python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid')
+        os.system(
+            f"python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid"
+        )
@@ -24,3 +24,4 @@
 from .caltechauthors import is_file_present
 from .caltechauthors import get_series_records
 from .caltechauthors import generate_data_citation_csv
+from .caltechauthors import get_data_availability_links
@@ -205,9 +205,7 @@ def get_author_records(
     query = f'?q=metadata.creators.person_or_org.identifiers.identifier%3A"{author_identifier}"'
 
     if date:
-        query += (
-            f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D"
-        )
+        query += f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D"
 
     if token:
         headers = {
@@ -482,3 +480,61 @@ def generate_data_citation_csv():
             )
 
     print(f"Saved {len(all_citations)} citations to {output_file}")
+
+
+def get_data_availability_links(token=None, size=25):
+    base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch"
+    base_file_url_template = (
+        "https://authors.library.caltech.edu/api/records/{record_id}/files"
+    )
+
+    token = os.environ.get("RDMTOK")
+
+    output_file = "test_results_harvesters.csv"
+
+    headers = {}
+    if token:
+        headers = {
+            "Authorization": f"Bearer {token}",
+            "Content-type": "application/json",
+        }
+
+    response = requests.get(base_url, headers=headers)
+    if response.status_code != 200:
+        print(
+            f"Error: Unable to fetch records from the API. Status code: {response.status_code}"
+        )
+        exit()
+
+    records = response.json().get("hits", {}).get("hits", [])
+
+    if not records:
+        print("No records found.")
+        exit()
+
+    results = []
+    for record in records:
+        record_id = record.get("id")
+        links = record.get("metadata", {}).get("additional_descriptions", [])
+
+        for link_data in links:
+            description = link_data.get("description", "")
+            links_in_description = extract_https_links(description)
+            for link in links_in_description:
+                classification = classify_link(link)
+                cleaned = clean_link(link)
+                filename = extract_filename_from_link(link)
+                file_present = is_file_present(record_id, filename)
+
+                results.append(
+                    {
+                        "record_id": record_id,
+                        "original_link": link,
+                        "classification": classification,
+                        "cleaned_link": cleaned,
+                        "filename": filename,
+                        "file_present": file_present,
+                    }
+                )
+
+    return results
@@ -24,3 +24,4 @@
 from .caltechauthors import save_metadata_to_file
 from .caltechauthors import add_related_identifiers_from_csv
 from .caltechauthors import add_authors_affiliations
+from .caltechauthors import process_link_updates