From 4f06f50d913430b1786b9b6e186526d453044731 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Mon, 31 Mar 2025 02:55:44 -0700 Subject: [PATCH 01/19] Add files via upload --- ames/harvesters/editor.py | 133 +++++++++++++++++++++++++++++ ames/harvesters/output.csv | 168 +++++++++++++++++++++++++++++++++++++ 2 files changed, 301 insertions(+) create mode 100644 ames/harvesters/editor.py create mode 100644 ames/harvesters/output.csv diff --git a/ames/harvesters/editor.py b/ames/harvesters/editor.py new file mode 100644 index 00000000..6a24a38e --- /dev/null +++ b/ames/harvesters/editor.py @@ -0,0 +1,133 @@ +import csv +import json +import requests +from caltechdata_edit import caltechdata_edit + +# Read the CSV file +records = [] +with open('output.csv', 'r') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + records.append(row) + +# Access token for authentication +token = "moaclkv2MHDHoGklIZs7ABF5mmBZNcxKGgmHY4yyHaUAGbCuyO3DNXhehpL7" + +# Using the development system (production=False) +production = False +base_url = "https://data.caltechlibrary.dev" + +# Set up headers for API requests +headers = { + "Authorization": f"Bearer {token}", + "Content-type": "application/json", +} + +def add_related_identifier(record_id, doi, caltech_author_id): + """Add DOI and CaltechAUTHORS_ID to related identifiers directly using the API""" + print(f"Processing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}") + + # First, get the current record + response = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) + + if response.status_code != 200: + print(f"Error fetching record {record_id}: {response.status_code}") + print(response.text) + return False + + record_data = response.json() + + # Check if there's already a draft + draft_response = requests.get(f"{base_url}/api/records/{record_id}/draft", headers=headers) + + if draft_response.status_code == 200: + # Use the draft if it exists + record_data = draft_response.json() + else: + # Create a draft if it doesn't exist + draft_create_response = requests.post( + f"{base_url}/api/records/{record_id}/draft", + headers=headers + ) + + if draft_create_response.status_code != 201: + print(f"Error creating draft for {record_id}: {draft_create_response.status_code}") + print(draft_create_response.text) + return False + + record_data = draft_create_response.json() + + # Update the related identifiers + related_identifiers = record_data.get("metadata", {}).get("related_identifiers", []) + if related_identifiers is None: + related_identifiers = [] + + # Check if DOI already exists + doi_exists = any(identifier.get("identifier") == doi for identifier in related_identifiers) + + # Check if CaltechAUTHORS_ID URL already exists + author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" + author_url_exists = any(identifier.get("identifier") == author_url for identifier in related_identifiers) + + # Add the DOI if it doesn't exist + if not doi_exists: + new_doi_identifier = { + "relation_type": {"id": "issupplementedby"}, + "identifier": doi, + "scheme": "doi", + "resource_type": {"id": "publication"} + } + related_identifiers.append(new_doi_identifier) + print(f"Adding DOI related identifier: {doi}") + else: + print(f"DOI {doi} already exists in related identifiers") + + # Add the CaltechAUTHORS_ID URL if it doesn't exist + if not author_url_exists: + new_author_identifier = { + "relation_type": {"id": "isreferencedby"}, + "identifier": author_url, + "scheme": "url", + "resource_type": {"id": "publication"} + } + related_identifiers.append(new_author_identifier) + print(f"Adding CaltechAUTHORS_ID related identifier: {author_url}") + else: + print(f"CaltechAUTHORS_ID URL {author_url} already exists in related identifiers") + + record_data["metadata"]["related_identifiers"] = related_identifiers + + # Update the draft + update_response = requests.put( + f"{base_url}/api/records/{record_id}/draft", + headers=headers, + json=record_data + ) + + if update_response.status_code != 200: + print(f"Error updating draft for {record_id}: {update_response.status_code}") + print(update_response.text) + return False + + # Publish the draft + publish_response = requests.post( + f"{base_url}/api/records/{record_id}/draft/actions/publish", + headers=headers + ) + + if publish_response.status_code != 202: + print(f"Error publishing draft for {record_id}: {publish_response.status_code}") + print(publish_response.text) + return False + + print(f"Successfully added related identifier {doi} to {record_id} and published the changes") + return True + +# Process each record +for record in records: + test_id = record['Test_ID'] + doi = record['CaltechAUTHORS_DOI'] + caltech_author_id = record['CaltechAUTHORS_ID'] + add_related_identifier(test_id, doi, caltech_author_id) + +print("Processing complete") \ No newline at end of file diff --git a/ames/harvesters/output.csv b/ames/harvesters/output.csv new file mode 100644 index 00000000..2d5bc431 --- /dev/null +++ b/ames/harvesters/output.csv @@ -0,0 +1,168 @@ +CaltechAUTHORS_ID,CaltechAUTHORS_DOI,Related_DOI,Data_ID,Cross_Link,Test_ID +bwww3-z8y74,10.1093/mnras/staa2808,10.22002/D1.1458,3hqgp-jhw61,No,99s7k-d6f58 +dm3mv-q1b76,10.1038/s41524-019-0216-x,10.22002/D1.1256,yrn11-jb916,No,t899g-xww46 +ahsp6-e8a25,10.1186/s40168-020-0785-4,10.22002/D1.1295,88yzp-h0n85,No,ndyrw-3gq31 +n1n6t-pxs56,10.1038/s41586-021-03601-4,10.22002/D1.1455,m47ts-35f81,No,ydg6m-x6q85 +zjfrp-a9k69,10.1126/science.abb3099,10.22002/D1.1444,s3ejh-8rk72,No,9vgt2-8vy76 +jvzym-1mh07,10.1021/acssynbio.1c00592,10.22002/D1.2140,tvy11-z5a48,No,rff39-mtm48 +r8qh8-y4065,10.1128/JCM.01785-21,10.22002/D1.1942,bv2tf-aap55,No,whd0t-96h94 +nrbae-qk103,10.1098/rspa.2021.0561,10.22002/D1.2173,m1b0w-0zs59,No,6atdb-1bj05 +nrbae-qk103,10.1098/rspa.2021.0561,10.22002/D1.2024,e3sea-97705,No,0x4v2-cb178 +dcas8-y4s76,10.1126/science.abg9765,10.22002/D1.1882,s7epj-5ry14,No,c0e9h-2n643 +k3h8s-vjz03,10.1016/j.atmosenv.2021.118809,10.22002/D1.1985,254mc-zpg74,No,4qwbc-5sb81 +9x8wh-65414,10.7554/eLife.65092,10.22002/D1.2076,k9hkr-v0978,No,v22y2-m7n82 +9x8wh-65414,10.7554/eLife.65092,10.22002/D1.2157,g3sp0-33085,No,9eh2f-k1a76 +9x8wh-65414,10.7554/eLife.65092,10.22002/D1.1790,qxm6c-q5p97,No,qn0ny-t3b46 +15qr1-8d538,10.1029/2019je006190,10.22002/D1.1326,dwc1w-r9a68,No,7gpm7-hb420 +344g6-s0176,10.1007/s12678-021-00694-3,10.22002/D1.1632,1km87-52j70,No,fdx2t-3rx08 +nxrw7-zx123,10.1038/s41561-021-00862-6,10.22002/D1.2150,0a0fg-yer22,No,36zbz-rbn41 +pa1b8-a8596,10.1029/2020gl089917,10.22002/D1.1376,9wpke-7eg08,No,358dc-1ck54 +ezbzn-3qz02,10.1038/s41567-021-01492-w,10.22002/D1.2202,59zzx-xhb23,No,kcgnj-he106 +hcg0j-pk874,10.1029/2020gl087477,10.22002/D1.1348,j5fk1-0g306,No,wkwk4-8ym88 +40n3g-jae61,10.1029/2017GC007260,10.22002/D1.320,vymbv-n8p13,No,65wey-dqt49 +40n3g-jae61,10.1029/2017GC007260,10.22002/D1.321,bwxz5-0kc80,No,yz4a8-bd988 +5624m-z4s58,10.1029/2019gc008862,10.22002/D1.1380,mm5dq-05y22,No,aekvq-hh376 +8scha-rhv61,10.1029/2018MS001313,10.22002/D1.933,a49j4-qt740,No,w08gs-1t250 +prs7s-hfr18,10.1038/s41524-022-00747-1,10.22002/D1.1983,dekcc-2tb35,No,2g1py-bnf41 +99bn3-pg365,10.7554/eLife.55308,10.22002/d1.1336,wyrrw-ffe20,No,yd65f-wkz21 +99bn3-pg365,10.7554/eLife.55308,10.22002/D1.1331,8ftre-92r60,No,q9xqq-xrk20 +q0pn9-dgp08,10.1029/2021ea001637,10.22002/D1.2028,p6614-mjw68,No,a7rma-wkb68 +q0pn9-dgp08,10.1029/2021ea001637,10.22002/D1.222,k8gnx-7hj46,No,6ffmj-n7192 +jk4kd-j8n07,10.1029/2020gl091699,10.22002/D1.1666,bk8pf-qvx09,No,gv80y-yq992 +zyge8-axx47,10.5194/se-11-2283-2020,10.22002/D1.1447,6jegf-05x08,No,443sq-jh364 +8xx2v-fk653,10.1093/pnasnexus/pgad033,10.22002/D1.20223,0yw13-j0441,No,szsyt-ytf03 +6famx-f0654,10.1038/s41467-020-16224-6,10.22002/D1.1371,60wg2-9qz17,No,v8shp-3a074 +qjzq0-71624,10.1029/2019jb018597,10.22002/D1.1377,nwj4y-hcm74,No,aan3s-zfk64 +nm7th-hgm94,10.1073/pnas.2102267118,10.22002/D1.1996,10svm-aq733,No,47mgd-ydg26 +zfjpk-xtj13,10.1063/5.0006684,10.22002/D1.1379,dxrpq-xyx02,No,jmzz8-6td18 +kp5yc-avm39,10.1038/s41586-019-1447-1,10.22002/D1.1160,w8hpj-0y065,No,6a7xt-crp87 +c9geb-11h23,10.1073/pnas.1907869116,10.22002/D1.1241,fbv6r-hg153,No,cqnnk-chw84 +zqrkn-f3w82,10.1038/s41524-019-0213-0,10.22002/D1.1178,ehp06-pcf04,No,9r1k1-95d28 +zqrkn-f3w82,10.1038/s41524-019-0213-0,10.22002/D1.1179,p2bsg-7tb62,No,pehkc-etn63 +arj9r-nfc42,10.1038/s41467-019-13262-7,10.22002/D1.1296,2vdrb-bmr68,No,26ts2-f1j05 +kz506-yqx70,10.1029/2019JB018922,10.22002/d1.1328,yen0b-fed04,No,vw4ra-qrp13 +gjzvx-agj04,10.1038/s41598-020-58586-3,10.22002/D1.1298,8d1ar-e8e29,No,gepec-02x71 +ztte2-h5j24,10.1088/1367-2630/ac1144,10.22002/D1.1451,hvtvx-rjq36,No,nz1k8-4ck35 +ngtts-mjd88,10.1029/2021jb021976,10.22002/D1.2009,7tp28-jp627,No,bzqfj-f7077 +ngtts-mjd88,10.1029/2021jb021976,10.22002/D1.2010,e80sv-5py92,No,ffmm8-ckg43 +jemmb-kp969,,10.22002/D1.1921,d2qn0-tse31,No,e3rhe-kdq51 +ke1kq-rk838,10.1038/s41467-022-33449-9,10.22002/D1.20291,9bkyr-21532,No,8b5nn-hbt51 +4jqdv-nrv37,10.1016/j.cels.2022.03.002,10.22002/D1.1693,4ry4k-5gf89,No,h2e81-mz738 +rsg2f-2nj92,10.1126/sciadv.adg6618,10.22002/D1.2090,c2w53-e7q49,No,15m2r-ep710 +qgw1r-cg440,10.1111/sed.13100,10.22002/D1.1645,8zehn-8rr62,No,mzph0-txc86 +w2ev0-snj25,10.1029/2021jf006392,10.22002/D1.8962,yrkvm-9bh56,No,d3ek1-z9x67 +750xa-xag63,,10.22002/D1.1286,zcndp-vg341,No,27jch-y5k25 +j80sb-te308,10.1088/1748-3190/ad277f,10.22002/c5cyj-mev09,c5cyj-mev09,No,86x6j-n4q55 +mypge-8d791,10.1038/s41586-023-06227-w,10.22002/7h65h-89163,7h65h-89163,No,4m1z1-bhc25 +90vmt-dcf44,10.1029/2019je006289,10.22002/D1.1349,kmde9-m7g44,No,y28dp-mfh71 +fx2at-3ps68,10.1029/2019jg005533,10.22002/D1.1226,xc9rx-8qs95,No,rbw1s-e1198 +pjjee-1w296,10.1029/2021MS002671,10.22002/D1.1429,3p6y1-a8b95,No,ymygq-mpz35 +79r10-1a091,10.1029/2021jf006406,10.22002/D1.2078,5qqjp-5g813,No,bkqq3-kfa29 +79r10-1a091,10.1029/2021jf006406,10.22002/D1.9200,dgxbk-45k21,No,9ynk4-ajv10 +4a7y9-nzb92,10.1016/j.cels.2022.03.001,10.22002/D1.1692,5e4cj-34824,No,bw0gp-8tt54 +4a7y9-nzb92,10.1016/j.cels.2022.03.001,10.22002/D1.20047,aymp3-qzt70,No,15qm4-77249 +6k82a-tnj57,10.24349/pjye-gkeo,10.22002/D1.1877,01pg4-5r437,No,h8ps1-0yp52 +1qwbd-2q359,10.1029/2021av000534,10.22002/D1.2176,kasms-vp209,No,e1m0y-cpf40 +dv27e-2sy47,10.1073/pnas.2023433118,10.22002/D1.1667,8rsdg-rxz52,No,kaayd-19r23 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1915,653dj-3b761,No,mtzc2-18p79 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1900,82zcr-mxa32,No,n1vb7-m1g75 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1905,cj7ec-8g780,No,j8p30-za980 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1914,5vd8b-sz113,No,8rp1w-qfz76 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1919,zh7s1-8g617,No,pqak6-n5333 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1917,2t5tj-e5w45,No,w80kf-hyt30 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1916,0vgd3-9cp11,No,9w2er-n4713 +tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1918,5x3vq-z1b96,No,zb2dj-d0f86 +h0e90-htt06,10.1029/2021jc018375,10.22002/D1.1296,2vdrb-bmr68,No,26ts2-f1j05 +0f32n-bgm37,10.1029/2021gl095227,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 +0f32n-bgm37,10.1029/2021gl095227,10.22002/D1.2033,jn6d1-wfj05,No,t8h2f-87s64 +nde62-t1r58,10.1029/2021JB022462,10.22002/D1.1970,h6438-cgy98,No,pbgwc-7b016 +rwr63-xyt69,10.1029/2021gl096503,10.22002/D1.2135,pyr6e-wt732,No,2kvfz-erc91 +r1pva-c8x21,10.1029/2022jb024329,10.22002/D1.20035,wxsas-96g08,No,8tj1d-22780 +rb73g-vqz89,10.1016/j.xcrp.2022.100959,10.22002/D1.20057,bfap4-h2m21,No,px5ss-5y916 +wr8qq-s8t58,10.1109/WHISPERS.2016.8071774,10.22002/D1.222,k8gnx-7hj46,No,6ffmj-n7192 +ynbgx-0tx98,10.1029/2021ms002747,10.22002/D1.1231,meh5c-wy279,No,61shk-rcs26 +6gcmj-reb48,10.1038/s41586-022-04749-3,10.22002/D1.2155,y72mq-emt30,No,5jjz9-40b67 +vpnng-szs82,10.1126/sciadv.abn9545,10.22002/D1.20048,hend5-jzt61,No,sb743-qp239 +1q6gn-mvc46,10.1029/2022ms003105,10.22002/D1.20052,j8mw7-fm491,No,5mdtq-15724 +ck6pf-68621,10.1093/gji/ggab407,10.22002/D1.1955,31emd-wmv98,No,33y3z-2te02 +d20js-7z640,10.1029/2022gl101715,10.22002/vn6v0-pfr77,vn6v0-pfr77,No,dcy6h-wem15 +rsekp-4g847,10.1109/tgrs.2023.3305194,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 +cmvm1-e9379,10.1029/2023gl105205,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 +wkjh4-6nf06,10.1101/2022.03.02.22271724,10.22002/D1.20049,pmm08-6q581,No,rtc14-wmr12 +p6mn1-gs660,10.1101/2022.06.17.496478,10.22002/D1.20199,5vng7-8ne78,No,afznk-vbp04 +vtef0-x7037,10.1101/2022.07.13.22277113,10.22002/D1.20223,0yw13-j0441,No,szsyt-ytf03 +q40ve-64h03,10.1002/essoar.10510937.2,10.22002/D1.20052,j8mw7-fm491,No,5mdtq-15724 +4w29e-xpa51,10.1002/essoar.10510458.2,10.22002/D1.1891,n1yye-2z213,No,zeyw2-27p09 +e9yy0-v0658,10.1002/essoar.10511838.1,10.22002/D1.20038,e1acg-e0k08,No,3pg72-ddh27 +x0cjg-kxg25,10.1101/2022.07.13.22277513,10.22002/D1.20223,0yw13-j0441,No,szsyt-ytf03 +jh5g2-fyg97,10.1002/essoar.10512148.1,10.22002/D1.20252,dhfgs-p7319,No,wjhsn-kbv79 +fgbgn-2rk49,10.1002/essoar.10512118.2,10.22002/D1.20248,pybpv-w7661,No,wp4x8-1vc62 +91hvf-49c47,,10.22002/D1.1438,3ejxg-69q72,No,2ths5-hkw28 +yvded-18923,10.1101/2020.07.25.210468,10.22002/D1.1407,fc4k3-75q88,No,c4fh9-f1170 +166f3-tj121,,10.22002/D1.1100,dkdt4-e0x94,No,bcn0f-n7z19 +nb3f9-s8p96,10.1002/essoar.10504190.1,10.22002/D1.1617,4peqr-t0723,No,npzd5-bzb58 +5xkg8-an610,10.1101/762773,10.22002/d1.1276,gye0e-gw976,No,gqpst-s3d05 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1915,653dj-3b761,No,mtzc2-18p79 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1900,82zcr-mxa32,No,n1vb7-m1g75 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1905,cj7ec-8g780,No,j8p30-za980 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1914,5vd8b-sz113,No,8rp1w-qfz76 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1917,2t5tj-e5w45,No,w80kf-hyt30 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1916,0vgd3-9cp11,No,9w2er-n4713 +24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1918,5x3vq-z1b96,No,zb2dj-d0f86 +2nvtm-7qe34,10.1029/2019je006191,10.22002/d1.1304,dmrkv-3xn63,No,91tsz-cfc02 +m07dw-3jq86,10.1038/s41587-019-0372-z,10.22002/D1.1311,a73n8-3pa89,No,dth8s-nba11 +n3qyx-kf341,10.1029/2019je006156,10.22002/D1.1299,df548-mhy96,No,a15h1-f1j49 +ks7h3-tf644,10.1038/s42005-021-00703-3,10.22002/D1.1858,vp3y4-ef326,No,2f5p2-2e148 +8mp48-93523,,10.22002/D1.1315,d8w02-xng39,No,q0346-mn915 +a7vwv-mhw66,10.1101/2022.07.10.499405,10.22002/D1.20215,awtz3-tz122,No,xfqva-b4991 +h26g5-k0311,10.1029/2018ea000416,10.22002/D1.1211,5ptpj-x8c08,No,7f5x5-pa209 +h26g5-k0311,10.1029/2018ea000416,10.22002/D1.1212,3j8a8-bzn05,No,xk40w-p0g08 +y47b9-pmn34,10.48550/arXiv.1905.06360,10.22002/D1.1241,fbv6r-hg153,No,cqnnk-chw84 +zf2v5-vx810,10.1029/2018JE005706,10.22002/D1.1087,hhzzq-yw058,No,e67v1-3kg31 +zf2v5-vx810,10.1029/2018JE005706,10.22002/D1.1085,44km0-er448,No,g867t-x4d04 +zf2v5-vx810,10.1029/2018JE005706,10.22002/D1.1086,c8fwc-kvg38,No,c1gj1-hnb74 +k6jp7-8js32,10.1029/2019gl086424,10.22002/d1.1317,rdv1n-st737,No,8kgzq-0z165 +se7bz-k9a90,10.1029/2019jb018855,10.22002/D1.1293,mtn3h-frk09,No,hrpxj-0tr80 +frxqw-qyw74,10.1063/1.5054927,10.22002/D1.305,0t2qn-hvq19,No,f3cwe-hgm53 +dzgj9-y8y49,10.3390/rs12213586,10.22002/d1.1182,45st6-jvh02,No,xk1ex-axk10 +ge4t4-7e026,10.1029/2019je006298,10.22002/D1.1318,gyvwk-btq07,No,te8rh-sxq88 +ge4t4-7e026,10.1029/2019je006298,10.22002/D1.1211,5ptpj-x8c08,No,7f5x5-pa209 +ze84b-v1782,10.1101/2020.12.09.20239467,10.22002/D1.1702,24265-gtd53,No,y98md-bp961 +d4h00-xf206,10.1029/2022jb025425,10.22002/D1.20252,dhfgs-p7319,No,wjhsn-kbv79 +x179p-jhv14,10.7554/elife.85370,10.22002/D1.2157,g3sp0-33085,No,9eh2f-k1a76 +ywtpy-nka66,10.1038/s41929-021-00618-w,10.22002/D1.1632,1km87-52j70,No,fdx2t-3rx08 +zwfrv-rbn84,10.1038/s41467-021-25443-4,10.22002/D1.2032,jvjkh-d9g50,No,x6gw8-as345 +y99cf-fkr11,10.1016/j.icarus.2022.115079,10.22002/D1.20170,3ra81-96y32,No,fddq3-zmn81 +ernz6-2xp43,10.1101/2022.03.21.484932,10.22002/D1.20060,hwv5v-m9x76,No,vfz20-ydp76 +rxd0x-ag678,10.1029/2020JE006675,10.22002/D1.1617,4peqr-t0723,No,npzd5-bzb58 +t2ser-vfe73,10.1029/2020ja027796,10.22002/D1.1333,r35r5-sb884,No,ngxcf-0a955 +v84d4-vcs34,10.1038/s41586-020-2872-x,10.22002/D1.1647,jhn25-fsd29,No,24b5z-t4m60 +c03z0-nfg11,10.1038/s41598-020-77073-3,10.22002/D1.1407,fc4k3-75q88,No,c4fh9-f1170 +4fdtn-y2e21,10.1029/2020je006606,10.22002/D1.1628,d5jt1-wqt82,No,wx24e-5yd65 +ct2sc-f7m12,10.1029/2019av000140,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 +rdqe2-hsq97,10.1029/2020jb021369,10.22002/D1.1670,e200b-xsm06,No,4qv0g-7yt28 +363j8-nw138,10.1038/s41587-021-00870-2,10.22002/D1.1876,7704n-f6m57,No,3gz1s-d5261 +r7rsd-a7a17,,10.22002/D1.2026,51sah-d9r47,No,mn3a1-x3x94 +r7rsd-a7a17,,10.22002/D1.2025,p8ppf-7ff93,No,e1acb-n8a94 +6f3gb-97h58,10.1126/science.abg2947,10.22002/D1.1976,zpzee-79351,No,px1w1-xqr37 +wfg20-4tn76,10.1029/2020tc006210,10.22002/D1.1388,8dhjv-rvf91,No,zwjd2-rw873 +n2b9p-8jf71,10.1029/2021JB021886,10.22002/D1.1612,ryqnw-bdf94,No,mzwax-jw720 +30s9e-9z096,10.1029/2021jb022676,10.22002/D1.2009,7tp28-jp627,No,bzqfj-f7077 +30s9e-9z096,10.1029/2021jb022676,10.22002/D1.2141,dwkqy-hkj69,No,e0zwb-f9s86 +30s9e-9z096,10.1029/2021jb022676,10.22002/D1.2142,665w3-pbj51,No,dygfc-38b42 +fx63v-hsd80,10.1029/2021gl092598,10.22002/D1.2023,m1r58-kvb98,No,ft45b-8nw98 +vfg98-9zt14,10.5194/gmd-14-6309-2021,10.22002/D1.971,8mkmg-c2938,No,3ydw5-6yx19 +b0xvf-xn162,10.1038/s41561-021-00706-3,10.22002/D1.1874,3hrqe-5x450,No,7m4ep-06188 +b0xvf-xn162,10.1038/s41561-021-00706-3,10.22002/D1.1873,8ydek-yt879,No,m86ym-m3603 +08dr4-w6943,10.1029/2021je006828,10.22002/D1.1971,gv4qf-qwa77,No,mm42q-wft65 +yzjs7-1cv55,10.1016/j.jsb.2022.107860,10.22002/D1.2096,2vtv3-pp862,No,tyest-mem06 +yzjs7-1cv55,10.1016/j.jsb.2022.107860,10.22002/D1.2099,tz8jq-0mk77,No,yypym-x5693 +yzjs7-1cv55,10.1016/j.jsb.2022.107860,10.22002/D1.2103,kfkqj-q6557,No,4khh3-dpp37 +0mmmp-9pz59,10.1029/2023jb026488,10.22002/D1.20248,pybpv-w7661,No,wp4x8-1vc62 +phbx7-m8a69,10.1088/2515-7655/ac817e,10.22002/D1.20061,q9zpw-g8s64,No,skmrx-14a11 +r5hzq-dzy83,10.2110/jsr.2022.032,10.22002/D1.20044,hzwqz-5wr07,No,0y3zw-g1n05 +sar53-81n52,10.1016/j.epsl.2023.118277,10.22002/D1.1619,s6ey4-qpm11,No,ac32t-mff72 +sar53-81n52,10.1016/j.epsl.2023.118277,10.22002/D1.1620,2znta-5t680,No,tka8c-bjp40 +y5gpx-saw63,10.5194/amt-17-5861-2024,10.14291/TCCON.GGG2014,rhrv4-mcp55,No,jn2dg-2h888 +knwz1-dvb78,,10.14291/tccon.ggg2014.pasadena01.r1/1182415,tb378-y1a55,No,8z4qd-mhz12 +dm3mv-q1b76,10.1038/s41524-019-0216-x,10.22002/D1.1256,yrn11-jb916,No,t899g-xww46 From 79e8a38983fad9f35f392f6479e04182315350b7 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Mon, 31 Mar 2025 14:07:36 -0700 Subject: [PATCH 02/19] Update editor.py File Update caltechauthors.py Update caltechauthors.py Update caltechauthors.py Update caltechauthors.py Update caltechauthors.py Delete ames/harvesters/editor.py Update caltechauthors.py Delete ames/harvesters/updated_file.csv Delete ames/harvesters/output.csv Update __init__.py Update __init__.py Update __init__.py Create test.py Update test.py Create test.yml Update test.yml Create test_matchers.py Update caltechauthors.py Update caltechauthors.py Update test_matchers.py Update test_matchers.py Delete ames/matchers/test.py Update caltechauthors.py Create run_caltechauthors_harvestors.py Create run_caltechauthors_matchers.py Rename run_caltechauthors_harvestors.py to run_harvest_links.py Update caltechauthors.py Update __init__.py Create run_caltechauthors_update_links.py Create run_caltechauthors_get_links.py Delete run_caltechauthors_matchers.py Update caltechauthors.py Update __init__.py Update caltechauthors.py Update caltechauthors.py Update caltechauthors.py Update caltechauthors.py Update caltechauthors.py --- .github/workflows/test.yml | 34 +++ ames/harvesters/__init__.py | 1 + ames/harvesters/caltechauthors.py | 128 +++++++----- ames/harvesters/editor.py | 133 ------------ ames/harvesters/output.csv | 168 --------------- ames/matchers/__init__.py | 1 + ames/matchers/caltechauthors.py | 321 +++++++++++++---------------- run_caltechauthors_get_links.py | 16 ++ run_caltechauthors_update_links.py | 16 ++ run_harvest_links.py | 88 ++++++++ tests/test_matchers.py | 76 +++++++ 11 files changed, 446 insertions(+), 536 deletions(-) create mode 100644 .github/workflows/test.yml delete mode 100644 ames/harvesters/editor.py delete mode 100644 ames/harvesters/output.csv create mode 100644 run_caltechauthors_get_links.py create mode 100644 run_caltechauthors_update_links.py create mode 100644 run_harvest_links.py create mode 100644 tests/test_matchers.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..867667a5 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,34 @@ +name: Test CaltechAuthors Matcher + +on: + push: + paths: + - 'ames/matchers/caltechauthors.py' + - 'tests/**' + pull_request: + paths: + - 'ames/matchers/caltechauthors.py' + - 'tests/**' + workflow_dispatch: + +jobs: + test-caltechauthors: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt || true + + - name: Run tests for caltechauthors + run: | + PYTHONPATH=${{ github.workspace }} python -m unittest discover -s tests -p 'test_matchers.py' diff --git a/ames/harvesters/__init__.py b/ames/harvesters/__init__.py index 1add4aea..5e290431 100644 --- a/ames/harvesters/__init__.py +++ b/ames/harvesters/__init__.py @@ -24,3 +24,4 @@ from .caltechauthors import is_file_present from .caltechauthors import get_series_records from .caltechauthors import generate_data_citation_csv +from .caltechauthors import get_data_availability_links diff --git a/ames/harvesters/caltechauthors.py b/ames/harvesters/caltechauthors.py index 93717bb2..4484fafa 100644 --- a/ames/harvesters/caltechauthors.py +++ b/ames/harvesters/caltechauthors.py @@ -334,7 +334,6 @@ def get_records_from_date(date="2023-08-25", test=False): return hits - def doi2url(doi): if not doi.startswith("10."): return doi @@ -352,7 +351,6 @@ def doi2url(doi): return resolved_url return doi - def fetch_metadata(record_id): url = f"https://authors.library.caltech.edu/api/records/{record_id}" try: @@ -362,12 +360,11 @@ def fetch_metadata(record_id): except: return None - def search_resource_type(obj): if isinstance(obj, dict): for k, v in obj.items(): - if k == "resource_type" and isinstance(v, dict) and "id" in v: - return v["id"] + if k == 'resource_type' and isinstance(v, dict) and 'id' in v: + return v['id'] result = search_resource_type(v) if result: return result @@ -378,10 +375,8 @@ def search_resource_type(obj): return result return None - def fetch_resource_type(data): - return search_resource_type(data) or "N/A" - + return search_resource_type(data) or 'N/A' def search_records(prefix): base_url = "https://authors.library.caltech.edu/api/records" @@ -392,7 +387,6 @@ def search_records(prefix): return response.json() return None - def extract_data_citations(hits): citations = [] for hit in hits: @@ -401,50 +395,38 @@ def extract_data_citations(hits): if not metadata: continue - caltechauthors_doi = ( - metadata.get("pids", {}).get("doi", {}).get("identifier", "") - ) + caltechauthors_doi = metadata.get("pids", {}).get("doi", {}).get("identifier", "") resource_type = fetch_resource_type(metadata) related_dois = [] for identifier in metadata.get("metadata", {}).get("related_identifiers", []): if identifier.get("scheme") == "doi": doi = identifier["identifier"] - if any( - doi.startswith(prefix) - for prefix in ["10.22002/", "10.14291/", "10.25989/"] - ): + if any(doi.startswith(prefix) for prefix in ["10.22002/", "10.14291/", "10.25989/"]): related_dois.append(doi) for doi in related_dois: caltechdata_url = doi2url(doi) if "data.caltech.edu/records/" in caltechdata_url: caltechdata_id = caltechdata_url.split("/records/")[-1] - caltechdata_metadata = requests.get( - f"https://data.caltech.edu/api/records/{caltechdata_id}" - ).json() + caltechdata_metadata = requests.get(f"https://data.caltech.edu/api/records/{caltechdata_id}").json() cross_link = "No" - for identifier in caltechdata_metadata.get("metadata", {}).get( - "related_identifiers", [] - ): + for identifier in caltechdata_metadata.get("metadata", {}).get("related_identifiers", []): if identifier.get("identifier") == caltechauthors_doi: cross_link = "Yes" break - citations.append( - { - "CaltechAUTHORS_ID": record_id, - "CaltechAUTHORS_DOI": caltechauthors_doi, - "Related_DOI": doi, - "CaltechDATA_ID": caltechdata_id, - "Cross_Link": cross_link, - "resource_type": resource_type, - } - ) + citations.append({ + "CaltechAUTHORS_ID": record_id, + "CaltechAUTHORS_DOI": caltechauthors_doi, + "Related_DOI": doi, + "CaltechDATA_ID": caltechdata_id, + "Cross_Link": cross_link, + "resource_type": resource_type + }) return citations - def generate_data_citation_csv(): prefixes = ["10.22002", "10.14291", "10.25989"] all_citations = [] @@ -457,26 +439,66 @@ def generate_data_citation_csv(): output_file = "data_citations_with_type.csv" with open(output_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow( - [ - "CaltechAUTHORS_ID", - "CaltechAUTHORS_DOI", - "Related_DOI", - "CaltechDATA_ID", - "Cross_Link", - "resource_type", - ] - ) + writer.writerow(["CaltechAUTHORS_ID", "CaltechAUTHORS_DOI", "Related_DOI", "CaltechDATA_ID", "Cross_Link", "resource_type"]) for citation in all_citations: - writer.writerow( - [ - citation["CaltechAUTHORS_ID"], - citation["CaltechAUTHORS_DOI"], - citation["Related_DOI"], - citation["CaltechDATA_ID"], - citation["Cross_Link"], - citation["resource_type"], - ] - ) + writer.writerow([ + citation["CaltechAUTHORS_ID"], + citation["CaltechAUTHORS_DOI"], + citation["Related_DOI"], + citation["CaltechDATA_ID"], + citation["Cross_Link"], + citation["resource_type"] + ]) print(f"Saved {len(all_citations)} citations to {output_file}") + +def get_data_availability_links(token=None, size=25): + base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch" + base_file_url_template = "https://authors.library.caltech.edu/api/records/{record_id}/files" + + token = os.environ.get("RDMTOK") + + output_file = "test_results_harvesters.csv" + + headers = {} + if token: + headers = { + "Authorization": f"Bearer {token}", + "Content-type": "application/json", + } + + response = requests.get(base_url, headers=headers) + if response.status_code != 200: + print(f"Error: Unable to fetch records from the API. Status code: {response.status_code}") + exit() + + records = response.json().get("hits", {}).get("hits", []) + + if not records: + print("No records found.") + exit() + + results = [] + for record in records: + record_id = record.get("id") + links = record.get("metadata", {}).get("additional_descriptions", []) + + for link_data in links: + description = link_data.get("description", "") + links_in_description = extract_https_links(description) + for link in links_in_description: + classification = classify_link(link) + cleaned = clean_link(link) + filename = extract_filename_from_link(link) + file_present = is_file_present(record_id, filename) + + results.append({ + "record_id": record_id, + "original_link": link, + "classification": classification, + "cleaned_link": cleaned, + "filename": filename, + "file_present": file_present + }) + + return results diff --git a/ames/harvesters/editor.py b/ames/harvesters/editor.py deleted file mode 100644 index 6a24a38e..00000000 --- a/ames/harvesters/editor.py +++ /dev/null @@ -1,133 +0,0 @@ -import csv -import json -import requests -from caltechdata_edit import caltechdata_edit - -# Read the CSV file -records = [] -with open('output.csv', 'r') as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - records.append(row) - -# Access token for authentication -token = "moaclkv2MHDHoGklIZs7ABF5mmBZNcxKGgmHY4yyHaUAGbCuyO3DNXhehpL7" - -# Using the development system (production=False) -production = False -base_url = "https://data.caltechlibrary.dev" - -# Set up headers for API requests -headers = { - "Authorization": f"Bearer {token}", - "Content-type": "application/json", -} - -def add_related_identifier(record_id, doi, caltech_author_id): - """Add DOI and CaltechAUTHORS_ID to related identifiers directly using the API""" - print(f"Processing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}") - - # First, get the current record - response = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) - - if response.status_code != 200: - print(f"Error fetching record {record_id}: {response.status_code}") - print(response.text) - return False - - record_data = response.json() - - # Check if there's already a draft - draft_response = requests.get(f"{base_url}/api/records/{record_id}/draft", headers=headers) - - if draft_response.status_code == 200: - # Use the draft if it exists - record_data = draft_response.json() - else: - # Create a draft if it doesn't exist - draft_create_response = requests.post( - f"{base_url}/api/records/{record_id}/draft", - headers=headers - ) - - if draft_create_response.status_code != 201: - print(f"Error creating draft for {record_id}: {draft_create_response.status_code}") - print(draft_create_response.text) - return False - - record_data = draft_create_response.json() - - # Update the related identifiers - related_identifiers = record_data.get("metadata", {}).get("related_identifiers", []) - if related_identifiers is None: - related_identifiers = [] - - # Check if DOI already exists - doi_exists = any(identifier.get("identifier") == doi for identifier in related_identifiers) - - # Check if CaltechAUTHORS_ID URL already exists - author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" - author_url_exists = any(identifier.get("identifier") == author_url for identifier in related_identifiers) - - # Add the DOI if it doesn't exist - if not doi_exists: - new_doi_identifier = { - "relation_type": {"id": "issupplementedby"}, - "identifier": doi, - "scheme": "doi", - "resource_type": {"id": "publication"} - } - related_identifiers.append(new_doi_identifier) - print(f"Adding DOI related identifier: {doi}") - else: - print(f"DOI {doi} already exists in related identifiers") - - # Add the CaltechAUTHORS_ID URL if it doesn't exist - if not author_url_exists: - new_author_identifier = { - "relation_type": {"id": "isreferencedby"}, - "identifier": author_url, - "scheme": "url", - "resource_type": {"id": "publication"} - } - related_identifiers.append(new_author_identifier) - print(f"Adding CaltechAUTHORS_ID related identifier: {author_url}") - else: - print(f"CaltechAUTHORS_ID URL {author_url} already exists in related identifiers") - - record_data["metadata"]["related_identifiers"] = related_identifiers - - # Update the draft - update_response = requests.put( - f"{base_url}/api/records/{record_id}/draft", - headers=headers, - json=record_data - ) - - if update_response.status_code != 200: - print(f"Error updating draft for {record_id}: {update_response.status_code}") - print(update_response.text) - return False - - # Publish the draft - publish_response = requests.post( - f"{base_url}/api/records/{record_id}/draft/actions/publish", - headers=headers - ) - - if publish_response.status_code != 202: - print(f"Error publishing draft for {record_id}: {publish_response.status_code}") - print(publish_response.text) - return False - - print(f"Successfully added related identifier {doi} to {record_id} and published the changes") - return True - -# Process each record -for record in records: - test_id = record['Test_ID'] - doi = record['CaltechAUTHORS_DOI'] - caltech_author_id = record['CaltechAUTHORS_ID'] - add_related_identifier(test_id, doi, caltech_author_id) - -print("Processing complete") \ No newline at end of file diff --git a/ames/harvesters/output.csv b/ames/harvesters/output.csv deleted file mode 100644 index 2d5bc431..00000000 --- a/ames/harvesters/output.csv +++ /dev/null @@ -1,168 +0,0 @@ -CaltechAUTHORS_ID,CaltechAUTHORS_DOI,Related_DOI,Data_ID,Cross_Link,Test_ID -bwww3-z8y74,10.1093/mnras/staa2808,10.22002/D1.1458,3hqgp-jhw61,No,99s7k-d6f58 -dm3mv-q1b76,10.1038/s41524-019-0216-x,10.22002/D1.1256,yrn11-jb916,No,t899g-xww46 -ahsp6-e8a25,10.1186/s40168-020-0785-4,10.22002/D1.1295,88yzp-h0n85,No,ndyrw-3gq31 -n1n6t-pxs56,10.1038/s41586-021-03601-4,10.22002/D1.1455,m47ts-35f81,No,ydg6m-x6q85 -zjfrp-a9k69,10.1126/science.abb3099,10.22002/D1.1444,s3ejh-8rk72,No,9vgt2-8vy76 -jvzym-1mh07,10.1021/acssynbio.1c00592,10.22002/D1.2140,tvy11-z5a48,No,rff39-mtm48 -r8qh8-y4065,10.1128/JCM.01785-21,10.22002/D1.1942,bv2tf-aap55,No,whd0t-96h94 -nrbae-qk103,10.1098/rspa.2021.0561,10.22002/D1.2173,m1b0w-0zs59,No,6atdb-1bj05 -nrbae-qk103,10.1098/rspa.2021.0561,10.22002/D1.2024,e3sea-97705,No,0x4v2-cb178 -dcas8-y4s76,10.1126/science.abg9765,10.22002/D1.1882,s7epj-5ry14,No,c0e9h-2n643 -k3h8s-vjz03,10.1016/j.atmosenv.2021.118809,10.22002/D1.1985,254mc-zpg74,No,4qwbc-5sb81 -9x8wh-65414,10.7554/eLife.65092,10.22002/D1.2076,k9hkr-v0978,No,v22y2-m7n82 -9x8wh-65414,10.7554/eLife.65092,10.22002/D1.2157,g3sp0-33085,No,9eh2f-k1a76 -9x8wh-65414,10.7554/eLife.65092,10.22002/D1.1790,qxm6c-q5p97,No,qn0ny-t3b46 -15qr1-8d538,10.1029/2019je006190,10.22002/D1.1326,dwc1w-r9a68,No,7gpm7-hb420 -344g6-s0176,10.1007/s12678-021-00694-3,10.22002/D1.1632,1km87-52j70,No,fdx2t-3rx08 -nxrw7-zx123,10.1038/s41561-021-00862-6,10.22002/D1.2150,0a0fg-yer22,No,36zbz-rbn41 -pa1b8-a8596,10.1029/2020gl089917,10.22002/D1.1376,9wpke-7eg08,No,358dc-1ck54 -ezbzn-3qz02,10.1038/s41567-021-01492-w,10.22002/D1.2202,59zzx-xhb23,No,kcgnj-he106 -hcg0j-pk874,10.1029/2020gl087477,10.22002/D1.1348,j5fk1-0g306,No,wkwk4-8ym88 -40n3g-jae61,10.1029/2017GC007260,10.22002/D1.320,vymbv-n8p13,No,65wey-dqt49 -40n3g-jae61,10.1029/2017GC007260,10.22002/D1.321,bwxz5-0kc80,No,yz4a8-bd988 -5624m-z4s58,10.1029/2019gc008862,10.22002/D1.1380,mm5dq-05y22,No,aekvq-hh376 -8scha-rhv61,10.1029/2018MS001313,10.22002/D1.933,a49j4-qt740,No,w08gs-1t250 -prs7s-hfr18,10.1038/s41524-022-00747-1,10.22002/D1.1983,dekcc-2tb35,No,2g1py-bnf41 -99bn3-pg365,10.7554/eLife.55308,10.22002/d1.1336,wyrrw-ffe20,No,yd65f-wkz21 -99bn3-pg365,10.7554/eLife.55308,10.22002/D1.1331,8ftre-92r60,No,q9xqq-xrk20 -q0pn9-dgp08,10.1029/2021ea001637,10.22002/D1.2028,p6614-mjw68,No,a7rma-wkb68 -q0pn9-dgp08,10.1029/2021ea001637,10.22002/D1.222,k8gnx-7hj46,No,6ffmj-n7192 -jk4kd-j8n07,10.1029/2020gl091699,10.22002/D1.1666,bk8pf-qvx09,No,gv80y-yq992 -zyge8-axx47,10.5194/se-11-2283-2020,10.22002/D1.1447,6jegf-05x08,No,443sq-jh364 -8xx2v-fk653,10.1093/pnasnexus/pgad033,10.22002/D1.20223,0yw13-j0441,No,szsyt-ytf03 -6famx-f0654,10.1038/s41467-020-16224-6,10.22002/D1.1371,60wg2-9qz17,No,v8shp-3a074 -qjzq0-71624,10.1029/2019jb018597,10.22002/D1.1377,nwj4y-hcm74,No,aan3s-zfk64 -nm7th-hgm94,10.1073/pnas.2102267118,10.22002/D1.1996,10svm-aq733,No,47mgd-ydg26 -zfjpk-xtj13,10.1063/5.0006684,10.22002/D1.1379,dxrpq-xyx02,No,jmzz8-6td18 -kp5yc-avm39,10.1038/s41586-019-1447-1,10.22002/D1.1160,w8hpj-0y065,No,6a7xt-crp87 -c9geb-11h23,10.1073/pnas.1907869116,10.22002/D1.1241,fbv6r-hg153,No,cqnnk-chw84 -zqrkn-f3w82,10.1038/s41524-019-0213-0,10.22002/D1.1178,ehp06-pcf04,No,9r1k1-95d28 -zqrkn-f3w82,10.1038/s41524-019-0213-0,10.22002/D1.1179,p2bsg-7tb62,No,pehkc-etn63 -arj9r-nfc42,10.1038/s41467-019-13262-7,10.22002/D1.1296,2vdrb-bmr68,No,26ts2-f1j05 -kz506-yqx70,10.1029/2019JB018922,10.22002/d1.1328,yen0b-fed04,No,vw4ra-qrp13 -gjzvx-agj04,10.1038/s41598-020-58586-3,10.22002/D1.1298,8d1ar-e8e29,No,gepec-02x71 -ztte2-h5j24,10.1088/1367-2630/ac1144,10.22002/D1.1451,hvtvx-rjq36,No,nz1k8-4ck35 -ngtts-mjd88,10.1029/2021jb021976,10.22002/D1.2009,7tp28-jp627,No,bzqfj-f7077 -ngtts-mjd88,10.1029/2021jb021976,10.22002/D1.2010,e80sv-5py92,No,ffmm8-ckg43 -jemmb-kp969,,10.22002/D1.1921,d2qn0-tse31,No,e3rhe-kdq51 -ke1kq-rk838,10.1038/s41467-022-33449-9,10.22002/D1.20291,9bkyr-21532,No,8b5nn-hbt51 -4jqdv-nrv37,10.1016/j.cels.2022.03.002,10.22002/D1.1693,4ry4k-5gf89,No,h2e81-mz738 -rsg2f-2nj92,10.1126/sciadv.adg6618,10.22002/D1.2090,c2w53-e7q49,No,15m2r-ep710 -qgw1r-cg440,10.1111/sed.13100,10.22002/D1.1645,8zehn-8rr62,No,mzph0-txc86 -w2ev0-snj25,10.1029/2021jf006392,10.22002/D1.8962,yrkvm-9bh56,No,d3ek1-z9x67 -750xa-xag63,,10.22002/D1.1286,zcndp-vg341,No,27jch-y5k25 -j80sb-te308,10.1088/1748-3190/ad277f,10.22002/c5cyj-mev09,c5cyj-mev09,No,86x6j-n4q55 -mypge-8d791,10.1038/s41586-023-06227-w,10.22002/7h65h-89163,7h65h-89163,No,4m1z1-bhc25 -90vmt-dcf44,10.1029/2019je006289,10.22002/D1.1349,kmde9-m7g44,No,y28dp-mfh71 -fx2at-3ps68,10.1029/2019jg005533,10.22002/D1.1226,xc9rx-8qs95,No,rbw1s-e1198 -pjjee-1w296,10.1029/2021MS002671,10.22002/D1.1429,3p6y1-a8b95,No,ymygq-mpz35 -79r10-1a091,10.1029/2021jf006406,10.22002/D1.2078,5qqjp-5g813,No,bkqq3-kfa29 -79r10-1a091,10.1029/2021jf006406,10.22002/D1.9200,dgxbk-45k21,No,9ynk4-ajv10 -4a7y9-nzb92,10.1016/j.cels.2022.03.001,10.22002/D1.1692,5e4cj-34824,No,bw0gp-8tt54 -4a7y9-nzb92,10.1016/j.cels.2022.03.001,10.22002/D1.20047,aymp3-qzt70,No,15qm4-77249 -6k82a-tnj57,10.24349/pjye-gkeo,10.22002/D1.1877,01pg4-5r437,No,h8ps1-0yp52 -1qwbd-2q359,10.1029/2021av000534,10.22002/D1.2176,kasms-vp209,No,e1m0y-cpf40 -dv27e-2sy47,10.1073/pnas.2023433118,10.22002/D1.1667,8rsdg-rxz52,No,kaayd-19r23 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1915,653dj-3b761,No,mtzc2-18p79 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1900,82zcr-mxa32,No,n1vb7-m1g75 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1905,cj7ec-8g780,No,j8p30-za980 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1914,5vd8b-sz113,No,8rp1w-qfz76 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1919,zh7s1-8g617,No,pqak6-n5333 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1917,2t5tj-e5w45,No,w80kf-hyt30 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1916,0vgd3-9cp11,No,9w2er-n4713 -tr0p8-5we23,10.1016/j.cell.2021.11.014,10.22002/D1.1918,5x3vq-z1b96,No,zb2dj-d0f86 -h0e90-htt06,10.1029/2021jc018375,10.22002/D1.1296,2vdrb-bmr68,No,26ts2-f1j05 -0f32n-bgm37,10.1029/2021gl095227,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 -0f32n-bgm37,10.1029/2021gl095227,10.22002/D1.2033,jn6d1-wfj05,No,t8h2f-87s64 -nde62-t1r58,10.1029/2021JB022462,10.22002/D1.1970,h6438-cgy98,No,pbgwc-7b016 -rwr63-xyt69,10.1029/2021gl096503,10.22002/D1.2135,pyr6e-wt732,No,2kvfz-erc91 -r1pva-c8x21,10.1029/2022jb024329,10.22002/D1.20035,wxsas-96g08,No,8tj1d-22780 -rb73g-vqz89,10.1016/j.xcrp.2022.100959,10.22002/D1.20057,bfap4-h2m21,No,px5ss-5y916 -wr8qq-s8t58,10.1109/WHISPERS.2016.8071774,10.22002/D1.222,k8gnx-7hj46,No,6ffmj-n7192 -ynbgx-0tx98,10.1029/2021ms002747,10.22002/D1.1231,meh5c-wy279,No,61shk-rcs26 -6gcmj-reb48,10.1038/s41586-022-04749-3,10.22002/D1.2155,y72mq-emt30,No,5jjz9-40b67 -vpnng-szs82,10.1126/sciadv.abn9545,10.22002/D1.20048,hend5-jzt61,No,sb743-qp239 -1q6gn-mvc46,10.1029/2022ms003105,10.22002/D1.20052,j8mw7-fm491,No,5mdtq-15724 -ck6pf-68621,10.1093/gji/ggab407,10.22002/D1.1955,31emd-wmv98,No,33y3z-2te02 -d20js-7z640,10.1029/2022gl101715,10.22002/vn6v0-pfr77,vn6v0-pfr77,No,dcy6h-wem15 -rsekp-4g847,10.1109/tgrs.2023.3305194,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 -cmvm1-e9379,10.1029/2023gl105205,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 -wkjh4-6nf06,10.1101/2022.03.02.22271724,10.22002/D1.20049,pmm08-6q581,No,rtc14-wmr12 -p6mn1-gs660,10.1101/2022.06.17.496478,10.22002/D1.20199,5vng7-8ne78,No,afznk-vbp04 -vtef0-x7037,10.1101/2022.07.13.22277113,10.22002/D1.20223,0yw13-j0441,No,szsyt-ytf03 -q40ve-64h03,10.1002/essoar.10510937.2,10.22002/D1.20052,j8mw7-fm491,No,5mdtq-15724 -4w29e-xpa51,10.1002/essoar.10510458.2,10.22002/D1.1891,n1yye-2z213,No,zeyw2-27p09 -e9yy0-v0658,10.1002/essoar.10511838.1,10.22002/D1.20038,e1acg-e0k08,No,3pg72-ddh27 -x0cjg-kxg25,10.1101/2022.07.13.22277513,10.22002/D1.20223,0yw13-j0441,No,szsyt-ytf03 -jh5g2-fyg97,10.1002/essoar.10512148.1,10.22002/D1.20252,dhfgs-p7319,No,wjhsn-kbv79 -fgbgn-2rk49,10.1002/essoar.10512118.2,10.22002/D1.20248,pybpv-w7661,No,wp4x8-1vc62 -91hvf-49c47,,10.22002/D1.1438,3ejxg-69q72,No,2ths5-hkw28 -yvded-18923,10.1101/2020.07.25.210468,10.22002/D1.1407,fc4k3-75q88,No,c4fh9-f1170 -166f3-tj121,,10.22002/D1.1100,dkdt4-e0x94,No,bcn0f-n7z19 -nb3f9-s8p96,10.1002/essoar.10504190.1,10.22002/D1.1617,4peqr-t0723,No,npzd5-bzb58 -5xkg8-an610,10.1101/762773,10.22002/d1.1276,gye0e-gw976,No,gqpst-s3d05 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1915,653dj-3b761,No,mtzc2-18p79 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1900,82zcr-mxa32,No,n1vb7-m1g75 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1905,cj7ec-8g780,No,j8p30-za980 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1914,5vd8b-sz113,No,8rp1w-qfz76 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1917,2t5tj-e5w45,No,w80kf-hyt30 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1916,0vgd3-9cp11,No,9w2er-n4713 -24wrg-rx971,10.1101/2021.05.13.444042,10.22002/D1.1918,5x3vq-z1b96,No,zb2dj-d0f86 -2nvtm-7qe34,10.1029/2019je006191,10.22002/d1.1304,dmrkv-3xn63,No,91tsz-cfc02 -m07dw-3jq86,10.1038/s41587-019-0372-z,10.22002/D1.1311,a73n8-3pa89,No,dth8s-nba11 -n3qyx-kf341,10.1029/2019je006156,10.22002/D1.1299,df548-mhy96,No,a15h1-f1j49 -ks7h3-tf644,10.1038/s42005-021-00703-3,10.22002/D1.1858,vp3y4-ef326,No,2f5p2-2e148 -8mp48-93523,,10.22002/D1.1315,d8w02-xng39,No,q0346-mn915 -a7vwv-mhw66,10.1101/2022.07.10.499405,10.22002/D1.20215,awtz3-tz122,No,xfqva-b4991 -h26g5-k0311,10.1029/2018ea000416,10.22002/D1.1211,5ptpj-x8c08,No,7f5x5-pa209 -h26g5-k0311,10.1029/2018ea000416,10.22002/D1.1212,3j8a8-bzn05,No,xk40w-p0g08 -y47b9-pmn34,10.48550/arXiv.1905.06360,10.22002/D1.1241,fbv6r-hg153,No,cqnnk-chw84 -zf2v5-vx810,10.1029/2018JE005706,10.22002/D1.1087,hhzzq-yw058,No,e67v1-3kg31 -zf2v5-vx810,10.1029/2018JE005706,10.22002/D1.1085,44km0-er448,No,g867t-x4d04 -zf2v5-vx810,10.1029/2018JE005706,10.22002/D1.1086,c8fwc-kvg38,No,c1gj1-hnb74 -k6jp7-8js32,10.1029/2019gl086424,10.22002/d1.1317,rdv1n-st737,No,8kgzq-0z165 -se7bz-k9a90,10.1029/2019jb018855,10.22002/D1.1293,mtn3h-frk09,No,hrpxj-0tr80 -frxqw-qyw74,10.1063/1.5054927,10.22002/D1.305,0t2qn-hvq19,No,f3cwe-hgm53 -dzgj9-y8y49,10.3390/rs12213586,10.22002/d1.1182,45st6-jvh02,No,xk1ex-axk10 -ge4t4-7e026,10.1029/2019je006298,10.22002/D1.1318,gyvwk-btq07,No,te8rh-sxq88 -ge4t4-7e026,10.1029/2019je006298,10.22002/D1.1211,5ptpj-x8c08,No,7f5x5-pa209 -ze84b-v1782,10.1101/2020.12.09.20239467,10.22002/D1.1702,24265-gtd53,No,y98md-bp961 -d4h00-xf206,10.1029/2022jb025425,10.22002/D1.20252,dhfgs-p7319,No,wjhsn-kbv79 -x179p-jhv14,10.7554/elife.85370,10.22002/D1.2157,g3sp0-33085,No,9eh2f-k1a76 -ywtpy-nka66,10.1038/s41929-021-00618-w,10.22002/D1.1632,1km87-52j70,No,fdx2t-3rx08 -zwfrv-rbn84,10.1038/s41467-021-25443-4,10.22002/D1.2032,jvjkh-d9g50,No,x6gw8-as345 -y99cf-fkr11,10.1016/j.icarus.2022.115079,10.22002/D1.20170,3ra81-96y32,No,fddq3-zmn81 -ernz6-2xp43,10.1101/2022.03.21.484932,10.22002/D1.20060,hwv5v-m9x76,No,vfz20-ydp76 -rxd0x-ag678,10.1029/2020JE006675,10.22002/D1.1617,4peqr-t0723,No,npzd5-bzb58 -t2ser-vfe73,10.1029/2020ja027796,10.22002/D1.1333,r35r5-sb884,No,ngxcf-0a955 -v84d4-vcs34,10.1038/s41586-020-2872-x,10.22002/D1.1647,jhn25-fsd29,No,24b5z-t4m60 -c03z0-nfg11,10.1038/s41598-020-77073-3,10.22002/D1.1407,fc4k3-75q88,No,c4fh9-f1170 -4fdtn-y2e21,10.1029/2020je006606,10.22002/D1.1628,d5jt1-wqt82,No,wx24e-5yd65 -ct2sc-f7m12,10.1029/2019av000140,10.22002/D1.1347,8hm1f-w5492,No,9tcpx-vf039 -rdqe2-hsq97,10.1029/2020jb021369,10.22002/D1.1670,e200b-xsm06,No,4qv0g-7yt28 -363j8-nw138,10.1038/s41587-021-00870-2,10.22002/D1.1876,7704n-f6m57,No,3gz1s-d5261 -r7rsd-a7a17,,10.22002/D1.2026,51sah-d9r47,No,mn3a1-x3x94 -r7rsd-a7a17,,10.22002/D1.2025,p8ppf-7ff93,No,e1acb-n8a94 -6f3gb-97h58,10.1126/science.abg2947,10.22002/D1.1976,zpzee-79351,No,px1w1-xqr37 -wfg20-4tn76,10.1029/2020tc006210,10.22002/D1.1388,8dhjv-rvf91,No,zwjd2-rw873 -n2b9p-8jf71,10.1029/2021JB021886,10.22002/D1.1612,ryqnw-bdf94,No,mzwax-jw720 -30s9e-9z096,10.1029/2021jb022676,10.22002/D1.2009,7tp28-jp627,No,bzqfj-f7077 -30s9e-9z096,10.1029/2021jb022676,10.22002/D1.2141,dwkqy-hkj69,No,e0zwb-f9s86 -30s9e-9z096,10.1029/2021jb022676,10.22002/D1.2142,665w3-pbj51,No,dygfc-38b42 -fx63v-hsd80,10.1029/2021gl092598,10.22002/D1.2023,m1r58-kvb98,No,ft45b-8nw98 -vfg98-9zt14,10.5194/gmd-14-6309-2021,10.22002/D1.971,8mkmg-c2938,No,3ydw5-6yx19 -b0xvf-xn162,10.1038/s41561-021-00706-3,10.22002/D1.1874,3hrqe-5x450,No,7m4ep-06188 -b0xvf-xn162,10.1038/s41561-021-00706-3,10.22002/D1.1873,8ydek-yt879,No,m86ym-m3603 -08dr4-w6943,10.1029/2021je006828,10.22002/D1.1971,gv4qf-qwa77,No,mm42q-wft65 -yzjs7-1cv55,10.1016/j.jsb.2022.107860,10.22002/D1.2096,2vtv3-pp862,No,tyest-mem06 -yzjs7-1cv55,10.1016/j.jsb.2022.107860,10.22002/D1.2099,tz8jq-0mk77,No,yypym-x5693 -yzjs7-1cv55,10.1016/j.jsb.2022.107860,10.22002/D1.2103,kfkqj-q6557,No,4khh3-dpp37 -0mmmp-9pz59,10.1029/2023jb026488,10.22002/D1.20248,pybpv-w7661,No,wp4x8-1vc62 -phbx7-m8a69,10.1088/2515-7655/ac817e,10.22002/D1.20061,q9zpw-g8s64,No,skmrx-14a11 -r5hzq-dzy83,10.2110/jsr.2022.032,10.22002/D1.20044,hzwqz-5wr07,No,0y3zw-g1n05 -sar53-81n52,10.1016/j.epsl.2023.118277,10.22002/D1.1619,s6ey4-qpm11,No,ac32t-mff72 -sar53-81n52,10.1016/j.epsl.2023.118277,10.22002/D1.1620,2znta-5t680,No,tka8c-bjp40 -y5gpx-saw63,10.5194/amt-17-5861-2024,10.14291/TCCON.GGG2014,rhrv4-mcp55,No,jn2dg-2h888 -knwz1-dvb78,,10.14291/tccon.ggg2014.pasadena01.r1/1182415,tb378-y1a55,No,8z4qd-mhz12 -dm3mv-q1b76,10.1038/s41524-019-0216-x,10.22002/D1.1256,yrn11-jb916,No,t899g-xww46 diff --git a/ames/matchers/__init__.py b/ames/matchers/__init__.py index a5923f56..69fb09bc 100644 --- a/ames/matchers/__init__.py +++ b/ames/matchers/__init__.py @@ -24,3 +24,4 @@ from .caltechauthors import save_metadata_to_file from .caltechauthors import add_related_identifiers_from_csv from .caltechauthors import add_authors_affiliations +from .caltechauthors import process_link_updates diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index 29fd02af..02017c85 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -342,201 +342,158 @@ def move_doi(record, token, test=False): ) -def add_related_identifiers_from_csv(csv_path, test=False): +def add_related_identifiers_from_csv(data_rows, token, test=False): """Reads a CSV file and adds related identifiers to each record using the CaltechDATA API.""" - base_url = ( - "https://data.caltechlibrary.dev" - if test - else "https://data.caltechlibrary.caltech.edu" - ) + base_url = "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu" headers = { "Authorization": f"Bearer {token}", "Content-type": "application/json", } + results = [] + for row in data_rows: + record_id = row['Test_ID'] + doi = row['CaltechAUTHORS_DOI'] + caltech_author_id = row['CaltechAUTHORS_ID'] + resource_type = row['resource_type'] + + print(f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}") + print(f"Using resource_type: {resource_type}") + + # Fetch the current record + response = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) + if response.status_code != 200: + print(f"Error fetching record {record_id}: {response.status_code}") + continue + record_data = response.json() - with open(csv_path, "r") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - record_id = row["Test_ID"] - doi = row["CaltechAUTHORS_DOI"] - caltech_author_id = row["CaltechAUTHORS_ID"] - resource_type = row["resource_type"] - - print( - f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}" - ) - print(f"Using resource_type: {resource_type}") - - # Fetch the current record - response = requests.get( - f"{base_url}/api/records/{record_id}", headers=headers - ) - if response.status_code != 200: - print(f"Error fetching record {record_id}: {response.status_code}") + # Draft check or create + draft_response = requests.get(f"{base_url}/api/records/{record_id}/draft", headers=headers) + if draft_response.status_code == 200: + record_data = draft_response.json() + else: + draft_create_response = requests.post(f"{base_url}/api/records/{record_id}/draft", headers=headers) + if draft_create_response.status_code != 201: + print(f"Error creating draft: {draft_create_response.status_code}") continue - record_data = response.json() - - # Draft check or create - draft_response = requests.get( - f"{base_url}/api/records/{record_id}/draft", headers=headers - ) - if draft_response.status_code == 200: - record_data = draft_response.json() - else: - draft_create_response = requests.post( - f"{base_url}/api/records/{record_id}/draft", headers=headers - ) - if draft_create_response.status_code != 201: - print(f"Error creating draft: {draft_create_response.status_code}") - continue - record_data = draft_create_response.json() - - related_identifiers = ( - record_data.get("metadata", {}).get("related_identifiers", []) or [] - ) + record_data = draft_create_response.json() + + related_identifiers = record_data.get("metadata", {}).get("related_identifiers", []) or [] + + doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers) + author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" + author_url_exists = any(ri.get("identifier") == author_url for ri in related_identifiers) + + if not doi_exists: + related_identifiers.append({ + "relation_type": {"id": "issupplementedby"}, + "identifier": doi, + "scheme": "doi", + "resource_type": {"id": resource_type} + }) + print(f"Adding DOI: {doi}") + else: + print(f"DOI already exists") + + if not author_url_exists: + related_identifiers.append({ + "relation_type": {"id": "isreferencedby"}, + "identifier": author_url, + "scheme": "url", + "resource_type": {"id": resource_type} + }) + print(f"Adding CaltechAUTHORS link: {author_url}") + else: + print(f"CaltechAUTHORS link already exists") - doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers) - author_url = ( - f"https://authors.library.caltech.edu/records/{caltech_author_id}" - ) - author_url_exists = any( - ri.get("identifier") == author_url for ri in related_identifiers - ) + record_data["metadata"]["related_identifiers"] = related_identifiers - if not doi_exists: - related_identifiers.append( - { - "relation_type": {"id": "issupplementedby"}, - "identifier": doi, - "scheme": "doi", - "resource_type": {"id": resource_type}, - } - ) - print(f"Adding DOI: {doi}") - else: - print(f"DOI already exists") - - if not author_url_exists: - related_identifiers.append( - { - "relation_type": {"id": "isreferencedby"}, - "identifier": author_url, - "scheme": "url", - "resource_type": {"id": resource_type}, - } - ) - print(f"Adding CaltechAUTHORS link: {author_url}") - else: - print(f"CaltechAUTHORS link already exists") + update_response = requests.put( + f"{base_url}/api/records/{record_id}/draft", headers=headers, json=record_data + ) + if update_response.status_code != 200: + print(f"Error updating draft: {update_response.status_code}") + continue - record_data["metadata"]["related_identifiers"] = related_identifiers + publish_response = requests.post( + f"{base_url}/api/records/{record_id}/draft/actions/publish", headers=headers + ) + if publish_response.status_code != 202: + print(f"Error publishing record {record_id}: {publish_response.status_code}") + results.append((record_id, False)) + continue - update_response = requests.put( - f"{base_url}/api/records/{record_id}/draft", - headers=headers, - json=record_data, - ) - if update_response.status_code != 200: - print(f"Error updating draft: {update_response.status_code}") - continue + print(f"Successfully updated and published {record_id}") + results.append((record_id, True)) + return results - publish_response = requests.post( - f"{base_url}/api/records/{record_id}/draft/actions/publish", - headers=headers, +def process_link_updates(input_csv): + # read the CSV file and build a dictionary: record_id -> {"links": [(link, classification), ...]} + records_data = {} + with open(input_file, newline="") as f: + reader = csv.DictReader(f, delimiter=",") + for row in reader: + record_id = row["record_id"].strip() + link = row["link"].strip() + classification = row["classification"].strip() + + if record_id not in records_data: + records_data[record_id] = { + "links": [] + } + records_data[record_id]["links"].append((link, classification)) + + results = [] + + for record_id, record_info in records_data.items(): + print(f"Processing record {record_id}") + + # get metadata for the record + metadata = get_record_metadata(record_id) + if not metadata: + # if we failed to get metadata, record the error and continue + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": None, + "metadata_updated": False, + "notes": "Failed to retrieve metadata", + } ) - if publish_response.status_code != 202: - print( - f"Error publishing record {record_id}: {publish_response.status_code}" - ) - continue - - print(f"Successfully updated and published {record_id}") - - print("All records processed.") - - -def add_authors_affiliations(record, token, dimensions_key, allowed_identifiers=None): - # Add dimensions affiliations to a record + continue - record_id = record["id"] - if "doi" in record["pids"]: - doi = record["pids"]["doi"]["identifier"] - else: - doi = None - if "identifiers" in record["metadata"]: - for idv in record["metadata"]["identifiers"]: - if idv["scheme"] == "doi": - doi = idv["identifier"] - if doi: - endpoint = "https://cris-api.dimensions.ai/v3" - dimcli.login(key=dimensions_key, endpoint=endpoint, verbose=False) - dsl = dimcli.Dsl() - res = dsl.query_iterative( - f""" - search publications - where doi = "{doi}" - return publications[basics+extras+abstract] """, - verbose=False, + # check existing related identifiers in the record + related_identifiers = metadata.get("metadata", {}).get("related_identifiers", []) + + # run check_doi if a "doi" is present among the links + doi_check = None + for (lk, ctype) in record_info["links"]: + if ctype.lower() == "doi": + try: + doi_check = check_doi(lk, production=True) + except Exception as e: + doi_check = f"Error: {str(e)}" + + # update related identifiers + updated_metadata, updated_flag = update_related_identifiers( + metadata, record_info["links"], source_type="data" ) - publication = res.json["publications"] - update = False - if len(publication) == 1: - publication = publication[0] - dimensions_authors = publication.get("authors", []) - existing_authors = record["metadata"]["creators"] - if len(dimensions_authors) == len(existing_authors): - for position in range(len(dimensions_authors)): - author = existing_authors[position] - dimensions_author = dimensions_authors[position] - if "affiliations" not in author: - affiliations = [] - affiliation_ids = [] - if dimensions_author["affiliations"] not in [[], None]: - for affiliation in dimensions_author["affiliations"]: - affil = {} - if "id" in affiliation: - if affiliation["id"] is not None: - ror = grid_to_ror(affiliation["id"]) - if ror is not None: - if allowed_identifiers is not None: - if ror in allowed_identifiers: - affil["id"] = ror - else: - print( - "ROR %s not in allowed identifiers list" - % ror - ) - else: - print( - "Missing ROR for affiliation %s" - % affiliation["id"] - ) - # We have to manually handle incorrectly mapped JPL - # affiliations - if "raw_affiliation" in affiliation: - raw = affiliation["raw_affiliation"] - affil["name"] = raw - if "91109" in raw: - affil["id"] = "027k65916" - if "Jet Propulsion Laboratory" in raw: - affil["id"] = "027k65916" - if "JPL" in raw: - affil["id"] = "027k65916" - # Some dimensions records don't include id values. - # We ignore those for now - if "id" in affil: - if affil["id"] not in affiliation_ids: - update = True - affiliation_ids.append(affil["id"]) - affiliations.append(affil) - existing_authors[position]["affiliations"] = affiliations - if update: - caltechdata_edit( - record_id, - metadata=record, - token=token, - production=True, - publish=True, - authors=True, - ) + if updated_flag: + # saving to local JSON file for reference + save_metadata_to_file(updated_metadata, record_id) + pass + + # preparing the final row for the results CSV + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": doi_check, + "metadata_updated": updated_flag, + "notes": "", + } + ) + return results diff --git a/run_caltechauthors_get_links.py b/run_caltechauthors_get_links.py new file mode 100644 index 00000000..cd486026 --- /dev/null +++ b/run_caltechauthors_get_links.py @@ -0,0 +1,16 @@ +from ames.harvesters.caltechauthors import get_data_availability_links +import csv +import os + +output_file = "test_results_get_links.csv" +token = os.environ.get("RDMTOK") +results = get_data_availability_links(token=token) + +if results: + with open(output_file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=results[0].keys()) + writer.writeheader() + writer.writerows(results) + print(f"Saved {len(results)} links to {output_file}") +else: + print("No results.") diff --git a/run_caltechauthors_update_links.py b/run_caltechauthors_update_links.py new file mode 100644 index 00000000..7f58fc48 --- /dev/null +++ b/run_caltechauthors_update_links.py @@ -0,0 +1,16 @@ +from ames.matchers.caltechauthors import process_link_updates +import csv + +input_file = "non_publisher_links.csv" +output_file = "test_results_update_links.csv" + +results = process_link_updates(input_file) + +if results: + with open(output_file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=results[0].keys()) + writer.writeheader() + writer.writerows(results) + print(f"Saved update results to {output_file}") +else: + print("No results.") diff --git a/run_harvest_links.py b/run_harvest_links.py new file mode 100644 index 00000000..3d71948c --- /dev/null +++ b/run_harvest_links.py @@ -0,0 +1,88 @@ +import os +import csv +import json +import requests + +from ames.matchers.caltechauthors import ( + get_record_metadata, + update_related_identifiers, + save_metadata_to_file, + check_doi, +) + + +def main(): + input_file = "non_publisher_links.csv" + output_file = "test_results_matchers.csv" + + # read the CSV file and build a dictionary: record_id -> {"links": [(link, classification), ...]} + records_data = {} + with open(input_file, newline="") as f: + reader = csv.DictReader(f, delimiter=",") + for row in reader: + record_id = row["record_id"].strip() + link = row["link"].strip() + classification = row["classification"].strip() + + if record_id not in records_data: + records_data[record_id] = { + "links": [] + } + records_data[record_id]["links"].append((link, classification)) + + results = [] + + for record_id, record_info in records_data.items(): + print(f"Processing record {record_id}") + + # get metadata for the record + metadata = get_record_metadata(record_id) + if not metadata: + # if we failed to get metadata, record the error and continue + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": None, + "metadata_updated": False, + "notes": "Failed to retrieve metadata", + } + ) + continue + + # check existing related identifiers in the record + related_identifiers = metadata.get("metadata", {}).get("related_identifiers", []) + + # run check_doi if a "doi" is present among the links + doi_check = None + for (lk, ctype) in record_info["links"]: + if ctype.lower() == "doi": + try: + doi_check = check_doi(lk, production=True) + except Exception as e: + doi_check = f"Error: {str(e)}" + + # update related identifiers + updated_metadata, updated_flag = update_related_identifiers( + metadata, record_info["links"], source_type="data" + ) + if updated_flag: + # saving to local JSON file for reference + save_metadata_to_file(updated_metadata, record_id) + pass + + # preparing the final row for the results CSV + first_link = record_info["links"][0][0] if record_info["links"] else "" + results.append( + { + "record_id": record_id, + "link": first_link, + "doi_check": doi_check, + "metadata_updated": updated_flag, + "notes": "", + } + ) + +if __name__ == "__main__": + main() diff --git a/tests/test_matchers.py b/tests/test_matchers.py new file mode 100644 index 00000000..9e8c5840 --- /dev/null +++ b/tests/test_matchers.py @@ -0,0 +1,76 @@ +import os +import unittest +import csv +import random +import requests +import sys + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from ames.matchers.caltechauthors import add_related_identifiers_from_csv + +token = "0UrVehnBSM9c7DQZZCM2EtB4lpuEwbTiLue6rf0Vme3lnzswlMA9whjJbmhX" +CSV_FILE = "test.csv" + +def load_test_data(from_csv=True): + if from_csv and os.path.exists(CSV_FILE): + with open(CSV_FILE, 'r') as f: + reader = csv.DictReader(f) + return list(reader) + else: + return [{ + "CaltechAUTHORS_ID": "bwww3-z8y74", + "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{random.randint(1000, 9999)}", + "Related_DOI": "10.22002/D1.1458", + "Data_ID": "3hqgp-jhw61", + "Cross_Link": "No", + "Test_ID": "99s7k-d6f58", + "resource_type": "publication-article" + }] + +def verify_related_identifiers_on_site(data_rows, test=False): + base_url = "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu" + headers = {"Authorization": f"Bearer {token}"} + results = [] + + for row in data_rows: + record_id = row['Test_ID'] + doi = row['CaltechAUTHORS_DOI'] + caltech_author_id = row['CaltechAUTHORS_ID'] + author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" + + r = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) + if r.status_code != 200: + print(f"❌ Could not fetch record {record_id}") + results.append((record_id, False)) + continue + + metadata = r.json().get("metadata", {}) + related = metadata.get("related_identifiers", []) + found_doi = any(x["identifier"] == doi for x in related) + found_author = any(x["identifier"] == author_url for x in related) + + if found_doi and found_author: + print(f"✅ Verified: {record_id}") + results.append((record_id, True)) + else: + print(f"❌ Verification failed: {record_id}") + results.append((record_id, False)) + + return results + +class TestCaltechDataUploader(unittest.TestCase): + + def test_add_and_verify_related_identifiers(self): + test_data = load_test_data(from_csv=False) # <-- change this flag to toggle source + upload_results = add_related_identifiers_from_csv(test_data, token, test=True) + for record_id, success in upload_results: + self.assertTrue(success, f"❌ Upload failed for record {record_id}") + + verify_results = verify_related_identifiers_on_site(test_data, test=True) + for record_id, success in verify_results: + self.assertTrue(success, f"❌ Verification failed for record {record_id}") + + +if __name__ == "__main__": + unittest.main() From 25b94d800336e2f9063ee02362785c9fdd2592ae Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP Date: Mon, 19 May 2025 09:56:39 +0000 Subject: [PATCH 03/19] Add updated CITATION.cff from codemeta.json file --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 4c17ad48..5900c905 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -21,4 +21,4 @@ keywords: - GitHub - metadata - software -date-released: 2025-05-07 +date-released: 2025-05-19 From d9318e55d30a0e3522184c24a135883ebd737354 Mon Sep 17 00:00:00 2001 From: Rohan Bhattarai Date: Mon, 19 May 2025 14:44:34 -0700 Subject: [PATCH 04/19] Black_Formatting --- add_orcid_script.py | 8 +- ames/harvesters/caltechauthors.py | 128 ++++++++++++++++--------- ames/matchers/caltechauthors.py | 83 ++++++++++------ run_archives_report.py | 5 +- run_authors_affiliation_enhancement.py | 12 +-- run_authors_group_report.py | 6 +- run_authors_name_update.py | 2 +- run_caltechauthors_harvestors.py | 28 +++--- run_caltechauthors_matchers.py | 13 +-- run_harvest_links.py | 13 +-- tests/test_matchers.py | 43 +++++---- 11 files changed, 211 insertions(+), 130 deletions(-) diff --git a/add_orcid_script.py b/add_orcid_script.py index 92d0d6f4..0bd59160 100644 --- a/add_orcid_script.py +++ b/add_orcid_script.py @@ -1,9 +1,11 @@ -import csv,os +import csv, os -with open('orcids.csv', 'r') as f: +with open("orcids.csv", "r") as f: reader = csv.reader(f) orcid_list = list(reader) for orcid_data in orcid_list: orcid = orcid_data[8] clpid = orcid_data[10] - os.system(f'python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid') + os.system( + f"python run_authors_name_update.py {clpid} {orcid} -add -new-scheme orcid" + ) diff --git a/ames/harvesters/caltechauthors.py b/ames/harvesters/caltechauthors.py index 326d73d9..b68fd60d 100644 --- a/ames/harvesters/caltechauthors.py +++ b/ames/harvesters/caltechauthors.py @@ -205,9 +205,7 @@ def get_author_records( query = f'?q=metadata.creators.person_or_org.identifiers.identifier%3A"{author_identifier}"' if date: - query += ( - f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D" - ) + query += f"%20AND%20metadata.publication_date%3A%5B{date}%20TO%20%2A%20%5D" if token: headers = { @@ -334,6 +332,7 @@ def get_records_from_date(date="2023-08-25", test=False): return hits + def doi2url(doi): if not doi.startswith("10."): return doi @@ -351,6 +350,7 @@ def doi2url(doi): return resolved_url return doi + def fetch_metadata(record_id): url = f"https://authors.library.caltech.edu/api/records/{record_id}" try: @@ -360,11 +360,12 @@ def fetch_metadata(record_id): except: return None + def search_resource_type(obj): if isinstance(obj, dict): for k, v in obj.items(): - if k == 'resource_type' and isinstance(v, dict) and 'id' in v: - return v['id'] + if k == "resource_type" and isinstance(v, dict) and "id" in v: + return v["id"] result = search_resource_type(v) if result: return result @@ -375,8 +376,10 @@ def search_resource_type(obj): return result return None + def fetch_resource_type(data): - return search_resource_type(data) or 'N/A' + return search_resource_type(data) or "N/A" + def search_records(prefix): base_url = "https://authors.library.caltech.edu/api/records" @@ -387,6 +390,7 @@ def search_records(prefix): return response.json() return None + def extract_data_citations(hits): citations = [] for hit in hits: @@ -395,38 +399,50 @@ def extract_data_citations(hits): if not metadata: continue - caltechauthors_doi = metadata.get("pids", {}).get("doi", {}).get("identifier", "") + caltechauthors_doi = ( + metadata.get("pids", {}).get("doi", {}).get("identifier", "") + ) resource_type = fetch_resource_type(metadata) related_dois = [] for identifier in metadata.get("metadata", {}).get("related_identifiers", []): if identifier.get("scheme") == "doi": doi = identifier["identifier"] - if any(doi.startswith(prefix) for prefix in ["10.22002/", "10.14291/", "10.25989/"]): + if any( + doi.startswith(prefix) + for prefix in ["10.22002/", "10.14291/", "10.25989/"] + ): related_dois.append(doi) for doi in related_dois: caltechdata_url = doi2url(doi) if "data.caltech.edu/records/" in caltechdata_url: caltechdata_id = caltechdata_url.split("/records/")[-1] - caltechdata_metadata = requests.get(f"https://data.caltech.edu/api/records/{caltechdata_id}").json() + caltechdata_metadata = requests.get( + f"https://data.caltech.edu/api/records/{caltechdata_id}" + ).json() cross_link = "No" - for identifier in caltechdata_metadata.get("metadata", {}).get("related_identifiers", []): + for identifier in caltechdata_metadata.get("metadata", {}).get( + "related_identifiers", [] + ): if identifier.get("identifier") == caltechauthors_doi: cross_link = "Yes" break - citations.append({ - "CaltechAUTHORS_ID": record_id, - "CaltechAUTHORS_DOI": caltechauthors_doi, - "Related_DOI": doi, - "CaltechDATA_ID": caltechdata_id, - "Cross_Link": cross_link, - "resource_type": resource_type - }) + citations.append( + { + "CaltechAUTHORS_ID": record_id, + "CaltechAUTHORS_DOI": caltechauthors_doi, + "Related_DOI": doi, + "CaltechDATA_ID": caltechdata_id, + "Cross_Link": cross_link, + "resource_type": resource_type, + } + ) return citations + def generate_data_citation_csv(): prefixes = ["10.22002", "10.14291", "10.25989"] all_citations = [] @@ -439,50 +455,66 @@ def generate_data_citation_csv(): output_file = "data_citations_with_type.csv" with open(output_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["CaltechAUTHORS_ID", "CaltechAUTHORS_DOI", "Related_DOI", "CaltechDATA_ID", "Cross_Link", "resource_type"]) + writer.writerow( + [ + "CaltechAUTHORS_ID", + "CaltechAUTHORS_DOI", + "Related_DOI", + "CaltechDATA_ID", + "Cross_Link", + "resource_type", + ] + ) for citation in all_citations: - writer.writerow([ - citation["CaltechAUTHORS_ID"], - citation["CaltechAUTHORS_DOI"], - citation["Related_DOI"], - citation["CaltechDATA_ID"], - citation["Cross_Link"], - citation["resource_type"] - ]) + writer.writerow( + [ + citation["CaltechAUTHORS_ID"], + citation["CaltechAUTHORS_DOI"], + citation["Related_DOI"], + citation["CaltechDATA_ID"], + citation["Cross_Link"], + citation["resource_type"], + ] + ) print(f"Saved {len(all_citations)} citations to {output_file}") + def get_data_availability_links(token=None, size=25): base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch" - base_file_url_template = "https://authors.library.caltech.edu/api/records/{record_id}/files" - + base_file_url_template = ( + "https://authors.library.caltech.edu/api/records/{record_id}/files" + ) + token = os.environ.get("RDMTOK") - + output_file = "test_results_harvesters.csv" - + headers = {} if token: headers = { "Authorization": f"Bearer {token}", "Content-type": "application/json", } - + response = requests.get(base_url, headers=headers) if response.status_code != 200: - print(f"Error: Unable to fetch records from the API. Status code: {response.status_code}") + print( + f"Error: Unable to fetch records from the API. Status code: {response.status_code}" + ) exit() - + records = response.json().get("hits", {}).get("hits", []) - + if not records: print("No records found.") exit() - + results = [] for record in records: record_id = record.get("id") links = record.get("metadata", {}).get("additional_descriptions", []) - + for link_data in links: description = link_data.get("description", "") links_in_description = extract_https_links(description) @@ -491,14 +523,16 @@ def get_data_availability_links(token=None, size=25): cleaned = clean_link(link) filename = extract_filename_from_link(link) file_present = is_file_present(record_id, filename) - - results.append({ - "record_id": record_id, - "original_link": link, - "classification": classification, - "cleaned_link": cleaned, - "filename": filename, - "file_present": file_present - }) - + + results.append( + { + "record_id": record_id, + "original_link": link, + "classification": classification, + "cleaned_link": cleaned, + "filename": filename, + "file_present": file_present, + } + ) + return results diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index 02017c85..c41dbf9d 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -345,19 +345,25 @@ def move_doi(record, token, test=False): def add_related_identifiers_from_csv(data_rows, token, test=False): """Reads a CSV file and adds related identifiers to each record using the CaltechDATA API.""" - base_url = "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu" + base_url = ( + "https://data.caltechlibrary.dev" + if test + else "https://data.caltechlibrary.caltech.edu" + ) headers = { "Authorization": f"Bearer {token}", "Content-type": "application/json", } results = [] for row in data_rows: - record_id = row['Test_ID'] - doi = row['CaltechAUTHORS_DOI'] - caltech_author_id = row['CaltechAUTHORS_ID'] - resource_type = row['resource_type'] + record_id = row["Test_ID"] + doi = row["CaltechAUTHORS_DOI"] + caltech_author_id = row["CaltechAUTHORS_ID"] + resource_type = row["resource_type"] - print(f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}") + print( + f"\nProcessing Test_ID: {record_id} with DOI: {doi} and CaltechAUTHORS_ID: {caltech_author_id}" + ) print(f"Using resource_type: {resource_type}") # Fetch the current record @@ -368,40 +374,52 @@ def add_related_identifiers_from_csv(data_rows, token, test=False): record_data = response.json() # Draft check or create - draft_response = requests.get(f"{base_url}/api/records/{record_id}/draft", headers=headers) + draft_response = requests.get( + f"{base_url}/api/records/{record_id}/draft", headers=headers + ) if draft_response.status_code == 200: record_data = draft_response.json() else: - draft_create_response = requests.post(f"{base_url}/api/records/{record_id}/draft", headers=headers) + draft_create_response = requests.post( + f"{base_url}/api/records/{record_id}/draft", headers=headers + ) if draft_create_response.status_code != 201: print(f"Error creating draft: {draft_create_response.status_code}") continue record_data = draft_create_response.json() - related_identifiers = record_data.get("metadata", {}).get("related_identifiers", []) or [] + related_identifiers = ( + record_data.get("metadata", {}).get("related_identifiers", []) or [] + ) doi_exists = any(ri.get("identifier") == doi for ri in related_identifiers) author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" - author_url_exists = any(ri.get("identifier") == author_url for ri in related_identifiers) + author_url_exists = any( + ri.get("identifier") == author_url for ri in related_identifiers + ) if not doi_exists: - related_identifiers.append({ - "relation_type": {"id": "issupplementedby"}, - "identifier": doi, - "scheme": "doi", - "resource_type": {"id": resource_type} - }) + related_identifiers.append( + { + "relation_type": {"id": "issupplementedby"}, + "identifier": doi, + "scheme": "doi", + "resource_type": {"id": resource_type}, + } + ) print(f"Adding DOI: {doi}") else: print(f"DOI already exists") if not author_url_exists: - related_identifiers.append({ - "relation_type": {"id": "isreferencedby"}, - "identifier": author_url, - "scheme": "url", - "resource_type": {"id": resource_type} - }) + related_identifiers.append( + { + "relation_type": {"id": "isreferencedby"}, + "identifier": author_url, + "scheme": "url", + "resource_type": {"id": resource_type}, + } + ) print(f"Adding CaltechAUTHORS link: {author_url}") else: print(f"CaltechAUTHORS link already exists") @@ -409,7 +427,9 @@ def add_related_identifiers_from_csv(data_rows, token, test=False): record_data["metadata"]["related_identifiers"] = related_identifiers update_response = requests.put( - f"{base_url}/api/records/{record_id}/draft", headers=headers, json=record_data + f"{base_url}/api/records/{record_id}/draft", + headers=headers, + json=record_data, ) if update_response.status_code != 200: print(f"Error updating draft: {update_response.status_code}") @@ -419,7 +439,9 @@ def add_related_identifiers_from_csv(data_rows, token, test=False): f"{base_url}/api/records/{record_id}/draft/actions/publish", headers=headers ) if publish_response.status_code != 202: - print(f"Error publishing record {record_id}: {publish_response.status_code}") + print( + f"Error publishing record {record_id}: {publish_response.status_code}" + ) results.append((record_id, False)) continue @@ -427,6 +449,7 @@ def add_related_identifiers_from_csv(data_rows, token, test=False): results.append((record_id, True)) return results + def process_link_updates(input_csv): # read the CSV file and build a dictionary: record_id -> {"links": [(link, classification), ...]} records_data = {} @@ -438,9 +461,7 @@ def process_link_updates(input_csv): classification = row["classification"].strip() if record_id not in records_data: - records_data[record_id] = { - "links": [] - } + records_data[record_id] = {"links": []} records_data[record_id]["links"].append((link, classification)) results = [] @@ -465,11 +486,13 @@ def process_link_updates(input_csv): continue # check existing related identifiers in the record - related_identifiers = metadata.get("metadata", {}).get("related_identifiers", []) + related_identifiers = metadata.get("metadata", {}).get( + "related_identifiers", [] + ) # run check_doi if a "doi" is present among the links doi_check = None - for (lk, ctype) in record_info["links"]: + for lk, ctype in record_info["links"]: if ctype.lower() == "doi": try: doi_check = check_doi(lk, production=True) @@ -478,7 +501,7 @@ def process_link_updates(input_csv): # update related identifiers updated_metadata, updated_flag = update_related_identifiers( - metadata, record_info["links"], source_type="data" + metadata, record_info["links"], source_type="data" ) if updated_flag: # saving to local JSON file for reference diff --git a/run_archives_report.py b/run_archives_report.py index ebd0e69c..0e591fc2 100644 --- a/run_archives_report.py +++ b/run_archives_report.py @@ -139,6 +139,7 @@ def block_fields(): "text_4", ] + def accession_format_report(file_obj, repo, aspace, subject=None, years=None): fields = [ "title", @@ -187,7 +188,9 @@ def accession_report(file_obj, repo, aspace, subject=None, years=None): print(f"subject {subject} not found") exit() print(f"Requesting accessions") - file_obj.writerow(["title","identifier","accession_date","agent"] + block_fields()) + file_obj.writerow( + ["title", "identifier", "accession_date", "agent"] + block_fields() + ) for acc in repo.accessions: for uri in acc.subjects: if search_uri == uri.ref: diff --git a/run_authors_affiliation_enhancement.py b/run_authors_affiliation_enhancement.py index 9789bdfd..663d4ba6 100644 --- a/run_authors_affiliation_enhancement.py +++ b/run_authors_affiliation_enhancement.py @@ -23,13 +23,13 @@ args = parser.parse_args() author_identifier = args.author_identifier -#to_update = [get_metadata('6dmax-vx632',authors=True)] +# to_update = [get_metadata('6dmax-vx632',authors=True)] to_update = get_author_records(author_identifier, token, all_metadata=True) for record in to_update: add_authors_affiliations( - record, - token, - dimensions_key, - allowed_identifiers=ror, - ) + record, + token, + dimensions_key, + allowed_identifiers=ror, + ) diff --git a/run_authors_group_report.py b/run_authors_group_report.py index c91e1aa5..529e1b49 100644 --- a/run_authors_group_report.py +++ b/run_authors_group_report.py @@ -3,15 +3,15 @@ group_identifier = sys.argv[1] -#outfile = open(f"{group_identifier}_report.csv", "w") -#writer = csv.writer(outfile) +# outfile = open(f"{group_identifier}_report.csv", "w") +# writer = csv.writer(outfile) to_update = get_group_records(group_identifier) outfile = open(f"{group_identifier}_report.json", "w") outfile.write(json.dumps(to_update, indent=4)) -#for record in to_update: +# for record in to_update: # if "doi" not in record["pids"]: # metadata = record["metadata"] # publisher = "" diff --git a/run_authors_name_update.py b/run_authors_name_update.py index b1228f36..d1a6414b 100644 --- a/run_authors_name_update.py +++ b/run_authors_name_update.py @@ -20,7 +20,7 @@ old_identifier = args.old_identifier new_identifier = args.new_identifier -to_update = get_author_records(old_identifier,token) +to_update = get_author_records(old_identifier, token) for record in to_update: if args.add: edit_author_identifier( diff --git a/run_caltechauthors_harvestors.py b/run_caltechauthors_harvestors.py index dc217f52..3e43d1a5 100644 --- a/run_caltechauthors_harvestors.py +++ b/run_caltechauthors_harvestors.py @@ -6,11 +6,13 @@ extract_https_links, clean_link, extract_filename_from_link, - is_file_present + is_file_present, ) base_url = "https://authors.library.caltech.edu/api/records?q=metadata.additional_descriptions.type.id%3A%22data-availability%22&size=25&sort=bestmatch" -base_file_url_template = "https://authors.library.caltech.edu/api/records/{record_id}/files" +base_file_url_template = ( + "https://authors.library.caltech.edu/api/records/{record_id}/files" +) token = os.environ.get("RDMTOK") @@ -25,7 +27,9 @@ response = requests.get(base_url, headers=headers) if response.status_code != 200: - print(f"Error: Unable to fetch records from the API. Status code: {response.status_code}") + print( + f"Error: Unable to fetch records from the API. Status code: {response.status_code}" + ) exit() records = response.json().get("hits", {}).get("hits", []) @@ -48,14 +52,16 @@ filename = extract_filename_from_link(link) file_present = is_file_present(record_id, filename) - results.append({ - "record_id": record_id, - "original_link": link, - "classification": classification, - "cleaned_link": cleaned, - "filename": filename, - "file_present": file_present - }) + results.append( + { + "record_id": record_id, + "original_link": link, + "classification": classification, + "cleaned_link": cleaned, + "filename": filename, + "file_present": file_present, + } + ) if results: with open(output_file, "w", newline="") as f: diff --git a/run_caltechauthors_matchers.py b/run_caltechauthors_matchers.py index 3d71948c..a5eb361a 100644 --- a/run_caltechauthors_matchers.py +++ b/run_caltechauthors_matchers.py @@ -25,9 +25,7 @@ def main(): classification = row["classification"].strip() if record_id not in records_data: - records_data[record_id] = { - "links": [] - } + records_data[record_id] = {"links": []} records_data[record_id]["links"].append((link, classification)) results = [] @@ -52,11 +50,13 @@ def main(): continue # check existing related identifiers in the record - related_identifiers = metadata.get("metadata", {}).get("related_identifiers", []) + related_identifiers = metadata.get("metadata", {}).get( + "related_identifiers", [] + ) # run check_doi if a "doi" is present among the links doi_check = None - for (lk, ctype) in record_info["links"]: + for lk, ctype in record_info["links"]: if ctype.lower() == "doi": try: doi_check = check_doi(lk, production=True) @@ -65,7 +65,7 @@ def main(): # update related identifiers updated_metadata, updated_flag = update_related_identifiers( - metadata, record_info["links"], source_type="data" + metadata, record_info["links"], source_type="data" ) if updated_flag: # saving to local JSON file for reference @@ -84,5 +84,6 @@ def main(): } ) + if __name__ == "__main__": main() diff --git a/run_harvest_links.py b/run_harvest_links.py index 3d71948c..a5eb361a 100644 --- a/run_harvest_links.py +++ b/run_harvest_links.py @@ -25,9 +25,7 @@ def main(): classification = row["classification"].strip() if record_id not in records_data: - records_data[record_id] = { - "links": [] - } + records_data[record_id] = {"links": []} records_data[record_id]["links"].append((link, classification)) results = [] @@ -52,11 +50,13 @@ def main(): continue # check existing related identifiers in the record - related_identifiers = metadata.get("metadata", {}).get("related_identifiers", []) + related_identifiers = metadata.get("metadata", {}).get( + "related_identifiers", [] + ) # run check_doi if a "doi" is present among the links doi_check = None - for (lk, ctype) in record_info["links"]: + for lk, ctype in record_info["links"]: if ctype.lower() == "doi": try: doi_check = check_doi(lk, production=True) @@ -65,7 +65,7 @@ def main(): # update related identifiers updated_metadata, updated_flag = update_related_identifiers( - metadata, record_info["links"], source_type="data" + metadata, record_info["links"], source_type="data" ) if updated_flag: # saving to local JSON file for reference @@ -84,5 +84,6 @@ def main(): } ) + if __name__ == "__main__": main() diff --git a/tests/test_matchers.py b/tests/test_matchers.py index 9e8c5840..bc6d7b24 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -5,38 +5,46 @@ import requests import sys -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from ames.matchers.caltechauthors import add_related_identifiers_from_csv token = "0UrVehnBSM9c7DQZZCM2EtB4lpuEwbTiLue6rf0Vme3lnzswlMA9whjJbmhX" CSV_FILE = "test.csv" + def load_test_data(from_csv=True): if from_csv and os.path.exists(CSV_FILE): - with open(CSV_FILE, 'r') as f: + with open(CSV_FILE, "r") as f: reader = csv.DictReader(f) return list(reader) else: - return [{ - "CaltechAUTHORS_ID": "bwww3-z8y74", - "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{random.randint(1000, 9999)}", - "Related_DOI": "10.22002/D1.1458", - "Data_ID": "3hqgp-jhw61", - "Cross_Link": "No", - "Test_ID": "99s7k-d6f58", - "resource_type": "publication-article" - }] + return [ + { + "CaltechAUTHORS_ID": "bwww3-z8y74", + "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{random.randint(1000, 9999)}", + "Related_DOI": "10.22002/D1.1458", + "Data_ID": "3hqgp-jhw61", + "Cross_Link": "No", + "Test_ID": "99s7k-d6f58", + "resource_type": "publication-article", + } + ] + def verify_related_identifiers_on_site(data_rows, test=False): - base_url = "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu" + base_url = ( + "https://data.caltechlibrary.dev" + if test + else "https://data.caltechlibrary.caltech.edu" + ) headers = {"Authorization": f"Bearer {token}"} results = [] for row in data_rows: - record_id = row['Test_ID'] - doi = row['CaltechAUTHORS_DOI'] - caltech_author_id = row['CaltechAUTHORS_ID'] + record_id = row["Test_ID"] + doi = row["CaltechAUTHORS_DOI"] + caltech_author_id = row["CaltechAUTHORS_ID"] author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" r = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) @@ -59,10 +67,13 @@ def verify_related_identifiers_on_site(data_rows, test=False): return results + class TestCaltechDataUploader(unittest.TestCase): def test_add_and_verify_related_identifiers(self): - test_data = load_test_data(from_csv=False) # <-- change this flag to toggle source + test_data = load_test_data( + from_csv=False + ) # <-- change this flag to toggle source upload_results = add_related_identifiers_from_csv(test_data, token, test=True) for record_id, success in upload_results: self.assertTrue(success, f"❌ Upload failed for record {record_id}") From abe8aaad54307cf1a76c5cc24e22c134ef13b43a Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Mon, 19 May 2025 14:51:28 -0700 Subject: [PATCH 05/19] Update test_matchers.py --- tests/test_matchers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_matchers.py b/tests/test_matchers.py index bc6d7b24..305aa752 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -5,11 +5,12 @@ import requests import sys + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from ames.matchers.caltechauthors import add_related_identifiers_from_csv -token = "0UrVehnBSM9c7DQZZCM2EtB4lpuEwbTiLue6rf0Vme3lnzswlMA9whjJbmhX" +token = os.environ.get("RDMTOK") CSV_FILE = "test.csv" From b6af8ae111b6f0e6aeb3cf0ec664fd4d4326f6e6 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Mon, 19 May 2025 14:57:02 -0700 Subject: [PATCH 06/19] Update caltechauthors.py --- ames/matchers/caltechauthors.py | 85 +++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index c41dbf9d..d1956bfd 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -520,3 +520,88 @@ def process_link_updates(input_csv): } ) return results + +def add_authors_affiliations(record, token, dimensions_key, allowed_identifiers=None): + # Add dimensions affiliations to a record + + record_id = record["id"] + if "doi" in record["pids"]: + doi = record["pids"]["doi"]["identifier"] + else: + doi = None + if "identifiers" in record["metadata"]: + for idv in record["metadata"]["identifiers"]: + if idv["scheme"] == "doi": + doi = idv["identifier"] + if doi: + endpoint = "https://cris-api.dimensions.ai/v3" + dimcli.login(key=dimensions_key, endpoint=endpoint, verbose=False) + dsl = dimcli.Dsl() + res = dsl.query_iterative( + f""" + search publications + where doi = "{doi}" + return publications[basics+extras+abstract] """, + verbose=False, + ) + publication = res.json["publications"] + update = False + if len(publication) == 1: + publication = publication[0] + dimensions_authors = publication.get("authors", []) + existing_authors = record["metadata"]["creators"] + if len(dimensions_authors) == len(existing_authors): + for position in range(len(dimensions_authors)): + author = existing_authors[position] + dimensions_author = dimensions_authors[position] + if "affiliations" not in author: + affiliations = [] + affiliation_ids = [] + if dimensions_author["affiliations"] not in [[], None]: + for affiliation in dimensions_author["affiliations"]: + affil = {} + if "id" in affiliation: + if affiliation["id"] is not None: + ror = grid_to_ror(affiliation["id"]) + if ror is not None: + if allowed_identifiers is not None: + if ror in allowed_identifiers: + affil["id"] = ror + else: + print( + "ROR %s not in allowed identifiers list" + % ror + ) + else: + print( + "Missing ROR for affiliation %s" + % affiliation["id"] + ) + # We have to manually handle incorrectly mapped JPL + # affiliations + if "raw_affiliation" in affiliation: + raw = affiliation["raw_affiliation"] + affil["name"] = raw + if "91109" in raw: + affil["id"] = "027k65916" + if "Jet Propulsion Laboratory" in raw: + affil["id"] = "027k65916" + if "JPL" in raw: + affil["id"] = "027k65916" + # Some dimensions records don't include id values. + # We ignore those for now + if "id" in affil: + if affil["id"] not in affiliation_ids: + update = True + affiliation_ids.append(affil["id"]) + affiliations.append(affil) + existing_authors[position]["affiliations"] = affiliations + if update: + caltechdata_edit( + record_id, + metadata=record, + token=token, + production=True, + publish=True, + authors=True, + ) From d1afcdf481c153a67f89aabdddcc206dd31f1743 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Mon, 19 May 2025 15:08:39 -0700 Subject: [PATCH 07/19] Update test_matchers.py --- tests/test_matchers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_matchers.py b/tests/test_matchers.py index 305aa752..0373dad2 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -33,7 +33,7 @@ def load_test_data(from_csv=True): ] -def verify_related_identifiers_on_site(data_rows, test=False): +def verify_related_identifiers_on_site(data_rows, test=True): base_url = ( "https://data.caltechlibrary.dev" if test From 472967670d204370670407b8c37434a19d4139aa Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP Date: Wed, 4 Jun 2025 09:58:48 +0000 Subject: [PATCH 08/19] Add updated CITATION.cff from codemeta.json file --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 3381ac61..b8693bb4 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -21,4 +21,4 @@ keywords: - GitHub - metadata - software -date-released: 2025-05-19 +date-released: 2025-06-04 From c88bf89a40a6f04239f1d018e0ee624940535af5 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:02:28 -0700 Subject: [PATCH 09/19] Update caltechauthors.py --- ames/matchers/caltechauthors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index d1956bfd..14cdc92c 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -4,7 +4,7 @@ from caltechdata_api import caltechdata_edit -# function to get metadata for a record +# function to get metadata for a records def get_record_metadata(record_id): metadata_url = f"https://authors.library.caltech.edu/api/records/{record_id}" headers = {} From 52f4c308709f43c79b692107ed9191037ebb0b1f Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:10:49 -0700 Subject: [PATCH 10/19] Update caltechauthors.py --- ames/matchers/caltechauthors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index 14cdc92c..d1956bfd 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -4,7 +4,7 @@ from caltechdata_api import caltechdata_edit -# function to get metadata for a records +# function to get metadata for a record def get_record_metadata(record_id): metadata_url = f"https://authors.library.caltech.edu/api/records/{record_id}" headers = {} From 0025215c9c4893ac3eb57d59783b82ae85526515 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:42:45 -0700 Subject: [PATCH 11/19] Update test_matchers.py --- tests/test_matchers.py | 82 +++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/tests/test_matchers.py b/tests/test_matchers.py index 0373dad2..34bcbe18 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -1,10 +1,17 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Extended-logging version of tests/test_matchers.py +Adds prints so you can see where the flow dies. +""" + import os import unittest import csv import random import requests import sys - +from datetime import datetime sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) @@ -13,24 +20,33 @@ token = os.environ.get("RDMTOK") CSV_FILE = "test.csv" +print(f"[{datetime.now().isoformat()}] RDMTOK present? {'YES' if token else 'NO'}") + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- def load_test_data(from_csv=True): + print(f"[loader] from_csv={from_csv}") if from_csv and os.path.exists(CSV_FILE): - with open(CSV_FILE, "r") as f: + print(f"[loader] reading {CSV_FILE}") + with open(CSV_FILE, "r", newline="") as f: reader = csv.DictReader(f) - return list(reader) + rows = list(reader) + print(f"[loader] loaded {len(rows)} rows") + return rows else: - return [ - { - "CaltechAUTHORS_ID": "bwww3-z8y74", - "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{random.randint(1000, 9999)}", - "Related_DOI": "10.22002/D1.1458", - "Data_ID": "3hqgp-jhw61", - "Cross_Link": "No", - "Test_ID": "99s7k-d6f58", - "resource_type": "publication-article", - } - ] + dummy = { + "CaltechAUTHORS_ID": "bwww3-z8y74", + "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{random.randint(1000, 9999)}", + "Related_DOI": "10.22002/D1.1458", + "Data_ID": "3hqgp-jhw61", + "Cross_Link": "No", + "Test_ID": "99s7k-d6f58", + "resource_type": "publication-article", + } + print(f"[loader] generated 1 synthetic row -> DOI {dummy['CaltechAUTHORS_DOI']}") + return [dummy] def verify_related_identifiers_on_site(data_rows, test=True): @@ -48,14 +64,19 @@ def verify_related_identifiers_on_site(data_rows, test=True): caltech_author_id = row["CaltechAUTHORS_ID"] author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" - r = requests.get(f"{base_url}/api/records/{record_id}", headers=headers) + url = f"{base_url}/api/records/{record_id}" + print(f"[verify] GET {url}") + r = requests.get(url, headers=headers) + print(f"[verify] status={r.status_code}") if r.status_code != 200: - print(f"❌ Could not fetch record {record_id}") + print(f"❌ Could not fetch record {record_id}: {r.text[:300]}") results.append((record_id, False)) continue metadata = r.json().get("metadata", {}) related = metadata.get("related_identifiers", []) + print(f"[verify] related_identifiers → {related}") + found_doi = any(x["identifier"] == doi for x in related) found_author = any(x["identifier"] == author_url for x in related) @@ -63,26 +84,31 @@ def verify_related_identifiers_on_site(data_rows, test=True): print(f"✅ Verified: {record_id}") results.append((record_id, True)) else: - print(f"❌ Verification failed: {record_id}") + print(f"❌ Verification failed: {record_id} " + f"(doi={found_doi}, author={found_author})") results.append((record_id, False)) return results +# --------------------------------------------------------------------------- +# Unit-test +# --------------------------------------------------------------------------- + class TestCaltechDataUploader(unittest.TestCase): def test_add_and_verify_related_identifiers(self): - test_data = load_test_data( - from_csv=False - ) # <-- change this flag to toggle source - upload_results = add_related_identifiers_from_csv(test_data, token, test=True) - for record_id, success in upload_results: - self.assertTrue(success, f"❌ Upload failed for record {record_id}") + test_data = load_test_data(from_csv=False) # flip flag to change source - verify_results = verify_related_identifiers_on_site(test_data, test=True) - for record_id, success in verify_results: - self.assertTrue(success, f"❌ Verification failed for record {record_id}") + print("[test] calling add_related_identifiers_from_csv ...") + upload_results = add_related_identifiers_from_csv( + test_data, token, test=True + ) + print(f"[test] upload_results → {upload_results}") + for record_id, success in upload_results: + print(f"[test] upload {record_id}: {'OK' if success else 'FAIL'}") + self.assertTrue(success, f"❌ Upload failed for record {record_id}") -if __name__ == "__main__": - unittest.main() + print("[test] verifying on server ...") + verify_results = verify_related_identifier_ From 1e3c2f1282d7590b529350032abe86bf0017cf08 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:44:54 -0700 Subject: [PATCH 12/19] Update test_matchers.py --- tests/test_matchers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_matchers.py b/tests/test_matchers.py index 34bcbe18..a5751c52 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -20,7 +20,8 @@ token = os.environ.get("RDMTOK") CSV_FILE = "test.csv" -print(f"[{datetime.now().isoformat()}] RDMTOK present? {'YES' if token else 'NO'}") +print(f"[debug] RDMTOK present? {'YES' if token else 'NO'} " + f"(len={len(token) if token else 0})") # --------------------------------------------------------------------------- # Helpers From 3ad7bd27553a597811a8c8f411530820811e3118 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:48:44 -0700 Subject: [PATCH 13/19] Update test.yml --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 867667a5..42ab51c0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,5 +30,7 @@ jobs: pip install -r requirements.txt || true - name: Run tests for caltechauthors + env: + RDMTOK: ${{ secrets.RDMTOK }} run: | PYTHONPATH=${{ github.workspace }} python -m unittest discover -s tests -p 'test_matchers.py' From 9300b13c1f6049615e55f4ea806aa7d5f475daf7 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:49:03 -0700 Subject: [PATCH 14/19] Update caltechauthors.py --- ames/matchers/caltechauthors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index d1956bfd..14cdc92c 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -4,7 +4,7 @@ from caltechdata_api import caltechdata_edit -# function to get metadata for a record +# function to get metadata for a records def get_record_metadata(record_id): metadata_url = f"https://authors.library.caltech.edu/api/records/{record_id}" headers = {} From 2f7d245c6c6181700bbcb1d3d142f9e9160e04a9 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:53:01 -0700 Subject: [PATCH 15/19] Update test_matchers.py --- tests/test_matchers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_matchers.py b/tests/test_matchers.py index a5751c52..83752fe7 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -112,4 +112,4 @@ def test_add_and_verify_related_identifiers(self): self.assertTrue(success, f"❌ Upload failed for record {record_id}") print("[test] verifying on server ...") - verify_results = verify_related_identifier_ + verify_results = verify_related_identifiers_on_site(test_data, test=True) From d283a146c1de88af6235f33b04b4206668d9f88f Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 04:00:13 -0700 Subject: [PATCH 16/19] Update test_matchers.py --- tests/test_matchers.py | 120 +++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 71 deletions(-) diff --git a/tests/test_matchers.py b/tests/test_matchers.py index 83752fe7..a04da0f9 100644 --- a/tests/test_matchers.py +++ b/tests/test_matchers.py @@ -1,115 +1,93 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -Extended-logging version of tests/test_matchers.py -Adds prints so you can see where the flow dies. -""" - -import os -import unittest import csv +import os import random -import requests import sys -from datetime import datetime +import unittest -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +import requests -from ames.matchers.caltechauthors import add_related_identifiers_from_csv +# Ensure the local project package is importable when the repo root is the CWD. +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from ames.matchers.caltechauthors import add_related_identifiers_from_csv # noqa: E402 -token = os.environ.get("RDMTOK") +TOKEN = os.getenv("RDMTOK") CSV_FILE = "test.csv" -print(f"[debug] RDMTOK present? {'YES' if token else 'NO'} " - f"(len={len(token) if token else 0})") +print(f"[init] RDMTOK present: {'YES' if TOKEN else 'NO'} (len={len(TOKEN) if TOKEN else 0})") -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- -def load_test_data(from_csv=True): - print(f"[loader] from_csv={from_csv}") +def load_test_data(from_csv: bool = True): + """Return rows for the upload function, from CSV when available.""" if from_csv and os.path.exists(CSV_FILE): - print(f"[loader] reading {CSV_FILE}") - with open(CSV_FILE, "r", newline="") as f: - reader = csv.DictReader(f) - rows = list(reader) - print(f"[loader] loaded {len(rows)} rows") - return rows - else: - dummy = { + with open(CSV_FILE, newline="") as fh: + return list(csv.DictReader(fh)) + + doi_stub = random.randint(1000, 9999) + return [ + { "CaltechAUTHORS_ID": "bwww3-z8y74", - "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{random.randint(1000, 9999)}", + "CaltechAUTHORS_DOI": f"10.1093/mnras/staa{doi_stub}", "Related_DOI": "10.22002/D1.1458", "Data_ID": "3hqgp-jhw61", "Cross_Link": "No", "Test_ID": "99s7k-d6f58", "resource_type": "publication-article", } - print(f"[loader] generated 1 synthetic row -> DOI {dummy['CaltechAUTHORS_DOI']}") - return [dummy] + ] -def verify_related_identifiers_on_site(data_rows, test=True): - base_url = ( +def verify_related_identifiers_on_site(rows, *, test: bool = True): + """Fetch each record and report which links are present or missing.""" + base = ( "https://data.caltechlibrary.dev" if test else "https://data.caltechlibrary.caltech.edu" ) - headers = {"Authorization": f"Bearer {token}"} + headers = {"Authorization": f"Bearer {TOKEN}"} results = [] - for row in data_rows: + for row in rows: record_id = row["Test_ID"] doi = row["CaltechAUTHORS_DOI"] - caltech_author_id = row["CaltechAUTHORS_ID"] - author_url = f"https://authors.library.caltech.edu/records/{caltech_author_id}" - - url = f"{base_url}/api/records/{record_id}" - print(f"[verify] GET {url}") - r = requests.get(url, headers=headers) - print(f"[verify] status={r.status_code}") - if r.status_code != 200: - print(f"❌ Could not fetch record {record_id}: {r.text[:300]}") + author_link = f"https://authors.library.caltech.edu/records/{row['CaltechAUTHORS_ID']}" + + resp = requests.get(f"{base}/api/records/{record_id}", headers=headers) + print(f"[verify] {record_id}: {resp.status_code}") + if resp.status_code != 200: + print(" Error: could not fetch record from server.") results.append((record_id, False)) continue - metadata = r.json().get("metadata", {}) - related = metadata.get("related_identifiers", []) - print(f"[verify] related_identifiers → {related}") + related = resp.json().get("metadata", {}).get("related_identifiers", []) + has_doi = any(x["identifier"] == doi for x in related) + has_author = any(x["identifier"] == author_link for x in related) - found_doi = any(x["identifier"] == doi for x in related) - found_author = any(x["identifier"] == author_url for x in related) + status_parts = [ + "DOI link present" if has_doi else "DOI link missing", + "CaltechAUTHORS link present" if has_author else "CaltechAUTHORS link missing", + ] + print(" " + "; ".join(status_parts)) - if found_doi and found_author: - print(f"✅ Verified: {record_id}") - results.append((record_id, True)) - else: - print(f"❌ Verification failed: {record_id} " - f"(doi={found_doi}, author={found_author})") - results.append((record_id, False)) + results.append((record_id, has_doi and has_author)) return results -# --------------------------------------------------------------------------- -# Unit-test -# --------------------------------------------------------------------------- - class TestCaltechDataUploader(unittest.TestCase): - + @unittest.skipUnless(TOKEN, "needs RDMTOK to hit CaltechDATA API") def test_add_and_verify_related_identifiers(self): - test_data = load_test_data(from_csv=False) # flip flag to change source + rows = load_test_data(from_csv=False) + + uploads = add_related_identifiers_from_csv(rows, TOKEN, test=True) + for record_id, ok in uploads: + self.assertTrue(ok, f"upload failed for {record_id}") - print("[test] calling add_related_identifiers_from_csv ...") - upload_results = add_related_identifiers_from_csv( - test_data, token, test=True - ) - print(f"[test] upload_results → {upload_results}") + verifies = verify_related_identifiers_on_site(rows, test=True) + for record_id, ok in verifies: + self.assertTrue(ok, f"verification failed for {record_id}") - for record_id, success in upload_results: - print(f"[test] upload {record_id}: {'OK' if success else 'FAIL'}") - self.assertTrue(success, f"❌ Upload failed for record {record_id}") - print("[test] verifying on server ...") - verify_results = verify_related_identifiers_on_site(test_data, test=True) +if __name__ == "__main__": + unittest.main(verbosity=2) From 3c6d71739e133af880e9f3f1d278562878017a81 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 09:01:54 -0700 Subject: [PATCH 17/19] Update caltechauthors.py --- ames/matchers/caltechauthors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ames/matchers/caltechauthors.py b/ames/matchers/caltechauthors.py index 14cdc92c..d1956bfd 100644 --- a/ames/matchers/caltechauthors.py +++ b/ames/matchers/caltechauthors.py @@ -4,7 +4,7 @@ from caltechdata_api import caltechdata_edit -# function to get metadata for a records +# function to get metadata for a record def get_record_metadata(record_id): metadata_url = f"https://authors.library.caltech.edu/api/records/{record_id}" headers = {} From ec967339515e7227453a3f7d6e0824e5291d9a22 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Wed, 4 Jun 2025 09:07:03 -0700 Subject: [PATCH 18/19] Update codemeta.json --- codemeta.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/codemeta.json b/codemeta.json index 639447c7..9acd3040 100755 --- a/codemeta.json +++ b/codemeta.json @@ -30,6 +30,17 @@ "email": "rsdoiel@caltech.edu", "@id": "https://orcid.org/0000-0003-0900-6903" }, + { + "@type": "Person", + "givenName": "Rohan", + "familyName": "Bhattarai", + "affiliation": { + "@type": "Organization", + "name": "Caltech" + }, + "email": "rbhattar@caltech.edu", + "@id": "https://orcid.org/0009-0007-0323-4733" + }, { "@type": "Person", "givenName": "Elizabeth", From 638d0aa47c96217c7eb9ae308f3545333ebddf7f Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP Date: Wed, 4 Jun 2025 16:07:21 +0000 Subject: [PATCH 19/19] Add updated CITATION.cff from codemeta.json file --- CITATION.cff | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CITATION.cff b/CITATION.cff index b8693bb4..ff07e63a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -8,6 +8,9 @@ authors: - family-names: Doiel given-names: Robert orcid: https://orcid.org/0000-0003-0900-6903 + - family-names: Bhattarai + given-names: Rohan + orcid: https://orcid.org/0009-0007-0323-4733 - family-names: Won given-names: Elizabeth orcid: https://orcid.org/0009-0002-2450-6471