From 627e53f3c666d708a062e15716573481c21ab998 Mon Sep 17 00:00:00 2001 From: yucongalicechen Date: Fri, 2 May 2025 19:15:35 -0400 Subject: [PATCH 1/4] feat: add function that returns cif filenames based on chemical formula --- news/get-cif.rst | 23 ++++++++++++++++ src/diffpy/utils/tools.py | 57 +++++++++++++++++++++++++++++++++++++++ tests/test_tools.py | 41 ++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 news/get-cif.rst diff --git a/news/get-cif.rst b/news/get-cif.rst new file mode 100644 index 0000000..5ded69e --- /dev/null +++ b/news/get-cif.rst @@ -0,0 +1,23 @@ +**Added:** + +* Function that returns a list of cif filenames based on a given chemical formula. + +**Changed:** + +* + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* + +**Security:** + +* diff --git a/src/diffpy/utils/tools.py b/src/diffpy/utils/tools.py index 63e10ba..5b6647b 100644 --- a/src/diffpy/utils/tools.py +++ b/src/diffpy/utils/tools.py @@ -4,6 +4,8 @@ from pathlib import Path import numpy as np +import requests +from bs4 import BeautifulSoup from scipy.optimize import dual_annealing from scipy.signal import convolve from xraydb import material_mu @@ -214,6 +216,61 @@ def get_package_info(package_names, metadata=None): return metadata +def fetch_cif_filenames(hill_formula): + """Fetches a list of CIF filenames from the Crystallography Open Database + (COD) based on the given chemical formula in Hill notation, where elements + are separated by whitespace and the count of 1 is omitted (e.g., "Cl Na"). + + Parameters + ---------- + hill_formula : str + The chemical formula in Hill notation. + + Returns + ------- + list of str + A list of CIF filenames (e.g., ["1000041.cif", "2104025.cif"]). + + Raises + ------ + ValueError + If no CIF files are found for the given formula. + + Notes + ----- + The data is retrieved from the Crystallography Open Database (COD). + If you use COD data in your research, + please acknowledge the COD project as described at + https://www.crystallography.net/cod/acknowledgements.html. + """ + search_url = ( + f"https://www.crystallography.net/cod/" + f"result.php?formula={hill_formula}" + ) + response = requests.get(search_url) + if response.status_code != 200: + raise Exception( + f"Failed to retrieve search results. " + f"HTTP status code: {response.status_code}." + ) + cif_links = BeautifulSoup(response.text, "html.parser").find_all("a") + cif_filenames = [] + for link in cif_links: + href = link.get("href", "") + if href.endswith(".cif"): + filename = href.split("/")[-1] + cif_filenames.append(filename) + if len(cif_filenames) == 0: + raise ValueError( + f"No CIF files found for the given formula: {hill_formula}. " + "Please ensure it's in Hill notation (e.g., 'Cl Na'). " + "You can use ``to_hill_notation`` for conversion. " + "If the formula is correct, it is possible that " + "no CIF files are available for this formula in the COD." + ) + return cif_filenames + + def get_density_from_cloud(sample_composition, mp_token=""): """Function to get material density from the MP or COD database. diff --git a/tests/test_tools.py b/tests/test_tools.py index 6be3870..dfd8581 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -1,6 +1,7 @@ import importlib.metadata import json import os +import re from pathlib import Path import numpy as np @@ -11,6 +12,7 @@ check_and_build_global_config, compute_mu_using_xraydb, compute_mud, + fetch_cif_filenames, get_package_info, get_user_info, ) @@ -270,6 +272,45 @@ def test_get_package_info(monkeypatch, inputs, expected): assert actual_metadata == expected +def test_fetch_cif_filenames(): + actual_cif_filenames = fetch_cif_filenames("Cl Na") + expected_cif_filenames = [ + "1000041.cif", + "2104025.cif", + "2108652.cif", + "2311042.cif", + "4300180.cif", + "4320809.cif", + "7132177.cif", + "9000629.cif", + "9003308.cif", + "9003309.cif", + "9003310.cif", + "9003311.cif", + "9003312.cif", + "9003313.cif", + "9003314.cif", + "9006369.cif", + "9006370.cif", + "9006371.cif", + "9006372.cif", + "9006373.cif", + ] + return sorted(actual_cif_filenames) == sorted(expected_cif_filenames) + + +def test_fetch_cif_filenames_bad(): + expected_error_msg = ( + "No CIF files found for the given formula: NaCl. " + "Please ensure it's in Hill notation (e.g., 'Cl Na'). " + "You can use ``to_hill_notation`` for conversion. " + "If the formula is correct, it is possible that " + "no CIF files are available for this formula in the COD." + ) + with pytest.raises(ValueError, match=re.escape(expected_error_msg)): + fetch_cif_filenames("NaCl") + + @pytest.mark.parametrize( "inputs", [ From de392e904cf9001eebb9a9103610414e37dda04e Mon Sep 17 00:00:00 2001 From: yucongalicechen Date: Fri, 2 May 2025 19:23:23 -0400 Subject: [PATCH 2/4] add bs4 module in pip and conda.txt --- requirements/conda.txt | 1 + requirements/pip.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements/conda.txt b/requirements/conda.txt index 9de10da..bf17e13 100644 --- a/requirements/conda.txt +++ b/requirements/conda.txt @@ -1,3 +1,4 @@ numpy xraydb scipy +bs4 diff --git a/requirements/pip.txt b/requirements/pip.txt index 9de10da..bf17e13 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -1,3 +1,4 @@ numpy xraydb scipy +bs4 From 12443328ff57b8dc3d67773f1cad32f0507f9e88 Mon Sep 17 00:00:00 2001 From: yucongalicechen Date: Fri, 2 May 2025 21:57:04 -0400 Subject: [PATCH 3/4] fix: use api instead of bs4 for data retrieval --- src/diffpy/utils/tools.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/src/diffpy/utils/tools.py b/src/diffpy/utils/tools.py index 5b6647b..799aeda 100644 --- a/src/diffpy/utils/tools.py +++ b/src/diffpy/utils/tools.py @@ -5,7 +5,6 @@ import numpy as np import requests -from bs4 import BeautifulSoup from scipy.optimize import dual_annealing from scipy.signal import convolve from xraydb import material_mu @@ -235,31 +234,17 @@ def fetch_cif_filenames(hill_formula): ------ ValueError If no CIF files are found for the given formula. - - Notes - ----- - The data is retrieved from the Crystallography Open Database (COD). - If you use COD data in your research, - please acknowledge the COD project as described at - https://www.crystallography.net/cod/acknowledgements.html. """ - search_url = ( - f"https://www.crystallography.net/cod/" - f"result.php?formula={hill_formula}" - ) - response = requests.get(search_url) + base_url = "https://www.crystallography.net/cod/result.php" + params = {"formula": hill_formula, "format": "json"} + response = requests.get(base_url, params=params) if response.status_code != 200: raise Exception( f"Failed to retrieve search results. " f"HTTP status code: {response.status_code}." ) - cif_links = BeautifulSoup(response.text, "html.parser").find_all("a") - cif_filenames = [] - for link in cif_links: - href = link.get("href", "") - if href.endswith(".cif"): - filename = href.split("/")[-1] - cif_filenames.append(filename) + data = response.json() + cif_filenames = [str(entry["file"]) + ".cif" for entry in data] if len(cif_filenames) == 0: raise ValueError( f"No CIF files found for the given formula: {hill_formula}. " From 4c1521f396874ac985c9fbbad00836ccbc965499 Mon Sep 17 00:00:00 2001 From: yucongalicechen Date: Fri, 2 May 2025 21:59:41 -0400 Subject: [PATCH 4/4] fix: remove bs4 from pip and conda.txt --- requirements/conda.txt | 1 - requirements/pip.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements/conda.txt b/requirements/conda.txt index bf17e13..9de10da 100644 --- a/requirements/conda.txt +++ b/requirements/conda.txt @@ -1,4 +1,3 @@ numpy xraydb scipy -bs4 diff --git a/requirements/pip.txt b/requirements/pip.txt index bf17e13..9de10da 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -1,4 +1,3 @@ numpy xraydb scipy -bs4