Skip to content

Commit 9aa595e

Browse files
author
Andrew Yang
committed
2 parents c963063 + 7190375 commit 9aa595e

File tree

16 files changed

+471
-34
lines changed

16 files changed

+471
-34
lines changed

CHANGELOG.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,29 @@
11
# Release notes
22

3-
## Version 3.1.0 – 2022-12-09
3+
## Version 3.2.0 – 2023-08-**
44

55
### Added
66

7-
- Compatibility with Python 3.10, 3.9, 3.8
7+
- CI Coverage.
8+
- New tests for loadData function.
9+
- loadData function now toggleable. Can return either (a) data read from data blocks or (b) header
10+
information stored above the data block.
811

9-
### Changed
12+
### Removed
1013

11-
### Deprecated
14+
- Remove use of pkg_resources (deprecated).
15+
- No longer use Travis.
16+
17+
## Version 3.1.0 – 2022-12-09
18+
19+
### Added
20+
21+
- Compatibility with Python 3.10, 3.9, 3.8
1222

1323
### Removed
1424

1525
- Remove the support for Python 3.5, 3.6.
1626

17-
### Fixed
18-
1927
## Version 3.0.0 -- 2019-03-12
2028

2129
Differences from version 1.2.2.

conda-recipe/run_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
#!/usr/bin/env python
22

3+
import sys
4+
import pathlib
5+
sys.path.append((pathlib.Path.cwd().parent.absolute() / "src").as_posix())
6+
37
import diffpy.utils.tests
48
assert diffpy.utils.tests.test().wasSuccessful()

src/diffpy/utils/parsers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
"""
1818

1919
from .loaddata import loadData
20+
from .serialization import serialize_data, deserialize_data
2021
from .resample import resample
2122

2223
# silence the pyflakes syntax checker
2324
assert loadData or resample or True
25+
assert serialize_data or deserialize_data or True
2426

2527
# End of file
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python
2+
##############################################################################
3+
#
4+
# diffpy.utils by DANSE Diffraction group
5+
# Simon J. L. Billinge
6+
# (c) 2010 The Trustees of Columbia University
7+
# in the City of New York. All rights reserved.
8+
#
9+
# File coded by:
10+
#
11+
# See AUTHORS.txt for a list of people who contributed.
12+
# See LICENSE_DANSE.txt for license information.
13+
#
14+
##############################################################################
15+
16+
class UnsupportedTypeError(Exception):
17+
"""For file types not supported by our parsers.
18+
19+
supported_types -- List of supported types.
20+
file -- file triggering the error.
21+
message -- for writing a custom message.
22+
"""
23+
24+
def __init__(self, file, supported_types=None, message=None):
25+
if message is None:
26+
self.message = f"The file {file} is not supported."
27+
if supported_types is not None:
28+
self.message += " Supported file types include: "
29+
for t in supported_types:
30+
self.message += t + ", "
31+
self.message = self.message[:-2] + "."
32+
super().__init__(self.message)
33+
34+
35+
class ImproperSizeError(Exception):
36+
"""When the size of an object does not match expectations.
37+
38+
bad_object -- Object with improper size.
39+
message -- for writing a custom message.
40+
"""
41+
42+
def __init__(self, bad_object, message=None):
43+
if message is None:
44+
self.message = f"The size of {bad_object} is different than expected."
45+
super().__init__(self.message)

src/diffpy/utils/parsers/loaddata.py

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,40 @@
1919
def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwargs):
2020
"""Find and load data from a text file.
2121
22-
The data reading starts at the first matrix block of at least minrows rows
23-
and constant number of columns. This seems to work for most of the
24-
datafiles including those generated by PDFGetX2.
22+
The data block is identified as the first matrix block of at least minrows rows
23+
and constant number of columns. This seems to work for most of the datafiles including
24+
those generated by diffpy programs.
2525
26-
filename -- name of the file we want to load data from.
27-
minrows -- minimum number of rows in the first data block.
28-
All rows must have the same number of floating point values.
29-
headers -- return also a dictionary of parameters specified in header
30-
hdel -- delimiter for parsing header information
31-
hignore -- ignore header rows beginning with any elements in the hignore list
32-
usecols -- zero-based index of columns to be loaded, by default use
33-
all detected columns. The reading skips data blocks that
34-
do not have the usecols-specified columns.
35-
unpack -- return data as a sequence of columns that allows tuple
36-
unpacking such as x, y = loadData(FILENAME, unpack=True).
37-
Note transposing the loaded array as loadData(FILENAME).T
38-
has the same effect.
39-
kwargs -- keyword arguments that are passed to numpy.loadtxt
26+
filename -- name of the file we want to load data from.
27+
minrows -- minimum number of rows in the first data block.
28+
All rows must have the same number of floating point values.
29+
headers -- when False (defualt), the function returns a numpy array of the
30+
data in the data block. When True, the function instead returns a
31+
dictionary of parameters and their corresponding values parsed from
32+
header (information prior the data block). See hdel and hignore for
33+
options to help with parsing header information.
34+
hdel -- (only used when headers enabled) delimiter for parsing header
35+
information (default '='). e.g. using default hdel, the line
36+
'parameter = p_value' is put into the dictionary as
37+
{parameter: p_value}.
38+
hignore -- (only used when headers enabled) ignore header rows beginning
39+
with any elements in the hignore list. e.g. hignore=['# ', '[']
40+
means the following lines are skipped: '# qmax=10', '[defaults]'.
41+
kwargs -- keyword arguments that are passed to numpy.loadtxt including
42+
the following arguments below. (See also numpy.loadtxt for more
43+
details.)
44+
delimiter -- delimiter for the data in the block (default use whitespace).
45+
For comma-separated data blocks, set delimiter to ','.
46+
usecols -- zero-based index of columns to be loaded, by default use
47+
all detected columns. The reading skips data blocks that
48+
do not have the usecols-specified columns.
49+
unpack -- return data as a sequence of columns that allows tuple
50+
unpacking such as x, y = loadData(FILENAME, unpack=True).
51+
Note transposing the loaded array as loadData(FILENAME).T
52+
has the same effect.
4053
41-
Return a numpy array of the data.
42-
See also numpy.loadtxt for more details.
54+
Return a numpy array of the data (data_block). If headers enabled, instead returns a
55+
dictionary of parameters read from the header (hddata).
4356
"""
4457
from numpy import array, loadtxt
4558
# for storing header data
@@ -124,22 +137,22 @@ def countcolumnsvalues(line):
124137
# block was found here!
125138
if nrows >= minrows:
126139
break
140+
141+
# Return header data if requested
142+
if headers:
143+
return hdata # Return, so do not proceed to reading datablock
144+
127145
# Return an empty array when no data found.
128146
# loadtxt would otherwise raise an exception on loading from EOF.
129147
if start is None:
130-
rv = array([], dtype=float)
148+
data_block = array([], dtype=float)
131149
else:
132150
fid.seek(start)
133151
# always use usecols argument so that loadtxt does not crash
134152
# in case of trailing delimiters.
135153
kwargs.setdefault('usecols', list(range(ncvblock[0])))
136-
rv = loadtxt(fid, **kwargs)
137-
138-
# return headers if requested
139-
if headers:
140-
return hdata, rv
141-
# otherwise do not
142-
return rv
154+
data_block = loadtxt(fid, **kwargs)
155+
return data_block
143156

144157

145158
class TextDataLoader(object):
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python
2+
##############################################################################
3+
#
4+
# diffpy.utils by DANSE Diffraction group
5+
# Simon J. L. Billinge
6+
# (c) 2010 The Trustees of Columbia University
7+
# in the City of New York. All rights reserved.
8+
#
9+
# File coded by:
10+
#
11+
# See AUTHORS.txt for a list of people who contributed.
12+
# See LICENSE_DANSE.txt for license information.
13+
#
14+
##############################################################################
15+
16+
import pathlib
17+
import json
18+
19+
from .custom_exceptions import UnsupportedTypeError, ImproperSizeError
20+
21+
# FIXME: add support for yaml, xml
22+
supported_formats = ['.json']
23+
24+
25+
def serialize_data(filename, hdata: dict, data_table: list, show_path=True, dt_colnames=None, serial_file=None):
26+
"""Serialize file data into a dictionary. Can also save dictionary into a serial language file.
27+
Dictionary is formatted as {filename: data}.
28+
29+
Requires hdata and data_table generated from loadData.
30+
31+
filename -- name of the file whose data is being serialized.
32+
hdata -- Dictionary of PDF metadata generated by loadData.
33+
data_table -- List storing parsed by loadData.
34+
dt_colnames -- List containing names of each column in data_table. Every name in
35+
data_table_cols will be put into the Dictionary as a key with a value
36+
of that column in data_table (stored as a List). Put None for
37+
columns without names. If dt_cols has less non-None entries
38+
than columns in data_table, the pair {'data table': data_table} will be put
39+
in the dictionary. (Default None: only entry {'data table': data_table}
40+
will be added to dictionary.)
41+
show_path -- include a path element in the database entry (default True).
42+
If 'path' is not included in hddata, extract path from filename.
43+
serial_file -- serial language file to dump dictionary into.
44+
45+
Returns the dictionary loaded from/into the updated database file.
46+
"""
47+
48+
# compile data_table and hddata together
49+
data = {}
50+
51+
# handle getting name of file for variety of filename types
52+
abs_path = pathlib.Path(filename).resolve()
53+
# add path to start of data if requested
54+
if show_path and 'path' not in hdata.keys():
55+
data.update({'path': abs_path.as_posix()})
56+
# title the entry with name of file (taken from end of path)
57+
title = abs_path.name
58+
59+
# first add named columns in dt_cols
60+
named_columns = 0 # initial value
61+
max_columns = 1 # higher than named_columns to trigger 'data table' entry
62+
if dt_colnames is not None:
63+
num_columns = [len(row) for row in data_table]
64+
max_columns = max(num_columns)
65+
num_col_names = len(dt_colnames)
66+
if max_columns < num_col_names: # assume numpy.loadtxt gives non-irregular array
67+
raise ImproperSizeError("More entries in dt_colnames than columns in data_table.")
68+
named_columns = 0
69+
for idx in range(num_col_names):
70+
colname = dt_colnames[idx]
71+
if colname is not None:
72+
data.update({colname: list(data_table[:, idx])})
73+
named_columns += 1
74+
75+
# second add data in hddata dict
76+
data.update(hdata)
77+
78+
# finally add data_table as an entry named 'data table' if not all columns were parsed
79+
if named_columns < max_columns:
80+
if 'data table' not in data.keys():
81+
data.update({'data table': data_table})
82+
else: # if 'data table' is already a key, keep adding primes to the end
83+
dt_name = 'data table'
84+
while dt_name in data.keys():
85+
dt_name += " prime"
86+
data.update({dt_name: data_table})
87+
88+
# parse name using pathlib and generate dictionary entry
89+
entry = {title: data}
90+
91+
# no save
92+
if serial_file is None:
93+
return entry
94+
95+
# saving/updating file
96+
# check if supported type
97+
sf = pathlib.Path(serial_file)
98+
sf_name = sf.name
99+
extension = sf.suffix
100+
if extension not in supported_formats:
101+
raise UnsupportedTypeError(sf_name, supported_formats)
102+
103+
# new file or update
104+
existing = False
105+
try:
106+
open(serial_file)
107+
existing = True
108+
except FileNotFoundError:
109+
pass
110+
111+
# json
112+
if extension == '.json':
113+
# dump if non-existing
114+
if not existing:
115+
with open(serial_file, 'w') as jsonfile:
116+
file_data = entry # for return
117+
json.dump(file_data, jsonfile, indent=2)
118+
119+
# update if existing
120+
else:
121+
with open(serial_file, 'r') as json_read:
122+
file_data = json.load(json_read)
123+
file_data.update(entry)
124+
with open(serial_file, 'w') as json_write:
125+
# dump to string first for formatting
126+
json.dump(file_data, json_write, indent=2)
127+
128+
return file_data
129+
130+
131+
def deserialize_data(filename):
132+
"""Load a dictionary from a serial file.
133+
134+
filename -- database file to load from.
135+
136+
Returns a dictionary of database information.
137+
"""
138+
139+
# check if supported type
140+
f = pathlib.Path(filename)
141+
f_name = f.name
142+
extension = f.suffix
143+
if extension not in supported_formats:
144+
raise UnsupportedTypeError(f_name, supported_formats)
145+
146+
# json
147+
if extension == '.json':
148+
with open(filename, 'r') as json_file:
149+
j_dict = json.load(json_file)
150+
151+
return j_dict

src/diffpy/utils/tests/test_loaddata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def test_loadData_headers(self):
5151
"""
5252
hignore = ["# ", "// ", "["] # ignore lines beginning with these strings
5353
delimiter = ": " # what our data should be separated by
54-
hdata, rv = loadData(loaddatawithheaders, headers=True, hdel=delimiter, hignore=hignore)
54+
hdata = loadData(loaddatawithheaders, headers=True, hdel=delimiter, hignore=hignore)
5555
# only fourteen lines of data are formatted properly
5656
assert len(hdata) == 14
5757
# check the following are floats

0 commit comments

Comments
 (0)