Merge branch 'loaddata_headers' of https://github.com/Sparks29032/diffpy.utils

Andrew Yang · Andrew Yang · commit c96306335248 · 2023-07-31T12:31:31.000-04:00
diff --git a/src/diffpy/utils/parsers/loaddata.py b/src/diffpy/utils/parsers/loaddata.py
@@ -16,7 +16,7 @@
 import numpy
 
 
-def loadData(filename, minrows=10, **kwargs):
+def loadData(filename, minrows=10, headers=False, hdel='=', hignore=None, **kwargs):
     """Find and load data from a text file.
 
     The data reading starts at the first matrix block of at least minrows rows
@@ -26,6 +26,9 @@ def loadData(filename, minrows=10, **kwargs):
     filename -- name of the file we want to load data from.
     minrows  -- minimum number of rows in the first data block.
                 All rows must have the same number of floating point values.
+    headers  -- return also a dictionary of parameters specified in header
+    hdel     -- delimiter for parsing header information
+    hignore  -- ignore header rows beginning with any elements in the hignore list
     usecols  -- zero-based index of columns to be loaded, by default use
                 all detected columns.  The reading skips data blocks that
                 do not have the usecols-specified columns.
@@ -39,6 +42,8 @@ def loadData(filename, minrows=10, **kwargs):
     See also numpy.loadtxt for more details.
     """
     from numpy import array, loadtxt
+    # for storing header data
+    hdata = {}
     # determine the arguments
     delimiter = kwargs.get('delimiter')
     usecols = kwargs.get('usecols')
@@ -72,8 +77,39 @@ def countcolumnsvalues(line):
         fpos = (0, 0)
         nrows = 0
         for line in fid:
+            # decode line
+            dline = line.decode()
+            # find header information if requested
+            if headers:
+                hpair = dline.split(hdel)
+                flag = True
+                # ensure number of non-blank arguments is two
+                if len(hpair) != 2:
+                    flag = False
+                else:
+                    # ignore if an argument is blank
+                    hpair[0] = hpair[0].strip()  # name of data entry
+                    hpair[1] = hpair[1].strip()  # value of entry
+                    if not hpair[0] or not hpair[1]:
+                        flag = False
+                    else:
+                        # check if row has an ignore tag
+                        if hignore is not None:
+                            for tag in hignore:
+                                taglen = len(tag)
+                                if len(hpair[0]) >= taglen and hpair[0][:taglen] == tag:
+                                    flag = False
+                # add header data
+                if flag:
+                    name = hpair[0]
+                    value = hpair[1]
+                    # check if data value should be stored as float
+                    if isfloat(hpair[1]):
+                        value = float(hpair[1])
+                    hdata.update({name: value})
+            # continue search for the start of datablock
             fpos = (fpos[1], fpos[1] + len(line))
-            line = line.decode()
+            line = dline
             ncv = countcolumnsvalues(line)
             if ncv < mincv:
                 start = None
@@ -98,6 +134,11 @@ def countcolumnsvalues(line):
             # in case of trailing delimiters.
             kwargs.setdefault('usecols', list(range(ncvblock[0])))
             rv = loadtxt(fid, **kwargs)
+
+    # return headers if requested
+    if headers:
+        return hdata, rv
+    # otherwise do not
     return rv
 
 
@@ -247,4 +288,4 @@ def isfloat(s):
         pass
     return False
 
-# End of file
+# End of file
diff --git a/src/diffpy/utils/tests/test_loaddata.py b/src/diffpy/utils/tests/test_loaddata.py
@@ -9,6 +9,7 @@
 from diffpy.utils.tests.testhelpers import datafile
 
 loaddata01 = datafile('loaddata01.txt')
+loaddatawithheaders = datafile('loaddatawithheaders.txt')
 
 ##############################################################################
 class TestLoadData(unittest.TestCase):
@@ -44,9 +45,28 @@ def test_loadData_1column(self):
         self.assertFalse(numpy.array_equal(d1c, d))
         return
 
+
+    def test_loadData_headers(self):
+        """check loadData() with headers options enabled
+        """
+        hignore = ["# ", "// ", "["]  # ignore lines beginning with these strings
+        delimiter = ": "  # what our data should be separated by
+        hdata, rv = loadData(loaddatawithheaders, headers=True, hdel=delimiter, hignore=hignore)
+        # only fourteen lines of data are formatted properly
+        assert len(hdata) == 14
+        # check the following are floats
+        vfloats = ["wavelength", "qmaxinst", "qmin", "qmax", "bgscale"]
+        for name in vfloats:
+            assert isinstance(hdata.get(name), float)
+        # check the following are NOT floats
+        vnfloats = ["composition", "rmax", "rmin", "rstep", "rpoly"]
+        for name in vnfloats:
+            assert not isinstance(hdata.get(name), float)
+
+
 # End of class TestRoutines
 
 if __name__ == '__main__':
     unittest.main()
 
-# End of file
+# End of file
diff --git a/src/diffpy/utils/tests/testdata/loaddatawithheaders.txt b/src/diffpy/utils/tests/testdata/loaddatawithheaders.txt