From b657862dd19fefe5734086802ba5b528d3c3e3c6 Mon Sep 17 00:00:00 2001 From: Aleksandar Jelenak Date: Sat, 29 Sep 2018 11:03:15 -0400 Subject: [PATCH] Add specifying library version for created HDF5 file --- h5json/hdf5db.py | 178 ++++++++++++++++++------------------ h5json/jsontoh5/jsontoh5.py | 26 ++++-- 2 files changed, 107 insertions(+), 97 deletions(-) diff --git a/h5json/hdf5db.py b/h5json/hdf5db.py index 38c1265..ef546e7 100644 --- a/h5json/hdf5db.py +++ b/h5json/hdf5db.py @@ -16,7 +16,7 @@ if six.PY3: unicode = str - + """ @@ -75,7 +75,7 @@ import json import logging -from .hdf5dtype import getTypeItem, createDataType, getItemSize +from .hdf5dtype import getTypeItem, createDataType, getItemSize # global dictionary to direct back to the Hdf5db instance by filename # (needed for visititems callback) @@ -138,7 +138,7 @@ def getVersionInfo(): def __init__(self, filePath, dbFilePath=None, readonly=False, app_logger=None, root_uuid=None, update_timestamps=True, - userid=None): + userid=None, libver='latest'): if app_logger: self.log = app_logger else: @@ -158,13 +158,13 @@ def __init__(self, filePath, dbFilePath=None, readonly=False, else: mode = 'r+' self.readonly = False - + self.log.info("init -- filePath: " + filePath + " mode: " + mode) self.update_timestamps = update_timestamps - self.f = h5py.File(filePath, mode, libver='latest') + self.f = h5py.File(filePath, mode, libver=libver) self.root_uuid = root_uuid @@ -1338,9 +1338,9 @@ def makeNullTermStringAttribute(self, obj, attr_name, strLength, value): if strLength < len(value): self.log.warning("makeNullTermStringAttribute: value string longer than length") #value = value[:strLength] # truncate to length - - - if six.PY3 and type(attr_name) is str: + + + if six.PY3 and type(attr_name) is str: try: attr_name = attr_name.encode('ascii') except UnicodeDecodeError: @@ -1432,7 +1432,7 @@ def makeAttribute(self, obj, attr_name, shape, attr_type, value): # create numpy array npdata = np.zeros(shape, dtype=dt) - + if rank == 0: npdata[()] = self.toNumPyValue(attr_type, value, npdata[()]) else: @@ -1689,9 +1689,9 @@ def toNumPyValue(self, typeItem, src, des): try: src.encode('ascii') except UnicodeDecodeError: - raise TypeError("non-ascii value not allowed with H5T_CSET_ASCII") + raise TypeError("non-ascii value not allowed with H5T_CSET_ASCII") des = src - + else: msg = "Unexpected type class: " + typeClass self.log.info(msg) @@ -1870,9 +1870,9 @@ def listToRef(self, data): self.log.info(msg) raise IOError(errno.EINVAL, msg) return out - + """ - Convert list that may contain bytes type elements to list of string elements + Convert list that may contain bytes type elements to list of string elements """ def bytesArrayToList(self, data): if type(data) in (bytes, str, unicode): @@ -1886,16 +1886,16 @@ def bytesArrayToList(self, data): else: is_list = False else: - is_list = True + is_list = True elif type(data) in (list, tuple): is_list = True else: is_list = False - + if is_list: out = [] for item in data: - out.append(self.bytesArrayToList(item)) # recursive call + out.append(self.bytesArrayToList(item)) # recursive call elif type(data) is bytes: if six.PY3: out = data.decode("utf-8") @@ -1903,9 +1903,9 @@ def bytesArrayToList(self, data): out = data else: out = data - + return out - + """ Get item description of region reference value """ @@ -2058,7 +2058,7 @@ def createRegionReference(self, item): h5py.h5s.SpaceID.select_hyperslab(space_id, start, count, op=h5py.h5s.SELECT_OR) # now that we've selected the desired region in the space, return a region reference - + if six.PY3: dset_name = dset.name.encode('utf-8') else: @@ -2091,12 +2091,12 @@ def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): msg = "only json and binary formats are supported" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if dset is None: msg = "Dataset: " + obj_uuid + " not found" self.log.info(msg) raise IOError(errno.ENXIO, msg) - + values = None dt = dset.dtype typeItem = getTypeItem(dt) @@ -2105,13 +2105,13 @@ def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): msg = "Only JSON is supported for for this data type" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if dset.shape is None: # null space dataset (with h5py 2.6.0) - return None - + return None + rank = len(dset.shape) - + if rank == 0: # check for null dataspace try: @@ -2131,7 +2131,7 @@ def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): msg = "Unexpected error: getDatasetValuesByUuid: number of dims in selection not same as rank" self.log.error(msg) raise IOError(errno.EIO, msg) - + if dt.kind == 'O': if format != "json": msg = "Only JSON is supported for for this data type" @@ -2170,31 +2170,31 @@ def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): values = dset[slices].tobytes() else: values = dset[slices] - + # just use tolist to dump - if format == "json": + if format == "json": values = values.tolist() else: #values = base64.b64encode(dset[slices].tobytes()) values = values.tobytes() - + return values - + """ doDatasetQueryByUuid: return rows based on query string Return rows from a dataset that matches query string. - + Note: Only supported for compound_type/one-dimensional datasets """ def doDatasetQueryByUuid(self, obj_uuid, query, start=0, stop=-1, step=1, limit=None): self.log.info("doQueryByUuid - uuid: " + obj_uuid + " query:" + query) self.log.info("start: " + str(start) + " stop: " + str(stop) + " step: " + str(step) + " limit: " + str(limit)) - dset = self.getDatasetObjByUuid(obj_uuid) + dset = self.getDatasetObjByUuid(obj_uuid) if dset is None: msg = "Dataset: " + obj_uuid + " not found" self.log.info(msg) raise IOError(errno.ENXIO, msg) - + values = [] dt = dset.dtype typeItem = getTypeItem(dt) @@ -2203,33 +2203,33 @@ def doDatasetQueryByUuid(self, obj_uuid, query, start=0, stop=-1, step=1, limit= msg = "Only compound type datasets can be used as query target" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if dset.shape is None: # null space dataset (with h5py 2.6.0) - return None - + return None + rank = len(dset.shape) if rank != 1: msg = "One one-dimensional datasets can be used as query target" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + values = [] indexes = [] count = 0 - + num_elements = dset.shape[0] if stop == -1: stop = num_elements elif stop > num_elements: stop = num_elements block_size = self._getBlockSize(dset) - self.log.info("block_size: " + str(block_size)) - + self.log.info("block_size: " + str(block_size)) + field_names = list(dset.dtype.fields.keys()) - eval_str = self._getEvalStr(query, field_names) - + eval_str = self._getEvalStr(query, field_names) + while start < stop: if limit and (count == limit): break # no more rows for this batch @@ -2248,21 +2248,21 @@ def doDatasetQueryByUuid(self, obj_uuid, query, start=0, stop=-1, step=1, limit= count += 1 if limit and (count == limit): break # no more rows for this batch - + start = end # go to next block - - + + # values = self.getDataValue(item_type, values, dimension=1, dims=(len(values),)) - - self.log.info("got " + str(count) + " query matches") + + self.log.info("got " + str(count) + " query matches") return (indexes, values) - + """ _getBlockSize: Get number of rows to read from disk - + heurestic to get reasonable sized chunk of data to fetch. make multiple of chunk_size if possible - """ + """ def _getBlockSize(self, dset): target_block_size = 256 * 1000 if dset.chunks: @@ -2274,12 +2274,12 @@ def _getBlockSize(self, dset): else: block_size = target_block_size return block_size - + """ _getEvalStr: Get eval string for given query - + Gets Eval string to use with numpy where method. - """ + """ def _getEvalStr(self, query, field_names): i = 0 eval_str = "" @@ -2310,7 +2310,7 @@ def _getEvalStr(self, query, field_names): eval_str += "rows['" + var_name + "']" var_name = None var_count += 1 - + if end_quote_char: if ch == end_quote_char: # end of literal @@ -2352,7 +2352,7 @@ def _getEvalStr(self, query, field_names): msg = "Mismatched paren" self.log.info("EINVAL: " + msg) raise IOError(errno.EINVAL, msg) - + return eval_str """ @@ -2365,7 +2365,7 @@ def getDatasetPointSelectionByUuid(self, obj_uuid, points): msg = "Dataset: " + obj_uuid + " not found" self.log.info(msg) raise IOError(errno.ENXIO, msg) - + rank = len(dset.shape) values = np.zeros(len(points), dtype=dset.dtype) try: @@ -2389,22 +2389,22 @@ def getDatasetPointSelectionByUuid(self, obj_uuid, points): """ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): dset = self.getDatasetObjByUuid(obj_uuid) - + if format not in ("json", "binary"): msg = "only json and binary formats are supported" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if format == "binary" and type(data) is not bytes: msg ="data must be of type bytes for binary writing" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if dset is None: msg = "Dataset: " + obj_uuid + " not found" self.log.info(msg) - raise IOError(errno.ENXIO, msg) - + raise IOError(errno.ENXIO, msg) + dt = dset.dtype typeItem = getTypeItem(dt) itemSize = getItemSize(typeItem) @@ -2412,11 +2412,11 @@ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): arraySize = 1 for extent in dset.shape: arraySize *= arraySize - + if itemSize == "H5T_VARIABLE" and format == "binary": msg = "Only JSON is supported for for this data type" self.log.info(msg) - raise IOError(errno.EINVAL, msg) + raise IOError(errno.EINVAL, msg) if slices is None: slices = [] @@ -2425,24 +2425,24 @@ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): s = slice(0, dset.shape[dim], 1) slices.append(s) slices = tuple(slices) - - + + if type(slices) != tuple: msg = "setDatasetValuesByUuid: bad type for dim parameter" self.log.error(msg) raise IOError(erno.EIO, msg) - + if len(slices) != rank: msg = "number of dims in selection not same as rank" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + npoints = 1 np_shape = [] for i in range(rank): s = slices[i] - + if s.start < 0 or s.step <= 0 or s.stop < s.start: msg = "invalid slice specification" self.log.info(msg) @@ -2452,17 +2452,17 @@ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): self.log.info(msg) raise IOError(errno.EINVAL, msg) np_shape.append(s.stop - s.start) - + count = (s.stop - s.start) // s.step if count <= 0: msg = "invalid slice specification" self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - npoints *= count - + raise IOError(errno.EINVAL, msg) + + npoints *= count + np_shape = tuple(np_shape) # for comparison with ndarray shape - + self.log.info("selection shape:" + str(np_shape)) @@ -2481,9 +2481,9 @@ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): if format == "binary": msg = "Only JSON is supported for for this data type" self.log.info(msg) - raise IOError(errno.EINVAL, msg) + raise IOError(errno.EINVAL, msg) data = self.listToRef(data) - + if format == "binary": if npoints*itemSize != len(data): msg = "Expected: " + str(npoints*itemSize) + " bytes, but got: " + str(len(data)) @@ -2524,16 +2524,16 @@ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): if selection_extent == 1: np_index += 1 continue # skip singleton selection - + # selection/data mismatch! msg = "data shape doesn't match selection shape" msg += "--data shape: " + str(arr.shape) msg += "--selection shape: " + str(np_shape) - + self.log.info(msg) raise IOError(errno.EINVAL, msg) - - # write temp numpy array to dataset + + # write temp numpy array to dataset if rank == 1: s = slices[0] try: @@ -2558,29 +2558,29 @@ def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): """ def setDatasetValuesByPointSelection(self, obj_uuid, data, points, format="json"): dset = self.getDatasetObjByUuid(obj_uuid) - + if format not in ("json", "binary"): msg = "only json and binary formats are supported" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if format == "binary" and type(data) is not bytes: msg ="data must be of type bytes for binary writing" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + if dset is None: msg = "Dataset: " + obj_uuid + " not found" self.log.info(msg) - raise IOError(errno.ENXIO, msg) - + raise IOError(errno.ENXIO, msg) + dt = dset.dtype typeItem = getTypeItem(dt) itemSize = getItemSize(typeItem) if itemSize == "H5T_VARIABLE" and format == "binary": msg = "Only JSON is supported for for this data type" self.log.info(msg) - raise IOError(errno.EINVAL, msg) + raise IOError(errno.EINVAL, msg) rank = len(dset.shape) @@ -2592,7 +2592,7 @@ def setDatasetValuesByPointSelection(self, obj_uuid, data, points, format="json" #for i in range(len(data)): # converted_data.append(self.toTuple(data[i])) #data = converted_data - + if format == "json": try: @@ -2608,7 +2608,7 @@ def setDatasetValuesByPointSelection(self, obj_uuid, data, points, format="json" msg = "setDatasetValuesByPointSelection, out of range error" self.log.info(msg) raise IOError(errno.EINVAL, msg) - + else: #binary arr = np.fromstring(data, dtype=dset.dtype) @@ -2779,7 +2779,7 @@ def createDataset(self, datatype, datashape, max_shape=None, else: # create the dataset - + try: newDataset = datasets.create_dataset( obj_uuid, shape=datashape, maxshape=max_shape, diff --git a/h5json/jsontoh5/jsontoh5.py b/h5json/jsontoh5/jsontoh5.py index 82707a8..9edc199 100755 --- a/h5json/jsontoh5/jsontoh5.py +++ b/h5json/jsontoh5/jsontoh5.py @@ -13,7 +13,7 @@ if six.PY3: unicode = str - + import sys import json import argparse @@ -232,10 +232,18 @@ def writeFile(self): self.createAttributes() # create attributes for objects self.createLinks() # link it all together + def main(): - parser = argparse.ArgumentParser(usage='%(prog)s [-h] ') - parser.add_argument('in_filename', nargs='+', help='JSon file to be converted to h5') - parser.add_argument('out_filename', nargs='+', help='name of HDF5 output file') + parser = argparse.ArgumentParser( + usage='%(prog)s [-h] ', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('in_filename', nargs='+', + help='JSon file to be converted to h5') + parser.add_argument('out_filename', nargs='+', + help='name of HDF5 output file') + parser.add_argument('--libver', default='latest', + choices=['earliest', 'latest'], + help='Library version to use when storing objects') args = parser.parse_args() # create logger @@ -249,7 +257,7 @@ def main(): log.addHandler(handler) text = open(args.in_filename[0]).read() - + # parse the json file h5json = json.loads(text) @@ -258,11 +266,12 @@ def main(): root_uuid = h5json["root"] filename = args.out_filename[0] - + # create the file, will raise IOError if there's a problem - Hdf5db.createHDF5File(filename) + Hdf5db.createHDF5File(filename) - with Hdf5db(filename, root_uuid=root_uuid, update_timestamps=False, app_logger=log) as db: + with Hdf5db(filename, root_uuid=root_uuid, update_timestamps=False, + app_logger=log, libver=args.libver) as db: h5writer = Writeh5(db, h5json) h5writer.writeFile() @@ -275,5 +284,6 @@ def main(): print("done!") + if __name__ == "__main__": main()