From 98c80f427f472be7f850a736ce2413e3f47b62f2 Mon Sep 17 00:00:00 2001 From: mattjala <124107509+mattjala@users.noreply.github.com> Date: Fri, 7 Jun 2024 04:29:05 -0500 Subject: [PATCH] Multi-Link API (#203) * Delete multiple links * PUT_Links support for group __setitem__ * Multi-link options for group.get() * Support retrieving multiple links by name * Cleanup --------- Co-authored-by: John Readey --- h5pyd/_hl/group.py | 166 ++++++++++++++++++++++++---- h5pyd/_hl/httpconn.py | 10 +- test/hl/test_group.py | 245 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 391 insertions(+), 30 deletions(-) diff --git a/h5pyd/_hl/group.py b/h5pyd/_hl/group.py index 5e2aad9..a24712d 100644 --- a/h5pyd/_hl/group.py +++ b/h5pyd/_hl/group.py @@ -679,11 +679,23 @@ def __getitem__(self, name, track_order=False): tgt._name = name return tgt - def get(self, name, default=None, getclass=False, getlink=False, track_order=False): + def _objectify_link_Json(self, link_json): + if "id" in link_json: + link_obj = HardLink(link_json["id"]) + elif "h5path" in link_json and "h5domain" not in link_json: + link_obj = SoftLink(link_json["h5path"]) + elif "h5path" in link_json and "h5domain" in link_json: + link_obj = ExternalLink(link_json["h5domain"], link_json["h5path"]) + else: + raise ValueError("Invalid link JSON") + + return link_obj + + def get(self, name, default=None, getclass=False, getlink=False, track_order=False, **kwds): """ Retrieve an item or other information. "name" given only: - Return the item, or "default" if it doesn't exist + Return the item with the given name, or "default" if nothing with that name exists "getclass" is True: Return the class of object (Group, Dataset, etc.), or "default" @@ -697,6 +709,21 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Fal Return HardLink, SoftLink and ExternalLink classes. Return "default" if nothing with that name exists. + "limit" is an integer: + If "name" is None, this will return the first "limit" links in the group. + + "marker" is a string: + If "name" is None, this will return only the links that come after the marker in the group's link ordering. + + "pattern" is a string: + If "name" is None, this will return only the links that match the given pattern + in the target group (and subgroups, if follow_links is provided). + Matching is done according to Unix pathname expansion rules. + + "follow_links" is True: + If "name" is None, subgroups of the target group will be recursively searched + for links that match the given names or pattern. + Example: >>> cls = group.get('foo', getclass=True) @@ -709,7 +736,7 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Fal except KeyError: return default - if name not in self: + if not isinstance(name, list) and name is not None and name not in self: return default elif getclass and not getlink: @@ -726,23 +753,80 @@ def get(self, name, default=None, getclass=False, getlink=False, track_order=Fal raise TypeError("Unknown object type") elif getlink: - parent_uuid, link_json = self._get_link_json(name) - typecode = link_json['class'] + if name is None or isinstance(name, list): + # Get all links in target group(s) + # Retrieve "limit", "marker", and "pattern" from kwds + limit = kwds.get("limit", None) + marker = kwds.get("marker", None) + pattern = kwds.get("pattern", None) + follow_links = kwds.get("follow_links", False) + + if name and (limit or marker or pattern or follow_links): + raise ValueError("Cannot specify 'name' along with 'limit', 'marker', 'pattern', or 'follow_links'") + + req = "/groups/" + self.id.uuid + "/links" + params = {} + + if limit: + params["Limit"] = limit + if marker: + params["Marker"] = marker + if pattern: + params["pattern"] = pattern + if follow_links: + params["follow_links"] = 1 + if track_order: + params["CreateOrder"] = 1 + + if name: + body = {} + + titles = [linkname.decode('utf-8') if + isinstance(linkname, bytes) else linkname for linkname in name] + body['titles'] = titles + rsp = self.POST(req, body=body, params=params) + else: + rsp = self.GET(req, params=params) + + if "links" in rsp: + # Process list of link objects so they may be accessed by name + links = rsp['links'] + links_out = {} + if all([isUUID(k) for k in links]): + # Multiple groups queried, links are returned under group ids + for group_id in links: + group_links = {} - if typecode == 'H5L_TYPE_SOFT': - if getclass: - return SoftLink + for link in links[group_id]: + group_links[link["title"]] = self._objectify_link_Json(link) - return SoftLink(link_json['h5path']) - elif typecode == 'H5L_TYPE_EXTERNAL': - if getclass: - return ExternalLink + links_out[group_id] = group_links - return ExternalLink(link_json['h5domain'], link_json['h5path']) - elif typecode == 'H5L_TYPE_HARD': - return HardLink if getclass else HardLink() + else: + for link in links: + links_out[link["title"]] = self._objectify_link_Json(link) + else: + raise ValueError("Can't parse server response to links query") + + return links_out else: - raise TypeError("Unknown link type") + parent_uuid, link_json = self._get_link_json(name) + typecode = link_json['class'] + + if typecode == 'H5L_TYPE_SOFT': + if getclass: + return SoftLink + + return SoftLink(link_json['h5path']) + elif typecode == 'H5L_TYPE_EXTERNAL': + if getclass: + return ExternalLink + + return ExternalLink(link_json['h5domain'], link_json['h5path']) + elif typecode == 'H5L_TYPE_HARD': + return HardLink if getclass else HardLink(link_json['id']) + else: + raise TypeError("Unknown link type") def __setitem__(self, name, obj): """ Add an object to the group. The name must not already be in use. @@ -768,7 +852,27 @@ def __setitem__(self, name, obj): values are stored as scalar datasets. Raise ValueError if we can't understand the resulting array dtype. """ - if name.find('/') != -1: + if isinstance(name, list) and isinstance(obj, list): + if len(name) != len(obj): + raise ValueError("name and object list lengths do not match") + + links = {} + + for i in range(len(name)): + if isinstance(obj[i], HLObject): + links[name[i]] = {"id": obj[i].id.uuid} + elif isinstance(obj[i], SoftLink): + links[name[i]] = {"h5path": obj[i].path} + elif isinstance(obj[i], ExternalLink): + links[name[i]] = {"h5path": obj[i].path, "h5domain": obj[i].filename} + else: + raise ValueError("only links are supported for multiple object creation") + + body = {"links": links} + req = "/groups/" + self.id.uuid + "/links" + self.PUT(req, body=body) + + elif name.find('/') != -1: parent_path = op.dirname(name) basename = op.basename(name) if not basename: @@ -855,12 +959,20 @@ def __delitem__(self, name): raise IOError("Not found") else: - # delete the link, not an object - req = "/groups/" + self.id.uuid + "/links/" + name + # delete the link(s), not an object + if isinstance(name, list): + # delete multiple links + req = "/groups/" + self.id.uuid + "/links?titles=" + '/'.join(name) + else: + # delete single link + req = "/groups/" + self.id.uuid + "/links/" + name + self.DELETE(req) - if name.find('/') == -1 and name in self._link_db: - # remove from link cache - del self._link_db[name] + + for n in name: + if n.find('/') == -1 and n in self._link_db: + # remove from link cache + del self._link_db[name] def __len__(self): """ Number of members attached to this group """ @@ -1186,8 +1298,16 @@ class HardLink(object): Represents a hard link in an HDF5 file. Provided only so that Group.get works in a sensible way. Has no other function. """ + @property + # The uuid of the target object + def id(self): + return self._id + + def __init__(self, id=None): + self._id = id - pass + def __repr__(self): + return f'' # TODO: implement equality testing for these diff --git a/h5pyd/_hl/httpconn.py b/h5pyd/_hl/httpconn.py index 9fce52d..260ac81 100644 --- a/h5pyd/_hl/httpconn.py +++ b/h5pyd/_hl/httpconn.py @@ -439,9 +439,9 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): check_cache = self._cache is not None and use_cache and format == "json" check_cache = check_cache and params["domain"] == self._domain - - if any(param in params for param in no_cache_params): - check_cache = False + check_cache = check_cache and "select" not in params and "query" not in params + check_cache = check_cache and "follow_links" not in params and "pattern" not in params + check_cache = check_cache and "Limit" not in params and "Marker" not in params if check_cache: self.log.debug("httpcon - checking cache") @@ -453,6 +453,7 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): self.log.info( f"GET: {self._endpoint + req} [{params['domain']}] timeout: {self._timeout}" ) + for k in params: if k != "domain": v = params[k] @@ -467,6 +468,7 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): stream = False else: stream = True + rsp = s.get( self._endpoint + req, params=params, @@ -502,6 +504,8 @@ def GET(self, req, format="json", params=None, headers=None, use_cache=True): add_to_cache = content_type and content_type.startswith("application/json") add_to_cache = add_to_cache and content_length < MAX_CACHE_ITEM_SIZE and not req.endswith("/value") + add_to_cache = add_to_cache and "follow_links" not in params and "pattern" not in params + add_to_cache = add_to_cache and "Limit" not in params and "Marker" not in params if add_to_cache: # add to our _cache diff --git a/test/hl/test_group.py b/test/hl/test_group.py index 99c5b4c..222e7f0 100644 --- a/test/hl/test_group.py +++ b/test/hl/test_group.py @@ -274,10 +274,7 @@ def get_count(grp): for item in grp: count += 1 return count - # create a file for use a link target - if config.get("use_h5py"): - # for some reason this test is failing in Travis - return + # create a file for use as a link target filename = self.getFileName("test_link_removal") print(f"filename: {filename}") @@ -302,6 +299,246 @@ def get_count(grp): f.close() + def test_link_multi_removal(self): + # create a file for use a link target + if config.get("use_h5py"): + return + filename = self.getFileName("test_link_multi_removal") + print(filename) + + f = h5py.File(filename, 'w') + g1 = f.create_group("g1") + g1_clone = f["g1"] + # create multiple subgroups + names = ["subgroup" + str(i) for i in range(10)] + subgrps = [] + for name in names: + subgrps.append(g1.create_group(name)) + + self.assertEqual(len(g1), 10) + + # Remove first 5 subgroups + del g1[names[0:5]] + + self.assertEqual(len(g1), 5) + self.assertEqual(len(g1_clone), 5) + + for name in names[0:5]: + self.assertFalse(name in g1) + self.assertFalse(name in g1_clone) + + for name in names[5:]: + self.assertTrue(name in g1) + self.assertTrue(name in g1_clone) + + # delete links with names that must be URL-encoded + names = ['link with spaces', 'link%', 'unicodeå…«link'] + + for name in names: + g1[name] = g1 + + del g1[names] + + for name in names: + self.assertTrue(name not in g1) + + f.close() + + def test_link_multi_create(self): + if config.get("use_h5py"): + return + + filename = self.getFileName("test_link_multi_create") + print(filename) + + f = h5py.File(filename, 'w') + g1 = f.create_group("g1") + + # Create 10 soft links + num_links = 10 + names = ["link" + str(i) for i in range(num_links)] + links = [] + + for name in names: + new_link = h5py.SoftLink("dummy_path_" + str(name)) + links.append(new_link) + + g1[names] = links + + self.assertEqual(len(g1), num_links) + + for i in range(num_links): + name = names[i] + self.assertTrue(name in g1) + self.assertEqual(g1.get(name, getlink=True).path, links[i].path) + + # Create soft and hard links + names = ["link" + str(i) for i in range(num_links, 2 * num_links)] + links = [] + + for i in range(num_links, 2 * num_links): + if i % 2 == 0: + new_link = h5py.SoftLink("dummy_path_" + str(i)) + else: + # Hard link to g1 + new_link = g1 + + links.append(new_link) + + g1[names] = links + + self.assertEqual(len(g1), num_links * 2) + + for i in range(num_links, 2 * num_links): + name = "link" + str(i) + self.assertTrue(name in g1) + + if i % 2 == 0: + link = g1.get(name, getlink=True) + self.assertEqual(link.path, links[i % num_links].path) + else: + g1_clone = g1.get(name) + self.assertEqual(len(g1_clone), len(g1)) + self.assertEqual(g1_clone.id.id, g1.id.id) + + # Create external links + + names = ["link" + str(i) for i in range(num_links * 2, num_links * 3)] + links = [] + + for i in range(num_links * 2, num_links * 3): + filename = "dummy_filename_" + str(i) + path = "dummy_path_" + str(i) + new_link = h5py.ExternalLink(filename=filename, path=path) + links.append(new_link) + + g1[names] = links + + self.assertEqual(len(g1), num_links * 3) + + for i in range(num_links * 2, num_links * 3): + name = "link" + str(i) + self.assertTrue(name in g1) + + link = g1.get(name, getlink=True) + self.assertEqual(link.path, links[i % num_links]._path) + self.assertEqual(link.filename, links[i % num_links]._filename) + + def test_link_get_multi(self): + if config.get("use_h5py"): + return + + filename = self.getFileName("test_link_get_multi") + print(filename) + + f = h5py.File(filename, 'w') + g1 = f.create_group("g1") + + # Create subgroups + g2 = g1.create_group("g2") + g3 = g2.create_group("g3") + + # Create links in each group + + num_links = 20 + names = ["link" + str(i) for i in range(num_links)] + + for name in names: + g1[name] = g1 + g2[name] = g2 + g3[name] = g3 + + # Get all links from g1 only + links_out = g1.get(None, getlink=True) + + self.assertEqual(len(links_out), num_links + 1) + + for name in names: + self.assertTrue(name in links_out) + link = links_out[name] + self.assertEqual(link.id, g1.id.uuid) + + # Get all links from g1 and subgroups + links_out = g1.get(None, getlink=True, follow_links=True) + + # 3 groups containing links + self.assertEqual(len(links_out), 3) + + for group_id in [g1.id.uuid, g2.id.uuid, g3.id.uuid]: + self.assertTrue(group_id in links_out) + links = links_out[group_id] + + if group_id == g3.id.uuid: + self.assertEqual(len(links), num_links) + else: + self.assertEqual(len(links), num_links + 1) + + for name in names: + self.assertTrue(name in links) + link = links[name] + self.assertEqual(link.id, group_id) + + # Make sure cache does not erroneously return recursive links + links_out = g1.get(None, getlink=True) + self.assertEqual(len(links_out), num_links + 1) + + # Return only 5 links from group + + links_out = g1.get(None, getlink=True, limit=5) + self.assertEqual(len(links_out), 5) + + self.assertTrue("g2" in links_out) + for name in sorted(names)[0:4]: + self.assertTrue(name in links_out) + link = links_out[name] + self.assertEqual(link.id, g1.id.uuid) + + # Return next 5 links via marker + links_out = g1.get(None, getlink=True, limit=5, marker=sorted(names)[3]) + + self.assertEqual(len(links_out), 5) + + for name in sorted(names)[4:9]: + self.assertTrue(name in links_out) + link = links_out[name] + self.assertEqual(link.id, g1.id.uuid) + + # Return all links in g1 besides g2 + links_out = g1.get(None, getlink=True, pattern="link*") + self.assertEqual(len(links_out), 20) + + for name in names: + if name.startswith("link1"): + self.assertTrue(name in links_out) + link = links_out[name] + self.assertEqual(link.id, g1.id.uuid) + + # Return all links in g1/g2/g3 except for the group links + links_out = g1.get(None, getlink=True, follow_links=True, pattern="link*") + self.assertEqual(len(links_out), 3) + + for group_id in [g1.id.uuid, g2.id.uuid, g3.id.uuid]: + self.assertTrue(group_id in links_out) + links = links_out[group_id] + + self.assertEqual(len(links), num_links) + + for name in names: + self.assertTrue(name in links) + link = links[name] + self.assertEqual(link.id, group_id) + + # Retrieve a set of links by name + names = ["link" + str(i) for i in range(5, 15)] + links_out = g1.get(names, getlink=True) + + self.assertEqual(len(links_out), 10) + + for name in names: + self.assertTrue(name in links_out) + link = links_out[name] + self.assertEqual(link.id, g1.id.uuid) + class TestTrackOrder(TestCase):