From d458b780eac858dedb9996a89c4acb591effd403 Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 17 Jul 2024 19:00:15 -0500 Subject: [PATCH] Cache and retrieve library_data.json from hed-schemas --- hed/schema/hed_cache.py | 144 +++++++++--------- hed/schema/hed_cache_lock.py | 90 +++++++++++ .../library_data/library_data.json | 11 ++ hed/schema/schema_io/ontology_util.py | 24 +-- spec_tests/test_hed_cache.py | 69 ++++++++- tests/schema/test_hed_schema_io.py | 5 +- 6 files changed, 261 insertions(+), 82 deletions(-) create mode 100644 hed/schema/hed_cache_lock.py create mode 100644 hed/schema/schema_data/library_data/library_data.json diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py index 5e3941fc..6d829a2f 100644 --- a/hed/schema/hed_cache.py +++ b/hed/schema/hed_cache.py @@ -6,11 +6,12 @@ import json from hashlib import sha1 from shutil import copyfile +import functools + import re from semantic_version import Version -import portalocker -import time +from hed.schema.hed_cache_lock import CacheException, CacheLock from hed.schema.schema_io.schema_util import url_to_file, make_url_request from pathlib import Path import urllib @@ -32,6 +33,7 @@ DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema" LIBRARY_HED_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/library_schemas" +LIBRARY_DATA_URL = "https://raw.githubusercontent.com/hed-standard/hed-schemas/main/library_data.json" DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL,) DEFAULT_LIBRARY_URL_LIST = (LIBRARY_HED_URL,) @@ -39,8 +41,6 @@ DEFAULT_SKIP_FOLDERS = ('deprecated',) HED_CACHE_DIRECTORY = os.path.join(Path.home(), '.hedtools/hed_cache/') -TIMESTAMP_FILENAME = "last_update.txt" -CACHE_TIME_THRESHOLD = 300 * 6 # This is the schemas included in the hedtools package. INSTALLED_CACHE_LOCATION = os.path.realpath(os.path.join(os.path.dirname(__file__), 'schema_data/')) @@ -144,13 +144,11 @@ def cache_local_versions(cache_folder): """ if not cache_folder: cache_folder = HED_CACHE_DIRECTORY - os.makedirs(cache_folder, exist_ok=True) try: - cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock") - with portalocker.Lock(cache_lock_filename, timeout=1): - _copy_installed_schemas_to_cache(cache_folder) - except portalocker.exceptions.LockException: + with CacheLock(cache_folder, write_time=False): + _copy_installed_folder_to_cache(cache_folder) + except CacheException: return -1 @@ -165,33 +163,25 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_ cache_folder (str): The folder holding the cache. Returns: - float: Returns -1 if cache failed, a positive number meaning time in seconds since last update - if it didn't cache, 0 if it cached successfully this time. + float: Returns -1 if cache failed for any reason, including having been cached too recently. + Returns 0 if it successfully cached this time. Notes: - The Default skip_folders is 'deprecated'. - The HED cache folder defaults to HED_CACHE_DIRECTORY. - The directories on GitHub are of the form: - https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml + https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema """ if not cache_folder: cache_folder = HED_CACHE_DIRECTORY - if isinstance(hed_base_urls, str): - hed_base_urls = [hed_base_urls] - if isinstance(hed_library_urls, str): - hed_library_urls = [hed_library_urls] - os.makedirs(cache_folder, exist_ok=True) - last_timestamp = _read_last_cached_time(cache_folder) - current_timestamp = time.time() - time_since_update = current_timestamp - last_timestamp - if time_since_update < CACHE_TIME_THRESHOLD: - return time_since_update - try: - cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock") - with portalocker.Lock(cache_lock_filename, timeout=1): + with CacheLock(cache_folder): + if isinstance(hed_base_urls, str): + hed_base_urls = [hed_base_urls] + if isinstance(hed_library_urls, str): + hed_library_urls = [hed_library_urls] all_hed_versions = {} for hed_base_url in hed_base_urls: new_hed_versions = _get_hed_xml_versions_one_library(hed_base_url) @@ -205,60 +195,78 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_ for version, version_info in hed_versions.items(): _cache_hed_version(version, library_name, version_info, cache_folder=cache_folder) - _write_last_cached_time(current_timestamp, cache_folder) - except portalocker.exceptions.LockException or ValueError or URLError: + except CacheException or ValueError or URLError: return -1 return 0 -def _copy_installed_schemas_to_cache(cache_folder): - """Copies the schemas from the install folder to the cache""" - installed_files = os.listdir(INSTALLED_CACHE_LOCATION) - for install_name in installed_files: - _, basename = os.path.split(install_name) - cache_name = os.path.join(cache_folder, basename) - install_name = os.path.join(INSTALLED_CACHE_LOCATION, basename) - if not os.path.exists(cache_name): - shutil.copy(install_name, cache_name) +@functools.lru_cache(maxsize=50) +def get_library_data(library_name, cache_folder=None): + """Retrieve the library data for the given library. + Currently, this is just the valid ID range. -def _read_last_cached_time(cache_folder): - """ Check the given cache folder to see when it was last updated. + Parameters: + library_name(str): The schema name. "" for standard schema. + cache_folder(str): The cache folder to use if not using the default. - Parameters: - cache_folder (str): The folder we're caching hed schema in. + Returns: + library_data(dict): The data for a specific library. + """ + if cache_folder is None: + cache_folder = HED_CACHE_DIRECTORY - Returns: - float: The time we last updated the cache. Zero if no update found. + cache_lib_data_folder = os.path.join(cache_folder, "library_data") - """ - timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME) + local_library_data_filename = os.path.join(cache_lib_data_folder, "library_data.json") + try: + with open(local_library_data_filename) as file: + library_data = json.load(file) + specific_library = library_data[library_name] + return specific_library + except (OSError, CacheException, ValueError, KeyError): + pass try: - with open(timestamp_filename, "r") as f: - timestamp = float(f.readline()) - return timestamp - except FileNotFoundError or ValueError or IOError: - return 0 + with CacheLock(cache_lib_data_folder, write_time=False): + _copy_installed_folder_to_cache(cache_lib_data_folder, "library_data") + + with open(local_library_data_filename) as file: + library_data = json.load(file) + specific_library = library_data[library_name] + return specific_library + except (OSError, CacheException, ValueError, KeyError): + pass + try: + with CacheLock(cache_lib_data_folder): + # if this fails it'll fail to load in the next step + _cache_specific_url(LIBRARY_DATA_URL, local_library_data_filename) + with open(local_library_data_filename) as file: + library_data = json.load(file) + specific_library = library_data[library_name] + return specific_library + except (OSError, CacheException, ValueError, URLError, KeyError) as e: + pass -def _write_last_cached_time(new_time, cache_folder): - """ Set the time of last cache update. + # This failed to get any data for some reason + return {} - Parameters: - new_time (float): The time this was updated. - cache_folder (str): The folder used for caching the hed schema. - :raises ValueError: - - something went wrong writing to the file - """ - timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME) - try: - with open(timestamp_filename, "w") as f: - f.write(str(new_time)) - except Exception: - raise ValueError("Error writing timestamp to hed cache") +def _copy_installed_folder_to_cache(cache_folder, sub_folder=""): + """Copies the schemas from the install folder to the cache""" + source_folder = INSTALLED_CACHE_LOCATION + if sub_folder: + source_folder = os.path.join(INSTALLED_CACHE_LOCATION, sub_folder) + + installed_files = os.listdir(source_folder) + for install_name in installed_files: + _, basename = os.path.split(install_name) + cache_name = os.path.join(cache_folder, basename) + install_name = os.path.join(source_folder, basename) + if not os.path.isdir(install_name) and not os.path.exists(cache_name): + shutil.copy(install_name, cache_name) def _check_if_url(hed_xml_or_url): @@ -435,13 +443,13 @@ def _cache_hed_version(version, library_name, version_info, cache_folder): return _cache_specific_url(download_url, possible_cache_filename) -def _cache_specific_url(hed_xml_url, cache_filename): +def _cache_specific_url(source_url, cache_filename): """Copies a specific url to the cache at the given filename""" cache_folder = cache_filename.rpartition("/")[0] os.makedirs(cache_folder, exist_ok=True) - temp_hed_xml_file = url_to_file(hed_xml_url) - if temp_hed_xml_file: - cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename) - os.remove(temp_hed_xml_file) + temp_filename = url_to_file(source_url) + if temp_filename: + cache_filename = _safe_move_tmp_to_folder(temp_filename, cache_filename) + os.remove(temp_filename) return cache_filename return None diff --git a/hed/schema/hed_cache_lock.py b/hed/schema/hed_cache_lock.py new file mode 100644 index 00000000..1df223cf --- /dev/null +++ b/hed/schema/hed_cache_lock.py @@ -0,0 +1,90 @@ +"""Support utilities for hed_cache locking""" +import time +import os +import portalocker + + +TIMESTAMP_FILENAME = "last_update.txt" +CACHE_TIME_THRESHOLD = 300 * 6 + + +class CacheException(Exception): + """Exception for cache locking or threshold errors.""" + pass + + +class CacheLock: + """Class to lock the cache folder to ensure it doesn't get hit by another version at the same time.""" + def __init__(self, cache_folder, write_time=True, time_threshold=CACHE_TIME_THRESHOLD): + """Constructor for hed locking object + + Parameters: + cache_folder(str): The folder to create the lock in(implicitly locking that folder) + write_time(bool): If true, read and write the cache time. Additionally, won't operate if too recent. + Generally False for local operations. + time_threshold(int): Time before cache is allowed to refresh again. + + """ + self.cache_folder = cache_folder + self.cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock") + self.cache_lock = None + self.timestamp = None + self.write_time = write_time + self.time_threshold = time_threshold + + def __enter__(self): + os.makedirs(self.cache_folder, exist_ok=True) + last_timestamp = _read_last_cached_time(self.cache_folder) + self.current_timestamp = time.time() + time_since_update = self.current_timestamp - last_timestamp + if time_since_update < self.time_threshold: + raise CacheException(f"Last updated {time_since_update} seconds ago. Threshold is {self.time_threshold}") + + try: + self.cache_lock = portalocker.Lock(self.cache_lock_filename, timeout=1) + except portalocker.exceptions.LockException: + raise CacheException(f"Could not lock cache using {self.cache_lock_filename}") + pass + + def __exit__(self, exc_type, exc_value, traceback): + if self.write_time: + _write_last_cached_time(self.current_timestamp, self.cache_folder) + self.cache_lock.release() + + +def _read_last_cached_time(cache_folder): + """ Check the given cache folder to see when it was last updated. + + Parameters: + cache_folder (str): The folder we're caching hed schema in. + + Returns: + float: The time we last updated the cache. Zero if no update found. + + """ + timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME) + + try: + with open(timestamp_filename, "r") as f: + timestamp = float(f.readline()) + return timestamp + except FileNotFoundError or ValueError or IOError: + return 0 + + +def _write_last_cached_time(new_time, cache_folder): + """ Set the time of last cache update. + + Parameters: + new_time (float): The time this was updated. + cache_folder (str): The folder used for caching the hed schema. + + :raises ValueError: + - something went wrong writing to the file + """ + timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME) + try: + with open(timestamp_filename, "w") as f: + f.write(str(new_time)) + except Exception: + raise ValueError("Error writing timestamp to hed cache") diff --git a/hed/schema/schema_data/library_data/library_data.json b/hed/schema/schema_data/library_data/library_data.json new file mode 100644 index 00000000..3b5ca5e0 --- /dev/null +++ b/hed/schema/schema_data/library_data/library_data.json @@ -0,0 +1,11 @@ +{ + "": { + "id_range":[10000, 39999] + }, + "score": { + "id_range":[40000, 59999] + }, + "lang": { + "id_range":[60000, 79999] + } +} \ No newline at end of file diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py index 898c55a8..59cd34c6 100644 --- a/hed/schema/schema_io/ontology_util.py +++ b/hed/schema/schema_io/ontology_util.py @@ -8,13 +8,10 @@ from hed.schema import hed_schema_df_constants as constants from hed.schema.hed_schema_constants import HedKey from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line +from hed.schema.hed_cache import get_library_data -library_index_ranges = { - "": (10000, 40000), - "score": (40000, 60000), - "lang": (60000, 80000) -} -UNKNOWN_LIBRARY_VALUE = 9910000 + +UNKNOWN_LIBRARY_VALUE = 0 object_type_id_offset = { constants.OBJECT_KEY: (100, 300), @@ -39,10 +36,11 @@ def get_library_name_and_id(schema): library_name(str): The capitalized library name first_id(int): the first id for a given library """ - name = schema.library - - starting_id, _ = library_index_ranges.get(name, (UNKNOWN_LIBRARY_VALUE, 0)) + name = schema.library + + library_data = get_library_data(name) + starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE)) if not name: name = "standard" return name.capitalize(), starting_id @@ -61,9 +59,10 @@ def _get_hedid_range(schema_name, df_key): if df_key == constants.STRUCT_KEY: raise NotImplementedError("Cannot assign hed_ids struct section") - if schema_name not in library_index_ranges: + library_data = get_library_data(schema_name) + if not library_data: return set() - starting_id, ending_id = library_index_ranges[schema_name] + starting_id, ending_id = library_data["id_range"] start_object_range, end_object_range = object_type_id_offset[df_key] if df_key == constants.TAG_KEY: @@ -73,7 +72,8 @@ def _get_hedid_range(schema_name, df_key): final_start = starting_id + start_object_range + initial_tag_adj final_end = starting_id + end_object_range if end_object_range == -1: - final_end = ending_id + # Add one since the versions on hed-schemas are set to max_value - 1 + final_end = ending_id + 1 return set(range(final_start, final_end)) diff --git a/spec_tests/test_hed_cache.py b/spec_tests/test_hed_cache.py index f9910d15..79ffb83d 100644 --- a/spec_tests/test_hed_cache.py +++ b/spec_tests/test_hed_cache.py @@ -41,7 +41,8 @@ def tearDownClass(cls): def test_cache_again(self): time_since_update = hed_cache.cache_xml_versions(cache_folder=self.hed_cache_dir) - self.assertGreater(time_since_update, 0) + # this should fail to cache, since it was cached too recently. + self.assertEqual(time_since_update, -1) def test_get_cache_directory(self): @@ -91,6 +92,7 @@ def test_find_hed_expression(self): final_version = f"HED{version}.xml" self.assertFalse(hed_cache.version_pattern.match(final_version)) + class TestLocal(unittest.TestCase): @classmethod def setUpClass(cls): @@ -143,6 +145,71 @@ def test_schema_load_schema_version_invalid(self): load_schema_version(["8.1.0", "notreallibrary_1.0.0"]) self.assertEqual(context8.exception.args[0], 'fileNotFound') + +class TestLibraryDataCache(unittest.TestCase): + # Verify get_library_data properly caches from the internet and locally + @classmethod + def setUpClass(cls): + hed_cache_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../schema_cache_test_get_library_data/') + if os.path.exists(hed_cache_dir) and os.path.isdir(hed_cache_dir): + shutil.rmtree(hed_cache_dir) + hed_cache.get_library_data.cache_clear() + cls.hed_cache_dir = hed_cache_dir + cls.saved_cache_folder = hed_cache.HED_CACHE_DIRECTORY + schema.set_cache_directory(cls.hed_cache_dir) + cls.saved_install_cache = hed_cache.INSTALLED_CACHE_LOCATION + cls.empty_source_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../schema_install_empty_local/") + if os.path.exists(cls.empty_source_dir) and os.path.isdir(cls.empty_source_dir): + shutil.rmtree(cls.empty_source_dir) + os.makedirs(cls.empty_source_dir) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.hed_cache_dir) + schema.set_cache_directory(cls.saved_cache_folder) + shutil.rmtree(cls.empty_source_dir) + hed_cache.INSTALLED_CACHE_LOCATION = cls.saved_install_cache + hed_cache.get_library_data.cache_clear() + + def test_local_cache_off(self): + hed_cache.get_library_data.cache_clear() + shutil.rmtree(self.hed_cache_dir) + saved_url = hed_cache.LIBRARY_DATA_URL + try: + hed_cache.LIBRARY_DATA_URL = "" + hed_cache.INSTALLED_CACHE_LOCATION = self.empty_source_dir + self.assertEqual(hed_cache.get_library_data(""), {}) + self.assertEqual(hed_cache.get_library_data("score"), {}) + self.assertEqual(hed_cache.get_library_data("not_real_library_name"), {}) + finally: + hed_cache.LIBRARY_DATA_URL = saved_url + hed_cache.INSTALLED_CACHE_LOCATION = self.saved_install_cache + + def test_local_cache_on(self): + hed_cache.get_library_data.cache_clear() + shutil.rmtree(self.hed_cache_dir) + saved_url = hed_cache.LIBRARY_DATA_URL + try: + hed_cache.LIBRARY_DATA_URL = "" + self.assertTrue(hed_cache.get_library_data("")) + self.assertTrue(hed_cache.get_library_data("score")) + self.assertEqual(hed_cache.get_library_data("not_real_library_name"), {}) + finally: + hed_cache.LIBRARY_DATA_URL = saved_url + + def test_url_cache(self): + hed_cache.get_library_data.cache_clear() + shutil.rmtree(self.hed_cache_dir) + hed_cache.INSTALLED_CACHE_LOCATION = self.empty_source_dir + try: + # hed_cache.LIBRARY_DATA_URL = "" + self.assertTrue(hed_cache.get_library_data("")) + self.assertTrue(hed_cache.get_library_data("score")) + self.assertEqual(hed_cache.get_library_data("not_real_library_name"), {}) + finally: + hed_cache.INSTALLED_CACHE_LOCATION = self.saved_install_cache + + if __name__ == '__main__': unittest.main() diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index b2d33086..9e7fc63f 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -230,7 +230,10 @@ def setUpClass(cls): cls.source_library_name = "score_1.1.0" for filename in os.listdir(hed_cache.INSTALLED_CACHE_LOCATION): - loaded_schema = schema.load_schema(os.path.join(hed_cache.INSTALLED_CACHE_LOCATION, filename)) + final_filename = os.path.join(hed_cache.INSTALLED_CACHE_LOCATION, filename) + if os.path.isdir(final_filename): + continue + loaded_schema = schema.load_schema(final_filename) loaded_schema.save_as_xml(os.path.join(cls.hed_cache_dir, filename), save_merged=False) if filename == f"HED_{cls.source_library_name}.xml": new_filename = f"HED_{cls.dupe_library_name}.xml"