Skip to content

Commit

Permalink
Cache and retrieve library_data.json from hed-schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
IanCa committed Jul 18, 2024
1 parent 53a4fcf commit d458b78
Show file tree
Hide file tree
Showing 6 changed files with 261 additions and 82 deletions.
144 changes: 76 additions & 68 deletions hed/schema/hed_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import json
from hashlib import sha1
from shutil import copyfile
import functools


import re
from semantic_version import Version
import portalocker
import time
from hed.schema.hed_cache_lock import CacheException, CacheLock
from hed.schema.schema_io.schema_util import url_to_file, make_url_request
from pathlib import Path
import urllib
Expand All @@ -32,15 +33,14 @@

DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema"
LIBRARY_HED_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/library_schemas"
LIBRARY_DATA_URL = "https://raw.githubusercontent.com/hed-standard/hed-schemas/main/library_data.json"
DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL,)
DEFAULT_LIBRARY_URL_LIST = (LIBRARY_HED_URL,)


DEFAULT_SKIP_FOLDERS = ('deprecated',)

HED_CACHE_DIRECTORY = os.path.join(Path.home(), '.hedtools/hed_cache/')
TIMESTAMP_FILENAME = "last_update.txt"
CACHE_TIME_THRESHOLD = 300 * 6

# This is the schemas included in the hedtools package.
INSTALLED_CACHE_LOCATION = os.path.realpath(os.path.join(os.path.dirname(__file__), 'schema_data/'))
Expand Down Expand Up @@ -144,13 +144,11 @@ def cache_local_versions(cache_folder):
"""
if not cache_folder:
cache_folder = HED_CACHE_DIRECTORY
os.makedirs(cache_folder, exist_ok=True)

try:
cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
with portalocker.Lock(cache_lock_filename, timeout=1):
_copy_installed_schemas_to_cache(cache_folder)
except portalocker.exceptions.LockException:
with CacheLock(cache_folder, write_time=False):
_copy_installed_folder_to_cache(cache_folder)
except CacheException:
return -1


Expand All @@ -165,33 +163,25 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_
cache_folder (str): The folder holding the cache.
Returns:
float: Returns -1 if cache failed, a positive number meaning time in seconds since last update
if it didn't cache, 0 if it cached successfully this time.
float: Returns -1 if cache failed for any reason, including having been cached too recently.
Returns 0 if it successfully cached this time.
Notes:
- The Default skip_folders is 'deprecated'.
- The HED cache folder defaults to HED_CACHE_DIRECTORY.
- The directories on GitHub are of the form:
https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml
https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema
"""
if not cache_folder:
cache_folder = HED_CACHE_DIRECTORY

if isinstance(hed_base_urls, str):
hed_base_urls = [hed_base_urls]
if isinstance(hed_library_urls, str):
hed_library_urls = [hed_library_urls]
os.makedirs(cache_folder, exist_ok=True)
last_timestamp = _read_last_cached_time(cache_folder)
current_timestamp = time.time()
time_since_update = current_timestamp - last_timestamp
if time_since_update < CACHE_TIME_THRESHOLD:
return time_since_update

try:
cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
with portalocker.Lock(cache_lock_filename, timeout=1):
with CacheLock(cache_folder):
if isinstance(hed_base_urls, str):
hed_base_urls = [hed_base_urls]
if isinstance(hed_library_urls, str):
hed_library_urls = [hed_library_urls]
all_hed_versions = {}
for hed_base_url in hed_base_urls:
new_hed_versions = _get_hed_xml_versions_one_library(hed_base_url)
Expand All @@ -205,60 +195,78 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_
for version, version_info in hed_versions.items():
_cache_hed_version(version, library_name, version_info, cache_folder=cache_folder)

_write_last_cached_time(current_timestamp, cache_folder)
except portalocker.exceptions.LockException or ValueError or URLError:
except CacheException or ValueError or URLError:
return -1

return 0


def _copy_installed_schemas_to_cache(cache_folder):
"""Copies the schemas from the install folder to the cache"""
installed_files = os.listdir(INSTALLED_CACHE_LOCATION)
for install_name in installed_files:
_, basename = os.path.split(install_name)
cache_name = os.path.join(cache_folder, basename)
install_name = os.path.join(INSTALLED_CACHE_LOCATION, basename)
if not os.path.exists(cache_name):
shutil.copy(install_name, cache_name)
@functools.lru_cache(maxsize=50)
def get_library_data(library_name, cache_folder=None):
"""Retrieve the library data for the given library.
Currently, this is just the valid ID range.
def _read_last_cached_time(cache_folder):
""" Check the given cache folder to see when it was last updated.
Parameters:
library_name(str): The schema name. "" for standard schema.
cache_folder(str): The cache folder to use if not using the default.
Parameters:
cache_folder (str): The folder we're caching hed schema in.
Returns:
library_data(dict): The data for a specific library.
"""
if cache_folder is None:
cache_folder = HED_CACHE_DIRECTORY

Returns:
float: The time we last updated the cache. Zero if no update found.
cache_lib_data_folder = os.path.join(cache_folder, "library_data")

"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)
local_library_data_filename = os.path.join(cache_lib_data_folder, "library_data.json")
try:
with open(local_library_data_filename) as file:
library_data = json.load(file)
specific_library = library_data[library_name]
return specific_library
except (OSError, CacheException, ValueError, KeyError):
pass

try:
with open(timestamp_filename, "r") as f:
timestamp = float(f.readline())
return timestamp
except FileNotFoundError or ValueError or IOError:
return 0
with CacheLock(cache_lib_data_folder, write_time=False):
_copy_installed_folder_to_cache(cache_lib_data_folder, "library_data")

with open(local_library_data_filename) as file:
library_data = json.load(file)
specific_library = library_data[library_name]
return specific_library
except (OSError, CacheException, ValueError, KeyError):
pass

try:
with CacheLock(cache_lib_data_folder):
# if this fails it'll fail to load in the next step
_cache_specific_url(LIBRARY_DATA_URL, local_library_data_filename)
with open(local_library_data_filename) as file:
library_data = json.load(file)
specific_library = library_data[library_name]
return specific_library
except (OSError, CacheException, ValueError, URLError, KeyError) as e:
pass

def _write_last_cached_time(new_time, cache_folder):
""" Set the time of last cache update.
# This failed to get any data for some reason
return {}

Parameters:
new_time (float): The time this was updated.
cache_folder (str): The folder used for caching the hed schema.

:raises ValueError:
- something went wrong writing to the file
"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)
try:
with open(timestamp_filename, "w") as f:
f.write(str(new_time))
except Exception:
raise ValueError("Error writing timestamp to hed cache")
def _copy_installed_folder_to_cache(cache_folder, sub_folder=""):
"""Copies the schemas from the install folder to the cache"""
source_folder = INSTALLED_CACHE_LOCATION
if sub_folder:
source_folder = os.path.join(INSTALLED_CACHE_LOCATION, sub_folder)

installed_files = os.listdir(source_folder)
for install_name in installed_files:
_, basename = os.path.split(install_name)
cache_name = os.path.join(cache_folder, basename)
install_name = os.path.join(source_folder, basename)
if not os.path.isdir(install_name) and not os.path.exists(cache_name):
shutil.copy(install_name, cache_name)


def _check_if_url(hed_xml_or_url):
Expand Down Expand Up @@ -435,13 +443,13 @@ def _cache_hed_version(version, library_name, version_info, cache_folder):
return _cache_specific_url(download_url, possible_cache_filename)


def _cache_specific_url(hed_xml_url, cache_filename):
def _cache_specific_url(source_url, cache_filename):
"""Copies a specific url to the cache at the given filename"""
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
temp_filename = url_to_file(source_url)
if temp_filename:
cache_filename = _safe_move_tmp_to_folder(temp_filename, cache_filename)
os.remove(temp_filename)
return cache_filename
return None
90 changes: 90 additions & 0 deletions hed/schema/hed_cache_lock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Support utilities for hed_cache locking"""
import time
import os
import portalocker


TIMESTAMP_FILENAME = "last_update.txt"
CACHE_TIME_THRESHOLD = 300 * 6


class CacheException(Exception):
"""Exception for cache locking or threshold errors."""
pass


class CacheLock:
"""Class to lock the cache folder to ensure it doesn't get hit by another version at the same time."""
def __init__(self, cache_folder, write_time=True, time_threshold=CACHE_TIME_THRESHOLD):
"""Constructor for hed locking object
Parameters:
cache_folder(str): The folder to create the lock in(implicitly locking that folder)
write_time(bool): If true, read and write the cache time. Additionally, won't operate if too recent.
Generally False for local operations.
time_threshold(int): Time before cache is allowed to refresh again.
"""
self.cache_folder = cache_folder
self.cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
self.cache_lock = None
self.timestamp = None
self.write_time = write_time
self.time_threshold = time_threshold

def __enter__(self):
os.makedirs(self.cache_folder, exist_ok=True)
last_timestamp = _read_last_cached_time(self.cache_folder)
self.current_timestamp = time.time()
time_since_update = self.current_timestamp - last_timestamp
if time_since_update < self.time_threshold:
raise CacheException(f"Last updated {time_since_update} seconds ago. Threshold is {self.time_threshold}")

try:
self.cache_lock = portalocker.Lock(self.cache_lock_filename, timeout=1)
except portalocker.exceptions.LockException:
raise CacheException(f"Could not lock cache using {self.cache_lock_filename}")
pass

def __exit__(self, exc_type, exc_value, traceback):
if self.write_time:
_write_last_cached_time(self.current_timestamp, self.cache_folder)
self.cache_lock.release()


def _read_last_cached_time(cache_folder):
""" Check the given cache folder to see when it was last updated.
Parameters:
cache_folder (str): The folder we're caching hed schema in.
Returns:
float: The time we last updated the cache. Zero if no update found.
"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)

try:
with open(timestamp_filename, "r") as f:
timestamp = float(f.readline())
return timestamp
except FileNotFoundError or ValueError or IOError:
return 0


def _write_last_cached_time(new_time, cache_folder):
""" Set the time of last cache update.
Parameters:
new_time (float): The time this was updated.
cache_folder (str): The folder used for caching the hed schema.
:raises ValueError:
- something went wrong writing to the file
"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)
try:
with open(timestamp_filename, "w") as f:
f.write(str(new_time))
except Exception:
raise ValueError("Error writing timestamp to hed cache")
11 changes: 11 additions & 0 deletions hed/schema/schema_data/library_data/library_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"": {
"id_range":[10000, 39999]
},
"score": {
"id_range":[40000, 59999]
},
"lang": {
"id_range":[60000, 79999]
}
}
24 changes: 12 additions & 12 deletions hed/schema/schema_io/ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@
from hed.schema import hed_schema_df_constants as constants
from hed.schema.hed_schema_constants import HedKey
from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line
from hed.schema.hed_cache import get_library_data

library_index_ranges = {
"": (10000, 40000),
"score": (40000, 60000),
"lang": (60000, 80000)
}
UNKNOWN_LIBRARY_VALUE = 9910000

UNKNOWN_LIBRARY_VALUE = 0

object_type_id_offset = {
constants.OBJECT_KEY: (100, 300),
Expand All @@ -39,10 +36,11 @@ def get_library_name_and_id(schema):
library_name(str): The capitalized library name
first_id(int): the first id for a given library
"""
name = schema.library

starting_id, _ = library_index_ranges.get(name, (UNKNOWN_LIBRARY_VALUE, 0))

name = schema.library

library_data = get_library_data(name)
starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE))
if not name:
name = "standard"
return name.capitalize(), starting_id
Expand All @@ -61,9 +59,10 @@ def _get_hedid_range(schema_name, df_key):
if df_key == constants.STRUCT_KEY:
raise NotImplementedError("Cannot assign hed_ids struct section")

if schema_name not in library_index_ranges:
library_data = get_library_data(schema_name)
if not library_data:
return set()
starting_id, ending_id = library_index_ranges[schema_name]
starting_id, ending_id = library_data["id_range"]

start_object_range, end_object_range = object_type_id_offset[df_key]
if df_key == constants.TAG_KEY:
Expand All @@ -73,7 +72,8 @@ def _get_hedid_range(schema_name, df_key):
final_start = starting_id + start_object_range + initial_tag_adj
final_end = starting_id + end_object_range
if end_object_range == -1:
final_end = ending_id
# Add one since the versions on hed-schemas are set to max_value - 1
final_end = ending_id + 1
return set(range(final_start, final_end))


Expand Down
Loading

0 comments on commit d458b78

Please sign in to comment.