Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache and retrieve library_data.json from hed-schemas #985

Merged
merged 1 commit into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 76 additions & 68 deletions hed/schema/hed_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import json
from hashlib import sha1
from shutil import copyfile
import functools


import re
from semantic_version import Version
import portalocker
import time
from hed.schema.hed_cache_lock import CacheException, CacheLock
from hed.schema.schema_io.schema_util import url_to_file, make_url_request
from pathlib import Path
import urllib
Expand All @@ -32,15 +33,14 @@

DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema"
LIBRARY_HED_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/library_schemas"
LIBRARY_DATA_URL = "https://raw.githubusercontent.com/hed-standard/hed-schemas/main/library_data.json"
DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL,)
DEFAULT_LIBRARY_URL_LIST = (LIBRARY_HED_URL,)


DEFAULT_SKIP_FOLDERS = ('deprecated',)

HED_CACHE_DIRECTORY = os.path.join(Path.home(), '.hedtools/hed_cache/')
TIMESTAMP_FILENAME = "last_update.txt"
CACHE_TIME_THRESHOLD = 300 * 6

# This is the schemas included in the hedtools package.
INSTALLED_CACHE_LOCATION = os.path.realpath(os.path.join(os.path.dirname(__file__), 'schema_data/'))
Expand Down Expand Up @@ -144,13 +144,11 @@ def cache_local_versions(cache_folder):
"""
if not cache_folder:
cache_folder = HED_CACHE_DIRECTORY
os.makedirs(cache_folder, exist_ok=True)

try:
cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
with portalocker.Lock(cache_lock_filename, timeout=1):
_copy_installed_schemas_to_cache(cache_folder)
except portalocker.exceptions.LockException:
with CacheLock(cache_folder, write_time=False):
_copy_installed_folder_to_cache(cache_folder)
except CacheException:
return -1


Expand All @@ -165,33 +163,25 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_
cache_folder (str): The folder holding the cache.

Returns:
float: Returns -1 if cache failed, a positive number meaning time in seconds since last update
if it didn't cache, 0 if it cached successfully this time.
float: Returns -1 if cache failed for any reason, including having been cached too recently.
Returns 0 if it successfully cached this time.

Notes:
- The Default skip_folders is 'deprecated'.
- The HED cache folder defaults to HED_CACHE_DIRECTORY.
- The directories on GitHub are of the form:
https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml
https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema

"""
if not cache_folder:
cache_folder = HED_CACHE_DIRECTORY

if isinstance(hed_base_urls, str):
hed_base_urls = [hed_base_urls]
if isinstance(hed_library_urls, str):
hed_library_urls = [hed_library_urls]
os.makedirs(cache_folder, exist_ok=True)
last_timestamp = _read_last_cached_time(cache_folder)
current_timestamp = time.time()
time_since_update = current_timestamp - last_timestamp
if time_since_update < CACHE_TIME_THRESHOLD:
return time_since_update

try:
cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
with portalocker.Lock(cache_lock_filename, timeout=1):
with CacheLock(cache_folder):
if isinstance(hed_base_urls, str):
hed_base_urls = [hed_base_urls]
if isinstance(hed_library_urls, str):
hed_library_urls = [hed_library_urls]
all_hed_versions = {}
for hed_base_url in hed_base_urls:
new_hed_versions = _get_hed_xml_versions_one_library(hed_base_url)
Expand All @@ -205,60 +195,78 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_
for version, version_info in hed_versions.items():
_cache_hed_version(version, library_name, version_info, cache_folder=cache_folder)

_write_last_cached_time(current_timestamp, cache_folder)
except portalocker.exceptions.LockException or ValueError or URLError:
except CacheException or ValueError or URLError:
return -1

return 0


def _copy_installed_schemas_to_cache(cache_folder):
"""Copies the schemas from the install folder to the cache"""
installed_files = os.listdir(INSTALLED_CACHE_LOCATION)
for install_name in installed_files:
_, basename = os.path.split(install_name)
cache_name = os.path.join(cache_folder, basename)
install_name = os.path.join(INSTALLED_CACHE_LOCATION, basename)
if not os.path.exists(cache_name):
shutil.copy(install_name, cache_name)
@functools.lru_cache(maxsize=50)
def get_library_data(library_name, cache_folder=None):
"""Retrieve the library data for the given library.

Currently, this is just the valid ID range.

def _read_last_cached_time(cache_folder):
""" Check the given cache folder to see when it was last updated.
Parameters:
library_name(str): The schema name. "" for standard schema.
cache_folder(str): The cache folder to use if not using the default.

Parameters:
cache_folder (str): The folder we're caching hed schema in.
Returns:
library_data(dict): The data for a specific library.
"""
if cache_folder is None:
cache_folder = HED_CACHE_DIRECTORY

Returns:
float: The time we last updated the cache. Zero if no update found.
cache_lib_data_folder = os.path.join(cache_folder, "library_data")

"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)
local_library_data_filename = os.path.join(cache_lib_data_folder, "library_data.json")
try:
with open(local_library_data_filename) as file:
library_data = json.load(file)
specific_library = library_data[library_name]
return specific_library
except (OSError, CacheException, ValueError, KeyError):
pass

try:
with open(timestamp_filename, "r") as f:
timestamp = float(f.readline())
return timestamp
except FileNotFoundError or ValueError or IOError:
return 0
with CacheLock(cache_lib_data_folder, write_time=False):
_copy_installed_folder_to_cache(cache_lib_data_folder, "library_data")

with open(local_library_data_filename) as file:
library_data = json.load(file)
specific_library = library_data[library_name]
return specific_library
except (OSError, CacheException, ValueError, KeyError):
pass

try:
with CacheLock(cache_lib_data_folder):
# if this fails it'll fail to load in the next step
_cache_specific_url(LIBRARY_DATA_URL, local_library_data_filename)
with open(local_library_data_filename) as file:
library_data = json.load(file)
specific_library = library_data[library_name]
return specific_library
except (OSError, CacheException, ValueError, URLError, KeyError) as e:
pass

def _write_last_cached_time(new_time, cache_folder):
""" Set the time of last cache update.
# This failed to get any data for some reason
return {}

Parameters:
new_time (float): The time this was updated.
cache_folder (str): The folder used for caching the hed schema.

:raises ValueError:
- something went wrong writing to the file
"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)
try:
with open(timestamp_filename, "w") as f:
f.write(str(new_time))
except Exception:
raise ValueError("Error writing timestamp to hed cache")
def _copy_installed_folder_to_cache(cache_folder, sub_folder=""):
"""Copies the schemas from the install folder to the cache"""
source_folder = INSTALLED_CACHE_LOCATION
if sub_folder:
source_folder = os.path.join(INSTALLED_CACHE_LOCATION, sub_folder)

installed_files = os.listdir(source_folder)
for install_name in installed_files:
_, basename = os.path.split(install_name)
cache_name = os.path.join(cache_folder, basename)
install_name = os.path.join(source_folder, basename)
if not os.path.isdir(install_name) and not os.path.exists(cache_name):
shutil.copy(install_name, cache_name)


def _check_if_url(hed_xml_or_url):
Expand Down Expand Up @@ -435,13 +443,13 @@ def _cache_hed_version(version, library_name, version_info, cache_folder):
return _cache_specific_url(download_url, possible_cache_filename)


def _cache_specific_url(hed_xml_url, cache_filename):
def _cache_specific_url(source_url, cache_filename):
"""Copies a specific url to the cache at the given filename"""
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
temp_filename = url_to_file(source_url)
if temp_filename:
cache_filename = _safe_move_tmp_to_folder(temp_filename, cache_filename)
os.remove(temp_filename)
return cache_filename
return None
90 changes: 90 additions & 0 deletions hed/schema/hed_cache_lock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Support utilities for hed_cache locking"""
import time
import os
import portalocker


TIMESTAMP_FILENAME = "last_update.txt"
CACHE_TIME_THRESHOLD = 300 * 6


class CacheException(Exception):
"""Exception for cache locking or threshold errors."""
pass


class CacheLock:
"""Class to lock the cache folder to ensure it doesn't get hit by another version at the same time."""
def __init__(self, cache_folder, write_time=True, time_threshold=CACHE_TIME_THRESHOLD):
"""Constructor for hed locking object

Parameters:
cache_folder(str): The folder to create the lock in(implicitly locking that folder)
write_time(bool): If true, read and write the cache time. Additionally, won't operate if too recent.
Generally False for local operations.
time_threshold(int): Time before cache is allowed to refresh again.

"""
self.cache_folder = cache_folder
self.cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
self.cache_lock = None
self.timestamp = None
self.write_time = write_time
self.time_threshold = time_threshold

def __enter__(self):
os.makedirs(self.cache_folder, exist_ok=True)
last_timestamp = _read_last_cached_time(self.cache_folder)
self.current_timestamp = time.time()
time_since_update = self.current_timestamp - last_timestamp
if time_since_update < self.time_threshold:
raise CacheException(f"Last updated {time_since_update} seconds ago. Threshold is {self.time_threshold}")

try:
self.cache_lock = portalocker.Lock(self.cache_lock_filename, timeout=1)
except portalocker.exceptions.LockException:
raise CacheException(f"Could not lock cache using {self.cache_lock_filename}")
pass

def __exit__(self, exc_type, exc_value, traceback):
if self.write_time:
_write_last_cached_time(self.current_timestamp, self.cache_folder)
self.cache_lock.release()


def _read_last_cached_time(cache_folder):
""" Check the given cache folder to see when it was last updated.

Parameters:
cache_folder (str): The folder we're caching hed schema in.

Returns:
float: The time we last updated the cache. Zero if no update found.

"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)

try:
with open(timestamp_filename, "r") as f:
timestamp = float(f.readline())
return timestamp
except FileNotFoundError or ValueError or IOError:
return 0


def _write_last_cached_time(new_time, cache_folder):
""" Set the time of last cache update.

Parameters:
new_time (float): The time this was updated.
cache_folder (str): The folder used for caching the hed schema.

:raises ValueError:
- something went wrong writing to the file
"""
timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME)
try:
with open(timestamp_filename, "w") as f:
f.write(str(new_time))
except Exception:
raise ValueError("Error writing timestamp to hed cache")
11 changes: 11 additions & 0 deletions hed/schema/schema_data/library_data/library_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"": {
"id_range":[10000, 39999]
},
"score": {
"id_range":[40000, 59999]
},
"lang": {
"id_range":[60000, 79999]
}
}
24 changes: 12 additions & 12 deletions hed/schema/schema_io/ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@
from hed.schema import hed_schema_df_constants as constants
from hed.schema.hed_schema_constants import HedKey
from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line
from hed.schema.hed_cache import get_library_data

library_index_ranges = {
"": (10000, 40000),
"score": (40000, 60000),
"lang": (60000, 80000)
}
UNKNOWN_LIBRARY_VALUE = 9910000

UNKNOWN_LIBRARY_VALUE = 0

object_type_id_offset = {
constants.OBJECT_KEY: (100, 300),
Expand All @@ -39,10 +36,11 @@ def get_library_name_and_id(schema):
library_name(str): The capitalized library name
first_id(int): the first id for a given library
"""
name = schema.library

starting_id, _ = library_index_ranges.get(name, (UNKNOWN_LIBRARY_VALUE, 0))

name = schema.library

library_data = get_library_data(name)
starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE))
if not name:
name = "standard"
return name.capitalize(), starting_id
Expand All @@ -61,9 +59,10 @@ def _get_hedid_range(schema_name, df_key):
if df_key == constants.STRUCT_KEY:
raise NotImplementedError("Cannot assign hed_ids struct section")

if schema_name not in library_index_ranges:
library_data = get_library_data(schema_name)
if not library_data:
return set()
starting_id, ending_id = library_index_ranges[schema_name]
starting_id, ending_id = library_data["id_range"]

start_object_range, end_object_range = object_type_id_offset[df_key]
if df_key == constants.TAG_KEY:
Expand All @@ -73,7 +72,8 @@ def _get_hedid_range(schema_name, df_key):
final_start = starting_id + start_object_range + initial_tag_adj
final_end = starting_id + end_object_range
if end_object_range == -1:
final_end = ending_id
# Add one since the versions on hed-schemas are set to max_value - 1
final_end = ending_id + 1
return set(range(final_start, final_end))


Expand Down
Loading
Loading