Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for caching prerelease schemas. #901

Merged
merged 1 commit into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ class SchemaWarnings:
SCHEMA_NON_PLACEHOLDER_HAS_CLASS = 'SCHEMA_NON_PLACEHOLDER_HAS_CLASS'
SCHEMA_PROLOGUE_CHARACTER_INVALID = "SCHEMA_PROLOGUE_CHARACTER_INVALID"

SCHEMA_PRERELEASE_VERSION_USED = "SCHEMA_PRERELEASE_VERSION_USED"


class SchemaAttributeErrors:
SCHEMA_ATTRIBUTE_INVALID = 'SCHEMA_ATTRIBUTE_INVALID'
Expand Down
5 changes: 5 additions & 0 deletions hed/errors/schema_error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def schema_error_unknown_attribute(attribute_name, source_tag):
f"or was used outside of it's defined class."


@hed_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, default_severity=ErrorSeverity.WARNING)
def schema_error_SCHEMA_PRERELEASE_VERSION_USED(current_version, known_versions):
return f"Schema version {current_version} used, which is prerelease or unofficial. Known versions are: {', '.join(known_versions)}"


@hed_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, default_severity=ErrorSeverity.WARNING,
actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID)
def schema_error_invalid_character_prologue(char_index, source_string, section_name):
Expand Down
173 changes: 106 additions & 67 deletions hed/schema/hed_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,13 @@
HED_XML_PREFIX = 'HED'
HED_XML_EXTENSION = '.xml'
hedxml_suffix = "/hedxml" # The suffix for schema and library schema at the given urls
prerelease_suffix = "/prerelease" # The prerelease schemas at the given URLs

DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml"
DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema"
LIBRARY_HED_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/library_schemas"
DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL, LIBRARY_HED_URL,)
DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL,)
DEFAULT_LIBRARY_URL_LIST = (LIBRARY_HED_URL,)


DEFAULT_SKIP_FOLDERS = ('deprecated',)

Expand Down Expand Up @@ -62,14 +65,15 @@ def get_cache_directory():
return HED_CACHE_DIRECTORY


def get_hed_versions(local_hed_directory=None, library_name=None):
def get_hed_versions(local_hed_directory=None, library_name=None, check_prerelease=False):
""" Get the HED versions in the hed directory.

Parameters:
local_hed_directory (str): Directory to check for versions which defaults to hed_cache.
library_name (str or None): An optional schema library name.
None retrieves the standard schema only.
Pass "all" to retrieve all standard and library schemas as a dict.
check_prerelease (bool): If True, results can include prerelease schemas

Returns:
list or dict: List of version numbers or dictionary {library_name: [versions]}.
Expand All @@ -83,6 +87,8 @@ def get_hed_versions(local_hed_directory=None, library_name=None):

all_hed_versions = {}
local_directory = local_hed_directory
if check_prerelease and not local_directory.endswith(prerelease_suffix):
local_directory += prerelease_suffix
try:
hed_files = os.listdir(local_directory)
except FileNotFoundError:
Expand All @@ -104,26 +110,26 @@ def get_hed_versions(local_hed_directory=None, library_name=None):
return all_hed_versions


def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None):
def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None, check_prerelease=False):
""" Get HED XML file path in a directory. Only returns filenames that exist.

Parameters:
library_name (str or None): Optional the schema library name.
xml_version (str): Returns this version if it exists
local_hed_directory (str): Path to local hed directory. Defaults to HED_CACHE_DIRECTORY

check_prerelease(bool): Also check for prerelease schemas
Returns:
str: The path to the latest HED version the hed directory.

"""
if not local_hed_directory:
local_hed_directory = HED_CACHE_DIRECTORY

hed_versions = get_hed_versions(local_hed_directory, library_name)
hed_versions = get_hed_versions(local_hed_directory, library_name, check_prerelease)
if not hed_versions or not xml_version:
return None
if xml_version in hed_versions:
return _create_xml_filename(xml_version, library_name, local_hed_directory)
return _create_xml_filename(xml_version, library_name, local_hed_directory, check_prerelease)


def cache_local_versions(cache_folder):
Expand All @@ -148,11 +154,12 @@ def cache_local_versions(cache_folder):
return -1


def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_LIBRARY_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
""" Cache all schemas at the given URLs.

Parameters:
hed_base_urls (str or list): Path or list of paths.
hed_base_urls (str or list): Path or list of paths. These should point to a single folder.
hed_library_urls (str or list): Path or list of paths. These should point to a folder containing library folders.
skip_folders (list): A list of subfolders to skip over when downloading.
cache_folder (str): The folder holding the cache.

Expand All @@ -170,8 +177,10 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP
if not cache_folder:
cache_folder = HED_CACHE_DIRECTORY

if not isinstance(hed_base_urls, (list, tuple)):
if isinstance(hed_base_urls, str):
hed_base_urls = [hed_base_urls]
if isinstance(hed_library_urls, str):
hed_library_urls = [hed_library_urls]
os.makedirs(cache_folder, exist_ok=True)
last_timestamp = _read_last_cached_time(cache_folder)
current_timestamp = time.time()
Expand All @@ -182,12 +191,17 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP
try:
cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
with portalocker.Lock(cache_lock_filename, timeout=1):
all_hed_versions = {}
for hed_base_url in hed_base_urls:
all_hed_versions = _get_hed_xml_versions_from_url(hed_base_url, skip_folders=skip_folders,
get_libraries=True)
for library_name, hed_versions in all_hed_versions.items():
for version, version_info in hed_versions.items():
_cache_hed_version(version, library_name, version_info, cache_folder=cache_folder)
new_hed_versions = _get_hed_xml_versions_one_library(hed_base_url)
_merge_in_versions(all_hed_versions, new_hed_versions)
for hed_library_url in hed_library_urls:
new_hed_versions = _get_hed_xml_versions_from_url_all_libraries(hed_library_url, skip_folders=skip_folders)
_merge_in_versions(all_hed_versions, new_hed_versions)

for library_name, hed_versions in all_hed_versions.items():
for version, version_info in hed_versions.items():
_cache_hed_version(version, library_name, version_info, cache_folder=cache_folder)

_write_last_cached_time(current_timestamp, cache_folder)
except portalocker.exceptions.LockException or ValueError or URLError:
Expand All @@ -196,18 +210,6 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP
return 0


def _cache_specific_url(hed_xml_url, cache_filename):
"""Copies a specific url to the cache at the given filename"""
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
return cache_filename
return None


def _copy_installed_schemas_to_cache(cache_folder):
"""Copies the schemas from the install folder to the cache"""
installed_files = os.listdir(INSTALLED_CACHE_LOCATION)
Expand Down Expand Up @@ -264,12 +266,13 @@ def _check_if_url(hed_xml_or_url):
return False


def _create_xml_filename(hed_xml_version, library_name=None, hed_directory=None):
def _create_xml_filename(hed_xml_version, library_name=None, hed_directory=None, prerelease=False):
"""Returns the default file name format for the given version"""
prerelease_prefix = f"prerelease/" if prerelease else ""
if library_name:
hed_xml_basename = f"{HED_XML_PREFIX}_{library_name}_{hed_xml_version}{HED_XML_EXTENSION}"
hed_xml_basename = f"{prerelease_prefix}{HED_XML_PREFIX}_{library_name}_{hed_xml_version}{HED_XML_EXTENSION}"
else:
hed_xml_basename = HED_XML_PREFIX + hed_xml_version + HED_XML_EXTENSION
hed_xml_basename = prerelease_prefix + HED_XML_PREFIX + hed_xml_version + HED_XML_EXTENSION

if hed_directory:
hed_xml_filename = os.path.join(hed_directory, hed_xml_basename)
Expand All @@ -281,15 +284,60 @@ def _sort_version_list(hed_versions):
return sorted(hed_versions, key=Version, reverse=True)


def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False):
def _get_hed_xml_versions_one_folder(hed_folder_url):
url_request = make_url_request(hed_folder_url)
url_data = str(url_request.read(), 'utf-8')
loaded_json = json.loads(url_data)

all_hed_versions = {}
for file_entry in loaded_json:
if file_entry['type'] == "dir":
continue
expression_match = version_pattern.match(file_entry["name"])
if expression_match is not None:
version = expression_match.group(3)
found_library_name = expression_match.group(2)
if found_library_name not in all_hed_versions:
all_hed_versions[found_library_name] = {}
all_hed_versions[found_library_name][version] = file_entry["sha"], file_entry["download_url"], hed_folder_url.endswith(prerelease_suffix)

return all_hed_versions


def _get_hed_xml_versions_one_library(hed_one_library_url):
all_hed_versions = {}
try:
finalized_versions = \
_get_hed_xml_versions_one_folder(hed_one_library_url + hedxml_suffix)
_merge_in_versions(all_hed_versions, finalized_versions)
except urllib.error.URLError:
# Silently ignore ones without a hedxml section for now.
pass
try:
pre_release_folder_versions = \
_get_hed_xml_versions_one_folder(hed_one_library_url + prerelease_suffix)
_merge_in_versions(all_hed_versions, pre_release_folder_versions)
except urllib.error.URLError:
# Silently ignore ones without a prerelease section for now.
pass

ordered_versions = {}
for hed_library_name, hed_versions in all_hed_versions.items():
ordered_versions1 = _sort_version_list(hed_versions)
ordered_versions2 = [(version, hed_versions[version]) for version in ordered_versions1]
ordered_versions[hed_library_name] = dict(ordered_versions2)

return ordered_versions


def _get_hed_xml_versions_from_url_all_libraries(hed_base_library_url, library_name=None, skip_folders=DEFAULT_SKIP_FOLDERS):
""" Get all available schemas and their hash values

Parameters:
hed_base_url (str): A single GitHub API url to cache
library_name(str or None): If str, cache only the named library schemas
hed_base_library_url(str): A single GitHub API url to cache, which contains library schema folders
The subfolders should be a schema folder containing hedxml and/or prerelease folders.
library_name(str or None): If str, cache only the named library schemas.
skip_folders (list): A list of sub folders to skip over when downloading.
get_libraries (bool): If True, return a dictionary of version numbers, with an entry for each library name.

Returns:
list or dict: List of version numbers or dictionary {library_name: [versions]}.
Expand All @@ -300,46 +348,25 @@ def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
- The directories on GitHub are of the form:
https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml
"""
url_request = make_url_request(hed_base_url)
url_request = make_url_request(hed_base_library_url)
url_data = str(url_request.read(), 'utf-8')
loaded_json = json.loads(url_data)

all_hed_versions = {}
for file_entry in loaded_json:
if file_entry['type'] == "dir":
if hed_base_url.endswith(hedxml_suffix):
continue
if file_entry['name'] in skip_folders:
continue
try:
sub_folder_versions = \
_get_hed_xml_versions_from_url(hed_base_url + "/" + file_entry['name'] + hedxml_suffix,
skip_folders=skip_folders, get_libraries=True)
except urllib.error.URLError:
# Silently ignore ones without a hedxml section for now.
continue
_merge_in_versions(all_hed_versions, sub_folder_versions)
expression_match = version_pattern.match(file_entry["name"])
if expression_match is not None:
version = expression_match.group(3)
found_library_name = expression_match.group(2)
if not get_libraries and found_library_name != library_name:
found_library_name = file_entry['name']
if library_name is not None and found_library_name != library_name:
continue
if found_library_name not in all_hed_versions:
all_hed_versions[found_library_name] = {}
all_hed_versions[found_library_name][version] = file_entry["sha"], file_entry["download_url"]

ordered_versions = {}
for hed_library_name, hed_versions in all_hed_versions.items():
ordered_versions1 = _sort_version_list(hed_versions)
ordered_versions2 = [(version, hed_versions[version]) for version in ordered_versions1]
ordered_versions[hed_library_name] = dict(ordered_versions2)
single_library_versions = _get_hed_xml_versions_one_library(hed_base_library_url + "/" + found_library_name)
_merge_in_versions(all_hed_versions, single_library_versions)
continue

if get_libraries:
return ordered_versions
if library_name in ordered_versions:
return ordered_versions[library_name]
return {}
if library_name in all_hed_versions:
return all_hed_versions[library_name]
return all_hed_versions


def _merge_in_versions(all_hed_versions, sub_folder_versions):
Expand Down Expand Up @@ -393,12 +420,24 @@ def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename):

def _cache_hed_version(version, library_name, version_info, cache_folder):
"""Cache the given hed version"""
sha_hash, download_url = version_info
sha_hash, download_url, prerelease = version_info

possible_cache_filename = _create_xml_filename(version, library_name, cache_folder)
possible_cache_filename = _create_xml_filename(version, library_name, cache_folder, prerelease)
local_sha_hash = _calculate_sha1(possible_cache_filename)

if sha_hash == local_sha_hash:
return possible_cache_filename

return _cache_specific_url(download_url, possible_cache_filename)


def _cache_specific_url(hed_xml_url, cache_filename):
"""Copies a specific url to the cache at the given filename"""
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
return cache_filename
return None
7 changes: 7 additions & 0 deletions hed/schema/hed_schema_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,15 +220,22 @@ def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None,
f"Must specify a schema version by number, found no version on {xml_version} schema.",
filename=name)
try:
# 1. Try fully local copy
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder)
if not final_hed_xml_file:
hed_cache.cache_local_versions(xml_folder)
# 2. Cache the schemas included in hedtools and try local again
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder)
hed_schema = load_schema(final_hed_xml_file, schema=schema, name=name)
except HedFileError as e:
if e.code == HedExceptions.FILE_NOT_FOUND:
# Cache all schemas if we haven't recently.
hed_cache.cache_xml_versions(cache_folder=xml_folder)
# 3. See if we got a copy from online
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder)
# 4. Finally check for a pre-release one
if not final_hed_xml_file:
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder, check_prerelease=True)
if not final_hed_xml_file:
raise HedFileError(HedExceptions.FILE_NOT_FOUND,
f"HED version '{xml_version}' not found in cache: {hed_cache.get_cache_directory()}",
Expand Down
22 changes: 22 additions & 0 deletions hed/schema/schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
get_allowed_characters_by_name, get_problem_indexes, validate_schema_description_new
from hed.schema.schema_validation_util_deprecated import validate_schema_tag, validate_schema_description, verify_no_brackets
from functools import partial
from hed.schema import hed_cache
from semantic_version import Version


def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handler=None):
Expand Down Expand Up @@ -36,6 +38,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl
name = hed_schema.filename
error_handler.push_error_context(ErrorContext.FILE_NAME, name)

issues_list += validator.check_if_prerelease_version()
issues_list += validator.check_prologue_epilogue()
issues_list += validator.check_invalid_chars()
issues_list += validator.check_attributes()
Expand Down Expand Up @@ -85,6 +88,25 @@ def __init__(self, hed_schema, error_handler):
self.error_handler = error_handler
self._new_character_validation = hed_schema.schema_83_props

def check_if_prerelease_version(self):
issues = []
libraries = self.hed_schema.library.split(",")
versions = self.hed_schema.version_number.split(",")
for library, version in zip(libraries, versions):
all_known_versions = hed_cache.get_hed_versions(library_name=library)
if "," not in library and not all_known_versions or Version(all_known_versions[0]) < Version(version):
issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, version,
all_known_versions)

if self.hed_schema.with_standard:
all_known_versions = hed_cache.get_hed_versions()
if not all_known_versions or Version(all_known_versions[0]) < Version(self.hed_schema.with_standard):
issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED,
self.hed_schema.with_standard,
all_known_versions)
self.error_handler.add_context_and_filter(issues)
return issues

def check_prologue_epilogue(self):
issues = []
if self._new_character_validation:
Expand Down
Loading
Loading