Skip to content

Commit

Permalink
fix: Impose stricter data file names
Browse files Browse the repository at this point in the history
  • Loading branch information
karatugo committed Jan 10, 2025
1 parent e5b07e3 commit ae4a8f5
Showing 1 changed file with 10 additions and 11 deletions.
21 changes: 10 additions & 11 deletions sumstats_service/resources/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,24 +843,23 @@ def get_md5_for_accession(
- A dictionary with the matching filename and its MD5 checksum. Empty if no
match is found.
"""
possible_keys = (
[f"{accession_id}.tsv", f"{accession_id}.tsv.gz"]
if not is_harmonised
else [f"{accession_id}.h.tsv", f"{accession_id}.h.tsv.gz"]
)

# Check for exact matches first
for key in possible_keys:
if key in md5_checksums:
return {key: md5_checksums[key]}
for key in md5_checksums:
if not is_harmonised:
if key.endswith((f"{accession_id}.tsv", f"{accession_id}.tsv.gz")):
return {key: md5_checksums[key]}
else:
if key.endswith((f"{accession_id}.h.tsv", f"{accession_id}.h.tsv.gz")):
return {key: md5_checksums[key]}

# Check for partial matches if no exact match is found
# i.e., files are named <GCST ID>_<build number>.*
# e.g. http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90308001-GCST90309000/GCST90308682/ # noqa:E501
for key in md5_checksums:
if (
accession_id in key
and "yaml" not in key
and key.endswith((".tsv", ".tsv.gz", ".txt", ".txt.gz", ".csv", ".csv.gz"))
and ".yaml" not in key
and ".tbi" not in key
and "running.log" not in key
and "README" not in key
Expand All @@ -875,7 +874,7 @@ def get_md5_for_accession(
if (
key.endswith((".tsv", ".tsv.gz", ".txt", ".txt.gz", ".csv", ".csv.gz"))
and key != "md5sums.txt"
and "yaml" not in key
and ".yaml" not in key
and ".tbi" not in key
and "running.log" not in key
and "README" not in key
Expand Down

0 comments on commit ae4a8f5

Please sign in to comment.