Skip to content

Commit

Permalink
Hotfix - explicitly read latin-1 encoding from fasta files (#637)
Browse files Browse the repository at this point in the history
  • Loading branch information
atc3 authored Dec 31, 2023
1 parent f039ad2 commit 6c13d01
Showing 1 changed file with 1 addition and 2 deletions.
3 changes: 1 addition & 2 deletions workflow_flu_gisaid_ingest/scripts/chunk_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def process_fasta_file(fasta_file, metadata, output_path, chunk_size=10_000):
# Get the date from the fasta file name, as a string
# file_date = Path(fasta_file).name.replace(".fa.gz", "")

with open(fasta_file, "rt") as fp:
with open(fasta_file, "rt", encoding="latin-1") as fp:
lines = fp.readlines()
for i, line in enumerate(lines):
# Strip whitespace
Expand Down Expand Up @@ -114,7 +114,6 @@ def process_fasta_file(fasta_file, metadata, output_path, chunk_size=10_000):
date_lookup = dict(zip(metadata.index, metadata["submission_date"]))

for name, seq in entries:

# Flush results if chunk is full
if chunk_i == chunk_size:
print("Writing {} sequences".format(chunk_i))
Expand Down

0 comments on commit 6c13d01

Please sign in to comment.