From 6c13d015de5cdd6789c32951e06338a3d77a928e Mon Sep 17 00:00:00 2001 From: Albert Tian Chen Date: Sun, 31 Dec 2023 16:33:13 -0500 Subject: [PATCH] Hotfix - explicitly read latin-1 encoding from fasta files (#637) --- workflow_flu_gisaid_ingest/scripts/chunk_sequences.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow_flu_gisaid_ingest/scripts/chunk_sequences.py b/workflow_flu_gisaid_ingest/scripts/chunk_sequences.py index d0dea402..b77c7cc0 100755 --- a/workflow_flu_gisaid_ingest/scripts/chunk_sequences.py +++ b/workflow_flu_gisaid_ingest/scripts/chunk_sequences.py @@ -61,7 +61,7 @@ def process_fasta_file(fasta_file, metadata, output_path, chunk_size=10_000): # Get the date from the fasta file name, as a string # file_date = Path(fasta_file).name.replace(".fa.gz", "") - with open(fasta_file, "rt") as fp: + with open(fasta_file, "rt", encoding="latin-1") as fp: lines = fp.readlines() for i, line in enumerate(lines): # Strip whitespace @@ -114,7 +114,6 @@ def process_fasta_file(fasta_file, metadata, output_path, chunk_size=10_000): date_lookup = dict(zip(metadata.index, metadata["submission_date"])) for name, seq in entries: - # Flush results if chunk is full if chunk_i == chunk_size: print("Writing {} sequences".format(chunk_i))