Skip to content

Commit

Permalink
Do not output empty alignments (#963)
Browse files Browse the repository at this point in the history
  • Loading branch information
eu9ene authored Dec 20, 2024
1 parent 8977fbf commit e71c831
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions pipeline/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,16 @@ def build_dataset_tsv(
empty_alignments = []

for src_line, trg_line, aln_line in zip(src_lines, trg_lines, aln_lines):
if not aln_line:
if aln_line.strip():
tsv_outfile.write(
f"{src_line.strip()}\t{trg_line.strip()}\t{aln_line.strip()}\n"
)
else:
# do not write lines with empty alignments to TSV, Marian will complain and skip those
empty_alignments.append((src_line, trg_line))
continue
tsv_outfile.write(f"{src_line.strip()}\t{trg_line.strip()}\t{aln_line.strip()}\n")

if empty_alignments:
logger.info(f"Number of empty alignments for {len(alignments_file)}")
logger.info(f"Number of empty alignments is {len(empty_alignments)}")
logger.info("Sample of empty alignments:")
random.shuffle(empty_alignments)
for src_line, trg_line in empty_alignments[:50]:
Expand Down

0 comments on commit e71c831

Please sign in to comment.