From 99360c979e66016b4902840a159f4d07af71b3b7 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Tue, 29 Oct 2024 08:35:30 -0500 Subject: [PATCH] Throw error in writer on extra columns --- src/koza/io/writer/tsv_writer.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/koza/io/writer/tsv_writer.py b/src/koza/io/writer/tsv_writer.py index 5c586bd..7bae5be 100644 --- a/src/koza/io/writer/tsv_writer.py +++ b/src/koza/io/writer/tsv_writer.py @@ -2,8 +2,9 @@ # NOTE - May want to rename to KGXWriter at some point, if we develop writers for other models non biolink/kgx specific from pathlib import Path -from typing import Dict, Iterable, List, Literal, Set, Union +from typing import Dict, Iterable, List, Literal, Set, Tuple, Union +from numpy.f2py.auxfuncs import throw_error from ordered_set import OrderedSet from koza.converter.kgx_converter import KGXConverter @@ -69,6 +70,13 @@ def write_row(self, record: Dict, record_type: Literal["node", "edge"]) -> None: fh = self.nodeFH if record_type == "node" else self.edgeFH columns = self.node_columns if record_type == "node" else self.edge_columns row = build_export_row(record, list_delimiter=self.list_delimiter) + + # Throw error if the record has extra columns + columns_tuple = tuple(columns) + row_keys_tuple = tuple(row.keys()) + if self.has_extra_columns(row_keys_tuple, columns_tuple): + throw_error(f"Record has extra columns: {set(row.keys()) - set(columns)} not defined in {record_type}") + values = [] if record_type == "node": row["id"] = record["id"] @@ -87,6 +95,19 @@ def finalize(self): if hasattr(self, "edgeFH"): self.edgeFH.close() + @staticmethod + def has_extra_columns(row_keys: Tuple[str, ...], columns_tuple: Tuple[str, ...]) -> bool: + """Check if a row has extra columns. + + Args: + row_keys: Tuple[str, ...] - A tuple of row keys + columns_tuple: Tuple[str, ...] - A tuple of columns + + Returns: + bool - True if row has extra columns, False otherwise + """ + return not set(row_keys).issubset(set(columns_tuple)) + @staticmethod def _order_columns(cols: Set, record_type: Literal["node", "edge"]) -> OrderedSet: """Arrange node or edge columns in a defined order.