diff --git a/docs/Usage/ingests.md b/docs/Usage/ingests.md index c12dc0a..dfa8a32 100644 --- a/docs/Usage/ingests.md +++ b/docs/Usage/ingests.md @@ -49,8 +49,9 @@ Creating this ingest will require three things: | __Optional CSV Specific Properties__ | | | `columns` | List of columns to include in output (CSV only) | | `delimiter` | Delimiter for csv files | - | `header_delimiter` | Delimiter for header in csv files | | `header` | Header row index for csv files | + | `header_delimiter` | Delimiter for header in csv files | + | `header_prefix` | Prefix for header in csv files | | `comment_char` | Comment character for csv files | | `skip_blank_lines` | Skip blank lines in csv files | diff --git a/pyproject.toml b/pyproject.toml index f76841a..a3699a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "koza" -version = "0.5.1" +version = "0.5.2" description = "Data transformation framework for LinkML data models" authors = [ "The Monarch Initiative ", diff --git a/src/koza/cli_runner.py b/src/koza/cli_runner.py index 053965d..f45cd2f 100644 --- a/src/koza/cli_runner.py +++ b/src/koza/cli_runner.py @@ -79,7 +79,9 @@ def transform_source( logger, ) - koza_app = _set_koza_app(koza_source, translation_table, output_dir, output_format, schema, node_type, edge_type, logger) + koza_app = _set_koza_app( + koza_source, translation_table, output_dir, output_format, schema, node_type, edge_type, logger + ) koza_app.process_maps() koza_app.process_sources() @@ -172,7 +174,9 @@ def _set_koza_app( ) -> KozaApp: """Create a KozaApp object for a given source""" - koza_apps[source.config.name] = KozaApp(source, translation_table, output_dir, output_format, schema, node_type, edge_type, logger) + koza_apps[source.config.name] = KozaApp( + source, translation_table, output_dir, output_format, schema, node_type, edge_type, logger + ) logger.debug(f"koza_apps entry created for {source.config.name}: {koza_apps[source.config.name]}") return koza_apps[source.config.name] diff --git a/src/koza/io/reader/csv_reader.py b/src/koza/io/reader/csv_reader.py index 6227511..a7d97f0 100644 --- a/src/koza/io/reader/csv_reader.py +++ b/src/koza/io/reader/csv_reader.py @@ -49,6 +49,7 @@ def __init__( delimiter: str = ",", header: Union[int, HeaderMode] = HeaderMode.infer, header_delimiter: str = None, + header_prefix: str = None, dialect: str = "excel", skip_blank_lines: bool = True, name: str = "csv file", @@ -69,6 +70,7 @@ def __init__( if field_type_map is None this will raise a ValueError :param header_delimiter: delimiter for the header row, default = self.delimiter + :param header_prefix: prefix for the header row, default = None :param dialect: csv dialect, default=excel :param skip_blank_lines: true to skip blank lines, false to insert NaN for blank lines, :param name: filename or alias @@ -82,6 +84,7 @@ def __init__( self.dialect = dialect self.header = header self.header_delimiter = header_delimiter if header_delimiter else delimiter + self.header_prefix = header_prefix self.skip_blank_lines = skip_blank_lines self.name = name self.comment_char = comment_char @@ -205,11 +208,12 @@ def _parse_header_line(self, skip_blank_or_commented_lines: bool = False) -> Lis Parse the header line and return a list of headers """ fieldnames = next(reader(self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect})) + if self.header_prefix and fieldnames[0].startswith(self.header_prefix): + fieldnames[0] = fieldnames[0].lstrip(self.header_prefix) if skip_blank_or_commented_lines: # there has to be a cleaner way to do this while not fieldnames or (self.comment_char is not None and fieldnames[0].startswith(self.comment_char)): fieldnames = next(reader(self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect})) - fieldnames[0] = fieldnames[0].lstrip(self.comment_char) return [f.strip() for f in fieldnames] diff --git a/src/koza/model/config/source_config.py b/src/koza/model/config/source_config.py index 74a3eb6..39aa016 100644 --- a/src/koza/model/config/source_config.py +++ b/src/koza/model/config/source_config.py @@ -147,8 +147,9 @@ class SourceConfig: columns: List[str] (optional) - list of columns to include required_properties: List[str] (optional) - list of properties which must be in json data files delimiter: str (optional) - delimiter for csv files - header_delimiter: str (optional) - delimiter for header in csv files header: int (optional) - header row index + header_delimiter: str (optional) - delimiter for header in csv files + header_prefix: str (optional) - prefix for header in csv files comment_char: str (optional) - comment character for csv files skip_blank_lines: bool (optional) - skip blank lines in csv files filters: List[ColumnFilter] (optional) - list of filters to apply @@ -171,6 +172,7 @@ class SourceConfig: delimiter: Optional[str] = None header: Union[int, HeaderMode] = HeaderMode.infer header_delimiter: Optional[str] = None + header_prefix: Optional[str] = None comment_char: str = "#" skip_blank_lines: bool = True filters: List[ColumnFilter] = field(default_factory=list) @@ -290,7 +292,7 @@ def __post_init__(self): raise ValueError("Field type map contains more than one key") for key, val in field.items(): field_type_map[key] = val - print(f"FIELD TYPE MAP: {field_type_map}") + # print(f"FIELD TYPE MAP: {field_type_map}") self.field_type_map = field_type_map diff --git a/src/koza/model/source.py b/src/koza/model/source.py index 2fa50b9..294c643 100644 --- a/src/koza/model/source.py +++ b/src/koza/model/source.py @@ -40,8 +40,9 @@ def __init__(self, config: Union[PrimaryFileConfig, MapFileConfig], row_limit: O name=config.name, field_type_map=config.field_type_map, delimiter=config.delimiter, - header_delimiter=config.header_delimiter, header=config.header, + header_delimiter=config.header_delimiter, + header_prefix=config.header_prefix, comment_char=self.config.comment_char, row_limit=self.row_limit, )