diff --git a/linkml_runtime/loaders/__init__.py b/linkml_runtime/loaders/__init__.py index 1fe4395d..e0b5dc58 100644 --- a/linkml_runtime/loaders/__init__.py +++ b/linkml_runtime/loaders/__init__.py @@ -4,6 +4,7 @@ from linkml_runtime.loaders.tsv_loader import TSVLoader from linkml_runtime.loaders.yaml_loader import YAMLLoader from linkml_runtime.loaders.csv_loader import CSVLoader +from linkml_runtime.loaders.passthrough_loader import PassthroughLoader json_loader = JSONLoader() rdf_loader = RDFLoader() @@ -11,3 +12,19 @@ yaml_loader = YAMLLoader() csv_loader = CSVLoader() tsv_loader = TSVLoader() + +__all__ = [ + "JSONLoader", + "RDFLoader", + "RDFLibLoader", + "TSVLoader", + "YAMLLoader", + "CSVLoader", + "PassthroughLoader", + "json_loader", + "rdf_loader", + "rdflib_loader", + "yaml_loader", + "csv_loader", + "tsv_loader" +] diff --git a/linkml_runtime/loaders/delimited_file_loader.py b/linkml_runtime/loaders/delimited_file_loader.py index d5c48110..1f24c9bc 100644 --- a/linkml_runtime/loaders/delimited_file_loader.py +++ b/linkml_runtime/loaders/delimited_file_loader.py @@ -1,7 +1,9 @@ +import csv +import re from abc import ABC, abstractmethod from json_flattener import unflatten_from_csv, KeyConfig, GlobalConfig, Serializer import json -from typing import Type, Union, List +from typing import Iterator, Optional, Type, Union, List, TextIO from linkml_runtime.utils.yamlutils import YAMLRoot from pydantic import BaseModel @@ -14,6 +16,15 @@ class DelimitedFileLoader(Loader, ABC): + def __init__(self, + source: Union[str, dict, TextIO] = None, + skip_empty_rows: bool = False, + index_slot_name: Optional[str] = None): + super().__init__(source) + self.skip_empty_rows = skip_empty_rows + self.index_slot_name = index_slot_name + + @property @abstractmethod def delimiter(self): @@ -60,4 +71,30 @@ def _get_json_str_to_load(self, configmap = get_configmap(schemaview, index_slot) config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter) objs = unflatten_from_csv(input, config=config, **kwargs) - return json.dumps({index_slot: objs}) \ No newline at end of file + return json.dumps({index_slot: objs}) + + def _rows(self) -> Iterator[dict]: + with open(self.source) as file: + reader: csv.DictReader = csv.DictReader(file, delimiter=self.delimiter, skipinitialspace=True) + for row in reader: + if self.skip_empty_rows and not any(row.values()): + continue + yield {k: _parse_numeric(v) for k, v in row.items() if k is not None and v != ""} + + def iter_instances(self) -> Iterator[dict]: + if self.index_slot_name is not None: + yield {self.index_slot_name: list(self._rows())} + else: + yield from self._rows() + +def _parse_numeric(value: str): + if not isinstance(value, str) or not re.search(r"[0-9]", value): + return value + try: + return int(value) + except (TypeError, ValueError): + pass + try: + return float(value) + except (TypeError, ValueError, OverflowError): + return value diff --git a/linkml_runtime/loaders/json_loader.py b/linkml_runtime/loaders/json_loader.py index 0d7f58b1..4e7cd86e 100644 --- a/linkml_runtime/loaders/json_loader.py +++ b/linkml_runtime/loaders/json_loader.py @@ -1,6 +1,6 @@ import json import logging -from typing import Union, TextIO, Optional, Dict, Type, List +from typing import Any, Iterator, Union, TextIO, Optional, Dict, Type, List from hbreader import FileInfo @@ -34,3 +34,18 @@ def load_any(self, logging.warning(f"Warning: input type mismatch. Expected: {target_class.__name__}, Actual: {typ}") return self._construct_target_class(data_as_dict, target_class) + + def iter_instances(self) -> Iterator[Any]: + """Lazily yield instance from JSON source. + + If the root of the JSON is an array, yield each element of the array. Otherwise, + yield the root element itself. + + :return: Iterator over data instances + :rtype: Iterator[Any] + """ + data = self.load_as_dict(self.source) + if isinstance(data, list): + yield from data + else: + yield data diff --git a/linkml_runtime/loaders/loader_root.py b/linkml_runtime/loaders/loader_root.py index e690c4b3..da4656ad 100644 --- a/linkml_runtime/loaders/loader_root.py +++ b/linkml_runtime/loaders/loader_root.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import TextIO, Union, Optional, Callable, Dict, Type, Any, List +from typing import Iterator, TextIO, Union, Optional, Callable, Dict, Type, Any, List from logging import getLogger from pydantic import BaseModel @@ -15,6 +15,10 @@ class Loader(ABC): + def __init__(self, source: Union[str, dict, TextIO] = None): + self.source = source + super().__init__() + @staticmethod def json_clean(inp: Any) -> Any: """ @@ -119,7 +123,17 @@ def loads(self, source: str, target_class: Type[Union[BaseModel, YAMLRoot]], *, """ return self.load(source, target_class, metadata=metadata) - def _construct_target_class(self, + @abstractmethod + def iter_instances(self) -> Iterator[Any]: + """Lazily load data instances from the source + + :return: Iterator over data instances + :rtype: Iterator[Any] + """ + pass + + + def _construct_target_class(self, data_as_dict: Union[dict, List[dict]], target_class: Union[Type[YAMLRoot], Type[BaseModel]]) -> Optional[Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]]: if data_as_dict: diff --git a/linkml_runtime/loaders/passthrough_loader.py b/linkml_runtime/loaders/passthrough_loader.py new file mode 100644 index 00000000..2e92b44b --- /dev/null +++ b/linkml_runtime/loaders/passthrough_loader.py @@ -0,0 +1,24 @@ +from typing import Any, Iterator + +from linkml_runtime.loaders.loader_root import Loader + + +class PassthroughLoader(Loader): + """A loader that passes through from an existing Iterator + + :param source: An Iterator + """ + + def __init__(self, source: Iterator) -> None: + super().__init__(source) + + def iter_instances(self) -> Iterator[Any]: + """Pass through instances from an Iterator + + :return: Iterator over data instances + :rtype: Iterator[Any] + """ + yield from self.source + + def load_any(self, *args, **kwargs): + raise NotImplementedError('Passthrough loader doesnt actually load anything') \ No newline at end of file diff --git a/linkml_runtime/loaders/rdf_loader.py b/linkml_runtime/loaders/rdf_loader.py index abaa8a79..516cff38 100644 --- a/linkml_runtime/loaders/rdf_loader.py +++ b/linkml_runtime/loaders/rdf_loader.py @@ -1,4 +1,4 @@ -from typing import Union, TextIO, Optional, Type, List +from typing import Any, Union, TextIO, Optional, Type, List, Iterator from hbreader import FileInfo @@ -90,3 +90,6 @@ def loader(data: Union[str, dict], _: FileInfo) -> Optional[dict]: # TODO: Make the SSL option a settable parameter in the package itself with no_ssl_verification(): return self.load_source(source, loader, target_class, accept_header=RDF_MIME_TYPES, metadata=metadata) + + def iter_instances(self) -> Iterator[Any]: + raise NotImplementedError("RDF Loader doesn't have instance iterator yet!") \ No newline at end of file diff --git a/linkml_runtime/loaders/rdflib_loader.py b/linkml_runtime/loaders/rdflib_loader.py index e26d3707..21db9a72 100644 --- a/linkml_runtime/loaders/rdflib_loader.py +++ b/linkml_runtime/loaders/rdflib_loader.py @@ -2,7 +2,7 @@ import urllib from copy import copy from dataclasses import dataclass -from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set +from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set, Iterator from curies import Converter from hbreader import FileInfo @@ -276,4 +276,5 @@ def loads(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot]: def load_any(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]: return self.load(source, **kwargs) - + def iter_instances(self) -> Iterator[Any]: + raise NotImplementedError("RDF Loader doesn't have instance iterator yet!") \ No newline at end of file diff --git a/linkml_runtime/loaders/yaml_loader.py b/linkml_runtime/loaders/yaml_loader.py index eee6e973..5e0beb9c 100644 --- a/linkml_runtime/loaders/yaml_loader.py +++ b/linkml_runtime/loaders/yaml_loader.py @@ -1,6 +1,6 @@ import os from io import StringIO -from typing import Union, TextIO, Optional, Dict, Type, List +from typing import Union, TextIO, Optional, Dict, Type, List, Iterator, Any import yaml from hbreader import FileInfo @@ -51,3 +51,19 @@ def loads_any(self, source: str, target_class: Type[Union[BaseModel, YAMLRoot]], @return: instance of taarget_class """ return self.load_any(source, target_class, metadata=metadata) + + def iter_instances(self) -> Iterator[Any]: + """Lazily yield instances from YAML source. + + If the root of the document is an array, yield each element of the array. Otherwise, + yield the root element itself. Repeat for each document in the YAML file. + + :return: Iterator over data instances + :rtype: Iterator[Any] + """ + with open(self.source) as source_file: + for document in yaml.safe_load_all(source_file): + if isinstance(document, list): + yield from document + else: + yield document \ No newline at end of file