Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate loaders #305

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions linkml_runtime/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,27 @@
from linkml_runtime.loaders.tsv_loader import TSVLoader
from linkml_runtime.loaders.yaml_loader import YAMLLoader
from linkml_runtime.loaders.csv_loader import CSVLoader
from linkml_runtime.loaders.passthrough_loader import PassthroughLoader

json_loader = JSONLoader()
rdf_loader = RDFLoader()
rdflib_loader = RDFLibLoader()
yaml_loader = YAMLLoader()
csv_loader = CSVLoader()
tsv_loader = TSVLoader()

__all__ = [
"JSONLoader",
"RDFLoader",
"RDFLibLoader",
"TSVLoader",
"YAMLLoader",
"CSVLoader",
"PassthroughLoader",
"json_loader",
"rdf_loader",
"rdflib_loader",
"yaml_loader",
"csv_loader",
"tsv_loader"
]
41 changes: 39 additions & 2 deletions linkml_runtime/loaders/delimited_file_loader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import csv
import re
from abc import ABC, abstractmethod
from json_flattener import unflatten_from_csv, KeyConfig, GlobalConfig, Serializer
import json
from typing import Type, Union, List
from typing import Iterator, Optional, Type, Union, List, TextIO
from linkml_runtime.utils.yamlutils import YAMLRoot
from pydantic import BaseModel

Expand All @@ -14,6 +16,15 @@

class DelimitedFileLoader(Loader, ABC):

def __init__(self,
source: Union[str, dict, TextIO] = None,
skip_empty_rows: bool = False,
index_slot_name: Optional[str] = None):
super().__init__(source)
self.skip_empty_rows = skip_empty_rows
self.index_slot_name = index_slot_name


@property
@abstractmethod
def delimiter(self):
Expand Down Expand Up @@ -60,4 +71,30 @@
configmap = get_configmap(schemaview, index_slot)
config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter)
objs = unflatten_from_csv(input, config=config, **kwargs)
return json.dumps({index_slot: objs})
return json.dumps({index_slot: objs})

def _rows(self) -> Iterator[dict]:
with open(self.source) as file:
reader: csv.DictReader = csv.DictReader(file, delimiter=self.delimiter, skipinitialspace=True)

Check warning on line 78 in linkml_runtime/loaders/delimited_file_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/delimited_file_loader.py#L78

Added line #L78 was not covered by tests
for row in reader:
if self.skip_empty_rows and not any(row.values()):
continue

Check warning on line 81 in linkml_runtime/loaders/delimited_file_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/delimited_file_loader.py#L81

Added line #L81 was not covered by tests
yield {k: _parse_numeric(v) for k, v in row.items() if k is not None and v != ""}

def iter_instances(self) -> Iterator[dict]:
if self.index_slot_name is not None:
yield {self.index_slot_name: list(self._rows())}

Check warning on line 86 in linkml_runtime/loaders/delimited_file_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/delimited_file_loader.py#L86

Added line #L86 was not covered by tests
else:
yield from self._rows()

Check warning on line 88 in linkml_runtime/loaders/delimited_file_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/delimited_file_loader.py#L88

Added line #L88 was not covered by tests

def _parse_numeric(value: str):
if not isinstance(value, str) or not re.search(r"[0-9]", value):
return value
try:
return int(value)
except (TypeError, ValueError):
pass
try:
return float(value)
except (TypeError, ValueError, OverflowError):
return value

Check warning on line 100 in linkml_runtime/loaders/delimited_file_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/delimited_file_loader.py#L92-L100

Added lines #L92 - L100 were not covered by tests
17 changes: 16 additions & 1 deletion linkml_runtime/loaders/json_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import logging
from typing import Union, TextIO, Optional, Dict, Type, List
from typing import Any, Iterator, Union, TextIO, Optional, Dict, Type, List

from hbreader import FileInfo

Expand Down Expand Up @@ -34,3 +34,18 @@
logging.warning(f"Warning: input type mismatch. Expected: {target_class.__name__}, Actual: {typ}")

return self._construct_target_class(data_as_dict, target_class)

def iter_instances(self) -> Iterator[Any]:
"""Lazily yield instance from JSON source.

If the root of the JSON is an array, yield each element of the array. Otherwise,
yield the root element itself.

:return: Iterator over data instances
:rtype: Iterator[Any]
"""
data = self.load_as_dict(self.source)

Check warning on line 47 in linkml_runtime/loaders/json_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/json_loader.py#L47

Added line #L47 was not covered by tests
if isinstance(data, list):
yield from data

Check warning on line 49 in linkml_runtime/loaders/json_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/json_loader.py#L49

Added line #L49 was not covered by tests
else:
yield data

Check warning on line 51 in linkml_runtime/loaders/json_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/json_loader.py#L51

Added line #L51 was not covered by tests
18 changes: 16 additions & 2 deletions linkml_runtime/loaders/loader_root.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import TextIO, Union, Optional, Callable, Dict, Type, Any, List
from typing import Iterator, TextIO, Union, Optional, Callable, Dict, Type, Any, List
from logging import getLogger

from pydantic import BaseModel
Expand All @@ -15,6 +15,10 @@

class Loader(ABC):

def __init__(self, source: Union[str, dict, TextIO] = None):
self.source = source
super().__init__()

@staticmethod
def json_clean(inp: Any) -> Any:
"""
Expand Down Expand Up @@ -119,7 +123,17 @@
"""
return self.load(source, target_class, metadata=metadata)

def _construct_target_class(self,
@abstractmethod
def iter_instances(self) -> Iterator[Any]:
"""Lazily load data instances from the source
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor: can be iterated on in new PR]

Can we clarify what a data instance would be?

It seems that this is canonically a dict, never an instance of a class (whether dataclass or pydantic)? or would it be (e.g. pkl serialization)? Would rdflib_loader eventually implement this with a Triple/Quad object, or a 3-or-4-tuple?

I'm tending towards a more predictable signature (iterates over dicts) with some guarantees

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

totally - so the type annotation here would be refined in the child loaders, keeping it "any" here is just to say "there will be some iterator (it could actually just be Iterator and then we would do Iterator[dict[str, JsonObj | dict | list]] or whatever in the child objects. We would make this type a union of all the child types but it wouldn't really give us much bc the child impl should override it


:return: Iterator over data instances
:rtype: Iterator[Any]
"""
pass

Check warning on line 133 in linkml_runtime/loaders/loader_root.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/loader_root.py#L133

Added line #L133 was not covered by tests


def _construct_target_class(self,
data_as_dict: Union[dict, List[dict]],
target_class: Union[Type[YAMLRoot], Type[BaseModel]]) -> Optional[Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]]:
if data_as_dict:
Expand Down
24 changes: 24 additions & 0 deletions linkml_runtime/loaders/passthrough_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Any, Iterator

from linkml_runtime.loaders.loader_root import Loader


class PassthroughLoader(Loader):
"""A loader that passes through from an existing Iterator

:param source: An Iterator
"""

def __init__(self, source: Iterator) -> None:
super().__init__(source)

Check warning on line 13 in linkml_runtime/loaders/passthrough_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/passthrough_loader.py#L13

Added line #L13 was not covered by tests

def iter_instances(self) -> Iterator[Any]:
"""Pass through instances from an Iterator

:return: Iterator over data instances
:rtype: Iterator[Any]
"""
yield from self.source

Check warning on line 21 in linkml_runtime/loaders/passthrough_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/passthrough_loader.py#L21

Added line #L21 was not covered by tests

def load_any(self, *args, **kwargs):
raise NotImplementedError('Passthrough loader doesnt actually load anything')
5 changes: 4 additions & 1 deletion linkml_runtime/loaders/rdf_loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, TextIO, Optional, Type, List
from typing import Any, Union, TextIO, Optional, Type, List, Iterator

from hbreader import FileInfo

Expand Down Expand Up @@ -90,3 +90,6 @@ def loader(data: Union[str, dict], _: FileInfo) -> Optional[dict]:
# TODO: Make the SSL option a settable parameter in the package itself
with no_ssl_verification():
return self.load_source(source, loader, target_class, accept_header=RDF_MIME_TYPES, metadata=metadata)

def iter_instances(self) -> Iterator[Any]:
raise NotImplementedError("RDF Loader doesn't have instance iterator yet!")
5 changes: 3 additions & 2 deletions linkml_runtime/loaders/rdflib_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import urllib
from copy import copy
from dataclasses import dataclass
from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set
from typing import Optional, Any, Dict, Type, Union, TextIO, List, Tuple, Set, Iterator

from curies import Converter
from hbreader import FileInfo
Expand Down Expand Up @@ -276,4 +276,5 @@ def loads(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot]:
def load_any(self, source: str, **kwargs) -> Union[BaseModel, YAMLRoot, List[BaseModel], List[YAMLRoot]]:
return self.load(source, **kwargs)


def iter_instances(self) -> Iterator[Any]:
raise NotImplementedError("RDF Loader doesn't have instance iterator yet!")
18 changes: 17 additions & 1 deletion linkml_runtime/loaders/yaml_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from io import StringIO
from typing import Union, TextIO, Optional, Dict, Type, List
from typing import Union, TextIO, Optional, Dict, Type, List, Iterator, Any

import yaml
from hbreader import FileInfo
Expand Down Expand Up @@ -51,3 +51,19 @@
@return: instance of taarget_class
"""
return self.load_any(source, target_class, metadata=metadata)

def iter_instances(self) -> Iterator[Any]:
"""Lazily yield instances from YAML source.

If the root of the document is an array, yield each element of the array. Otherwise,
yield the root element itself. Repeat for each document in the YAML file.

:return: Iterator over data instances
:rtype: Iterator[Any]
"""
with open(self.source) as source_file:
for document in yaml.safe_load_all(source_file):
if isinstance(document, list):
yield from document

Check warning on line 67 in linkml_runtime/loaders/yaml_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/yaml_loader.py#L67

Added line #L67 was not covered by tests
else:
yield document

Check warning on line 69 in linkml_runtime/loaders/yaml_loader.py

View check run for this annotation

Codecov / codecov/patch

linkml_runtime/loaders/yaml_loader.py#L69

Added line #L69 was not covered by tests
Loading