Skip to content

Commit

Permalink
Merge branch 'main' into dv-WS_uploader_performance
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiangs18 committed Jan 31, 2024
2 parents a412797 + 8b48835 commit 779ae79
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 79 deletions.
24 changes: 16 additions & 8 deletions src/common/product_models/columnar_attribs_common_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@


FIELD_COLUMNS = "columns"
NON_VISIBLE = "non_visible"


class ColumnType(str, Enum):
Expand Down Expand Up @@ -49,9 +50,9 @@ class FilterStrategy(str, Enum):
""" A search based on ngram matching. """


class AttributesColumnSpec(BaseModel):
class AttributesColumnBase(BaseModel):
"""
A specification for a column in an attributes table.
A base class for a specification for a column in an attributes table.
"""
key: str = Field(
example="checkm_completeness",
Expand All @@ -66,11 +67,7 @@ class AttributesColumnSpec(BaseModel):
description="The filter strategy for the column if any. Not all column types need "
+ "a filter strategy."
)] = None
non_visible: Annotated[bool, Field(
example=False,
description="Whether the column is visible to the user. "
+ "If True, the display name and category fields are not required"
)] = False

display_name: Annotated[str | None, Field(
example="Completeness",
description="The display name of the column. "
Expand All @@ -95,6 +92,17 @@ def _check_filter_strategy(self) -> Self:
raise ValueError("Only string types may have a filter strategy")
return self


class AttributesColumnSpec(AttributesColumnBase):
"""
A specification for a column in an attributes table.
"""
non_visible: Annotated[bool, Field(
example=False,
description="Whether the column is visible to the user. "
+ "If True, the display name and category fields are not required"
)] = False

@model_validator(mode="after")
def _check_visible_col(self) -> Self:
if not self.non_visible:
Expand All @@ -115,7 +123,7 @@ class ColumnarAttributesSpec(BaseModel):
)] = list()


class AttributesColumn(AttributesColumnSpec):
class AttributesColumn(AttributesColumnBase):
min_value: Annotated[int | float | str | None, Field(
example="2023-08-25T22:08:30.576+0000",
description="The minimum value for the column for numeric and date columns. "
Expand Down
21 changes: 9 additions & 12 deletions src/loaders/common/loader_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from src.common.collection_column_specs.load_specs import load_spec
from src.common.product_models.columnar_attribs_common_models import (
ColumnType,
AttributesColumn,
ColumnarAttributesMeta,
)
from src.common.storage.db_doc_conversions import (
collection_data_id_key,
Expand Down Expand Up @@ -55,7 +53,7 @@ def _convert_to_iso8601(date_string: str) -> str:
# Convert a date string to ISO 8601 format
formats_to_try = ["%Y/%m/%d",
"%Y-%m-%d",
"%m/%d/%y",] # Add more formats as needed
"%m/%d/%y", ] # Add more formats as needed
# The current code always leaves the date in day precision with no time zone information as that's
# all that's available from the current data.
# If higher precision dates are encountered in the future the code should be adapted to
Expand Down Expand Up @@ -143,17 +141,16 @@ def process_columnar_meta(
enum_values = list(set(values))
enum_values.sort()

attri_column = AttributesColumn(
**col_spec.model_dump(),
min_value=min_value,
max_value=max_value,
enum_values=enum_values
)
columns.append(attri_column)
attri_column = {
'min_value': min_value,
'max_value': max_value,
'enum_values': enum_values,
**col_spec.model_dump()
}

columnar_attri_meta = ColumnarAttributesMeta(columns=columns, count=len(docs))
columns.append(attri_column)

meta_doc = columnar_attri_meta.model_dump()
meta_doc = {'columns': columns, 'count': len(docs)}
meta_doc.update({
names.FLD_ARANGO_KEY: collection_load_version_key(kbase_collection, load_ver),
names.FLD_COLLECTION_ID: kbase_collection,
Expand Down
9 changes: 5 additions & 4 deletions src/loaders/workspace_downloader/workspace_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def _process_object_info(
loader_common_names.FLD_KB_OBJ_TIMESTAMP: obj_info[3],
loader_common_names.FLD_KB_OBJ_GENOME_UPA: "{6}/{0}/{4}".format(*genome_info),
loader_common_names.ASSEMBLY_OBJ_INFO_KEY: obj_info,
loader_common_names.GENOME_METADATA_FILE: genome_info}
loader_common_names.GENOME_OBJ_INFO_KEY: genome_info}

return res_dict

Expand Down Expand Up @@ -245,14 +245,15 @@ def _find_sample_upa(
# find one and only one sample associated upa from input upas and retrieve the sample data
# raise error if multiple samples are found

found_sample, sample_ret, sample_upa, sample_effective_time = False, None, None, None
found_sample_id, sample_ret, sample_upa, sample_effective_time = None, None, None, None

for upa in upas:
try:
sample_ret, sample_effective_time = _retrieve_sample(conf, upa)
if found_sample:
sample_id = sample_ret['id']
if found_sample_id and found_sample_id != sample_id:
raise ValueError(f"Found multiple samples in input {upas}")
found_sample, sample_upa = True, upa
found_sample_id, sample_upa = sample_id, upa
except NoDataLinkError:
pass

Expand Down
76 changes: 75 additions & 1 deletion src/service/data_products/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
"""
from typing import Any, Callable, NamedTuple

from fastapi import Request

import src.common.storage.collection_and_field_names as names
from src.common.product_models import columnar_attribs_common_models as col_models
from src.common.storage.db_doc_conversions import (
collection_load_version_key,
collection_data_id_key,
)
from src.service import errors, kb_auth, models
from src.service import errors, kb_auth, models, app_state
from src.service.filtering.filters import FilterSet
from src.service.storage_arango import ArangoStorage

Expand Down Expand Up @@ -109,6 +112,77 @@ async def get_collection_singleton_from_db(
)


async def get_columnar_attribs_meta(
storage: ArangoStorage,
collection: str,
collection_id: str,
load_ver: str,
load_ver_override: bool,
return_only_visible: bool = False
) -> col_models.ColumnarAttributesMeta:
"""
Get the columnar attributes meta document for a collection. The document is expected to be
a singleton per collection load version.
storage - the storage system.
collection - the arango collection containing the document.
collection_id - the KBase collection containing the document.
load_ver - the load version of the collection.
load_ver_override - whether to override the load version.
return_only_visible - whether to return only visible columns.
"""
doc = await get_collection_singleton_from_db(
storage,
collection,
collection_id,
load_ver,
load_ver_override
)

doc[col_models.FIELD_COLUMNS] = [
col_models.AttributesColumn(**d) for d in doc[col_models.FIELD_COLUMNS]
if not return_only_visible or not d[col_models.NON_VISIBLE]
]

return col_models.ColumnarAttributesMeta(**remove_collection_keys(doc))


async def get_product_meta(
r: Request,
collection: str,
collection_id: str,
data_product: str,
load_ver_override: str,
user: kb_auth.KBaseUser,

) -> col_models.ColumnarAttributesMeta:
"""
Get the columnar attributes meta document for a collection used by /meta endpoint.
r - the request.
collection - the arango collection containing the meta information.
collection_id - the ID of the Collection for which to retrieve the meta information.
data_product - the ID of the data product from which to retrieve the load version.
load_ver_override - an override for the load version. If provided:
* the user must be a service administrator
* the collection is not checked for the existence of the data product.
user - the user. Ignored if load_ver_override is not provided; must be a service administrator.
"""

storage = app_state.get_app_state(r).arangostorage
_, load_ver = await get_load_version(storage, collection_id, data_product, load_ver_override, user)
meta = await get_columnar_attribs_meta(storage,
collection,
collection_id,
load_ver,
bool(load_ver_override),
return_only_visible=True)

return meta


async def get_doc_from_collection_by_unique_id(
store: ArangoStorage,
collection: str,
Expand Down
6 changes: 3 additions & 3 deletions src/service/data_products/data_product_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ async def get_load_version_and_processes( # pretty huge method sig here
match_id: str | None = None,
selection_id: str | None = None,
multiple_ids: bool = False,
) -> tuple[str, models.DataProductProcess, models.DataProductProcess]:
) -> tuple[str, models.DataProductProcess, models.DataProductProcess, models.ActiveCollection | None]:
f"""
Get the appropriate load version to use when querying data along with match and / or selection
processes, if match and selection IDs are provided.
Expand Down Expand Up @@ -93,7 +93,7 @@ async def get_load_version_and_processes( # pretty huge method sig here
appstate, coll, selection_id, data_product,
partial(_process_subset, collection, multiple_ids)
)
return load_ver, dp_match, dp_sel
return load_ver, dp_match, dp_sel, coll


async def _process_subset(
Expand Down Expand Up @@ -154,7 +154,7 @@ async def get_missing_ids(
if not match_id and not selection_id:
raise errors.IllegalParameterError(
"At last one of a match ID or selection ID must be supplied")
_, dp_match, dp_sel = await get_load_version_and_processes(
_, dp_match, dp_sel, _ = await get_load_version_and_processes(
appstate,
user,
collection,
Expand Down
Loading

0 comments on commit 779ae79

Please sign in to comment.