Merge branch 'main' into dv-WS_uploader_performance

kbase · Jan 31, 2024 · 779ae79 · 779ae79
2 parents a412797 + 8b48835
commit 779ae79
Show file tree

Hide file tree

Showing 8 changed files with 212 additions and 79 deletions.
diff --git a/src/common/product_models/columnar_attribs_common_models.py b/src/common/product_models/columnar_attribs_common_models.py
@@ -13,6 +13,7 @@
 
 
 FIELD_COLUMNS = "columns"
+NON_VISIBLE = "non_visible"
 
 
 class ColumnType(str, Enum):
@@ -49,9 +50,9 @@ class FilterStrategy(str, Enum):
     """ A search based on ngram matching. """
 
 
-class AttributesColumnSpec(BaseModel):
+class AttributesColumnBase(BaseModel):
     """
-    A specification for a column in an attributes table.
+    A base class for a specification for a column in an attributes table.
     """
     key: str = Field(
         example="checkm_completeness",
@@ -66,11 +67,7 @@ class AttributesColumnSpec(BaseModel):
         description="The filter strategy for the column if any. Not all column types need "
             + "a filter strategy."
     )] = None
-    non_visible: Annotated[bool, Field(
-        example=False,
-        description="Whether the column is visible to the user. "
-             + "If True, the display name and category fields are not required"
-    )] = False
+
     display_name: Annotated[str | None, Field(
         example="Completeness",
         description="The display name of the column. "
@@ -95,6 +92,17 @@ def _check_filter_strategy(self) -> Self:
             raise ValueError("Only string types may have a filter strategy")
         return self
 
+
+class AttributesColumnSpec(AttributesColumnBase):
+    """
+    A specification for a column in an attributes table.
+    """
+    non_visible: Annotated[bool, Field(
+        example=False,
+        description="Whether the column is visible to the user. "
+             + "If True, the display name and category fields are not required"
+    )] = False
+
     @model_validator(mode="after")
     def _check_visible_col(self) -> Self:
         if not self.non_visible:
@@ -115,7 +123,7 @@ class ColumnarAttributesSpec(BaseModel):
     )] = list()
 
 
-class AttributesColumn(AttributesColumnSpec):
+class AttributesColumn(AttributesColumnBase):
     min_value: Annotated[int | float | str | None, Field(
         example="2023-08-25T22:08:30.576+0000",
         description="The minimum value for the column for numeric and date columns. "

diff --git a/src/loaders/common/loader_helper.py b/src/loaders/common/loader_helper.py
@@ -21,8 +21,6 @@
 from src.common.collection_column_specs.load_specs import load_spec
 from src.common.product_models.columnar_attribs_common_models import (
     ColumnType,
-    AttributesColumn,
-    ColumnarAttributesMeta,
 )
 from src.common.storage.db_doc_conversions import (
     collection_data_id_key,
@@ -55,7 +53,7 @@ def _convert_to_iso8601(date_string: str) -> str:
     # Convert a date string to ISO 8601 format
     formats_to_try = ["%Y/%m/%d",
                       "%Y-%m-%d",
-                      "%m/%d/%y",]  # Add more formats as needed
+                      "%m/%d/%y", ]  # Add more formats as needed
     # The current code always leaves the date in day precision with no time zone information as that's
     # all that's available from the current data.
     # If higher precision dates are encountered in the future the code should be adapted to
@@ -143,17 +141,16 @@ def process_columnar_meta(
             enum_values = list(set(values))
             enum_values.sort()
 
-        attri_column = AttributesColumn(
-            **col_spec.model_dump(),
-            min_value=min_value,
-            max_value=max_value,
-            enum_values=enum_values
-        )
-        columns.append(attri_column)
+        attri_column = {
+            'min_value': min_value,
+            'max_value': max_value,
+            'enum_values': enum_values,
+            **col_spec.model_dump()
+        }
 
-    columnar_attri_meta = ColumnarAttributesMeta(columns=columns, count=len(docs))
+        columns.append(attri_column)
 
-    meta_doc = columnar_attri_meta.model_dump()
+    meta_doc = {'columns': columns, 'count': len(docs)}
     meta_doc.update({
         names.FLD_ARANGO_KEY: collection_load_version_key(kbase_collection, load_ver),
         names.FLD_COLLECTION_ID: kbase_collection,

diff --git a/src/loaders/workspace_downloader/workspace_downloader.py b/src/loaders/workspace_downloader/workspace_downloader.py
@@ -133,7 +133,7 @@ def _process_object_info(
                 loader_common_names.FLD_KB_OBJ_TIMESTAMP: obj_info[3],
                 loader_common_names.FLD_KB_OBJ_GENOME_UPA: "{6}/{0}/{4}".format(*genome_info),
                 loader_common_names.ASSEMBLY_OBJ_INFO_KEY: obj_info,
-                loader_common_names.GENOME_METADATA_FILE: genome_info}
+                loader_common_names.GENOME_OBJ_INFO_KEY: genome_info}
 
     return res_dict
 
@@ -245,14 +245,15 @@ def _find_sample_upa(
     # find one and only one sample associated upa from input upas and retrieve the sample data
     # raise error if multiple samples are found
 
-    found_sample, sample_ret, sample_upa, sample_effective_time = False, None, None, None
+    found_sample_id, sample_ret, sample_upa, sample_effective_time = None, None, None, None
 
     for upa in upas:
         try:
             sample_ret, sample_effective_time = _retrieve_sample(conf, upa)
-            if found_sample:
+            sample_id = sample_ret['id']
+            if found_sample_id and found_sample_id != sample_id:
                 raise ValueError(f"Found multiple samples in input {upas}")
-            found_sample, sample_upa = True, upa
+            found_sample_id, sample_upa = sample_id, upa
         except NoDataLinkError:
             pass
 

diff --git a/src/service/data_products/common_functions.py b/src/service/data_products/common_functions.py
@@ -3,12 +3,15 @@
 """
 from typing import Any, Callable, NamedTuple
 
+from fastapi import Request
+
 import src.common.storage.collection_and_field_names as names
+from src.common.product_models import columnar_attribs_common_models as col_models
 from src.common.storage.db_doc_conversions import (
     collection_load_version_key,
     collection_data_id_key,
 )
-from src.service import errors, kb_auth, models
+from src.service import errors, kb_auth, models, app_state
 from src.service.filtering.filters import FilterSet
 from src.service.storage_arango import ArangoStorage
 
@@ -109,6 +112,77 @@ async def get_collection_singleton_from_db(
     )
 
 
+async def get_columnar_attribs_meta(
+        storage: ArangoStorage,
+        collection: str,
+        collection_id: str,
+        load_ver: str,
+        load_ver_override: bool,
+        return_only_visible: bool = False
+) -> col_models.ColumnarAttributesMeta:
+    """
+    Get the columnar attributes meta document for a collection. The document is expected to be
+    a singleton per collection load version.
+
+    storage - the storage system.
+    collection - the arango collection containing the document.
+    collection_id - the KBase collection containing the document.
+    load_ver - the load version of the collection.
+    load_ver_override - whether to override the load version.
+    return_only_visible - whether to return only visible columns.
+
+    """
+    doc = await get_collection_singleton_from_db(
+            storage,
+            collection,
+            collection_id,
+            load_ver,
+            load_ver_override
+    )
+
+    doc[col_models.FIELD_COLUMNS] = [
+        col_models.AttributesColumn(**d) for d in doc[col_models.FIELD_COLUMNS]
+        if not return_only_visible or not d[col_models.NON_VISIBLE]
+    ]
+
+    return col_models.ColumnarAttributesMeta(**remove_collection_keys(doc))
+
+
+async def get_product_meta(
+        r: Request,
+        collection: str,
+        collection_id: str,
+        data_product: str,
+        load_ver_override: str,
+        user: kb_auth.KBaseUser,
+
+) -> col_models.ColumnarAttributesMeta:
+    """
+    Get the columnar attributes meta document for a collection used by /meta endpoint.
+
+    r - the request.
+    collection - the arango collection containing the meta information.
+    collection_id - the ID of the Collection for which to retrieve the meta information.
+    data_product - the ID of the data product from which to retrieve the load version.
+    load_ver_override - an override for the load version. If provided:
+        * the user must be a service administrator
+        * the collection is not checked for the existence of the data product.
+    user - the user. Ignored if load_ver_override is not provided; must be a service administrator.
+
+    """
+
+    storage = app_state.get_app_state(r).arangostorage
+    _, load_ver = await get_load_version(storage, collection_id, data_product, load_ver_override, user)
+    meta = await get_columnar_attribs_meta(storage,
+                                           collection,
+                                           collection_id,
+                                           load_ver,
+                                           bool(load_ver_override),
+                                           return_only_visible=True)
+
+    return meta
+
+
 async def get_doc_from_collection_by_unique_id(
     store: ArangoStorage,
     collection: str,

diff --git a/src/service/data_products/data_product_processing.py b/src/service/data_products/data_product_processing.py
@@ -43,7 +43,7 @@ async def get_load_version_and_processes( # pretty huge method sig here
     match_id: str | None = None,
     selection_id: str | None = None,
     multiple_ids: bool = False,
-) -> tuple[str, models.DataProductProcess, models.DataProductProcess]:
+) -> tuple[str, models.DataProductProcess, models.DataProductProcess, models.ActiveCollection | None]:
     f"""
     Get the appropriate load version to use when querying data along with match and / or selection
     processes, if match and selection IDs are provided.
@@ -93,7 +93,7 @@ async def get_load_version_and_processes( # pretty huge method sig here
             appstate, coll, selection_id, data_product,
             partial(_process_subset, collection, multiple_ids)
         )
-    return load_ver, dp_match, dp_sel
+    return load_ver, dp_match, dp_sel, coll
 
 
 async def _process_subset(
@@ -154,7 +154,7 @@ async def get_missing_ids(
     if not match_id and not selection_id:
         raise errors.IllegalParameterError(
             "At last one of a match ID or selection ID must be supplied")
-    _, dp_match, dp_sel = await get_load_version_and_processes(
+    _, dp_match, dp_sel, _ = await get_load_version_and_processes(
         appstate,
         user,
         collection,