Merge pull request #1375 from Sage-Bionetworks/develop-FDS-241-mypy-u…

…tils FDS-241 mypy utils files
Sage-Bionetworks · Feb 26, 2024 · 6462e63 · 6462e63
2 parents 3a6ff92 + 22b2172
commit 6462e63
Show file tree

Hide file tree

Showing 8 changed files with 106 additions and 52 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -102,6 +102,7 @@ jobs:
           # poetry run mypy --install-types --non-interactive
           # add here when enforced
           poetry run mypy --disallow-untyped-defs --install-types --non-interactive schematic/configuration/*.py schematic/exceptions.py schematic/help.py schematic/loader.py schematic/version.py schematic/visualization
+          poetry run mypy --disallow-untyped-defs --install-types --non-interactive schematic/utils/cli_utils.py schematic/utils/curie_utils.py schematic/utils/df_utils.py schematic/utils/general.py schematic/utils/google_api_utils.py schematic/utils/validate_rules_utils.py schematic/utils/viz_utils.py
 
       #----------------------------------------------
       #             linting

diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py
@@ -32,7 +32,7 @@ def extract(dictionary: Any, key: Any) -> Union[Any, None]:
             return None
         return dictionary.get(key)
 
-    return reduce(extract, keys, dictionary)
+    return reduce(extract, keys, dictionary)  # type: ignore
 
 
 def log_value_from_config(arg_name: str, config_value: Any) -> None:

diff --git a/schematic/utils/curie_utils.py b/schematic/utils/curie_utils.py
@@ -1,7 +1,12 @@
 """Curie utils"""
 
 import logging
+from typing import Any, Union
 
+Context = dict[str, str]
+Record = dict[str, Union[str, list, dict, None]]
+Graph = list[Record]
+Schema = dict[str, Any]
 
 logger = logging.getLogger(__name__)
 
@@ -15,7 +20,7 @@ def extract_name_from_uri_or_curie(item: str) -> str:
     raise ValueError("Error extracting name from URI or Curie.")
 
 
-def expand_curie_to_uri(curie: str, context_info: dict[str, str]) -> str:
+def expand_curie_to_uri(curie: str, context_info: Context) -> str:
     """Expand curie to uri based on the context given
 
     parmas
@@ -36,27 +41,27 @@ def expand_curie_to_uri(curie: str, context_info: dict[str, str]) -> str:
     return curie
 
 
-def expand_curies_in_schema(schema):
+def expand_curies_in_schema(schema: Schema) -> Schema:
     """Expand all curies in a SchemaOrg JSON-LD file into URI"""
-    context = schema["@context"]
-    graph = schema["@graph"]
+    context: Context = schema["@context"]
+    graph: Graph = schema["@graph"]
     new_schema = {"@context": context, "@graph": [], "@id": schema["@id"]}
     for record in graph:
-        new_record = {}
+        new_record: Record = {}
         for key, value in record.items():
             if isinstance(value, str):
                 new_record[expand_curie_to_uri(key, context)] = expand_curie_to_uri(
                     value, context
                 )
             elif isinstance(value, list):
+                uri = expand_curie_to_uri(key, context)
                 if isinstance(value[0], dict):
-                    new_record[expand_curie_to_uri(key, context)] = []
+                    lst: list[dict[str, str]] = []
+                    new_record[uri] = lst
                     for _item in value:
-                        new_record[expand_curie_to_uri(key, context)].append(
-                            {"@id": expand_curie_to_uri(_item["@id"], context)}
-                        )
+                        lst.append({"@id": expand_curie_to_uri(_item["@id"], context)})
                 else:
-                    new_record[expand_curie_to_uri(key, context)] = [
+                    new_record[uri] = [
                         expand_curie_to_uri(_item, context) for _item in value
                     ]
             elif isinstance(value, dict) and "@id" in value:
@@ -69,7 +74,7 @@ def expand_curies_in_schema(schema):
     return new_schema
 
 
-def uri2label(uri, schema):
+def uri2label(uri: str, schema: Schema) -> list:
     """Given a URI, return the label"""
     return [
         record["rdfs:label"] for record in schema["@graph"] if record["@id"] == uri

diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py
@@ -92,18 +92,21 @@ def find_and_convert_ints(dataframe: pd.DataFrame) -> tuple[pd.DataFrame, pd.Dat
     if (
         dataframe.size < large_manifest_cutoff_size
     ):  # If small manifest, iterate as normal for improved performance
-        ints = dataframe.map(
+        ints = dataframe.map(  # type:ignore
             lambda cell: convert_ints(cell), na_action="ignore"
         ).fillna(False)
 
     else:  # parallelize iterations for large manifests
         pandarallel.initialize(verbose=1)
-        ints = dataframe.parallel_applymap(
+        ints = dataframe.parallel_applymap(  # type:ignore
             lambda cell: convert_ints(cell), na_action="ignore"
         ).fillna(False)
 
     # Identify cells converted to integers
-    is_int = ints.map(pd.api.types.is_integer)
+    is_int = ints.map(pd.api.types.is_integer)  # type:ignore
+
+    assert isinstance(ints, pd.DataFrame)
+    assert isinstance(is_int, pd.DataFrame)
 
     return ints, is_int
 

diff --git a/schematic/utils/general.py b/schematic/utils/general.py
@@ -66,14 +66,17 @@ def str2list(item: Any) -> Optional[list]:
     return None
 
 
-def unlist(seq: Sequence) -> Any:
+X = TypeVar("X")
+
+
+def unlist(seq: Sequence[X]) -> Union[Sequence[X], X]:
     """Returns the first item of a sequence
 
     Args:
-        seq (Sequence): Any sequence
+        seq (Sequence[X]): A Sequence of any type
 
     Returns:
-        Any:
+        Union[Sequence[X], X]:
           if sequence is length one, return the first item
           otherwise return the sequence
     """
@@ -151,12 +154,12 @@ def check_synapse_cache_size(
         size_in_mb = float(size.rstrip("M"))
         byte_size = size_in_mb * 1000000
     elif "G" in size:
-        size_in_gb = float(size.rstrip("G"))
+        size_in_gb = int(size.rstrip("G"))
         byte_size = convert_gb_to_bytes(size_in_gb)
     elif "B" in size:
         byte_size = float(size.rstrip("B"))
     else:
-        logger.error("Cannot recongize the file size unit")
+        logger.error("Cannot recognize the file size unit")
     return byte_size
 
 
@@ -275,9 +278,9 @@ def profile(
         Callable: Profile of the decorated function
     """
 
-    def inner(func):
+    def inner(func: Callable) -> Callable:
         @wraps(func)
-        def wrapper(*args, **kwargs):
+        def wrapper(*args: Any, **kwargs: Any) -> Callable:
             _output_file = output_file or func.__name__ + ".prof"
             profiler = Profile()
             profiler.enable()
@@ -299,7 +302,7 @@ def wrapper(*args, **kwargs):
                         p_stats.sort_stats(*sort_by)
                     else:
                         p_stats.sort_stats(sort_by)
-                    p_stats.print_stats(lines_to_print)
+                    p_stats.print_stats(lines_to_print)  # type: ignore
             return retval
 
         return wrapper

diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py
@@ -5,10 +5,10 @@
 import os
 import logging
 import json
-from typing import Any, Union
+from typing import Any, Union, no_type_check, TypedDict
 
 import pandas as pd
-from googleapiclient.discovery import build  # type: ignore
+from googleapiclient.discovery import build, Resource  # type: ignore
 from google.oauth2 import service_account  # type: ignore
 from schematic.configuration.configuration import CONFIG
 from schematic.store.synapse import SynapseStorage
@@ -23,11 +23,18 @@
 ]
 
 
-def build_service_account_creds() -> dict[str, Any]:
+class GoogleServiceAcountCreds(TypedDict):
+    "Service account credentials for Google sheets"
+    sheet_service: Resource
+    drive_service: Resource
+    creds: service_account.Credentials
+
+
+def build_service_account_creds() -> GoogleServiceAcountCreds:
     """Build Google service account credentials
 
     Returns:
-        dict[str, Any]: The credentials
+        GoogleServiceAcountCreds: The credentials
     """
     if "SERVICE_ACCOUNT_CREDS" in os.environ:
         dict_creds = json.loads(os.environ["SERVICE_ACCOUNT_CREDS"])
@@ -48,15 +55,16 @@ def build_service_account_creds() -> dict[str, Any]:
         )
 
     # get a Google Sheet API service
-    sheet_service = build("sheets", "v4", credentials=credentials)
+    sheet_service: Resource = build("sheets", "v4", credentials=credentials)
     # get a Google Drive API service
-    drive_service = build("drive", "v3", credentials=credentials)
+    drive_service: Resource = build("drive", "v3", credentials=credentials)
 
-    return {
+    creds: GoogleServiceAcountCreds = {
         "sheet_service": sheet_service,
         "drive_service": drive_service,
         "creds": credentials,
     }
+    return creds
 
 
 def download_creds_file() -> None:
@@ -92,7 +100,8 @@ def download_creds_file() -> None:
         )
 
 
-def execute_google_api_requests(service: Any, requests_body: Any, **kwargs) -> Any:
+@no_type_check
+def execute_google_api_requests(service, requests_body, **kwargs) -> Any:
     """
     Execute google API requests batch; attempt to execute in parallel.