From ba043c6df04cd067f4f6e8208d288571ef3389b7 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 11:59:36 -0800 Subject: [PATCH 01/17] lint all utils files except schema utils --- schematic/manifest/generator.py | 2 +- schematic/utils/cli_utils.py | 92 ++++++----- schematic/utils/curie_utils.py | 47 +++--- schematic/utils/df_utils.py | 152 +++++++++--------- schematic/utils/general.py | 195 +++++++++++++++--------- schematic/utils/google_api_utils.py | 121 +++++++++------ schematic/utils/io_utils.py | 29 ++-- schematic/utils/validate_rules_utils.py | 50 +++--- schematic/utils/validate_utils.py | 73 ++++----- schematic/utils/viz_utils.py | 22 ++- 10 files changed, 452 insertions(+), 331 deletions(-) diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index 090024f04..79c8ed4af 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -1503,7 +1503,7 @@ def export_sheet_to_excel( export_manifest_drive_service( manifest_url, file_path=output_excel_file_path, - mimeType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + mime_Type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) return output_excel_file_path diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index 684bafba1..ce701834e 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -1,9 +1,8 @@ -#!/usr/bin/env python3 +"""CLI utils""" -import inspect import logging -from typing import Any, Mapping, Sequence, Union, List +from typing import Any, Mapping, Sequence, Union, Optional from functools import reduce import re @@ -11,6 +10,7 @@ # We are using fstrings in logger methods # pylint: disable=logging-fstring-interpolation +# pylint: disable = anomalous-backslash-in-string def query_dict(dictionary: Mapping[Any, Any], keys: Sequence[Any]) -> Union[Any, None]: @@ -48,52 +48,66 @@ def log_value_from_config(arg_name: str, config_value: Any): ) -def parse_synIDs( - ctx, - param, - synIDs, -) -> List[str]: - """Parse and validate a comma separated string of synIDs +def parse_synIDs( # pylint: disable=invalid-name + ctx: Any, # pylint: disable=unused-argument + param: str, # pylint: disable=unused-argument + synIDs: str, # pylint: disable=invalid-name +) -> Optional[list[str]]: + """For backwards compatibility""" + parse_syn_ids(ctx, param, synIDs) - Args: - ctx: - click option context - param: - click option argument name - synIDs: - comma separated string of synIDs - Returns: - List of synID strings +def parse_syn_ids( + ctx: Any, # pylint: disable=unused-argument + param: str, # pylint: disable=unused-argument + syn_ids: str, +) -> Optional[list[str]]: + """Parse and validate a comma separated string of synapse ids + + Args: + ctx (Any): click option context + param (str): click option argument name + syn_ids (str): comma separated string of synapse ids Raises: - ValueError: If the entire string does not match a regex for + ValueError: If the entire string does not match a regex for a valid comma separated string of SynIDs + + Returns: + Optional[list[str]]: List of synapse ids """ - if synIDs: - project_regex = re.compile("(syn\d+\,?)+") - valid = project_regex.fullmatch(synIDs) + if not syn_ids: + return None - if valid: - synIDs = synIDs.split(",") + project_regex = re.compile("(syn\d+\,?)+") + valid = project_regex.fullmatch(syn_ids) - return synIDs + if not valid: + raise ValueError( + f"The provided list of project synID(s): {syn_ids}, is not formatted correctly. " + "\nPlease check your list of projects for errors." + ) - else: - raise ValueError( - f"The provided list of project synID(s): {synIDs}, is not formatted correctly. " - "\nPlease check your list of projects for errors." - ) - else: - return + syn_ids = syn_ids.split(",") + return syn_ids def parse_comma_str_to_list( - ctx, - param, - comma_string, -) -> List[str]: - if comma_string: - return comma_string.split(",") - else: + ctx: Any, # pylint: disable=unused-argument + param: str, # pylint: disable=unused-argument + comma_string: str, +) -> Optional[list[str]]: + """Separates a comma separated sting into a list of strings + + Args: + ctx (Any): click option context + param (str): click option argument name + comma_string (str): comma separated string + + Returns: + Optional[list[str]]: _description_ + """ + if not comma_string: return None + + return comma_string.split(",") diff --git a/schematic/utils/curie_utils.py b/schematic/utils/curie_utils.py index 42361b482..fd24fd297 100644 --- a/schematic/utils/curie_utils.py +++ b/schematic/utils/curie_utils.py @@ -1,3 +1,5 @@ +"""Curie utils""" + import logging @@ -8,10 +10,9 @@ def extract_name_from_uri_or_curie(item): """Extract name from uri or curie""" if "http" not in item and len(item.split(":")) == 2: return item.split(":")[-1] - elif len(item.split("//")[-1].split("/")) > 1: + if len(item.split("//")[-1].split("/")) > 1: return item.split("//")[-1].split("/")[-1] - else: - raise ValueError("Error extracting name from URI or Curie.") + raise ValueError("Error extracting name from URI or Curie.") def expand_curie_to_uri(curie, context_info): @@ -24,17 +25,15 @@ def expand_curie_to_uri(curie, context_info): "http://schema.biothings.io/"}) """ # as suggested in SchemaOrg standard file, these prefixes don't expand - PREFIXES_NOT_EXPAND = ["rdf", "rdfs", "xsd"] + prefixes_not_expand = ["rdf", "rdfs", "xsd"] # determine if a value is curie if len(curie.split(":")) == 2: prefix, value = curie.split(":") - if prefix in context_info and prefix not in PREFIXES_NOT_EXPAND: + if prefix in context_info and prefix not in prefixes_not_expand: return context_info[prefix] + value # if the input is not curie, return the input unmodified - else: - return curie - else: return curie + return curie def expand_curies_in_schema(schema): @@ -44,28 +43,28 @@ def expand_curies_in_schema(schema): new_schema = {"@context": context, "@graph": [], "@id": schema["@id"]} for record in graph: new_record = {} - for k, v in record.items(): - if type(v) == str: - new_record[expand_curie_to_uri(k, context)] = expand_curie_to_uri( - v, context + for key, value in record.items(): + if isinstance(value, str): + new_record[expand_curie_to_uri(key, context)] = expand_curie_to_uri( + value, context ) - elif type(v) == list: - if type(v[0]) == dict: - new_record[expand_curie_to_uri(k, context)] = [] - for _item in v: - new_record[expand_curie_to_uri(k, context)].append( + elif isinstance(value, list): + if isinstance(value[0], dict): + new_record[expand_curie_to_uri(key, context)] = [] + for _item in value: + new_record[expand_curie_to_uri(key, context)].append( {"@id": expand_curie_to_uri(_item["@id"], context)} ) else: - new_record[expand_curie_to_uri(k, context)] = [ - expand_curie_to_uri(_item, context) for _item in v + new_record[expand_curie_to_uri(key, context)] = [ + expand_curie_to_uri(_item, context) for _item in value ] - elif type(v) == dict and "@id" in v: - new_record[expand_curie_to_uri(k, context)] = { - "@id": expand_curie_to_uri(v["@id"], context) + elif isinstance(value, dict) and "@id" in value: + new_record[expand_curie_to_uri(key, context)] = { + "@id": expand_curie_to_uri(value["@id"], context) } - elif v == None: - new_record[expand_curie_to_uri(k, context)] = None + elif value is None: + new_record[expand_curie_to_uri(key, context)] = None new_schema["@graph"].append(new_record) return new_schema diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 14c8d6b2b..2fbd8cc4d 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -1,28 +1,38 @@ +"""df utils""" + import logging from copy import deepcopy from time import perf_counter -import datetime as dt import dateparser as dp import pandas as pd import numpy as np from pandarallel import pandarallel +# pylint: disable=logging-fstring-interpolation + logger = logging.getLogger(__name__) -def load_df(file_path, preserve_raw_input=True, data_model=False, **load_args): +def load_df( + file_path: str, + preserve_raw_input: bool = True, + data_model: bool = False, + **load_args: dict, +) -> pd.DataFrame: """ Universal function to load CSVs and return DataFrames Parses string entries to convert as appropriate to type int, float, and pandas timestamp - Pandarallel is used for type inference for large manfiests to improve performance + Pandarallel is used for type inference for large manifests to improve performance + Args: - file_path: path of csv to open - preserve_raw_input: Bool. If false, convert cell datatypes to an inferred type - data_model: bool, indicates if importing a data model - load_args: dict of key value pairs to be passed to the pd.read_csv function - **kwargs: keyword arguments for pd.read_csv() + file_path (str): path of csv to open + preserve_raw_input (bool, optional): If false, convert cell datatypes to an inferred type + data_model (bool, optional): bool, indicates if importing a data model + **load_args(dict): dict of key value pairs to be passed to the pd.read_csv function - Returns: a processed dataframe for manifests or unprocessed df for data models and where indicated + Returns: + pd.DataFrame: a processed dataframe for manifests or unprocessed df for data models and + where indicated """ large_manifest_cutoff_size = 1000 # start performance timer @@ -42,48 +52,48 @@ def load_df(file_path, preserve_raw_input=True, data_model=False, **load_args): return org_df # If type inferences is allowed: infer types, trim, and return - else: - # create a separate copy of the manifest - # before beginning conversions to store float values - float_df = deepcopy(org_df) - - # Cast the columns in the dataframe to string and - # replace Null values with empty strings - null_cells = org_df.isnull() - org_df = org_df.astype(str).mask(null_cells, "") - - # Find integers stored as strings and replace with entries of type np.int64 - if ( - org_df.size < large_manifest_cutoff_size - ): # If small manifest, iterate as normal for improved performance - ints = org_df.applymap( - lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" - ).fillna(False) - - else: # parallelize iterations for large manfiests - pandarallel.initialize(verbose=1) - ints = org_df.parallel_applymap( - lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" - ).fillna(False) - - # Identify cells converted to intergers - ints_tf_df = ints.applymap(pd.api.types.is_integer) - - # convert strings to numerical dtype (float) if possible, preserve non-numerical strings - for col in org_df.columns: - float_df[col] = pd.to_numeric(float_df[col], errors="coerce") - # replace values that couldn't be converted to float with the original str values - float_df[col].fillna(org_df[col][float_df[col].isna()], inplace=True) - - # Trim nans and empty rows and columns - processed_df = trim_commas_df(float_df) - - # Store values that were converted to type int in the final dataframe - processed_df = processed_df.mask(ints_tf_df, other=ints) - - # log manifest load and processing time - logger.debug(f"Load Elapsed time {perf_counter()-t_load_df}") - return processed_df + + # create a separate copy of the manifest + # before beginning conversions to store float values + float_df = deepcopy(org_df) + + # Cast the columns in the dataframe to string and + # replace Null values with empty strings + null_cells = org_df.isnull() + org_df = org_df.astype(str).mask(null_cells, "") + + # Find integers stored as strings and replace with entries of type np.int64 + if ( + org_df.size < large_manifest_cutoff_size + ): # If small manifest, iterate as normal for improved performance + ints = org_df.applymap( + lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" + ).fillna(False) + + else: # parallelize iterations for large manfiests + pandarallel.initialize(verbose=1) + ints = org_df.parallel_applymap( + lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" + ).fillna(False) + + # Identify cells converted to integers + ints_tf_df = ints.applymap(pd.api.types.is_integer) + + # convert strings to numerical dtype (float) if possible, preserve non-numerical strings + for col in org_df.columns: + float_df[col] = pd.to_numeric(float_df[col], errors="coerce") + # replace values that couldn't be converted to float with the original str values + float_df[col].fillna(org_df[col][float_df[col].isna()], inplace=True) + + # Trim nan's and empty rows and columns + processed_df = trim_commas_df(float_df) + + # Store values that were converted to type int in the final dataframe + processed_df = processed_df.mask(ints_tf_df, other=ints) + + # log manifest load and processing time + logger.debug(f"Load Elapsed time {perf_counter()-t_load_df}") + return processed_df def _parse_dates(date_string): @@ -94,11 +104,11 @@ def _parse_dates(date_string): return False -def normalize_table(df: pd.DataFrame, primary_key: str) -> pd.DataFrame: +def normalize_table(dataframe: pd.DataFrame, primary_key: str) -> pd.DataFrame: """ Function to normalize a table (e.g. dedup) Args: - df: data frame to normalize + dataframe: data frame to normalize primary_key: primary key on which to perform dedup Returns: a dedupped dataframe @@ -106,9 +116,9 @@ def normalize_table(df: pd.DataFrame, primary_key: str) -> pd.DataFrame: try: # if valid primary key has been provided normalize df - df = df.reset_index() - df_norm = df.drop_duplicates(subset=[primary_key]) - df_norm = df.drop(columns=["index"]) + dataframe = dataframe.reset_index() + df_norm = dataframe.drop_duplicates(subset=[primary_key]) + df_norm = dataframe.drop(columns=["index"]) return df_norm except KeyError: # if the primary key is not in the df; then return the same df w/o changes @@ -116,7 +126,7 @@ def normalize_table(df: pd.DataFrame, primary_key: str) -> pd.DataFrame: "Specified primary key is not in table schema. Proceeding without table changes." ) - return df + return dataframe def update_df( @@ -171,47 +181,47 @@ def update_df( return input_df_idx -def trim_commas_df(df: pd.DataFrame): +def trim_commas_df(dataframe: pd.DataFrame) -> pd.DataFrame: """Removes empty (trailing) columns and empty rows from pandas dataframe (manifest data). Args: - df: pandas dataframe with data from manifest file. + dataframe: pandas dataframe with data from manifest file. Returns: df: cleaned-up pandas dataframe. """ # remove all columns which have substring "Unnamed" in them - df = df.loc[:, ~df.columns.str.contains("^Unnamed")] + dataframe = dataframe.loc[:, ~dataframe.columns.str.contains("^Unnamed")] # remove all completely empty rows - df = df.dropna(how="all", axis=0) + dataframe = dataframe.dropna(how="all", axis=0) # Fill in nan cells with empty strings - df.fillna("", inplace=True) - return df + dataframe.fillna("", inplace=True) + return dataframe -def col_in_dataframe(col: str, df: pd.DataFrame) -> bool: - """Check if a column is in a dataframe, without worring about case +def col_in_dataframe(col: str, dataframe: pd.DataFrame) -> bool: + """Check if a column is in a dataframe, without worrying about case Args: col: name of column whose presence in the dataframe is being checked - df: pandas dataframe with data from manifest file. + dataframe: pandas dataframe with data from manifest file. Returns: bool: whether or not the column name is a column in the dataframe, case agnostic """ return col.lower() in [ - manifest_col.lower() for manifest_col in df.columns.to_list() + manifest_col.lower() for manifest_col in dataframe.columns.to_list() ] def populate_df_col_with_another_col( - df: pd.DataFrame, source_col: str, target_col: str + dataframe: pd.DataFrame, source_col: str, target_col: str ) -> pd.DataFrame: """Copy the values from one column in a dataframe to another column in the same dataframe Args: - df: pandas dataframe with data from manifest file. + dataframe: pandas dataframe with data from manifest file. source_col: column whose contents to copy over target_col: column to be updated with other contents @@ -219,5 +229,5 @@ def populate_df_col_with_another_col( dataframe with contents updated """ # Copy the contents over - df[target_col] = df[source_col] - return df + dataframe[target_col] = dataframe[source_col] + return dataframe diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 66ddb2252..5af2bf6a2 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -1,6 +1,6 @@ -# allows specifying explicit variable types +"""General utils""" + import logging -import math import os import pstats import subprocess @@ -8,53 +8,92 @@ from cProfile import Profile from datetime import datetime, timedelta from functools import wraps -from typing import Union +from typing import Union, TypeVar, Any, Optional, Sequence, Callable from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.entity import File, Folder, Project from synapseclient.table import EntityViewSchema +from synapseclient.core import cache +from synapseclient import Synapse -import synapseclient.core.cache as cache +# pylint: disable=logging-fstring-interpolation logger = logging.getLogger(__name__) +T = TypeVar("T") + -def find_duplicates(_list): +def find_duplicates(_list: list[T]) -> set[T]: """Find duplicate items in a list""" - return set([x for x in _list if _list.count(x) > 1]) + return {x for x in _list if _list.count(x) > 1} -def dict2list(dictionary): - if type(dictionary) == list: - return dictionary - elif type(dictionary) == dict: - return [dictionary] +def dict2list(item: Any) -> Optional[Union[dict, list]]: + """Puts a dictionary into a list + Args: + item (Any): Any type of input -def str2list(_str): - if type(_str) == str: - return [_str] - elif type(_str) == list: - return _str + Returns: + Optional[Union[dict, list]]: + If input is a list, return it + If input is a dict, return it in a list + Return None for anything else + """ + if isinstance(item, list): + return item + if isinstance(item, dict): + return [item] + return None -def unlist(_list): - if len(_list) == 1: - return _list[0] - else: - return _list +def str2list(item: Any) -> Optional[list]: + """Puts a string into a list + + Args: + item (Any): Any type of input + + Returns: + Optional[list]: + If input is a list, return it + If input is a string, return it in a list + Return None for anything else + """ + if isinstance(item, str): + return [item] + if isinstance(item, list): + return item + return None + + +def unlist(seq: Sequence) -> Any: + """Returns the first item of a sequence + + Args: + seq (Sequence): Any sequence + + Returns: + Any: + if sequence is length one, return the first item + otherwise return the sequence + """ + if len(seq) == 1: + return seq[0] + return seq -def get_dir_size(path: str): - """Recursively descend the directory tree rooted at the top and call .st_size function to calculate size of files in bytes. +def get_dir_size(path: str) -> int: + """ + Recursively descend the directory tree rooted at the top and call + .st_size function to calculate size of files in bytes. Args: path: path to a folder return: total size of all the files in a given directory in bytes. """ total = 0 # Recursively scan directory to find entries - with os.scandir(path) as it: - for entry in it: + with os.scandir(path) as itr: + for entry in itr: if entry.is_file(): total += entry.stat().st_size elif entry.is_dir(): @@ -70,7 +109,8 @@ def calculate_datetime( Args: input_date (datetime): date time object provided by users minutes (int): number of minutes - before_or_after (str): default to "before". if "before", calculate x minutes before current date time. if "after", calculate x minutes after current date time. + before_or_after (str): default to "before". if "before", calculate x minutes before + current date time. if "after", calculate x minutes after current date time. Returns: datetime: return result of date time calculation @@ -93,9 +133,10 @@ def check_synapse_cache_size(directory="/root/.synapseCache") -> Union[float, in Returns: float or integer: returns size of .synapsecache directory in bytes """ - # Note: this command might fail on windows user. But since this command is primarily for running on AWS, it is fine. + # Note: this command might fail on windows user. + # But since this command is primarily for running on AWS, it is fine. command = ["du", "-sh", directory] - output = subprocess.run(command, capture_output=True).stdout.decode("utf-8") + output = subprocess.run(command, capture_output=True, check=False).stdout.decode("utf-8") # Parsing the output to extract the directory size size = output.split("\t")[0] @@ -115,11 +156,11 @@ def check_synapse_cache_size(directory="/root/.synapseCache") -> Union[float, in return byte_size -def clear_synapse_cache(cache: cache.Cache, minutes: int) -> int: +def clear_synapse_cache(synapse_cache: cache.Cache, minutes: int) -> int: """clear synapse cache before a certain time Args: - cache: an object of synapseclient Cache. + synapse_cache: an object of synapseclient Cache. minutes (int): all files before this minute will be removed Returns: int: number of files that get deleted @@ -128,49 +169,54 @@ def clear_synapse_cache(cache: cache.Cache, minutes: int) -> int: minutes_earlier = calculate_datetime( input_date=current_date, minutes=minutes, before_or_after="before" ) - num_of_deleted_files = cache.purge(before_date=minutes_earlier) + num_of_deleted_files = synapse_cache.purge(before_date=minutes_earlier) return num_of_deleted_files -def convert_gb_to_bytes(gb: int): +def convert_gb_to_bytes(g_bytes: int) -> int: """convert gb to bytes Args: - gb: number of gb + g_bytes: number of gb return: total number of bytes """ - return gb * 1024 * 1024 * 1024 + return g_bytes * 1024 * 1024 * 1024 -def entity_type_mapping(syn, entity_id): - """ - Return the entity type of manifest +def entity_type_mapping(syn: Synapse, entity_id: str) -> str: + """Return the entity type of manifest + Args: - entity_id: id of an entity - Return: - type_entity: type of the manifest being returned + syn (Synapse): Synapse object + entity_id (str): id of an entity + + Raises: + SynapseHTTPError: Re-raised SynapseHTTPError + + Returns: + str: type of the manifest being returned """ # check the type of entity try: entity = syn.get(entity_id, downloadFile=False) - except SynapseHTTPError as e: + except SynapseHTTPError as exc: logger.error( f"cannot get {entity_id} from asset store. Please make sure that {entity_id} exists" ) raise SynapseHTTPError( f"cannot get {entity_id} from asset store. Please make sure that {entity_id} exists" - ) from e + ) from exc if isinstance(entity, EntityViewSchema): return "asset view" - elif isinstance(entity, Folder): + if isinstance(entity, Folder): return "folder" - elif isinstance(entity, File): + if isinstance(entity, File): return "file" - elif isinstance(entity, Project): + if isinstance(entity, Project): return "project" - else: - # if there's no matching type, return concreteType - return entity.concreteType + + # if there's no matching type, return concreteType + return entity.concreteType def create_temp_folder(path: str) -> str: @@ -185,59 +231,70 @@ def create_temp_folder(path: str) -> str: def profile( - output_file=None, sort_by="cumulative", lines_to_print=None, strip_dirs=False -): + output_file: Optional[str] = None, + sort_by="cumulative", + lines_to_print: Optional[int] = None, + strip_dirs: bool = False, +) -> Callable: """ - The function was initially taken from: https://towardsdatascience.com/how-to-profile-your-code-in-python-e70c834fad89 + The function was initially taken from: + https://towardsdatascience.com/how-to-profile-your-code-in-python-e70c834fad89 A time profiler decorator. Inspired by and modified the profile decorator of Giampaolo Rodola: http://code.activestate.com/recipes/577817-profile-decorator/ + Args: - output_file: str or None. Default is None + output_file (Optional[str], optional): Path of the output file. If only name of the file is given, it's saved in the current directory. If it's None, the name of the decorated function is used. - sort_by: str or SortKey enum or tuple/list of str/SortKey enum + Defaults to None. + sort_by (str, optional): + str or SortKey enum or tuple/list of str/SortKey enum Sorting criteria for the Stats object. For a list of valid string and SortKey refer to: https://docs.python.org/3/library/profile.html#pstats.Stats.sort_stats - lines_to_print: int or None - Number of lines to print. Default (None) is for all the lines. + Defaults to "cumulative". + lines_to_print (Optional[int], optional): + Number of lines to print. This is useful in reducing the size of the printout, especially that sorting by 'cumulative', the time consuming operations are printed toward the top of the file. - strip_dirs: bool + Default (None) is for all the lines. + strip_dirs (bool, optional): Whether to remove the leading path info from file names. This is also useful in reducing the size of the printout + Defaults to False. + Returns: - Profile of the decorated function + Callable: Profile of the decorated function """ def inner(func): @wraps(func) def wrapper(*args, **kwargs): _output_file = output_file or func.__name__ + ".prof" - pr = Profile() - pr.enable() + profiler = Profile() + profiler.enable() retval = func(*args, **kwargs) - pr.disable() - pr.dump_stats(_output_file) + profiler.disable() + profiler.dump_stats(_output_file) # if we are running the functions on AWS: if "SECRETS_MANAGER_SECRETS" in os.environ: - ps = pstats.Stats(pr) + p_stats = pstats.Stats(profiler) # limit this to 30 line for now otherwise it will be too long for AWS log - ps.sort_stats("cumulative").print_stats(30) + p_stats.sort_stats("cumulative").print_stats(30) else: - with open(_output_file, "w") as f: - ps = pstats.Stats(pr, stream=f) + with open(_output_file, "w", encoding="utf-8") as fle: + p_stats = pstats.Stats(profiler, stream=fle) if strip_dirs: - ps.strip_dirs() + p_stats.strip_dirs() if isinstance(sort_by, (tuple, list)): - ps.sort_stats(*sort_by) + p_stats.sort_stats(*sort_by) else: - ps.sort_stats(sort_by) - ps.print_stats(lines_to_print) + p_stats.sort_stats(sort_by) + p_stats.print_stats(lines_to_print) return retval return wrapper diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index f7862a3a2..05d8e095d 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -1,19 +1,18 @@ +"""Google API utils""" + import os -import pickle import logging import json -import pygsheets as ps - -from typing import Dict, Any +from typing import Any, Union, Optional +import pandas as pd from googleapiclient.discovery import build -from google_auth_oauthlib.flow import InstalledAppFlow -from google.auth.transport.requests import Request from google.oauth2 import service_account -from google.oauth2.credentials import Credentials + from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage -import pandas as pd + +# pylint: disable=logging-fstring-interpolation logger = logging.getLogger(__name__) @@ -25,9 +24,10 @@ ] -# TODO: replace by pygsheets calls? -def build_credentials() -> Dict[str, Any]: - creds = generate_token() +# This function doesn't appear to be used or tested anywhere in schematic. +# TO DO: replace by pygsheets calls? +def build_credentials() -> dict[str, Any]: # pylint: disable=missing-function-docstring + creds = generate_token() # pylint: disable=undefined-variable # get a Google Sheet API service sheet_service = build("sheets", "v4", credentials=creds) @@ -41,7 +41,12 @@ def build_credentials() -> Dict[str, Any]: } -def build_service_account_creds() -> Dict[str, Any]: +def build_service_account_creds() -> dict[str, Any]: + """Build Google service account credentials + + Returns: + dict[str, Any]: The credentials + """ if "SERVICE_ACCOUNT_CREDS" in os.environ: dict_creds = json.loads(os.environ["SERVICE_ACCOUNT_CREDS"]) credentials = service_account.Credentials.from_service_account_info( @@ -73,6 +78,7 @@ def build_service_account_creds() -> Dict[str, Any]: def download_creds_file() -> None: + """Download google credentials file""" syn = SynapseStorage.login() # if file path of service_account does not exist @@ -83,13 +89,13 @@ def download_creds_file() -> None: and "SERVICE_ACCOUNT_CREDS" not in os.environ ): # synapse ID of the 'schematic_service_account_creds.json' file - API_CREDS = CONFIG.service_account_credentials_synapse_id + api_creds = CONFIG.service_account_credentials_synapse_id # Download in parent directory of SERVICE_ACCT_CREDS to # ensure same file system for os.rename() creds_dir = os.path.dirname(CONFIG.service_account_credentials_path) - creds_file = syn.get(API_CREDS, downloadLocation=creds_dir) + creds_file = syn.get(api_creds, downloadLocation=creds_dir) os.rename(creds_file.path, CONFIG.service_account_credentials_path) logger.info( @@ -104,16 +110,19 @@ def download_creds_file() -> None: ) -def execute_google_api_requests(service, requests_body, **kwargs): +def execute_google_api_requests(service: Any, requests_body: Any, **kwargs) -> Any: """ Execute google API requests batch; attempt to execute in parallel. + Args: - service: google api service; for now assume google sheets service that is instantiated and authorized - service_type: default batchUpdate; TODO: add logic for values update + service (Any): google api service; for now assume google sheets service that is + instantiated and authorized + requests_body (Any): _description_ kwargs: google API service parameters - Return: google API response - """ + Returns: + Any: google API response or None + """ if ( "spreadsheet_id" in kwargs and "service_type" in kwargs @@ -127,21 +136,23 @@ def execute_google_api_requests(service, requests_body, **kwargs): ) return response + return None -def export_manifest_drive_service(manifest_url, file_path, mimeType): +def export_manifest_drive_service( + manifest_url: str, file_path: str, mime_type: str +) -> None: """ - Export manifest by using google drive api. If export as an Excel spreadsheet, the exported spreasheet would also include a hidden sheet - Args: - manifest_url: google sheet manifest url - file_path: file path of the exported manifest - mimeType: exporting mimetype - + Export manifest by using google drive api. If export as an Excel spreadsheet, + the exported spreadsheet would also include a hidden sheet result: Google sheet gets exported in desired format + Args: + manifest_url (str): google sheet manifest url + file_path (str): file path of the exported manifest + mime_type (str): exporting mimetype """ - - # intialize drive service + # initialize drive service services_creds = build_service_account_creds() drive_service = services_creds["drive_service"] @@ -150,45 +161,51 @@ def export_manifest_drive_service(manifest_url, file_path, mimeType): # use google drive data = ( - drive_service.files().export(fileId=spreadsheet_id, mimeType=mimeType).execute() + drive_service.files() + .export(fileId=spreadsheet_id, mimeType=mime_type) + .execute() # pylint: disable=no-member ) # open file and write data - with open(os.path.abspath(file_path), "wb") as f: + with open(os.path.abspath(file_path), "wb") as fle: try: - f.write(data) + fle.write(data) except FileNotFoundError as not_found: logger.error(f"{not_found.filename} could not be found") - f.close - -def export_manifest_csv(file_path, manifest): +def export_manifest_csv(file_path: str, manifest: Union[pd.DataFrame, str]) -> None: """ Export manifest as a CSV by using google drive api - Args: - manifest: could be a dataframe or a manifest url - file_path: file path of the exported manifest - mimeType: exporting mimetype - result: Google sheet gets exported as a CSV - """ + Args: + file_path (str): file path of the exported manifest + manifest (Union[pd.DataFrame, str]): could be a dataframe or a manifest url + """ if isinstance(manifest, pd.DataFrame): manifest.to_csv(file_path, index=False) else: - export_manifest_drive_service(manifest, file_path, mimeType="text/csv") + export_manifest_drive_service(manifest, file_path, mime_type="text/csv") -def export_manifest_excel(manifest, output_excel=None): +# This function doesn't appear to be used or tested +# pd.ExcelWriter is an ABC class which means it SHOULD NOT be instantiated +def export_manifest_excel( + manifest: Union[pd.DataFrame, str], output_excel: Optional[str] = None +) -> None: """ - Export manifest as an Excel spreadsheet by using google sheet API. This approach could export hidden sheet + Export manifest as an Excel spreadsheet by using google sheet API. + This approach could export hidden sheet + Google sheet gets exported as an excel spreadsheet. + If there's a hidden sheet, the hidden sheet also gets exported. + Args: - manifest: could be a dataframe or a manifest url - output_excel: name of the exported manifest sheet - result: Google sheet gets exported as an excel spreadsheet. If there's a hidden sheet, the hidden sheet also gets exported. + manifest (Union[pd.DataFrame, str]): could be a dataframe or a manifest url + output_excel (Optional[str], optional): name of the exported manifest sheet. + Defaults to None. """ - # intialize drive service + # initialize drive service services_creds = build_service_account_creds() sheet_service = services_creds["sheet_service"] @@ -200,17 +217,21 @@ def export_manifest_excel(manifest, output_excel=None): # use google sheet api sheet_metadata = ( - sheet_service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute() + sheet_service.spreadsheets() # pylint: disable=no-member + .get(spreadsheetId=spreadsheet_id) + .execute() ) sheets = sheet_metadata.get("sheets") # export to Excel - writer = pd.ExcelWriter(output_excel) + writer = pd.ExcelWriter( + output_excel + ) # pylint: disable=abstract-class-instantiated # export each sheet in manifest for sheet in sheets: dataset = ( - sheet_service.spreadsheets() + sheet_service.spreadsheets() # pylint: disable=no-member .values() .get(spreadsheetId=spreadsheet_id, range=sheet["properties"]["title"]) .execute() diff --git a/schematic/utils/io_utils.py b/schematic/utils/io_utils.py index 016ea5dcd..1651d085e 100644 --- a/schematic/utils/io_utils.py +++ b/schematic/utils/io_utils.py @@ -1,11 +1,12 @@ -import os +"""io utils""" + +from typing import Any import json import urllib.request - from schematic import LOADER -def load_json(file_path): +def load_json(file_path: str) -> Any: """Load json document from file path or url :arg str file_path: The path of the url doc, could be url or file path @@ -16,28 +17,26 @@ def load_json(file_path): return data # handle file path else: - with open(file_path, encoding="utf8") as f: - data = json.load(f) + with open(file_path, encoding="utf8") as fle: + data = json.load(fle) return data -def export_json(json_doc, file_path): +def export_json(json_doc: Any, file_path: str) -> None: """Export JSON doc to file""" - with open(file_path, "w", encoding="utf8") as f: - json.dump(json_doc, f, sort_keys=True, indent=4, ensure_ascii=False) + with open(file_path, "w", encoding="utf8") as fle: + json.dump(json_doc, fle, sort_keys=True, indent=4, ensure_ascii=False) -def load_default(): +def load_default() -> Any: """Load biolink vocabulary""" data_path = "data_models/biothings.model.jsonld" biothings_path = LOADER.filename(data_path) - return load_json(biothings_path) -def load_schemaorg(): - """Load SchemOrg vocabulary""" +def load_schemaorg() -> Any: + """Load SchemaOrg vocabulary""" data_path = "data_models/schema_org.model.jsonld" - schemaorg_path = LOADER.filename(data_path) - - return load_json(schemaorg_path) + schema_org_path = LOADER.filename(data_path) + return load_json(schema_org_path) diff --git a/schematic/utils/validate_rules_utils.py b/schematic/utils/validate_rules_utils.py index f1588ed2e..f2a333ecc 100644 --- a/schematic/utils/validate_rules_utils.py +++ b/schematic/utils/validate_rules_utils.py @@ -1,14 +1,14 @@ -from ast import arg -from jsonschema import ValidationError +"""validate rules utils""" + +from typing import Any import logging -import pandas as pd -from typing import Any, Dict, Optional, Text, List +from jsonschema import ValidationError logger = logging.getLogger(__name__) -def validation_rule_info(): +def validation_rule_info() -> dict[str, dict[str, Any]]: """ Function to return dict that holds information about each rule Will be pulled into validate_single_rule, validate_manifest_rules, validate_schema_rules @@ -127,7 +127,7 @@ def get_error( attribute_name: str, error_type: str, input_filetype: str, -) -> List[str]: +) -> list[str]: """ Generate error message for errors when trying to specify multiple validation rules. @@ -137,32 +137,35 @@ def get_error( if error_type == "delimiter": error_str = ( f"The {input_filetype}, has an error in the validation rule " - f"for the attribute: {attribute_name}, the provided validation rules ({validation_rules}) are improperly " - f"specified. Please check your delimiter is '::'" + f"for the attribute: {attribute_name}, the provided validation rules " + f"({validation_rules}) are improperly " + "specified. Please check your delimiter is '::'" ) logging.error(error_str) error_message = error_str - error_val = f"Multiple Rules: Delimiter" + error_val = "Multiple Rules: Delimiter" if error_type == "not_rule": error_str = ( f"The {input_filetype}, has an error in the validation rule " - f"for the attribute: {attribute_name}, the provided validation rules ({validation_rules}) is not " - f"a valid rule. Please check spelling." + f"for the attribute: {attribute_name}, the provided validation rules " + f"({validation_rules}) is not " + "a valid rule. Please check spelling." ) logging.error(error_str) error_message = error_str - error_val = f"Not a Rule" + error_val = "Not a Rule" if error_type == "args_not_allowed": error_str = ( f"The {input_filetype}, has an error in the validation rule " - f"for the attribute: {attribute_name}, the provided validation rules ({validation_rules}) is not" - f"formatted properly. No additional arguments are allowed for this rule." + f"for the attribute: {attribute_name}, the provided validation rules " + f"({validation_rules}) is not" + "formatted properly. No additional arguments are allowed for this rule." ) logging.error(error_str) error_message = error_str - error_val = f"Args not allowed." + error_val = "Args not allowed." if error_type == "incorrect_num_args": rule_type = validation_rules.split(" ")[0] @@ -173,19 +176,22 @@ def get_error( error_str = ( f"The {input_filetype}, has an error in the validation rule " - f"for the attribute: {attribute_name}, the provided validation rules ({validation_rules}) is not " - f"formatted properly. The number of provided arguments does not match the number allowed({no_allowed}) or required({no_required})." + f"for the attribute: {attribute_name}, the provided validation rules " + f"({validation_rules}) is not " + "formatted properly. The number of provided arguments does not match the " + f"number allowed({no_allowed}) or required({no_required})." ) logging.error(error_str) error_message = error_str - error_val = f"Incorrect num arguments." + error_val = "Incorrect num arguments." return ["NA", error_col, error_message, error_val] def validate_single_rule(validation_rule, attribute, input_filetype): """ - Perform validation for a single rule to ensure it is specified correctly with an appropriate number of arguments + Perform validation for a single rule to ensure it is specified + correctly with an appropriate number of arguments Inputs: validation_rule: single rule being validated attribute: attribute validation rule was specified for @@ -213,7 +219,7 @@ def validate_single_rule(validation_rule, attribute, input_filetype): ) ) # Check that the rule is actually a valid rule type. - elif rule_type not in validation_types.keys(): + elif rule_type not in validation_types: errors.append( get_error( validation_rule, @@ -263,7 +269,7 @@ def validate_single_rule(validation_rule, attribute, input_filetype): return errors -def validate_schema_rules(validation_rules, attribute, input_filetype): +def validate_schema_rules(validation_rules, attribute, input_filetype: str) -> None: """ validation_rules: list input_filetype: str, used in error generation to aid user in @@ -285,5 +291,3 @@ def validate_schema_rules(validation_rules, attribute, input_filetype): f"for attribute {attribute}. " f"Validation failed with the following errors: {errors}" ) - - return diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index ee64728a4..1e98b79c9 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -1,12 +1,15 @@ -import os -import pandas as pd +"""Validation utils""" + +import re +from typing import Pattern, Union, Iterable +from numbers import Number from jsonschema import validate -from re import compile, search, IGNORECASE +import numpy as np +import pandas as pd from schematic.utils.io_utils import load_json from schematic import LOADER -from typing import List -import numpy as np -from numbers import Number + +# pylint: disable = anomalous-backslash-in-string def validate_schema(schema): @@ -33,32 +36,38 @@ def validate_class_schema(schema): return validate(schema, json_schema) -def comma_separated_list_regex(): - # Regex to match with comma separated list - # Requires at least one element and a comma to be valid - # Does not require a trailing comma - csv_list_regex = compile("([^\,]+\,)(([^\,]+\,?)*)") +def comma_separated_list_regex() -> Pattern[str]: + """ + Regex to match with comma separated list + Requires at least one element and a comma to be valid + Does not require a trailing comma - return csv_list_regex + Returns: + Pattern[str]: + """ + csv_list_regex = re.compile("([^\,]+\,)(([^\,]+\,?)*)") + return csv_list_regex -def rule_in_rule_list(rule: str, rule_list: List[str]): - # Function to standardize - # checking to see if a rule is contained in a list of rules. - # Uses regex to avoid issues arising from validation rules with arguments - # or rules that have arguments updated. - # seperate rule type if arguments are specified +def rule_in_rule_list(rule: str, rule_list: list[str]) -> re.Match: + """ + Function to standardize + checking to see if a rule is contained in a list of rules. + Uses regex to avoid issues arising from validation rules with arguments + or rules that have arguments updated. + """ + # separate rule type if arguments are specified rule_type = rule.split(" ")[0] # Process string and list of strings for regex comparison rule_type = rule_type + "[^\|]*" rule_list = "|".join(rule_list) - return search(rule_type, rule_list, flags=IGNORECASE) + return re.search(rule_type, rule_list, flags=re.IGNORECASE) -def parse_str_series_to_list(col: pd.Series): +def parse_str_series_to_list(col: pd.Series) -> pd.Series: """ Parse a pandas series of comma delimited strings into a series with values that are lists of strings @@ -72,29 +81,25 @@ def parse_str_series_to_list(col: pd.Series): return col -def np_array_to_str_list(np_array): +def np_array_to_str_list(np_array: np.array) -> list[str]: """ Parse a numpy array of ints to a list of strings """ return np.char.mod("%d", np_array).tolist() -def iterable_to_str_list(iterable): +def iterable_to_str_list(obj: Union[str, Number, Iterable]) -> list[str]: """ Parse an object into a list of strings Accepts str, Number, and iterable inputs """ # If object is a string, just return wrapped as a list - if isinstance(iterable, str): - return [iterable] - # If object is numberical, convert to string and wrap as a list - elif isinstance(iterable, Number): - return [str(iterable)] - # If the object is iterable and not a string, convert every element to string and wratp as a list - else: - strlist = [] - for element in iterable: - strlist.append(str(element)) - - return strlist + if isinstance(obj, str): + return [obj] + # If object is numerical, convert to string and wrap as a list + if isinstance(obj, Number): + return [str(obj)] + # If the object is iterable and not a string, convert every element + # to string and wrap as a list + return [str(item) for item in obj] diff --git a/schematic/utils/viz_utils.py b/schematic/utils/viz_utils.py index b62c9be47..262ee2f90 100644 --- a/schematic/utils/viz_utils.py +++ b/schematic/utils/viz_utils.py @@ -1,12 +1,24 @@ +"""viz utils""" + +from typing import Optional import graphviz -def visualize(edges, size=None): +def visualize(edges, size: Optional[float] = None) -> graphviz.Digraph: + """_summary_ + + Args: + edges (_type_): _description_ + size (Optional[float], optional): _description_. Defaults to None. + + Returns: + graphviz.Digraph: _description_ + """ if size: - d = graphviz.Digraph(graph_attr=[("size", size)]) + digraph = graphviz.Digraph(graph_attr=[("size", size)]) else: - d = graphviz.Digraph() + digraph = graphviz.Digraph() for _item in edges: - d.edge(_item[0], _item[1]) - return d + digraph.edge(_item[0], _item[1]) + return digraph From 33227daa2fab98388d5b1d008ac43a549f3cb2c5 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:04:41 -0800 Subject: [PATCH 02/17] pylint utils check in github workflow --- .github/workflows/test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b2adf95f8..534feaac4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -117,6 +117,10 @@ jobs: # ran only on certain files for now # add here when checked poetry run pylint schematic/configuration/*.py schematic/exceptions.py schematic/help.py schematic/loader.py schematic/version.py + # do all utils but schema_utils.py + poetry run pylint schematic/utils/cli_utils.py schematic/utils/curie_utils.py schematic/utils/df_utils.py + poetry run pylint schematic/utils/general.py schematic/utils/google_api_utils.py schematic/utils/io_utils.py + poetry run pylint schematic/utils/validate_rules_utils.py schematic/utils/validate_utils.py schematic/utils/viz_utils.py #---------------------------------------------- # run test suite From 88b7bcf6b015fe10d6f699f8dd48aef47b2ec5c8 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:12:21 -0800 Subject: [PATCH 03/17] fix misnamed argument --- schematic/manifest/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index 79c8ed4af..31a020e08 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -1503,7 +1503,7 @@ def export_sheet_to_excel( export_manifest_drive_service( manifest_url, file_path=output_excel_file_path, - mime_Type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) return output_excel_file_path From a8cc2ca1e25d66919a703072fe406421eb394da0 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:12:42 -0800 Subject: [PATCH 04/17] ran black --- schematic/utils/general.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 5af2bf6a2..e19f5047d 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -136,7 +136,9 @@ def check_synapse_cache_size(directory="/root/.synapseCache") -> Union[float, in # Note: this command might fail on windows user. # But since this command is primarily for running on AWS, it is fine. command = ["du", "-sh", directory] - output = subprocess.run(command, capture_output=True, check=False).stdout.decode("utf-8") + output = subprocess.run(command, capture_output=True, check=False).stdout.decode( + "utf-8" + ) # Parsing the output to extract the directory size size = output.split("\t")[0] From 48f98a29651b426aa69df3877076062ad0bb9c1e Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:19:45 -0800 Subject: [PATCH 05/17] fix sonar lint issue --- schematic/utils/general.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/schematic/utils/general.py b/schematic/utils/general.py index e19f5047d..7c32b6386 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -209,16 +209,17 @@ def entity_type_mapping(syn: Synapse, entity_id: str) -> str: ) from exc if isinstance(entity, EntityViewSchema): - return "asset view" - if isinstance(entity, Folder): - return "folder" - if isinstance(entity, File): - return "file" - if isinstance(entity, Project): - return "project" - - # if there's no matching type, return concreteType - return entity.concreteType + entity_type = "asset view" + elif isinstance(entity, Folder): + entity_type = "folder" + elif isinstance(entity, File): + entity_type = "file" + elif isinstance(entity, Project): + entity_type = "project" + else: + # if there's no matching type, return concreteType + entity_type = entity.concreteType + return entity_type def create_temp_folder(path: str) -> str: From 44b7b135a4ffaa8d83100baf03f43fe6919f107e Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:19:59 -0800 Subject: [PATCH 06/17] fix sonar lint issue --- schematic/utils/general.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 7c32b6386..c1d8d7058 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -235,7 +235,7 @@ def create_temp_folder(path: str) -> str: def profile( output_file: Optional[str] = None, - sort_by="cumulative", + sort_by:Any="cumulative", lines_to_print: Optional[int] = None, strip_dirs: bool = False, ) -> Callable: From 44f03fc2a9521bc24d78f1a0141ecf28ebd0f182 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:22:42 -0800 Subject: [PATCH 07/17] fix pylint/black issue --- schematic/utils/general.py | 2 +- schematic/utils/google_api_utils.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/schematic/utils/general.py b/schematic/utils/general.py index c1d8d7058..3dcbd029f 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -235,7 +235,7 @@ def create_temp_folder(path: str) -> str: def profile( output_file: Optional[str] = None, - sort_by:Any="cumulative", + sort_by: Any = "cumulative", lines_to_print: Optional[int] = None, strip_dirs: bool = False, ) -> Callable: diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 05d8e095d..01ff765ba 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -161,9 +161,9 @@ def export_manifest_drive_service( # use google drive data = ( - drive_service.files() + drive_service.files() # pylint: disable=no-member .export(fileId=spreadsheet_id, mimeType=mime_type) - .execute() # pylint: disable=no-member + .execute() ) # open file and write data @@ -205,6 +205,9 @@ def export_manifest_excel( output_excel (Optional[str], optional): name of the exported manifest sheet. Defaults to None. """ + # pylint: disable=abstract-class-instantiated + # pylint: disable=no-member + # initialize drive service services_creds = build_service_account_creds() sheet_service = services_creds["sheet_service"] @@ -217,21 +220,17 @@ def export_manifest_excel( # use google sheet api sheet_metadata = ( - sheet_service.spreadsheets() # pylint: disable=no-member - .get(spreadsheetId=spreadsheet_id) - .execute() + sheet_service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute() ) sheets = sheet_metadata.get("sheets") # export to Excel - writer = pd.ExcelWriter( - output_excel - ) # pylint: disable=abstract-class-instantiated + writer = pd.ExcelWriter(output_excel) # export each sheet in manifest for sheet in sheets: dataset = ( - sheet_service.spreadsheets() # pylint: disable=no-member + sheet_service.spreadsheets() .values() .get(spreadsheetId=spreadsheet_id, range=sheet["properties"]["title"]) .execute() From 1426a6fa9411bc2991cf5d97e4f494ccd329e733 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 12:36:38 -0800 Subject: [PATCH 08/17] sonar cloud fixes --- schematic/utils/validate_rules_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schematic/utils/validate_rules_utils.py b/schematic/utils/validate_rules_utils.py index f2a333ecc..dfb746a29 100644 --- a/schematic/utils/validate_rules_utils.py +++ b/schematic/utils/validate_rules_utils.py @@ -188,7 +188,7 @@ def get_error( return ["NA", error_col, error_message, error_val] -def validate_single_rule(validation_rule, attribute, input_filetype): +def validate_single_rule(validation_rule:str, attribute:str, input_filetype:str): """ Perform validation for a single rule to ensure it is specified correctly with an appropriate number of arguments @@ -269,7 +269,7 @@ def validate_single_rule(validation_rule, attribute, input_filetype): return errors -def validate_schema_rules(validation_rules, attribute, input_filetype: str) -> None: +def validate_schema_rules(validation_rules:list[str], attribute:str, input_filetype: str) -> None: """ validation_rules: list input_filetype: str, used in error generation to aid user in From 35b7c6cc8bb1c256fa3f0c762fce79d65ca4bf57 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 13:20:18 -0800 Subject: [PATCH 09/17] fixed some typing --- schematic/utils/cli_utils.py | 5 ++--- schematic/utils/df_utils.py | 26 ++++++++++++++++++++----- schematic/utils/general.py | 12 ++++++------ schematic/utils/google_api_utils.py | 5 ++--- schematic/utils/validate_rules_utils.py | 6 +++--- schematic/utils/validate_utils.py | 11 +++++------ 6 files changed, 39 insertions(+), 26 deletions(-) diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index ce701834e..03017f5af 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -54,7 +54,7 @@ def parse_synIDs( # pylint: disable=invalid-name synIDs: str, # pylint: disable=invalid-name ) -> Optional[list[str]]: """For backwards compatibility""" - parse_syn_ids(ctx, param, synIDs) + return parse_syn_ids(ctx, param, synIDs) def parse_syn_ids( @@ -88,8 +88,7 @@ def parse_syn_ids( "\nPlease check your list of projects for errors." ) - syn_ids = syn_ids.split(",") - return syn_ids + return syn_ids.split(",") def parse_comma_str_to_list( diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 2fbd8cc4d..76f403382 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -3,10 +3,12 @@ import logging from copy import deepcopy from time import perf_counter +from typing import Union, Any +from datetime import datetime import dateparser as dp import pandas as pd import numpy as np -from pandarallel import pandarallel +from pandarallel import pandarallel #type: ignore # pylint: disable=logging-fstring-interpolation @@ -39,7 +41,13 @@ def load_df( t_load_df = perf_counter() # Read CSV to df as type specified in kwargs - org_df = pd.read_csv(file_path, keep_default_na=True, encoding="utf8", **load_args) + org_df = pd.read_csv( #type: ignore + file_path, + keep_default_na=True, + encoding="utf8", + **load_args + ) + assert isinstance(org_df, pd.DataFrame) # If type inference not allowed: trim and return if preserve_raw_input: @@ -66,13 +74,13 @@ def load_df( if ( org_df.size < large_manifest_cutoff_size ): # If small manifest, iterate as normal for improved performance - ints = org_df.applymap( + ints: pd.DataFrame = org_df.applymap( lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" ).fillna(False) else: # parallelize iterations for large manfiests pandarallel.initialize(verbose=1) - ints = org_df.parallel_applymap( + ints: pd.DataFrame = org_df.parallel_applymap( #type: ignore lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" ).fillna(False) @@ -96,7 +104,15 @@ def load_df( return processed_df -def _parse_dates(date_string): +def parse_dates(date_string: str) -> Union[datetime, bool]: + """Gets a datetime from a string + + Args: + date_string (str): The string to get the datetime from + + Returns: + Union[datetime, bool]: The parsed datetime or False + """ try: date = dp.parse(date_string=date_string, settings={"STRICT_PARSING": True}) return date if date else False diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 3dcbd029f..5ce4bd1c7 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -10,11 +10,11 @@ from functools import wraps from typing import Union, TypeVar, Any, Optional, Sequence, Callable -from synapseclient.core.exceptions import SynapseHTTPError -from synapseclient.entity import File, Folder, Project -from synapseclient.table import EntityViewSchema -from synapseclient.core import cache -from synapseclient import Synapse +from synapseclient.core.exceptions import SynapseHTTPError #type: ignore +from synapseclient.entity import File, Folder, Project #type: ignore +from synapseclient.table import EntityViewSchema #type: ignore +from synapseclient.core import cache #type: ignore +from synapseclient import Synapse #type: ignore # pylint: disable=logging-fstring-interpolation @@ -124,7 +124,7 @@ def calculate_datetime( return date_time_result -def check_synapse_cache_size(directory="/root/.synapseCache") -> Union[float, int]: +def check_synapse_cache_size(directory:str="/root/.synapseCache") -> Union[float, int]: """use du --sh command to calculate size of .synapseCache. Args: diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 01ff765ba..948fb286f 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -6,9 +6,8 @@ from typing import Any, Union, Optional import pandas as pd -from googleapiclient.discovery import build -from google.oauth2 import service_account - +from googleapiclient.discovery import build # type :ignore +from google.oauth2 import service_account # type :ignoreS from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage diff --git a/schematic/utils/validate_rules_utils.py b/schematic/utils/validate_rules_utils.py index dfb746a29..1b0d3c55c 100644 --- a/schematic/utils/validate_rules_utils.py +++ b/schematic/utils/validate_rules_utils.py @@ -1,6 +1,6 @@ """validate rules utils""" -from typing import Any +from typing import Union import logging from jsonschema import ValidationError @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) -def validation_rule_info() -> dict[str, dict[str, Any]]: +def validation_rule_info() -> dict[str, dict[str, Union[tuple[int, int], str, list[str], None]]]: """ Function to return dict that holds information about each rule Will be pulled into validate_single_rule, validate_manifest_rules, validate_schema_rules @@ -123,7 +123,7 @@ def validation_rule_info() -> dict[str, dict[str, Any]]: def get_error( - validation_rules: list, + validation_rules: str, attribute_name: str, error_type: str, input_filetype: str, diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index 1e98b79c9..a7e7be552 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -1,7 +1,7 @@ """Validation utils""" import re -from typing import Pattern, Union, Iterable +from typing import Pattern, Union, Iterable, Any, Optional from numbers import Number from jsonschema import validate import numpy as np @@ -50,7 +50,7 @@ def comma_separated_list_regex() -> Pattern[str]: return csv_list_regex -def rule_in_rule_list(rule: str, rule_list: list[str]) -> re.Match: +def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]]: """ Function to standardize checking to see if a rule is contained in a list of rules. @@ -62,9 +62,8 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> re.Match: # Process string and list of strings for regex comparison rule_type = rule_type + "[^\|]*" - rule_list = "|".join(rule_list) - - return re.search(rule_type, rule_list, flags=re.IGNORECASE) + rule_list_str = "|".join(rule_list) + return re.search(rule_type, rule_list_str, flags=re.IGNORECASE) def parse_str_series_to_list(col: pd.Series) -> pd.Series: @@ -81,7 +80,7 @@ def parse_str_series_to_list(col: pd.Series) -> pd.Series: return col -def np_array_to_str_list(np_array: np.array) -> list[str]: +def np_array_to_str_list(np_array: Any) -> list[str]: """ Parse a numpy array of ints to a list of strings """ From 5ab2d9fb80edd0e7029e438f1874b04d1b291c0b Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 13:24:27 -0800 Subject: [PATCH 10/17] ran black --- schematic/utils/df_utils.py | 13 +++++-------- schematic/utils/general.py | 14 ++++++++------ schematic/utils/google_api_utils.py | 4 ++-- schematic/utils/validate_rules_utils.py | 10 +++++++--- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 76f403382..29e70a6ed 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -3,12 +3,12 @@ import logging from copy import deepcopy from time import perf_counter -from typing import Union, Any +from typing import Union from datetime import datetime import dateparser as dp import pandas as pd import numpy as np -from pandarallel import pandarallel #type: ignore +from pandarallel import pandarallel # type: ignore # pylint: disable=logging-fstring-interpolation @@ -41,11 +41,8 @@ def load_df( t_load_df = perf_counter() # Read CSV to df as type specified in kwargs - org_df = pd.read_csv( #type: ignore - file_path, - keep_default_na=True, - encoding="utf8", - **load_args + org_df = pd.read_csv( # type: ignore + file_path, keep_default_na=True, encoding="utf8", **load_args ) assert isinstance(org_df, pd.DataFrame) @@ -80,7 +77,7 @@ def load_df( else: # parallelize iterations for large manfiests pandarallel.initialize(verbose=1) - ints: pd.DataFrame = org_df.parallel_applymap( #type: ignore + ints: pd.DataFrame = org_df.parallel_applymap( # type: ignore lambda x: np.int64(x) if str.isdigit(x) else False, na_action="ignore" ).fillna(False) diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 5ce4bd1c7..92068e109 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -10,11 +10,11 @@ from functools import wraps from typing import Union, TypeVar, Any, Optional, Sequence, Callable -from synapseclient.core.exceptions import SynapseHTTPError #type: ignore -from synapseclient.entity import File, Folder, Project #type: ignore -from synapseclient.table import EntityViewSchema #type: ignore -from synapseclient.core import cache #type: ignore -from synapseclient import Synapse #type: ignore +from synapseclient.core.exceptions import SynapseHTTPError # type: ignore +from synapseclient.entity import File, Folder, Project # type: ignore +from synapseclient.table import EntityViewSchema # type: ignore +from synapseclient.core import cache # type: ignore +from synapseclient import Synapse # type: ignore # pylint: disable=logging-fstring-interpolation @@ -124,7 +124,9 @@ def calculate_datetime( return date_time_result -def check_synapse_cache_size(directory:str="/root/.synapseCache") -> Union[float, int]: +def check_synapse_cache_size( + directory: str = "/root/.synapseCache", +) -> Union[float, int]: """use du --sh command to calculate size of .synapseCache. Args: diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 948fb286f..618a7227a 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -6,8 +6,8 @@ from typing import Any, Union, Optional import pandas as pd -from googleapiclient.discovery import build # type :ignore -from google.oauth2 import service_account # type :ignoreS +from googleapiclient.discovery import build # type :ignore +from google.oauth2 import service_account # type :ignoreS from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage diff --git a/schematic/utils/validate_rules_utils.py b/schematic/utils/validate_rules_utils.py index 1b0d3c55c..9a245d470 100644 --- a/schematic/utils/validate_rules_utils.py +++ b/schematic/utils/validate_rules_utils.py @@ -8,7 +8,9 @@ logger = logging.getLogger(__name__) -def validation_rule_info() -> dict[str, dict[str, Union[tuple[int, int], str, list[str], None]]]: +def validation_rule_info() -> ( + dict[str, dict[str, Union[tuple[int, int], str, list[str], None]]] +): """ Function to return dict that holds information about each rule Will be pulled into validate_single_rule, validate_manifest_rules, validate_schema_rules @@ -188,7 +190,7 @@ def get_error( return ["NA", error_col, error_message, error_val] -def validate_single_rule(validation_rule:str, attribute:str, input_filetype:str): +def validate_single_rule(validation_rule: str, attribute: str, input_filetype: str): """ Perform validation for a single rule to ensure it is specified correctly with an appropriate number of arguments @@ -269,7 +271,9 @@ def validate_single_rule(validation_rule:str, attribute:str, input_filetype:str) return errors -def validate_schema_rules(validation_rules:list[str], attribute:str, input_filetype: str) -> None: +def validate_schema_rules( + validation_rules: list[str], attribute: str, input_filetype: str +) -> None: """ validation_rules: list input_filetype: str, used in error generation to aid user in From 318c538a901009aac044ea9fd00f3c8d2cf14116 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 15:12:41 -0800 Subject: [PATCH 11/17] added some typing fixes --- schematic/utils/cli_utils.py | 2 +- schematic/utils/curie_utils.py | 4 ++-- schematic/utils/google_api_utils.py | 4 ++-- schematic/utils/validate_rules_utils.py | 22 ++++++++++++++-------- schematic/utils/validate_utils.py | 7 ++++--- schematic/utils/viz_utils.py | 2 +- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index 03017f5af..58180796e 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -36,7 +36,7 @@ def extract(dictionary: Any, key: Any) -> Union[Any, None]: return reduce(extract, keys, dictionary) -def log_value_from_config(arg_name: str, config_value: Any): +def log_value_from_config(arg_name: str, config_value: Any) -> None: """Logs when getting a value from the config Args: diff --git a/schematic/utils/curie_utils.py b/schematic/utils/curie_utils.py index fd24fd297..9345f2b87 100644 --- a/schematic/utils/curie_utils.py +++ b/schematic/utils/curie_utils.py @@ -6,7 +6,7 @@ logger = logging.getLogger(__name__) -def extract_name_from_uri_or_curie(item): +def extract_name_from_uri_or_curie(item: str) -> str: """Extract name from uri or curie""" if "http" not in item and len(item.split(":")) == 2: return item.split(":")[-1] @@ -15,7 +15,7 @@ def extract_name_from_uri_or_curie(item): raise ValueError("Error extracting name from URI or Curie.") -def expand_curie_to_uri(curie, context_info): +def expand_curie_to_uri(curie: str, context_info: dict[str, str]) -> str: """Expand curie to uri based on the context given parmas diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 618a7227a..3ac80ac05 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -6,8 +6,8 @@ from typing import Any, Union, Optional import pandas as pd -from googleapiclient.discovery import build # type :ignore -from google.oauth2 import service_account # type :ignoreS +from googleapiclient.discovery import build # type: ignore +from google.oauth2 import service_account # type: ignore from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage diff --git a/schematic/utils/validate_rules_utils.py b/schematic/utils/validate_rules_utils.py index 9a245d470..cce26a7b4 100644 --- a/schematic/utils/validate_rules_utils.py +++ b/schematic/utils/validate_rules_utils.py @@ -21,7 +21,7 @@ def validation_rule_info() -> ( 'complementary_rules': []} } """ - rule_dict = { + return { "int": { "arguments": (1, 0), "type": "type_validation", @@ -121,8 +121,6 @@ def validation_rule_info() -> ( }, } - return rule_dict - def get_error( validation_rules: str, @@ -172,16 +170,20 @@ def get_error( rule_type = validation_rules.split(" ")[0] if rule_type in validation_rule_info(): - no_allowed, no_required = validation_rule_info()[rule_type]["arguments"] + arg_tuple = validation_rule_info()[rule_type]["arguments"] + assert isinstance(arg_tuple, tuple) + assert len(arg_tuple) == 2 + number_allowed = str(arg_tuple[0]) + number_required = str(arg_tuple[1]) else: - no_allowed, no_required = ("", "") + number_allowed, number_required = ("", "") error_str = ( f"The {input_filetype}, has an error in the validation rule " f"for the attribute: {attribute_name}, the provided validation rules " f"({validation_rules}) is not " "formatted properly. The number of provided arguments does not match the " - f"number allowed({no_allowed}) or required({no_required})." + f"number allowed({number_allowed}) or required({number_required})." ) logging.error(error_str) error_message = error_str @@ -232,10 +234,14 @@ def validate_single_rule(validation_rule: str, attribute: str, input_filetype: s ) # if the rule is indeed a rule and formatted correctly, check that arguments are appropriate else: - arguments_allowed, arguments_required = validation_types[rule_type]["arguments"] + arg_tuple = validation_rule_info()[rule_type]["arguments"] + assert isinstance(arg_tuple, tuple) + assert len(arg_tuple) == 2 + arguments_allowed, arguments_required = arg_tuple # Remove any fixed args from our calc. - if "fixed_arg" in validation_types[rule_type].keys(): + if "fixed_arg" in validation_types[rule_type]: fixed_args = validation_types[rule_type]["fixed_arg"] + assert isinstance(fixed_args, list) num_args = ( len([vr for vr in validation_rule_with_args if vr not in fixed_args]) - 1 diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index a7e7be552..de7c4e66b 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -1,6 +1,7 @@ """Validation utils""" import re +from collections.abc import Mapping from typing import Pattern, Union, Iterable, Any, Optional from numbers import Number from jsonschema import validate @@ -12,7 +13,7 @@ # pylint: disable = anomalous-backslash-in-string -def validate_schema(schema): +def validate_schema(schema: Union[Mapping, bool]) -> None: """Validate schema against schema.org standard""" data_path = "validation_schemas/model.schema.json" json_schema_path = LOADER.filename(data_path) @@ -20,7 +21,7 @@ def validate_schema(schema): return validate(schema, json_schema) -def validate_property_schema(schema): +def validate_property_schema(schema: Union[Mapping, bool]) -> None: """Validate schema against SchemaORG property definition standard""" data_path = "validation_schemas/property.schema.json" json_schema_path = LOADER.filename(data_path) @@ -28,7 +29,7 @@ def validate_property_schema(schema): return validate(schema, json_schema) -def validate_class_schema(schema): +def validate_class_schema(schema: Union[Mapping, bool]) -> None: """Validate schema against SchemaORG class definition standard""" data_path = "validation_schemas/class.schema.json" json_schema_path = LOADER.filename(data_path) diff --git a/schematic/utils/viz_utils.py b/schematic/utils/viz_utils.py index 262ee2f90..58a904728 100644 --- a/schematic/utils/viz_utils.py +++ b/schematic/utils/viz_utils.py @@ -1,7 +1,7 @@ """viz utils""" from typing import Optional -import graphviz +import graphviz # type: ignore def visualize(edges, size: Optional[float] = None) -> graphviz.Digraph: From 8169f6a6df9857f88f2eeba22d42897e0712560e Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Sun, 4 Feb 2024 15:40:33 -0800 Subject: [PATCH 12/17] remove get_synIDs function --- schematic/manifest/commands.py | 4 ++-- schematic/models/commands.py | 4 ++-- schematic/utils/cli_utils.py | 9 --------- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/schematic/manifest/commands.py b/schematic/manifest/commands.py index 002ada68c..d6b20d3cf 100644 --- a/schematic/manifest/commands.py +++ b/schematic/manifest/commands.py @@ -10,7 +10,7 @@ from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.manifest.generator import ManifestGenerator -from schematic.utils.cli_utils import log_value_from_config, query_dict, parse_synIDs +from schematic.utils.cli_utils import log_value_from_config, query_dict, parse_syn_ids from schematic.utils.google_api_utils import export_manifest_csv from schematic.help import manifest_commands @@ -253,7 +253,7 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None): "-ps", "--project_scope", default=None, - callback=parse_synIDs, + callback=parse_syn_ids, help=query_dict(manifest_commands, ("manifest", "migrate", "project_scope")), ) @click.option( diff --git a/schematic/models/commands.py b/schematic/models/commands.py index 0c1e6e8a3..ac6f4946a 100644 --- a/schematic/models/commands.py +++ b/schematic/models/commands.py @@ -14,7 +14,7 @@ from schematic.utils.cli_utils import ( log_value_from_config, query_dict, - parse_synIDs, + parse_syn_ids, parse_comma_str_to_list, ) from schematic.help import model_commands @@ -98,7 +98,7 @@ def model(ctx, config): # use as `schematic model ...` "-ps", "--project_scope", default=None, - callback=parse_synIDs, + callback=parse_syn_ids, help=query_dict(model_commands, ("model", "validate", "project_scope")), ) @click.option( diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index 58180796e..342053600 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -48,15 +48,6 @@ def log_value_from_config(arg_name: str, config_value: Any) -> None: ) -def parse_synIDs( # pylint: disable=invalid-name - ctx: Any, # pylint: disable=unused-argument - param: str, # pylint: disable=unused-argument - synIDs: str, # pylint: disable=invalid-name -) -> Optional[list[str]]: - """For backwards compatibility""" - return parse_syn_ids(ctx, param, synIDs) - - def parse_syn_ids( ctx: Any, # pylint: disable=unused-argument param: str, # pylint: disable=unused-argument From c2151299e8f0a33a5c7a5289a9ce1c5f1add431a Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Mon, 5 Feb 2024 09:12:36 -0800 Subject: [PATCH 13/17] removed unsused functions, moved around linting disables --- schematic/utils/cli_utils.py | 7 +- schematic/utils/df_utils.py | 4 +- schematic/utils/general.py | 4 +- schematic/utils/google_api_utils.py | 80 +------------------ schematic/utils/validate_utils.py | 4 +- tests/data/mock_manifests/test_BulkRNAseq.csv | 6 +- 6 files changed, 16 insertions(+), 89 deletions(-) diff --git a/schematic/utils/cli_utils.py b/schematic/utils/cli_utils.py index 342053600..52debd51c 100644 --- a/schematic/utils/cli_utils.py +++ b/schematic/utils/cli_utils.py @@ -1,5 +1,8 @@ """CLI utils""" +# pylint: disable=logging-fstring-interpolation +# pylint: disable=anomalous-backslash-in-string + import logging from typing import Any, Mapping, Sequence, Union, Optional @@ -8,10 +11,6 @@ logger = logging.getLogger(__name__) -# We are using fstrings in logger methods -# pylint: disable=logging-fstring-interpolation -# pylint: disable = anomalous-backslash-in-string - def query_dict(dictionary: Mapping[Any, Any], keys: Sequence[Any]) -> Union[Any, None]: """Access a nested value in a dictionary corresponding diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 29e70a6ed..a95c280ea 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -1,5 +1,7 @@ """df utils""" +# pylint: disable=logging-fstring-interpolation + import logging from copy import deepcopy from time import perf_counter @@ -10,8 +12,6 @@ import numpy as np from pandarallel import pandarallel # type: ignore -# pylint: disable=logging-fstring-interpolation - logger = logging.getLogger(__name__) diff --git a/schematic/utils/general.py b/schematic/utils/general.py index 92068e109..246d0bc9e 100644 --- a/schematic/utils/general.py +++ b/schematic/utils/general.py @@ -1,5 +1,7 @@ """General utils""" +# pylint: disable=logging-fstring-interpolation + import logging import os import pstats @@ -16,8 +18,6 @@ from synapseclient.core import cache # type: ignore from synapseclient import Synapse # type: ignore -# pylint: disable=logging-fstring-interpolation - logger = logging.getLogger(__name__) T = TypeVar("T") diff --git a/schematic/utils/google_api_utils.py b/schematic/utils/google_api_utils.py index 3ac80ac05..4cc743e6b 100644 --- a/schematic/utils/google_api_utils.py +++ b/schematic/utils/google_api_utils.py @@ -1,9 +1,11 @@ """Google API utils""" +# pylint: disable=logging-fstring-interpolation + import os import logging import json -from typing import Any, Union, Optional +from typing import Any, Union import pandas as pd from googleapiclient.discovery import build # type: ignore @@ -11,8 +13,6 @@ from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage -# pylint: disable=logging-fstring-interpolation - logger = logging.getLogger(__name__) @@ -23,23 +23,6 @@ ] -# This function doesn't appear to be used or tested anywhere in schematic. -# TO DO: replace by pygsheets calls? -def build_credentials() -> dict[str, Any]: # pylint: disable=missing-function-docstring - creds = generate_token() # pylint: disable=undefined-variable - - # get a Google Sheet API service - sheet_service = build("sheets", "v4", credentials=creds) - # get a Google Drive API service - drive_service = build("drive", "v3", credentials=creds) - - return { - "sheet_service": sheet_service, - "drive_service": drive_service, - "creds": creds, - } - - def build_service_account_creds() -> dict[str, Any]: """Build Google service account credentials @@ -159,6 +142,7 @@ def export_manifest_drive_service( spreadsheet_id = manifest_url.split("/")[-1] # use google drive + # Pylint seems to have trouble with the google api classes, recognizing their methods data = ( drive_service.files() # pylint: disable=no-member .export(fileId=spreadsheet_id, mimeType=mime_type) @@ -186,59 +170,3 @@ def export_manifest_csv(file_path: str, manifest: Union[pd.DataFrame, str]) -> N manifest.to_csv(file_path, index=False) else: export_manifest_drive_service(manifest, file_path, mime_type="text/csv") - - -# This function doesn't appear to be used or tested -# pd.ExcelWriter is an ABC class which means it SHOULD NOT be instantiated -def export_manifest_excel( - manifest: Union[pd.DataFrame, str], output_excel: Optional[str] = None -) -> None: - """ - Export manifest as an Excel spreadsheet by using google sheet API. - This approach could export hidden sheet - Google sheet gets exported as an excel spreadsheet. - If there's a hidden sheet, the hidden sheet also gets exported. - - Args: - manifest (Union[pd.DataFrame, str]): could be a dataframe or a manifest url - output_excel (Optional[str], optional): name of the exported manifest sheet. - Defaults to None. - """ - # pylint: disable=abstract-class-instantiated - # pylint: disable=no-member - - # initialize drive service - services_creds = build_service_account_creds() - sheet_service = services_creds["sheet_service"] - - if isinstance(manifest, pd.DataFrame): - manifest.to_excel(output_excel, index=False) - else: - # get spreadsheet id from url - spreadsheet_id = manifest.split("/")[-1] - - # use google sheet api - sheet_metadata = ( - sheet_service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute() - ) - sheets = sheet_metadata.get("sheets") - - # export to Excel - writer = pd.ExcelWriter(output_excel) - - # export each sheet in manifest - for sheet in sheets: - dataset = ( - sheet_service.spreadsheets() - .values() - .get(spreadsheetId=spreadsheet_id, range=sheet["properties"]["title"]) - .execute() - ) - dataset_df = pd.DataFrame(dataset["values"]) - dataset_df.columns = dataset_df.iloc[0] - dataset_df.drop(dataset_df.index[0], inplace=True) - dataset_df.to_excel( - writer, sheet_name=sheet["properties"]["title"], index=False - ) - writer.save() - writer.close() diff --git a/schematic/utils/validate_utils.py b/schematic/utils/validate_utils.py index de7c4e66b..fed5b422c 100644 --- a/schematic/utils/validate_utils.py +++ b/schematic/utils/validate_utils.py @@ -1,5 +1,7 @@ """Validation utils""" +# pylint: disable = anomalous-backslash-in-string + import re from collections.abc import Mapping from typing import Pattern, Union, Iterable, Any, Optional @@ -10,8 +12,6 @@ from schematic.utils.io_utils import load_json from schematic import LOADER -# pylint: disable = anomalous-backslash-in-string - def validate_schema(schema: Union[Mapping, bool]) -> None: """Validate schema against schema.org standard""" diff --git a/tests/data/mock_manifests/test_BulkRNAseq.csv b/tests/data/mock_manifests/test_BulkRNAseq.csv index facfa3f6a..49e1a38e5 100644 --- a/tests/data/mock_manifests/test_BulkRNAseq.csv +++ b/tests/data/mock_manifests/test_BulkRNAseq.csv @@ -1,3 +1,3 @@ -Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA -TestRNA-seqDataset1/TestRNA-seq-dummy-dataset.rtf,ABCD,BAM,BulkRNA-seqAssay,GRCh38, -TestRNA-seqDataset1/TestRNA-seq-dummy-dataset2.rtf,EFGH,CRAM,BulkRNA-seqAssay,GRCm39, +Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId +TestRNA-seqDataset1/TestRNA-seq-dummy-dataset.rtf,ABCD,BAM,BulkRNA-seqAssay,GRCh38,,dcb30f9c-0810-4159-aead-6aefcec19d36,syn39242580 +TestRNA-seqDataset1/TestRNA-seq-dummy-dataset2.rtf,EFGH,CRAM,BulkRNA-seqAssay,GRCm39,,8ae18fa9-e68e-4c56-b9aa-9a55cb4ccf94,syn51900502 From 84e83e63f1a39ab2405d95ac7b05ff7880824632 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Mon, 5 Feb 2024 09:19:31 -0800 Subject: [PATCH 14/17] fix function call whos name was changed --- schematic/models/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schematic/models/commands.py b/schematic/models/commands.py index ac6f4946a..6e545f95f 100644 --- a/schematic/models/commands.py +++ b/schematic/models/commands.py @@ -204,7 +204,7 @@ def submit_manifest( "-ps", "--project_scope", default=None, - callback=parse_synIDs, + callback=parse_syn_ids, help=query_dict(model_commands, ("model", "validate", "project_scope")), ) @click.pass_obj From 483c9915a3f732b2a0e8ab5eb12a5fffefc0523f Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Mon, 5 Feb 2024 09:44:29 -0800 Subject: [PATCH 15/17] fix mock manifest --- tests/data/mock_manifests/test_BulkRNAseq.csv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data/mock_manifests/test_BulkRNAseq.csv b/tests/data/mock_manifests/test_BulkRNAseq.csv index 49e1a38e5..facfa3f6a 100644 --- a/tests/data/mock_manifests/test_BulkRNAseq.csv +++ b/tests/data/mock_manifests/test_BulkRNAseq.csv @@ -1,3 +1,3 @@ -Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA,Id,entityId -TestRNA-seqDataset1/TestRNA-seq-dummy-dataset.rtf,ABCD,BAM,BulkRNA-seqAssay,GRCh38,,dcb30f9c-0810-4159-aead-6aefcec19d36,syn39242580 -TestRNA-seqDataset1/TestRNA-seq-dummy-dataset2.rtf,EFGH,CRAM,BulkRNA-seqAssay,GRCm39,,8ae18fa9-e68e-4c56-b9aa-9a55cb4ccf94,syn51900502 +Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA +TestRNA-seqDataset1/TestRNA-seq-dummy-dataset.rtf,ABCD,BAM,BulkRNA-seqAssay,GRCh38, +TestRNA-seqDataset1/TestRNA-seq-dummy-dataset2.rtf,EFGH,CRAM,BulkRNA-seqAssay,GRCm39, From 370de428c9c847db722f54c060fdd5e8408bfc78 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Thu, 8 Feb 2024 11:57:30 -0800 Subject: [PATCH 16/17] fix linting issues form merge --- schematic/utils/df_utils.py | 41 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 415fc0bf5..7c578f046 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -10,7 +10,7 @@ import dateparser as dp import pandas as pd import numpy as np -from pandarallel import pandarallel # type: ignore +from pandarallel import pandarallel # type: ignore logger = logging.getLogger(__name__) @@ -67,64 +67,67 @@ def load_df( return processed_df -def find_and_convert_ints(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: +def find_and_convert_ints(dataframe: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ Find strings that represent integers and convert to type int Args: - df: dataframe with nulls masked as empty strings + dataframe: dataframe with nulls masked as empty strings Returns: ints: dataframe with values that were converted to type int is_int: dataframe with boolean values indicating which cells were converted to type int """ + # pylint: disable=unnecessary-lambda large_manifest_cutoff_size = 1000 # Find integers stored as strings and replace with entries of type np.int64 if ( - df.size < large_manifest_cutoff_size + dataframe.size < large_manifest_cutoff_size ): # If small manifest, iterate as normal for improved performance - ints = df.map(lambda x: convert_ints(x), na_action="ignore").fillna(False) + ints = dataframe.map( + lambda cell: convert_ints(cell), na_action="ignore" + ).fillna(False) - else: # parallelize iterations for large manfiests + else: # parallelize iterations for large manifests pandarallel.initialize(verbose=1) - ints = df.parallel_map(lambda x: convert_ints(x), na_action="ignore").fillna( - False - ) + ints = dataframe.parallel_map( + lambda cell: convert_ints(cell), na_action="ignore" + ).fillna(False) - # Identify cells converted to intergers + # Identify cells converted to integers is_int = ints.map(pd.api.types.is_integer) return ints, is_int -def convert_ints(x: str) -> Union[np.int64, bool]: +def convert_ints(string: str) -> Union[np.int64, bool]: """ Lambda function to convert a string to an integer if possible, otherwise returns False Args: - x: string to attempt conversion to int + string: string to attempt conversion to int Returns: - x converted to type int if possible, otherwise False + string converted to type int if possible, otherwise False """ - return np.int64(x) if str.isdigit(x) else False + return np.int64(string) if str.isdigit(string) else False -def convert_floats(df: pd.DataFrame) -> pd.DataFrame: +def convert_floats(dataframe: pd.DataFrame) -> pd.DataFrame: """ Convert strings that represent floats to type float Args: - df: dataframe with nulls masked as empty strings + dataframe: dataframe with nulls masked as empty strings Returns: float_df: dataframe with values that were converted to type float. Columns are type object """ # create a separate copy of the manifest # before beginning conversions to store float values - float_df = deepcopy(df) + float_df = deepcopy(dataframe) # convert strings to numerical dtype (float) if possible, preserve non-numerical strings - for col in df.columns: + for col in dataframe.columns: float_df[col] = pd.to_numeric(float_df[col], errors="coerce").astype("object") # replace values that couldn't be converted to float with the original str values - float_df[col].fillna(df[col][float_df[col].isna()], inplace=True) + float_df[col].fillna(dataframe[col][float_df[col].isna()], inplace=True) return float_df From 7b57a04a1557e3b234a177e828e01ce17b5386f1 Mon Sep 17 00:00:00 2001 From: andrewelamb Date: Fri, 9 Feb 2024 09:30:45 -0800 Subject: [PATCH 17/17] add dataframe check --- schematic/utils/df_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index 7c578f046..222b75713 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -32,6 +32,9 @@ def load_df( data_model (bool, optional): bool, indicates if importing a data model **load_args(dict): dict of key value pairs to be passed to the pd.read_csv function + Raises: + ValueError: When pd.read_csv on the file path doesn't return as dataframe + Returns: pd.DataFrame: a processed dataframe for manifests or unprocessed df for data models and where indicated @@ -43,7 +46,13 @@ def load_df( org_df = pd.read_csv( # type: ignore file_path, keep_default_na=True, encoding="utf8", **load_args ) - assert isinstance(org_df, pd.DataFrame) + if not isinstance(org_df, pd.DataFrame): + raise ValueError( + ( + "Pandas did not return a dataframe. " + "Pandas will return a TextFileReader if chunksize parameter is used." + ) + ) # only trim if not data model csv if not data_model: