diff --git a/build_script.py b/build_script.py index 1640933..f01576c 100644 --- a/build_script.py +++ b/build_script.py @@ -97,21 +97,21 @@ def runSampMap( """ drc = ",".join(drcfiles) args = ( - f" --sampId={smap}" - f" --chemid={cid}" - f" --epMap={emap}" - f" --chemClass={cclass}" - f" --compToxFile={ctfile}" - f" --sampleFiles={fses}" - f" --chemDesc={descfile}" - f" --sampMap={smap}" + f" --sample_id={smap}" + f" --chem_id={cid}" + f" --ep_map={emap}" + f" --chem_class={cclass}" + f" --comptox_file={ctfile}" + f" --sample_files={fses}" + f" --chem_desc={descfile}" + f" --sample_map={smap}" ) if is_sample: - cmd = f"Rscript sampleChemMapping/mapSamplesToChems.R --sample --drcFiles={drc} {args}" + cmd = f"python sampleChemMapping/map_samples_to_chemicals.py --sample --drc_files={drc} {args}" elif len(drcfiles) > 0: - cmd = f"Rscript sampleChemMapping/mapSamplesToChems.R --chemical --drcFiles={drc} {args}" + cmd = f"python sampleChemMapping/map_samples_to_chemicals.py --chemical --drc_files={drc} {args}" else: - cmd = f"Rscript sampleChemMapping/mapSamplesToChems.R {args}" + cmd = f"python sampleChemMapping/map_samples_to_chemicals.py {args}" tqdm.write("\nRunning sample mapping with the following parameters:\n") tqdm.write(f"{cmd}\n") diff --git a/sampleChemMapping/map_samples_to_chemicals.py b/sampleChemMapping/map_samples_to_chemicals.py new file mode 100644 index 0000000..968e946 --- /dev/null +++ b/sampleChemMapping/map_samples_to_chemicals.py @@ -0,0 +1,969 @@ +"""map_samples_to_chemicals: Python version of mapSamplesToChems.R + +Original rewrite using generative AI; modified by @christinehc +""" + +# ========================================================= +# Imports +# ========================================================= +import os +from argparse import ArgumentParser +from typing import Optional + +import numpy as np +import pandas as pd + +from numpy.typing import ArrayLike +from .params import ( + MASV_CC, + MASV_SOURCE, + REQUIRED_BMD_COLUMNS, + REQUIRED_COMPTOX_COLUMNS, + REQUIRED_SAMPLE_COLUMNS, + SAMPLE_CHEM_COLUMNS, + SAMPLE_COLUMNS, +) + +# These pathways refer to absolute pathways in the docker image +# setting these three parameters, can be appended +# data_dir = 'https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data' +out_dir = "/tmp/" +# out_dir = "./" + +# ========================================================= +# Functions +# ========================================================= + + +# @FIXME: Possibly delete? Seems unused +def get_data_files( + data_dir: str = "https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data", + filename: str = "srp_build_files.csv", +) -> pd.DataFrame: + """Get online data files + Note: most data files are stored on github and can be edited there, then this will be rerun + + Parameters + ---------- + data_dir : _type_, optional + _description_, by default "https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data" + filename : str, optional + _description_, by default "srp_build_files.csv" + + Returns + ------- + pd.DataFrame + _description_ + """ + dflist = pd.read_csv(os.path.join(data_dir, filename)) + return dflist + + +def get_mapping_file(filename: str, map_type: str) -> str: + """Get mapping filename from master list. + + Parameters + ---------- + filename : str + Master list file + map_type : str + Type of mapping file + + Returns + ------- + pd.DataFrame + /path/to/map_type/file + """ + data = pd.read_csv(filename) + map_file = list(data.loc[data.name == map_type].location)[0] + return map_file + + +# ################################# +# Master ID tables +# +# Note: The database requires Sample_ID and Chemical_ID be unique. +# They are in some files but not others, so tables are +# automatically updated below. +# ################################# + + +def chem_id_master_table(cas_ids: ArrayLike, cmap: str) -> pd.DataFrame: + """Generate master table for chemical ID + + Note: database requires Sample_ID and Chemical_ID be unique. They are in some files but not others + Thus, the tables below are automatically updated + + Parameters + ---------- + cas_ids : ArrayLike + CAS IDs + cmap : str + /path/to/chemical_mapping_file.csv + + Returns + ------- + pd.DataFrame + Chemical ID master table + """ + cols = ["cas_number", "zf.cid", "Chemical_ID", "chemical_class"] + map_df = ( + pd.read_csv(cmap).loc[:, cols].drop_duplicates().dropna(subset=["cas_number"]) + ) + + missing = set(cas_ids) - set(map_df["cas_number"]) + if missing: + print(f"Missing {len(missing)} chemical IDs; adding them now...") + max_id = map_df["Chemical_ID"].max() + 1 + + # create table rows for missing chemical IDs + missing_df = pd.DataFrame( + { + "cas_number": list(missing), + "zf.cid": [""] * len(missing), + "Chemical_ID": range(max_id, max_id + len(missing)), + "chemical_class": [""] * len(missing), + } + ) + + # add missing rows + map_df = pd.concat([map_df, missing_df]) + + # TODO: how do we update with new ids? + # write.csv(newMap,paste0(data.dir,'chemicalIdMapping.csv'),row.names = F) + return map_df + + +def sample_id_master_table( + existing_sample_numbers: ArrayLike, smap: str +) -> pd.DataFrame: + """Generate master table for sample ID + + Note: database requires Sample_ID and Chemical_ID be unique. They are in some files but not others + Thus, the tables below are automatically updated + + Parameters + ---------- + existing_sample_numbers : ArrayLike + _description_ + smap : str + _description_ + + Returns + ------- + pd.DataFrame + Sample ID master table + """ + map_df = pd.read_csv(smap).loc[:, ["Sample_ID", "SampleNumber"]].drop_duplicates() + + missing = set(existing_sample_numbers) - set(map_df["SampleNumber"]) + if missing: + print(f"Missing {len(missing)} sample IDs; adding them now...") + max_id = map_df["Sample_ID"].astype(float).max(skipna=True) + 1 + missing_df = pd.DataFrame( + { + "Sample_ID": range(int(max_id), int(max_id) + len(missing)), + "SampleNumber": list(missing), + } + ) + map_df = pd.concat([map_df, missing_df]) + + return map_df + + +# ################################## +# Duplicate Removal +# +# There are two causes of duplicates in the data +# 1) Samples that are evaluated from multiple files: +# Duplicated samples need to be removed. +# 2) Multiple chemical ids mapping to a single CAS ID: +# A single chemical ID must be selected. +# ################################## + + +def remove_chem_id_duplicates( + df: pd.DataFrame, + chem_ids: pd.DataFrame, + cols: ArrayLike = ["AUC_Norm", "End_Point_Name", "Model"], +) -> pd.DataFrame: + """Remove duplicates from anything with Chemical_ID in the field + + Parameters + ---------- + df : pd.DataFrame + Table to be de-duplicated + chem_ids : pd.DataFrame + Mapping of chemical_IDs to CAS ids + cols : ArrayLike, optional + List of columns that make the data distinct, + by default ["AUC_Norm", "End_Point_Name", "Model"] + + Returns + ------- + pd.DataFrame + De-duplicated table + """ + group_by_cas = chem_ids.groupby("cas_number")["Chemical_ID"].nunique() + dupes = group_by_cas[group_by_cas > 1].index + + chem_ids = ( + chem_ids[chem_ids["cas_number"].isin(dupes)] + .loc[:, ["cas_number", "Chemical_ID"]] + .assign(in_dataset=lambda df: df["Chemical_ID"].isin(df["Chemical_ID"])) + ) + + dist_tab = df.merge(chem_ids, on="Chemical_ID", how="left").drop_duplicates( + subset=["cas_number"] + cols + ) + dupe_counts = ( + dist_tab.groupby(["cas_number", "End_Point_Name", "Model"]) + .size() + .reset_index(name="n_ids") + ) + + dupe_cas = dupe_counts[dupe_counts["n_ids"] > 1]["cas_number"].unique() + return dupe_cas + + +################################### +# Metadata Collection +################################### + + +def get_chem_metadata( + desc_file: str = "ChemicalDescriptions.xlsx", + chemical_classes: Optional[pd.DataFrame] = None, + comptox_file: str = "", + chem_id_file: str = "", +) -> pd.DataFrame: + """Get chemical metadata, which is stored in `data.dir` + + Parameters + ---------- + desc_file : str, optional + Path to standardized data that enables matching across datasets, + by default "ChemicalDescriptions.xlsx" + chemical_classes : Optional[pd.DataFrame], optional + _description_, by default None + comptox_file : str, optional + Files downloaded from the EPA website, by default "" + chem_id_file : str, optional + _description_, by default "" + + Returns + ------- + pd.DataFrame + _description_ + """ + # Read curated descriptions for each chemical + curated_desc = pd.read_excel(desc_file, sheet_name=0) + curated_desc = curated_desc[["CASRN", "USE CATEGORY/DESCRIPTION"]] + curated_desc = curated_desc.rename( + columns={ + "CASRN": "cas_number", + "USE CATEGORY/DESCRIPTION": "chemDescription", + } + ) + + # Get chemical metadata from CompTox dashboard + chem_meta = pd.read_csv(comptox_file) + chem_meta = chem_meta[REQUIRED_COMPTOX_COLUMNS] + chem_meta = chem_meta.rename(columns={"INPUT": "cas_number"}) + chem_meta = chem_meta.merge(chemical_classes, on="cas_number", how="left") + chem_meta = chem_meta.merge(curated_desc, on="cas_number", how="left") + chem_meta = chem_meta.dropna(subset=["cas_number"]) + chem_meta = chem_meta.replace(["NA", "N/A"], np.nan) + + chem_ids = chem_id_master_table(chem_meta["cas_number"], chem_id_file) + chem_ids = chem_ids.loc[:, ["cas_number", "Chemical_ID"]].drop_duplicates() + chem_meta = chem_meta.merge(chem_ids, on="cas_number", how="left") + chem_meta["PREFERRED_NAME"] = chem_meta["PREFERRED_NAME"].str.replace( + r"^-$", "Chemical name unknown", regex=True + ) + chem_meta = chem_meta.fillna( + {"newClass": "Unclassified", "PREFERRED_NAME": "Chemical name unknown"} + ).drop(columns=["ParameterName"]) + + nocas = chem_meta["cas_number"].str.contains("NOCAS", na=False) + if nocas.any(): + print(f"removing {nocas.sum()} chems with no cas") + chem_meta = chem_meta[~nocas] + return chem_meta + + +def get_endpoint_metadata(filename: str) -> pd.DataFrame: + """_summary_ + + Parameters + ---------- + filename : str + Path to endpoint file + + Returns + ------- + pd.DataFrame + _description_ + """ + endpoint_details = ( + pd.read_excel(filename, sheet_name=3) + .rename( + columns={ + "Abbreviation": "End_Point", + "Simple name (<20char)": "End_Point_Name", + "Ontology Link": "endPointLink", + } + ) + .assign( + IncludeInPortal="No", + End_Point="NoData", + End_Point_Name=None, + Description="No data", + endPointLink="", + ) + .pipe( + lambda df: df.append( + { + "IncludeInPortal": "No", + "End_Point": "NoData", + "End_Point_Name": None, + "Description": "No data", + "endPointLink": "", + }, + ignore_index=True, + ) + ) + ) + return endpoint_details + + +#'getNewChemicalClass +#'This file reads in the files and processes the chemical class names +#'to be friendly for the website +#' @depracated As we move to the new MASV classes +#'@param data.dir +#'@return data.frame +# Map the newClass values based on conditions provided +def _map_classification(x): + if x in [ + "industrial", + "industrial; aniline", + "Industrial", + "industrial; consumerProduct; phenol", + "industrial; consumerProduct; aniline", + "industrial; phenol", + ]: + return "Industrial" + elif x in ["PAH; industrial", "PAH"]: + return "PAH" + elif x in [ + "personalCare; personalCare; natural; natural; consumerProduct; consumerProduct", + "personalCare; natural; consumerProduct", + "personalCare; natural", + "pharmacological; personalCare; industrial; natural; consumerProduct", + ]: + return "Natural" + elif x == "pestFungicide": + return "Fungicide" + elif pd.isna(x) or x == "NA": + return "Unclassified" + else: + return x + + +def get_new_chemical_class(data_dir): + # Read in the Excel sheet for PAHs + pahs = ( + pd.read_excel(f"{data_dir}/PAH_and_1530_SRP_Summary.xlsx", sheet_name=3) + .loc[:, ["casrn"]] + .rename(columns={"casrn": "cas_number"}) + .assign(Classification="PAH") + ) + + # Extras data for particular CAS numbers + extras = pd.DataFrame( + { + "cas_number": ["3074-03-01", "7496-02-08", "6373-11-01"], + "Classification": "PAH", + } + ) + + # Read in the Excel sheet for non-PAHs + non_pahs = ( + pd.read_excel(f"{data_dir}/PAH_and_1530_SRP_Summary.xlsx", sheet_name=4) + .loc[:, ["casrn", "classification"]] + .rename(columns={"casrn": "cas_number", "classification": "Classification"}) + ) + + # Combine all the dataframes + full_class = pd.concat([pahs, extras, non_pahs], ignore_index=True) + full_class["newClass"] = full_class["Classification"].apply(_map_classification) + + return full_class + + +#' buildSampleData - takes the curated information and selects the data we need +#' @param data.dir +#' @return data.frame +# fses_files, #files from barton that contain sample info +# chemMeta, #metadata for chemicals including identifier mapping +# sampIds, #new ids for samples +# sampMapping ##mapping for sample names to clean up +def build_sample_data(fses_files, chem_meta, sample_ids, samp_mapping=None): + sample_data = [] + for fs in fses_files: + sc = ( + pd.read_csv(fs) + .loc[:, REQUIRED_SAMPLE_COLUMNS] + .replace({"BLOD": "0", "NULL": "0", "nc:BDL": "0"}) + .query( + "SampleNumber != 'None' and cas_number != 'NULL' and measurement_value != '0' and measurement_value_molar != '0'" + ) + .copy() + ) + sc["LocationLon"] = pd.to_numeric(sc["LocationLon"], errors="coerce") + sc["LocationLon"] = np.where( + sc["LocationLon"].gt(0), -sc["LocationLon"], sc["LocationLon"] + ) + sample_data.append(sc) + + final_samp_chem = pd.concat(sample_data).merge( + chem_meta[["Chemical_ID", "cas_number", "AVERAGE_MASS"]], + on="cas_number", + how="left", + ) + + ids = sample_id_master_table(final_samp_chem["SampleNumber"], sample_ids) + final_samp_chem = final_samp_chem.merge( + ids, on="SampleNumber", how="left" + ).drop_duplicates() + + all_samp_names = ( + final_samp_chem[["Sample_ID", "SampleName"]] + .drop_duplicates() + .assign(is_dupe=lambda df: df.duplicated("SampleName")) + ) + dupe_samp_names = all_samp_names.query("is_dupe").sort_values("SampleName") + num_dupes = ( + dupe_samp_names.groupby("SampleName") + .size() + .reset_index(name="nid") + .merge(dupe_samp_names) + ) + + new_names = ( + num_dupes[["SampleName", "nid"]] + .drop_duplicates() + .assign( + new_name=lambda df: df.apply( + lambda row: ":".join( + [f"{row['SampleName']}:{i+1}" for i in range(int(row["nid"]))] + ), + axis=1, + ) + ) + .drop(columns="nid") + .rename(columns={"SampleName": "old_sn"}) + ) + + full_rep = pd.concat([dupe_samp_names, new_names], axis=1)[ + ["Sample_ID", "new_name"] + ] + new_samp_names = pd.concat( + [ + all_samp_names.query("~is_dupe").drop("is_dupe", axis=1), + full_rep.rename(columns={"new_name": "SampleName"}), + ] + ) + final_samp_chem = ( + final_samp_chem.drop("SampleName", axis=1) + .merge(new_samp_names, on="Sample_ID", how="left") + .copy() + ) + + sample_name_remap = ( + pd.read_excel(samp_mapping, sheet_name=0) + .loc[:, ["Sample_ID", "ProjectName", "NewSampleName", "NewLocationName"]] + .drop_duplicates() + ) + + final_samp_chem = final_samp_chem.merge( + sample_name_remap, on="Sample_ID", how="left" + ).drop_duplicates() + + nas = final_samp_chem["projectName"].isna() + final_samp_chem.loc[nas, "projectName"] = final_samp_chem.loc[nas, "ProjectName"] + final_samp_chem.loc[nas, "LocationName"] = final_samp_chem.loc[ + nas, "NewLocationName" + ] + final_samp_chem.loc[nas, "SampleName"] = final_samp_chem.loc[nas, "NewSampleName"] + + final_samp_chem = final_samp_chem.drop( + columns=["ProjectName", "NewSampleName", "NewLocationName", "AVERAGE_MASS"] + ).dropna(subset=["cas_number"]) + + return final_samp_chem + + +def combine_v2_chemical_endpoint_data( + bmd_files, is_extract=False, samp_chem=None, endpoint_details=None +): + print(f"Combining bmd files: {', '.join(bmd_files)}") + cols = REQUIRED_BMD_COLUMNS["bmd"] + mid_bmd = pd.concat([pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files]) + + dupes = mid_bmd[["Chemical_ID", "End_Point"]].duplicated().values + if dupes.any(): + mid_bmd = mid_bmd[~dupes] + + if is_extract: + sd_samp = ( + samp_chem["Sample_ID"] + .str.split("-", expand=True) + .iloc[:, 0:2] + .rename(columns={0: "tmp_id", 1: "sub"}) + .merge(samp_chem[["Sample_ID"]].drop_duplicates(), on="sub", how="left") + ) + + full_bmd = ( + mid_bmd.assign(tmp_id=mid_bmd["Chemical_ID"].astype(str)) + .drop("Chemical_ID", axis=1) + .merge(sd_samp, on="tmp_id", how="left") + ) + + nas = full_bmd["Sample_ID"].isna() + full_bmd.loc[nas, "Sample_ID"] = full_bmd.loc[nas, "tmp_id"] + + new_nas = full_bmd["SampleName"].isna() + if new_nas.any(): + full_bmd.loc[new_nas, "SampleName"] = ( + "Sample " + full_bmd.loc[new_nas, "Sample_ID"] + ) + + full_bmd = ( + full_bmd.fillna({"End_Point": "NoData"}) + .merge(endpoint_details, on="End_Point", how="right") + .drop(columns=["End_Point", "tmp_id"]) + .dropna(subset=["Sample_ID"]) + .drop_duplicates() + .fillna({"LocationName": "None"}) + ) + else: + full_bmd = ( + mid_bmd.fillna({"End_Point": "NoData"}) + .merge(endpoint_details, on="End_Point", how="right") + .drop(columns="End_Point") + .dropna(subset=["cas_number"]) + .drop_duplicates() + .fillna({"chemical_class": "Unclassified"}) + ) + + full_bmd = ( + full_bmd.rename(columns={"DataQC_Flag": "qc_num"}) + .assign( + DataQC_Flag=full_bmd["qc_num"].replace( + {0: "Poor", 1: "Poor", 4: "Moderate", 5: "Moderate"}, regex=True + ) + ) + .assign(Model=lambda df: df["Model"].str.replace("NULL", "None")) + .drop(columns="qc_num") + ) + + return full_bmd + + +def combine_chemical_fit_data( + bmd_files, is_extract=False, samp_chem=None, endpoint_details=None +): + print(f"Combining fit files: {', '.join(bmd_files)}") + cols = REQUIRED_BMD_COLUMNS["fitVals"] + files = [pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files] + + chem_eps = [ + set( + file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"])[ + "combined" + ] + ) + for file in files + ] + new_chem_eps = chem_eps.copy() + + if len(chem_eps) > 1: + for i in range(1, len(new_chem_eps)): + previous_eps = set.union(*chem_eps[:i]) + new_chem_eps[i] -= previous_eps + + fixed_files = [] + for i, (file, new_eps) in enumerate(zip(files, new_chem_eps)): + fixed_files.append( + file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"]) + .query("combined in @new_eps") + .replace({"X_vals": "NULL"}, np.nan) + .assign( + X_vals=lambda df: df["X_vals"].astype(float), + Y_vals=lambda df: df["Y_vals"].astype(float), + ) + ) + + mid_bmd = pd.concat(fixed_files) + full_bmd = ( + mid_bmd.merge(endpoint_details, on="End_Point", how="right") + .drop_duplicates() + .drop(columns=["End_Point", "Description"]) + ) + + if is_extract: + sd_samp = ( + samp_chem["Sample_ID"] + .str.split("-", expand=True) + .iloc[:, 0:2] + .rename(columns={0: "tmp_id", 1: "sub"}) + .merge(samp_chem[["Sample_ID"]].drop_duplicates(), on="sub", how="left") + ) + + full_bmd = ( + full_bmd.assign(tmp_id=full_bmd["Chemical_ID"].astype(str)) + .drop("Chemical_ID", axis=1) + .merge(sd_samp, on="tmp_id", how="left") + ) + + nas = full_bmd["Sample_ID"].isna() + full_bmd.loc[nas, "Sample_ID"] = full_bmd.loc[nas, "tmp_id"] + full_bmd = full_bmd.drop(columns=["tmp_id"]) + + return full_bmd + + +def _flatten_class_df( + df: pd.DataFrame, + keep_cols: ArrayLike, + drop_cols: ArrayLike, + var_name: str, + id_cols: ArrayLike = ["CASNumber", "ParameterName"], +) -> pd.DataFrame: + """Flatten and format long-form presence/absence class data. + + Parameters + ---------- + df : pd.DataFrame + Long-form class dataframe + keep_cols : ArrayLike + List of columns to keep (e.g. MASV_CC or MASV_SOURCE) + drop_cols : ArrayLike + List of columns to drop (e.g. MASV_CC or MASV_SOURCE) + var_name : str + Variable name (e.g. "chem_source", "chem_class", ...) + id_cols : ArrayLike, optional + List of chemical ID cols, by default ["CASNumber", "ParameterName"] + + Returns + ------- + pd.DataFrame + Formatted dataframe + """ + + df = df.drop(drop_cols) # drop non-source cols + df = df.melt( # reorg by cas, param name, source/class, and pos/neg + id_vars=id_cols, + value_vars=keep_cols, + var_name=var_name, + value_name="posNeg", + ) + + # Remove null pos/negs, then remove pos/neg col + df = df[df["posNeg"] != "NULL"] # remove nulls + df = df.drop(columns="posNeg") + + # For sources, combine all pesticides into 1 category + if "source" in var_name: + df[var_name] = df[var_name].str.replace(r"^pest.*", "pesticide", regex=True) + + # Compile all values for each compound + df = df.groupby(id_cols)[var_name].apply(lambda x: ";".join(x)).reset_index() + return df + + +def masv_chem_class( + class_file, + save_to: str = "MASV_classAndSource.csv", + id_cols: ArrayLike = ["CASNumber", "ParameterName"], +): + """Reads full MASV class annotations and assigns values to chemicals. + + Parameters + ---------- + class_file : _type_ + _description_ + save_to : str, optional + _description_, by default "MASV_classAndSource.csv" + id_cols : ArrayLike, optional + _description_, by default ["CASNumber", "ParameterName"] + + Returns + ------- + _type_ + _description_ + """ + data = pd.read_excel(class_file, sheet_name=0) + + # Reorganize data by source + sources = data.copy() + sources = _flatten_class_df( + sources, + keep_cols=MASV_SOURCE, + drop_cols=MASV_CC, + var_name="chem_source", + id_cols=id_cols, + ) + + # Reorganize data by class + classes = data.copy() + classes = _flatten_class_df( + classes, + keep_cols=MASV_CC, + drop_cols=MASV_SOURCE, + var_name="chemical_class", + id_cols=id_cols, + ) + + # Combine source and class information + combined = sources.merge(classes, on=id_cols, how="outer") + combined = combined.fillna({"chemical_class": "Unclassified"}) + combined = combined.rename(columns={"CASNumber": "cas_number"}) + + combined.to_csv(save_to, index=False) + return combined + + +def combine_chemical_dose_data( + bmd_files, is_extract=False, samp_chem=None, endpoint_details=None +): + print(f'Combining dose response files: {", ".join(bmd_files)}') + cols = REQUIRED_BMD_COLUMNS["doseRep"] + files = [pd.read_csv(bmd_file).loc[:, cols] for bmd_file in bmd_files] + + chem_eps = [ + set( + file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"])[ + "combined" + ] + ) + for file in files + ] + new_chem_eps = chem_eps.copy() + + if len(chem_eps) > 1: + for i in range(1, len(new_chem_eps)): + previous_eps = set.union(*chem_eps[:i]) + new_chem_eps[i] -= previous_eps + + fixed_files = [] + for i, (file, new_eps) in enumerate(zip(files, new_chem_eps)): + fixed_files.append( + file.assign(combined=lambda df: df["Chemical_ID"] + df["End_Point"]) + .query("combined in @new_eps") + .loc[:, cols] + ) + + mid_bmd = pd.concat(fixed_files) + full_bmd = ( + mid_bmd.merge(endpoint_details, on="End_Point", how="right") + .drop_duplicates() + .drop(columns=["End_Point", "Description"]) + ) + + if is_extract: + sd_samp = ( + samp_chem["Sample_ID"] + .str.split("-", expand=True) + .iloc[:, 0:2] + .rename(columns={0: "tmp_id", 1: "sub"}) + .merge(samp_chem[["Sample_ID"]].drop_duplicates(), on="sub", how="left") + ) + + full_bmd = ( + full_bmd.assign(tmp_id=full_bmd["Chemical_ID"].astype(str)) + .drop("Chemical_ID", axis=1) + .merge(sd_samp, on="tmp_id", how="left") + .assign(Sample_ID=lambda df: df["Sample_ID"].fillna(df["tmp_id"])) + ) + + return full_bmd + + +# ========================================================= +# Command Line Interface (CLI) +# ========================================================= + + +def main(): + parser = ArgumentParser() + parser.add_argument( + "-s", + "--sample", + dest="is_sample", + action="store_true", + default=False, + help="Flag to indicate file is for samples", + ) + parser.add_argument( + "-c", + "--chemical", + dest="is_chem", + default=False, + action="store_true", + help="Flag to indicate file is for chemicals", + ) + parser.add_argument( + "-d", + "--drc_files", + dest="dose_res_stat", + default="", + help="Dose response curve file", + ) + parser.add_argument( + "-p", + "--sample_id", + dest="sample_id_file", + default="", + help="Sample mapping file location", + ) + parser.add_argument( + "-i", + "--chem_id", + dest="chem_id_file", + default="", + help="Chemical ID file location", + ) + parser.add_argument( + "-e", + "--ep_map", + dest="endpoint_mapping_file", + default="", + help="Endpoint naming file location", + ) + parser.add_argument( + "-l", + "--chem_class", + dest="chem_class_file", + default="", + help="Chemical class file location", + ) + parser.add_argument( + "-x", + "--comptox_file", + dest="comptox_mapping", + default="", + help="CompTox data file location", + ) + parser.add_argument( + "-f", + "--sample_files", + dest="sample_files", + default="", + help="Comma delimited list of FSES files to merge", + ) + parser.add_argument( + "-y", + "--chem_desc", + dest="chem_desc", + default="", + help="Descriptions of chemicals", + ) + parser.add_argument( + "-m", + "--sample_map", + dest="samp_map", + default="", + help="File that maps sample locations", + ) + + args = parser.parse_args() + + chem_class = masv_chem_class(args.chem_class_file) + chem_meta = get_chem_metadata( + args.chem_desc, chem_class, args.comptox_mapping, args.chem_id_file + ) + + sample_files_list = args.sample_files.split(",") + samp_chem = build_sample_data( + sample_files_list, chem_meta, args.sample_id_file, args.samp_map + ) + + endpoint_details = get_endpoint_metadata( + args.endpoint_mapping_file + ).drop_duplicates() + + if args.is_sample or args.is_chem: + all_files = args.dose_res_stat.split(",") + bmd_files = [file for file in all_files if "bmd" in file] + dose_files = [file for file in all_files if "dose" in file] + fit_files = [file for file in all_files if "fit" in file] + + meta_file = samp_chem if args.is_sample else chem_meta + + bmds = ( + combine_v2_chemical_endpoint_data( + bmd_files, + is_extract=args.is_sample, + samp_chem=meta_file, + endpoint_details=endpoint_details, + ) + .dropna(subset=["BMD_Analysis_Flag"]) + .query("BMD_Analysis_Flag != 'NA'") + ) + curves = combine_chemical_fit_data( + fit_files, + is_extract=args.is_sample, + samp_chem=meta_file, + endpoint_details=endpoint_details, + ) + dose_reps = combine_chemical_dose_data( + dose_files, + is_extract=args.is_sample, + samp_chem=meta_file, + endpoint_details=endpoint_details, + ).dropna(subset=["Dose"]) + + output_dir = "/tmp/" + + if args.is_sample: + bmds.to_csv( + f"{output_dir}zebrafishSampBMDs.csv", index=False, quotechar='"' + ) + curves.to_csv( + f"{output_dir}zebrafishSampXYCoords.csv", index=False, quotechar='"' + ) + dose_reps.to_csv( + f"{output_dir}zebrafishSampDoseResponse.csv", index=False, quotechar='"' + ) + + if args.is_chem: + nas = bmds["Chemical_ID"].isna() + to_remove = set(nas) - set(samp_chem["Chemical_ID"]) + bmds = bmds[~bmds["Chemical_ID"].isin(to_remove)] + curves = curves[~curves["Chemical_ID"].isin(to_remove)] + dose_reps = dose_reps[~dose_reps["Chemical_ID"].isin(to_remove)] + + bmds.to_csv( + f"{output_dir}zebrafishChemBMDs.csv", index=False, quotechar='"' + ) + curves.to_csv( + f"{output_dir}zebrafishChemXYCoords.csv", index=False, quotechar='"' + ) + dose_reps.to_csv( + f"{output_dir}zebrafishChemDoseResponse.csv", index=False, quotechar='"' + ) + + else: + chem_meta.to_csv(f"{output_dir}chemicals.csv", index=False, quotechar='"') + samp_chem[SAMPLE_COLUMNS].drop_duplicates().to_csv( + f"{output_dir}samples.csv", index=False, quotechar='"' + ) + samp_chem[SAMPLE_CHEM_COLUMNS].drop_duplicates().to_csv( + f"{output_dir}sampleToChemicals.csv", index=False, quotechar='"' + ) + + +main() diff --git a/sampleChemMapping/params.py b/sampleChemMapping/params.py new file mode 100644 index 0000000..c570fd5 --- /dev/null +++ b/sampleChemMapping/params.py @@ -0,0 +1,157 @@ +# ========================================================= +# Define Columns / Table Schemas +# ========================================================= + +REQUIRED_SAMPLE_COLUMNS = [ + "ClientName", + "SampleNumber", + "date_sampled", + "sample_matrix", + "technology", + "projectName", + "SampleName", + "LocationLat", + "projectLink", + "LocationLon", + "LocationName", + "LocationAlternateDescription", + "AlternateName", + "cas_number", + "date_sample_start", + "measurement_value", + "measurement_value_qualifier", + "measurement_value_unit", + "measurement_value_molar", + "measurement_value_molar_unit", + "environmental_concentration", + "environmental_concentration_qualifier", + "environmental_concentration_unit", + "environmental_concentration_molar", + "environmental_concentration_molar_unit", +] + +# @TODO: need to rename the water columns +# new_sample_columns = { +# 'environmental_concentration': 'water_concentration', +# 'environmental_concentration_qualifier': 'water_concentration_qualifier', +# 'environmental_concentration_unit': 'water_concentration_unit', +# 'environmental_concentration_molar': 'water_concentration_molar', +# 'environmental_concentration_molar_unit': 'water_concentration_molar_unit' +# } + + +REQUIRED_COMPTOX_COLUMNS = [ + "INPUT", + "DTXSID", + "PREFERRED_NAME", + "INCHIKEY", + "SMILES", + "MOLECULAR_FORMULA", + "AVERAGE_MASS", + "PUBCHEM_DATA_SOURCES", +] + +SAMPLE_CHEM_COLUMNS = [ + "Sample_ID", + "Chemical_ID", + "measurement_value", + "measurement_value_qualifier", + "measurement_value_unit", + "measurement_value_molar", + "measurement_value_molar_unit", + "environmental_concentration", + "environmental_concentration_qualifier", + "environmental_concentration_unit", + "environmental_concentration_molar", + "environmental_concentration_molar_unit", +] + +SAMPLE_COLUMNS = [ + "Sample_ID", + "ClientName", + "SampleNumber", + "date_sampled", + "sample_matrix", + "technology", + "projectName", + "SampleName", + "LocationLat", + "projectLink", + "LocationLon", + "LocationName", + "LocationAlternateDescription", + "AlternateName", + "date_sample_start", +] + +REQUIRED_CHEM_COLUMNS = [ + "Chemical_ID", + "cas_number", + "DTXSID", + "PREFERRED_NAME", + "INCHIKEY", + "SMILES", + "MOLECULAR_FORMULA", + "AVERAGE_MASS", + "PUBCHEM_DATA_SOURCES", + "chem_source", + "chemical_class", + "chemDescription", +] + +REQUIRED_MAPPING_COLUMNS = [ + "SampleNumber", + "Sample_ID", + "zf_lims_id", +] # added these to complete mapping + +# required from bmd calculation +# add in Sample_ID or Chemical_ID depending on table +REQUIRED_BMD_COLUMNS = { + "bmd": [ + "Chemical_ID", + "End_Point", + "Model", + "BMD10", + "BMD50", + "Min_Dose", + "Max_Dose", + "AUC_Norm", + "DataQC_Flag", + "BMD_Analysis_Flag", + ], + "doseRep": ["Chemical_ID", "End_Point", "Dose", "Response", "CI_Lo", "CI_Hi"], + "fitVals": ["Chemical_ID", "End_Point", "X_vals", "Y_vals"], +} + +# Define MASV mapping params +MASV_SOURCE = [ + "pharmacological", + "personalCare", + "industrial", + "pulpAndPaper", + "pestProduct", + "natural", + "pestRodenticide", + "consumerProduct", + "pestHerbicide", + "pestInsecticide", + "pestFungicide", + "pestGeneral", + "flameRetardant", +] + +MASV_CC = [ + "PAH", + "OPAH", + "PCB", + "PBB", + "PBDE", + "deuterated", + "dioxinsAndFurans", + "haloEthers", + "OPFR", + "phenol", + "aniline", + "uncategorized", +]