Merge pull request #3 from zivy/updateUtilities

Update utilities
IBEXImagingCommunity · Jan 20, 2023 · 5e024e4 · 5e024e4
2 parents 670ceb4 + 3045065
commit 5e024e4
Show file tree

Hide file tree

Showing 10 changed files with 312 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,19 @@ needed. This is equivalent to summarizing all activity on a feature branch versu
 
 ## Unreleased
 
+## v0.3.0
+
+### Added
+* fluorescent_probes_csv_2_md - script for creating the knowledge-base fluorescent_probes markdown page from the fluorescent_probes.csv.
+
+### Changed
+* reagent_resources_csv_2_md_url - In addition to the reagent_resources.csv we now use a template file into which the table is written. Allows us to modify the descriptive text without modifying code. Additionally, the table is sorted on the "Target Name / Protein Biomarker" column.
+* update_index_md_stats - Change the computed statistics to:
+  1. number_of_contributors - count both original contributors and folks that replicated the work.
+  1. number_of_validated_reagents - count rows in the reagent_resources.csv.
+  1. number_of_fluorescent_probes - count number of unique entries in conjugate column of the reagent_resources.csv (ignoring NA, Unconjugated, Biotin, HRP, UT014, UT015, UT016, UT019).
+  1. number_of_tissues - count unique combinations of Target_Species-Target_Tissue-Tissue_State.
+
 ## v0.2.0
 
 ### Added

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "ibex_imaging_knowledge_base_utilities"
-version = "0.2.0"
+version = "0.3.0"
 authors = [{ name="Ziv Yaniv", email="[email protected]" },
 ]
 description = "Utility scripts used for maintaining the IBEX Imaging Community Knowledge-Base"
@@ -30,5 +30,6 @@ dependencies = [
 [project.scripts]
 bib2md = "ibex_imaging_knowledge_base_utilities.bib2md:main"
 reagent_resources_csv_2_md_url = "ibex_imaging_knowledge_base_utilities.reagent_resources_csv_2_md_url:main"
+fluorescent_probes_csv_2_md = "ibex_imaging_knowledge_base_utilities.fluorescent_probes_csv_2_md:main"
 update_index_md_stats = "ibex_imaging_knowledge_base_utilities.update_index_md_stats:main"
 validate_zenodo_json = "ibex_imaging_knowledge_base_utilities.validate_zenodo_json:main"
diff --git a/src/ibex_imaging_knowledge_base_utilities/fluorescent_probes_csv_2_md.py b/src/ibex_imaging_knowledge_base_utilities/fluorescent_probes_csv_2_md.py
@@ -0,0 +1,89 @@
+# =========================================================================
+#
+#  Copyright Ziv Yaniv
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# =========================================================================
+
+import pandas as pd
+import argparse
+import sys
+from .argparse_types import file_path, dir_path
+
+
+"""
+This script converts the IBEX knowledge-base fluorescent_probes.csv file to markdown.
+
+This script is automatically run when modifications to the fluorescent_probes.csv file is merged
+into the main branch of the ibex_knowledge_base repository (see .github/workflows/data2md.yml).
+
+Assumption: The fluorescent_probes.csv file is valid. It conforms to the expected format (empty entries denoted
+by the string "NA").
+"""
+
+
+def fluorescent_probe_csv_to_md(template_file_path, csv_file_path, output_dir):
+    """
+    Convert the IBEX knowledge-base fluorescent probe csv file to markdown. Output is written to a
+    file named fluorescent_probes.md in the output directory. The template_file_path file is expected
+    to contain the string
+    {probe_table} which is replaced with the contents of the actual table.
+    """
+    # Read the dataframe and keep entries that are "NA", don't convert to nan
+    df = pd.read_csv(csv_file_path, dtype=str, keep_default_na=False)
+    df.sort_values(by=["Excitation Max (nm)", "Emission Max (nm)"], inplace=True)
+    with open(template_file_path, "r") as fp:
+        input_md_str = fp.read()
+    with open(output_dir / "fluorescent_probes.md", "w") as fp:
+        fp.write(input_md_str.format(probe_table=df.to_markdown(index=False)))
+
+
+def main(argv=None):
+    if argv is None:  # script was invoked from commandline
+        argv = sys.argv[1:]
+    parser = argparse.ArgumentParser(
+        description="Convert knowledge-base fluorescent probes file from csv to md and sort according to excitation and emission."  # noqa E501
+    )
+    parser.add_argument(
+        "md_template_file",
+        type=file_path,
+        help='Path to template markdown file which contains the string "{probe_table}".',
+    )
+    parser.add_argument(
+        "csv_file", type=file_path, help="Path to the fluorescent_probes.csv file."
+    )
+    parser.add_argument(
+        "output_dir",
+        type=dir_path,
+        help="Path to the output directory (the fluorescent_probes.md file is written to this directory).",
+    )
+    args = parser.parse_args(argv)
+
+    try:
+        return fluorescent_probe_csv_to_md(
+            template_file_path=args.md_template_file,
+            csv_file_path=args.csv_file,
+            output_dir=args.output_dir,
+        )
+    except Exception as e:
+        print(
+            f"{e}",
+            file=sys.stderr,
+        )
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/ibex_imaging_knowledge_base_utilities/reagent_resources_csv_2_md_url.py b/src/ibex_imaging_knowledge_base_utilities/reagent_resources_csv_2_md_url.py
@@ -55,8 +55,6 @@
 by the string "NA").
 """
 
-md_header = "<!-- Do NOT edit this file. It is automatically generated from reagents_resources.csv -->\n\n"
-
 
 def short_circuit_requests_get(url, params=None, **kwargs):
     res = requests.Response()
@@ -161,15 +159,20 @@ def uniprots_to_md(uniprots_str, uniprot_md_str):
 
 
 def csv_to_md_with_url(
-    csv_file_path, supporting_material_root_dir, vendor_to_website_json_file_path
+    template_file_path,
+    csv_file_path,
+    supporting_material_root_dir,
+    vendor_to_website_json_file_path,
 ):
     """
-    Convert the IBEX knowledge-base csv file to markdown and add links to the supporting
+    Convert the IBEX knowledge-base reagent resources csv file to markdown and add links to the supporting
     material files. Output is written to a file named markdown.md in the parent directory
-    of the supporting_material_root_dir.
+    of the supporting_material_root_dir. The md_template_path file is expected to contain the
+    string {reagent_resources_table} which is replaced with the contents of the actual table.
     """
     # Read the dataframe and keep entries that are "NA", don't convert to nan
     df = pd.read_csv(csv_file_path, dtype=str, keep_default_na=False)
+    df.sort_values(by=["Target Name / Protein Biomarker"], inplace=True)
     supporting_material_path = pathlib.PurePath(supporting_material_root_dir).name
     if not df.empty:
         print("Start linking to supporting material...")
@@ -210,8 +213,13 @@ def csv_to_md_with_url(
             print(f"Vendor ({k}) not found in {vendor_to_website_json_file_path}.")
             return 1
         print("Finished linking to vendor websites...")
+
+    with open(template_file_path, "r") as fp:
+        input_md_str = fp.read()
     with open(supporting_material_root_dir.parent / "reagent_resources.md", "w") as fp:
-        fp.write("# Reagent Resources\n\n" + md_header + df.to_markdown(index=False))
+        fp.write(
+            input_md_str.format(reagent_resources_table=df.to_markdown(index=False))
+        )
     return 0
 
 
@@ -222,18 +230,23 @@ def main(argv=None):
         description="Convert knowledge-base reagent resources file from csv to md and add hyperlinks."
     )
     parser.add_argument(
-        "csv_file", type=file_path, help="Path to the reagent_resources.csv file."
+        "md_template_file",
+        type=file_path,
+        help='Path to template markdown file which contains the string "{reagent_resources_table}".',
     )
     parser.add_argument(
-        "supporting_material_root_dir",
-        type=dir_path,
-        help="Path to the directory containing the supporting materials files.",
+        "csv_file", type=file_path, help="Path to the reagent_resources.csv file."
     )
     parser.add_argument(
         "vendor_to_website",
         type=file_path,
         help="JSON file containing the mapping between vendor name and website",
     )
+    parser.add_argument(
+        "supporting_material_root_dir",
+        type=dir_path,
+        help="Path to the directory containing the supporting materials files.",
+    )
     parser.add_argument(
         "--skip_url_validation",
         action="store_true",
@@ -245,7 +258,10 @@ def main(argv=None):
         if args.skip_url_validation:
             requests.get = short_circuit_requests_get
         return csv_to_md_with_url(
-            args.csv_file, args.supporting_material_root_dir, args.vendor_to_website
+            template_file_path=args.md_template_file,
+            csv_file_path=args.csv_file,
+            supporting_material_root_dir=args.supporting_material_root_dir,
+            vendor_to_website_json_file_path=args.vendor_to_website,
         )
     except Exception as e:
         print(

diff --git a/src/ibex_imaging_knowledge_base_utilities/update_index_md_stats.py b/src/ibex_imaging_knowledge_base_utilities/update_index_md_stats.py
@@ -46,36 +46,59 @@ def update_index_stats(input_md, input_csv, output_file):
     with open(input_md, "r") as fp:
         input_md_str = fp.read()
     stats_dictionary = compute_stats_dictionary(input_csv)
-    stats_dictionary[
-        "do_not_edit_message"
-    ] = "<!-- Do NOT edit this file. It is automatically generated from the reagent_resources.csv file -->\n\n"
     with open(output_file, "w") as fp:
         fp.write(input_md_str.format(**stats_dictionary))
 
 
+def entry2list(entry):
+    """
+    Replace a string entry with a If the entry is
+    nan a null string or "NA" return an empty list.
+    Otherwise, the string is split using the semicolon as
+    the separator character, leading and trailing whitespace is
+    removed from the substrings.
+    """
+    if pd.isna(entry) or entry.strip() == "":
+        return set()
+    else:
+        res_list = [v.strip() for v in entry.split(";") if v.strip() != ""]
+        res = set(res_list)
+        if len(res_list) != len(res):
+            raise ValueError(f"entry with duplicate values - {entry}")
+        return res
+
+
 def compute_stats_dictionary(input_csv):
     stats_dict = {}
     df = pd.read_csv(input_csv, dtype=str, keep_default_na=False)
-    stats_dict["number_of_contributors"] = df["Contributor"].nunique()
-    stats_dict["number_of_recommended_antibodies"] = df["Catalog Number"][
-        df["Recommend"] == "Yes"
-    ].nunique()
-    stats_dict["number_of_not_recommended_antibodies"] = df["Catalog Number"][
-        df["Recommend"] == "No"
-    ].nunique()
-    stats_dict["number_of_fluorophores"] = len(
+    # Compute number of contributors, both original and folks that
+    # replicated the validation and either agree or disagree with the
+    # original contribution. The original contributor added the ORCID
+    # to the "Agree" column and the "Contributor" column, so no need to
+    # look at the "Contributor" column.
+    all_contributions = df["Agree"].tolist() + df["Disagree"].tolist()
+    all_unique_contributors = set(
+        [
+            v.strip()
+            for x in all_contributions
+            for v in x.split(";")
+            if v.strip() != "NA"
+        ]
+    )
+    stats_dict["number_of_contributors"] = len(all_unique_contributors)
+    stats_dict["number_of_validated_reagents"] = len(df)
+    stats_dict["number_of_fluorescent_probes"] = len(
         df["Conjugate"][
             ~df["Conjugate"].isin(
                 [
                     "NA",
-                    "AF594",
-                    "eF615",
-                    "Hoechst",
-                    "JOJO-1",
                     "Unconjugated",
-                    "PE/Dazzle AF594",
                     "Biotin",
                     "HRP",
+                    "UT014",
+                    "UT015",
+                    "UT016",
+                    "UT019",
                 ]
             )
         ].unique()

diff --git a/tests/data/fluorescent_probes.csv b/tests/data/fluorescent_probes.csv
@@ -0,0 +1,67 @@
+Fluorescent Probe,Excitation Max (nm),Emission Max (nm),Signal Inactivation Conditions IBEX2D Manual
+Hoechst,350,461,Does not bleach
+VioGreen,388,520,1 mg/ml LiBH4 15 minutes
+VioBlue,400,452,1 mg/ml LiBH4 15 minutes
+Spark Violet 538,400,538,1 mg/ml LiBH4 15 minutes
+StarBright Violet 670,401,667,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+StarBright Violet 710,402,713,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+BV421,405,421,1 mg/ml LiBH4 15 minutes + Light
+eF450,405,450,1 mg/ml LiBH4 15 minutes
+BV510,405,510,1 mg/ml LiBH4 15 minutes + Light
+BV570,405,570,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+Pacific Blue,410,455,1 mg/ml LiBH4 15 minutes
+Cyan Fluorescent Protein (CFP),435,485,Does not bleach
+StarBright Blue 700,473,703,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+PerCP-Vio 700,482,704,Not tested
+Green Fluorescent Protein (GFP),488,510,Does not bleach
+iF488,488,530,1 mg/ml LiBH4 15 minutes
+AF488,490,525,1 mg/ml LiBH4 15 minutes
+AF488 (Plus),490,525,1 mg/ml LiBH4 15 minutes
+FITC,490,525,1 mg/ml LiBH4 30 minutes
+CL490,491,515,1 mg/ml LiBH4 15 minutes
+Spark Blue 574,506,574,1 mg/ml LiBH4 15 minutes
+iF514,511,527,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+Yellow Fluorescent Protein (YFP),513,527,Does not bleach
+JOJO-1,530,544,Does not bleach
+AF532,532,554,1 mg/ml LiBH4 15 minutes
+iF532,537,560,1 mg/ml LiBH4 15 minutes
+CL550,550,575,1 mg/ml LiBH4 15 minutes
+AF555,555,580,1 mg/ml LiBH4 15 minutes
+AF555 (Plus),555,580,1 mg/ml LiBH4 15 minutes
+Spark YG 570,555,570,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+Red Fluorescent Protein (RFP),555,584,Does not bleach
+AF546,556,573,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+eF570,556,569,1 mg/ml LiBH4 15 minutes
+iF555,557,570,1 mg/ml LiBH4 15 minutes
+PE,565,578,1 mg/ml LiBH4 15 minutes
+RY586,565,586,Not tested
+PE/iF594,565,606,1 mg/ml LiBH4 15 minutes
+PE/Dazzle AF594,565,610,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+PE-Vio 615,565,619,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+PE-Vio 770,565,775,1 mg/ml LiBH4 15 minutes
+AF568,578,603,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+iF594,588,604,1 mg/ml LiBH4 15 minutes
+AF594,590,617,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment. 
+CL594,593,614,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+CF594,593,615,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+eF615,595,615,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+Texas Red,596,615,Does not bleach within 15 minutes of 1 mg/ml LiBH4 treatment
+AF633,631,650,1 mg/ml LiBH4 15 minutes
+eF660,633,669,1 mg/ml LiBH4 15 minutes
+AF647,650,665,1 mg/ml LiBH4 15 minutes
+AF647 (Plus),650,665,1 mg/ml LiBH4 15 minutes
+APC-Vio 770,652,775,Not tested
+CL650,655,676,1 mg/ml LiBH4 15 minutes
+iF647,656,670,1 mg/ml LiBH4 15 minutes
+AF660,662,690,1 mg/ml LiBH4 15 minutes
+AF680,679,702,1 mg/ml LiBH4 15 minutes
+iF680,684,701,1 mg/ml LiBH4 15 minutes
+Spark Red 718,687,718,1 mg/ml LiBH4 15 minutes
+Vio Bright R720,695,720,1 mg/ml LiBH4 15 minutes
+AF700,702,723,1 mg/ml LiBH4 15 minutes
+AF750,749,775,1 mg/ml LiBH4 15 minutes
+iF750,757,779,1 mg/ml LiBH4 15 minutes
+BL759/780,759,780,1 mg/ml LiBH4 15 minutes
+DL755,776,754,1 mg/ml LiBH4 15 minutes
+AF790,784,814,1 mg/ml LiBH4 15 minutes
+AF800 (Plus),786,790,1 mg/ml LiBH4 15 minutes
diff --git a/tests/data/fluorescent_probes.md.in b/tests/data/fluorescent_probes.md.in
@@ -0,0 +1,10 @@
+# Fluorescent Probes Tested by the IBEX Imaging Community
+
+<!-- Do NOT edit this file. It is automatically generated from the fluorescent_probes.md.in and fluorescent_probes.csv files. -->
+
+Summary of fluorescent probes tested by the IBEX Imaging Community. Inactivation conditions are method specific. 
+
+For the original IBEX2D manual method that uses 1 mg/ml of LiBH4. The time, concentration of LiBH4, and method (continuous exchange, bleaching in the presence of light) may vary by user. 
+
+
+{probe_table}