Merge pull request #35 from zivy/improveReagentResourcesValidation

Improve reagent resources validation
IBEXImagingCommunity · Jul 19, 2024 · 5d3b3d0 · 5d3b3d0
2 parents 50c4b65 + 82994f8
commit 5d3b3d0
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 10 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
    - id: check-json
    - id: check-yaml
 - repo: https://github.com/psf/black
-  rev: 22.12.0
+  rev: 24.4.2
   hooks: # check conformance to black formatting
    - id: black 
      args: ['--check'] # if run without arguments, will fail and will format the files

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,11 @@ needed. This is equivalent to summarizing all activity on a feature branch versu
 
 ## Unreleased
 
+## v0.8.2
+
+### Changed
+* validate_image_resources - Check that the vendor listed in the reagent_resources.csv is from the list of vendors found in the vendor_urls.csv.
+
 ## v0.8.1
 
 ### Fixed

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "ibex_imaging_knowledge_base_utilities"
-version = "0.8.1"
+version = "0.8.2"
 authors = [{ name="Ziv Yaniv", email="[email protected]" },
 ]
 description = "Utility scripts used for managing the IBEX Imaging Community Knowledge-Base"

diff --git a/src/ibex_imaging_knowledge_base_utilities/data_validation/validate_reagent_resources.py b/src/ibex_imaging_knowledge_base_utilities/data_validation/validate_reagent_resources.py
@@ -25,6 +25,7 @@
 from ibex_imaging_knowledge_base_utilities.argparse_types import (
     file_path_endswith,
     dir_path,
+    csv_path,
 )
 from .utilities import validate_df
 
@@ -47,6 +48,7 @@ def validate_reagent_resources(
     csv_file_name,
     json_config_file_name,
     zenodo_json_file_name,
+    vendors_csv_file_name,
     supporting_material_root_dir,
 ):
     MAX_ORCID_ENTRIES = 5
@@ -66,10 +68,18 @@ def validate_reagent_resources(
         zenodo_dict = json.load(fp)
     orcids = [data["orcid"].strip() for data in zenodo_dict["creators"]] + ["NA"]
 
+    # Get the list of vendor names from the vendor_urls.csv file name, column
+    # titled "Vendor"
+    vendor_names = pd.read_csv(vendors_csv_file_name)["Vendor"].to_list()
+
+    # Add the ORCIDs and vendor names to the configuration dictionary to
+    # enforce column content to be in a set of values
     if "column_is_in" in configuration_dict:
         configuration_dict["column_is_in"]["Contributor"] = orcids
+        configuration_dict["column_is_in"]["Vendor"] = vendor_names
     else:
         configuration_dict["column_is_in"] = {"Contributor": orcids}
+        configuration_dict["column_is_in"] = {"Vendor": vendor_names}
 
     if "multi_value_column_is_in" in configuration_dict:
         configuration_dict["multi_value_column_is_in"]["Agree"] = orcids
@@ -101,7 +111,7 @@ def validate_reagent_resources(
     # Check that the Contributor ORCID appears in the Agree or Disagree column.
     # In most cases it will be in the Agree column. When the recommendation
     # is refuted (multiple validators disagreed with the original contributor)
-    # and changed to from Yes to No or vice versa the ORCIDs from the Agree and
+    # and changed from Yes to No or vice versa the ORCIDs from the Agree and
     # Disagree columns are swapped and the original contributors ORCID will appear
     # in the Disagree column.
     df["Agree"] = df["Agree"].apply(
@@ -159,7 +169,7 @@ def validate_reagent_resources(
     unique_target_conjugate = df[
         ["Target Name / Protein Biomarker", "Conjugate"]
     ].drop_duplicates()
-    # md_file_paths_from_csv, status = unique_target_conjugate.apply(
+
     res = unique_target_conjugate.apply(
         lambda target_conjugate: validate_supporting_material(
             target_conjugate, df, supporting_material_root_dir
@@ -314,10 +324,10 @@ def validate_supporting_material(
             ] = supporting_orcid_configurations["Disagree"].apply(
                 lambda x: frozenset([s[1:-1] for s in re.findall(orcid_pattern, x)])
             )
-            supporting_orcid_configurations[
-                "Contributor"
-            ] = supporting_orcid_configurations["Contributor"].apply(
-                lambda x: re.findall(orcid_pattern, x)[0][1:-1]
+            supporting_orcid_configurations["Contributor"] = (
+                supporting_orcid_configurations["Contributor"].apply(
+                    lambda x: re.findall(orcid_pattern, x)[0][1:-1]
+                )
             )
             # Compare the configuration data from the supporting material to that from the reagent_resources file.
             # We don't use DataFrame.equal because that assumes the order of the columns and indexes is the same,
@@ -362,6 +372,11 @@ def main(argv=None):
         type=lambda x: file_path_endswith(x, ".json"),
         help=".zenodo.json file which contains the ORCIDs of all contributors.",
     )
+    parser.add_argument(
+        "vendors_csv_file",
+        type=lambda x: csv_path(x, required_columns={"Vendor"}),
+        help="csv file containing all valid vendor names in a column titled 'Vendor'.",
+    )
     parser.add_argument(
         "supporting_material_root_dir",
         type=dir_path,
@@ -373,6 +388,7 @@ def main(argv=None):
         args.csv_file,
         args.json_config_file,
         args.zenodo_json_file,
+        args.vendors_csv_file,
         args.supporting_material_root_dir,
     )
 

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -449,25 +449,33 @@ def test_validate_image_resources(
 
 class TestReagentResourcesValidation(BaseTest):
     @pytest.mark.parametrize(
-        "json_config, input_csv, zenodo_json, supporting_material_root_dir, result",
+        "json_config, input_csv, zenodo_json, vendor_csv, supporting_material_root_dir, result",
         [
             (
                 "reagent_resources.json",
                 "reagent_resources.csv",
                 "zenodo.json",
+                "vendors_and_urls.csv",
                 "supporting_material",
                 0,
             ),
         ],
     )
     def test_validate_reagent_resources(
-        self, json_config, input_csv, zenodo_json, supporting_material_root_dir, result
+        self,
+        json_config,
+        input_csv,
+        zenodo_json,
+        vendor_csv,
+        supporting_material_root_dir,
+        result,
     ):
         assert (
             validate_reagent_resources(
                 str(self.data_path / input_csv),
                 str(self.data_path / json_config),
                 str(self.data_path / zenodo_json),
+                str(self.data_path / vendor_csv),
                 str(self.data_path / supporting_material_root_dir),
             )
             == result