From 4e7fea0b0788364444fcfff4050511ffe0ad21a1 Mon Sep 17 00:00:00 2001 From: Ziv Yaniv Date: Wed, 1 Feb 2023 21:13:12 -0500 Subject: [PATCH 1/2] Updating the reagent resources table creation code. --- CHANGELOG.md | 8 +++- pyproject.toml | 2 +- .../argparse_types.py | 10 +++++ .../reagent_resources_csv_2_md_url.py | 44 ++++++++++++++----- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 645787a..1bbec4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,12 +15,18 @@ Each release should describe the changes using the following subsection types: When working on the package, add information under the "Unreleased" heading. In this manner the release notes are created incrementally, and do not require a concerted effort prior to a release. -Using a manual approach to create the release notes instead of automatically deriving them from the +Using a manual approach to create the release notes instead of automatically deriving them from the commits allows us to provide a high level description of the features and issues, yet provide details when those are needed. This is equivalent to summarizing all activity on a feature branch versus reporting all commits on that branch. ## Unreleased +## v0.3.2 + +### Changed +* reagent_resources_csv_2_md_url - Update the automatic path to supporting material creation. The paths cannot include parentheses, so thos are replaced with underscores. Additionally, the insertion of the table into the input markdown file is done via the string `replace` and not the `format` method because the use of `format` precludes the presence of curly braces in the template file. We now need to use curly braces in the input markdown file so that the table has an id value when the markdown is converted to html. + + ## v0.3.0 ### Added diff --git a/pyproject.toml b/pyproject.toml index dda30b3..8724233 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "ibex_imaging_knowledge_base_utilities" -version = "0.3.0" +version = "0.3.2" authors = [{ name="Ziv Yaniv", email="zivyaniv@nih.gov" }, ] description = "Utility scripts used for maintaining the IBEX Imaging Community Knowledge-Base" diff --git a/src/ibex_imaging_knowledge_base_utilities/argparse_types.py b/src/ibex_imaging_knowledge_base_utilities/argparse_types.py index 660747c..40669e0 100644 --- a/src/ibex_imaging_knowledge_base_utilities/argparse_types.py +++ b/src/ibex_imaging_knowledge_base_utilities/argparse_types.py @@ -22,6 +22,16 @@ # definitions of argparse types, enables argparse to validate the command line parameters +def file_path_endswith_md_in(path): + p = pathlib.Path(path) + if p.is_file() and path.endswith(".md.in"): + return p + else: + raise argparse.ArgumentTypeError( + f'Invalid argument ({path}), not a file path, file does not exist, or path does not end with ".md.in".' + ) + + def file_path(path): p = pathlib.Path(path) if p.is_file(): diff --git a/src/ibex_imaging_knowledge_base_utilities/reagent_resources_csv_2_md_url.py b/src/ibex_imaging_knowledge_base_utilities/reagent_resources_csv_2_md_url.py index ce18e73..de53866 100644 --- a/src/ibex_imaging_knowledge_base_utilities/reagent_resources_csv_2_md_url.py +++ b/src/ibex_imaging_knowledge_base_utilities/reagent_resources_csv_2_md_url.py @@ -20,7 +20,7 @@ import argparse import sys import pathlib -from .argparse_types import file_path, dir_path +from .argparse_types import file_path, file_path_endswith_md_in, dir_path import requests import json from itertools import chain @@ -71,9 +71,9 @@ def short_circuit_requests_get(url, params=None, **kwargs): def json_to_md_str_dict(json_file_path): with open(json_file_path) as fp: - json_dict = json.load(fp) + data_dict = json.load(fp) md_str_dict = {} - for raw_text, url_target in json_dict.items(): + for raw_text, url_target in data_dict.items(): try: res = requests.get( url_target, @@ -95,6 +95,13 @@ def json_to_md_str_dict(json_file_path): return md_str_dict +def replace_char_list(input_str, change_chars_list, replacement_char): + for c in change_chars_list: + if c in input_str: + input_str = input_str.replace(c, replacement_char) + return input_str + + def data_to_md_str(data, supporting_material_root_dir): """ The data parameter is a series with three entries: @@ -110,11 +117,19 @@ def data_to_md_str(data, supporting_material_root_dir): urls_str = "" txt = [v.strip() for v in data[0].split(";") if v.strip() != ""] for v in txt[0:-1]: - # Replace all spaces and slashes with underscores so that the path - # matches the expected supporting material path - tc_subpath = f"{data[1]}_{data[2]}".replace(" ", "_").replace("/", "_") + # Replace spaces, slashes and brackets with underscores assume that the + # file exists, data validation happens prior to conversion of data to markdown. + tc_subpath = replace_char_list( + input_str=f"{data[1]}_{data[2]}", + change_chars_list=[" ", "\t", "/", "\\", "{", "}", "[", "]", "(", ")"], + replacement_char="_", + ) urls_str += f"[{v}]({supporting_material_root_dir}/{tc_subpath}/{v}.md), " - tc_subpath = f"{data[1]}_{data[2]}".replace(" ", "_").replace("/", "_") + tc_subpath = replace_char_list( + input_str=f"{data[1]}_{data[2]}", + change_chars_list=[" ", "\t", "/", "\\", "{", "}", "[", "]", "(", ")"], + replacement_char="_", + ) urls_str += ( f"[{txt[-1]}]({supporting_material_root_dir}/{tc_subpath}/{txt[-1]}.md)" ) @@ -172,7 +187,12 @@ def csv_to_md_with_url( """ # Read the dataframe and keep entries that are "NA", don't convert to nan df = pd.read_csv(csv_file_path, dtype=str, keep_default_na=False) - df.sort_values(by=["Target Name / Protein Biomarker"], inplace=True) + # Sort dataframe according to target, ignoring case. + df.sort_values( + by=["Target Name / Protein Biomarker"], + inplace=True, + key=lambda x: x.str.lower(), + ) supporting_material_path = pathlib.PurePath(supporting_material_root_dir).name if not df.empty: print("Start linking to supporting material...") @@ -216,9 +236,11 @@ def csv_to_md_with_url( with open(template_file_path, "r") as fp: input_md_str = fp.read() - with open(supporting_material_root_dir.parent / "reagent_resources.md", "w") as fp: + with open(supporting_material_root_dir.parent / template_file_path.stem, "w") as fp: fp.write( - input_md_str.format(reagent_resources_table=df.to_markdown(index=False)) + input_md_str.replace( + "{reagent_resources_table}", df.to_markdown(index=False) + ) ) return 0 @@ -231,7 +253,7 @@ def main(argv=None): ) parser.add_argument( "md_template_file", - type=file_path, + type=file_path_endswith_md_in, help='Path to template markdown file which contains the string "{reagent_resources_table}".', ) parser.add_argument( From 097d0883dcf0eac1b3fba7cee2cc0a1981ca1d5d Mon Sep 17 00:00:00 2001 From: Ziv Yaniv Date: Thu, 2 Feb 2023 11:43:09 -0500 Subject: [PATCH 2/2] Updated md5 hash for bib2md results. Change in pandoc, new version 3.0.1 leads to slightly different results. --- tests/test_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 67d1eb8..049f193 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -94,7 +94,7 @@ def test_fluorescent_probe_csv_to_md( class TestBib2MD(BaseTest): @pytest.mark.parametrize( "bib_file_name, csl_file_name, result_md5hash", - [("publications.bib", "ibex.csl", "61f01467fe88de1f686afcbbd4abaed7")], + [("publications.bib", "ibex.csl", "b95a58740183fb04079027610e3d06c1")], ) def test_bib_2_md(self, bib_file_name, csl_file_name, result_md5hash, tmp_path): # Write the output using the tmp_path fixture