RECETOX · hechth · Nov 7, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .vscode
 **/tool_test_output.html
 **/tool_test_output.json
+**/tmp*
+**/__pycache__
diff --git a/tools/tables/.shed.yml b/tools/tables/.shed.yml
@@ -0,0 +1,16 @@
+name: tables
+owner: recetox
+remote_repository_url: "https://github.com/RECETOX/galaxytools/tree/master/tools/analysis"
+homepage_url: "https://github.com/RECETOX/galaxytools"
+categories:
+  - Metabolomics
+  - Statistics
+description: "Tools to manipulate and analyze data tables."
+long_description: "Tools to manipulate and analyze data tables. Current tools include interpolation using scipy and arithmetic operations on tables with pandas."
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }} tool from the general purpose data analysis suite developed by RECETOX."
+suite:
+  name: suite_table_tools
+  description: This tool suites contains tools for general purpose data analysis built on top of pandas, scipy, dplyr and others.
+  type: repository_suite_definition
diff --git a/tools/tables/table_pandas_arithmetics.py b/tools/tables/table_pandas_arithmetics.py
@@ -0,0 +1,36 @@
+import argparse
+
+from utils import LoadDataAction, StoreOutputAction
+
+def perform_operation(df, column_index, operation, operand):
+    column_name = df.columns[column_index - 1]  # Convert base-1 index to zero-based index
+    if operation == 'mul':
+        df[column_name] = df[column_name] * operand
+    elif operation == 'sub':
+        df[column_name] = df[column_name] - operand
+    elif operation == 'div':
+        df[column_name] = df[column_name] / operand
+    elif operation == 'add':
+        df[column_name] = df[column_name] + operand
+    elif operation == 'pow':
+        df[column_name] = df[column_name] ** operand
+    else:
+        raise ValueError(f"Unsupported operation: {operation}")
+    return df
+
+def main(input_dataset, column_index, operation, operand, output_dataset):
+    df = input_dataset
+    df = perform_operation(df, column_index, operation, operand)
+    write_func, file_path = output_dataset
+    write_func(df, file_path)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Perform arithmetic operations on a dataframe column.')
+    parser.add_argument('--input_dataset', nargs=2, action=LoadDataAction, required=True, help='Path to the input dataset and its file extension (csv, tsv, parquet)')
+    parser.add_argument('--column', type=int, required=True, help='Base-1 index of the column to perform the operation on')
+    parser.add_argument('--operation', type=str, choices=['mul', 'sub', 'div', 'add', 'pow'], required=True, help='Arithmetic operation to perform')
+    parser.add_argument('--operand', type=float, required=True, help='Operand for the arithmetic operation')
+    parser.add_argument('--output_dataset', nargs=2, action=StoreOutputAction, required=True, help='Path to the output dataset and its file extension (csv, tsv, parquet)')
+
+    args = parser.parse_args()
+    main(args.input_dataset, args.column, args.operation, args.operand, args.output_dataset)
diff --git a/tools/tables/table_pandas_arithmetics.xml b/tools/tables/table_pandas_arithmetics.xml
@@ -0,0 +1,55 @@
+<tool id="pandas_arithmetics" name="pandas arithmetics" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>perform arithmetic operations on a dataframe column</description>
+    <macros>
+        <token name="@TOOL_VERSION@">2.2.3</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">pandas</requirement>
+        <requirement type="package" version="18.0.0">pyarrow</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '$__tool_directory__/table_pandas_arithmetics.py' 
+            --input_dataset '$input_dataset' '$input_dataset.ext' 
+            --column '$column' 
+            --operation '$operation' 
+            --operand '$operand' 
+            --output_dataset '$output_dataset' '$output_dataset.ext'
+    ]]></command>
+    <inputs>
+        <param name="input_dataset" type="data" format="csv,tsv,tabular,parquet" label="Input Dataset"/>
+        <param name="column" type="data_column" data_ref="input_dataset" use_header_names="true" label="Column" help="Column from the dataset to perform the computation on."/>
+        <param name="operation" type="select" label="Arithmetic Operation">
+            <option value="mul">Multiply</option>
+            <option value="sub">Subtract</option>
+            <option value="div">Divide</option>
+            <option value="add">Add</option>
+            <option value="pow">Power</option>
+        </param>
+        <param name="operand" type="float" label="Operand"/>
+    </inputs>
+    <outputs>
+        <data name="output_dataset" format_source="input_dataset" label="${tool.name} on ${on_string}">
+            <change_format>
+                <when input="input_dataset.ext" value="tsv" format="tabular" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_dataset" value="query.tabular" ftype="tabular"/>
+            <param name="column" value="3"/>
+            <param name="operation" value="div"/>
+            <param name="operand" value="100"/>
+            <output name="output_dataset" file="arithmetics/query_divide_ri.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help>
+        This tool performs arithmetic operations on a specified column of a dataframe.
+        Supported operations are: multiply, subtract, divide, add, and power.
+    </help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3509134</citation>
+        <citation type="doi">10.25080/Majora-92bf1922-00a</citation>
+    </citations>
+</tool>
diff --git a/tools/tables/table_pandas_rename_column.py b/tools/tables/table_pandas_rename_column.py
@@ -0,0 +1,30 @@
+import argparse
+import pandas as pd
+from utils import LoadDataAction, StoreOutputAction
+
+class KeyValuePairsAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        key_value_pairs = {}
+        for item in values:
+            key, value = item.split('=')
+            key_value_pairs[int(key)] = value  # Convert key to integer
+        setattr(namespace, self.dest, key_value_pairs)
+
+def rename_columns(df, rename_dict):
+    rename_map = {df.columns[key - 1]: value for key, value in rename_dict.items()}  # Convert 1-based index to column name
+    return df.rename(columns=rename_map)
+
+def main(input_dataset, rename_dict, output_dataset):
+    df = input_dataset
+    df = rename_columns(df, rename_dict)
+    write_func, file_path = output_dataset
+    write_func(df, file_path)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Rename columns in a dataframe.')
+    parser.add_argument('--input_dataset', nargs=2, action=LoadDataAction, required=True, help='Path to the input dataset and its file extension (csv, tsv, parquet)')
+    parser.add_argument('--rename', nargs='+', action=KeyValuePairsAction, required=True, help='List of key=value pairs with 1-based column index as key and new column name as value')
+    parser.add_argument('--output_dataset', nargs=2, action=StoreOutputAction, required=True, help='Path to the output dataset and its file extension (csv, tsv, parquet)')
+
+    args = parser.parse_args()
+    main(args.input_dataset, args.rename, args.output_dataset)
diff --git a/tools/tables/table_pandas_rename_column.xml b/tools/tables/table_pandas_rename_column.xml
@@ -0,0 +1,75 @@
+<tool id="table_pandas_rename_column" name="rename column" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>of a table</description>
+    <!-- <xrefs>
+        <xref type="bio.tools"></xref>
+    </xrefs> -->
+    <macros>
+        <token name="@TOOL_VERSION@">2.2.3</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <!-- TODO: please annotate this tool with topics and operations from http://edamontology.org -->
+    <!-- TODO: for more information see: https://galaxy-iuc-standards.readthedocs.io/en/latest/best_practices/tool_xml.html#edam-topics-and-operations -->
+    <!-- <edam_topics>
+        <edam_topic>topic_TODO</edam_topic>
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_TODO</edam_operation>
+    </edam_operations> -->
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #set rename_dict = " ".join([str($key_value_pair.column) + '=' + str($key_value_pair.new_name) for $key_value_pair in $columns_selection])
+        python3 '$__tool_directory__/table_pandas_rename_column.py' 
+            --input_dataset '$input_dataset' '$input_dataset.ext' 
+            --rename $rename_dict
+            --output_dataset '$output_dataset' '$output_dataset.ext' 
+    ]]></command>
+    <inputs>
+        <param name="input_dataset" type="data" format="csv,tsv,tabular,parquet" label="Input Dataset"/>
+        <repeat name="columns_selection" title="Rename column" min="1">
+            <param name="column" type="data_column" data_ref="input_dataset" use_header_names="true" label="Column" help="Column from the dataset to rename."/>
+            <param argument="new_name" type="text" value="" label="New column name" help="New name for the column">
+                <sanitizer invalid_char="">
+                    <valid initial="string.letters,string.digits">
+                        <add value="_" />
+                    </valid>
+                </sanitizer>
+                <validator type="regex">[0-9a-zA-Z_]+</validator>
+            </param>
+        </repeat>
+    </inputs>
+    <outputs>
+        <data name="output_dataset" format_source="input_dataset" label="${tool.name} on ${on_string}">
+            <change_format>
+                <when input="input_dataset.ext" value="tsv" format="tabular" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_dataset" value="reference.txt" ftype="tabular"/>
+            <param name="column" value="2"/>
+            <param name="new_name" value="retention_time"/>
+            <output name="output_dataset" file="rename/reference_rt_renamed.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="input_dataset" value="reference.txt" ftype="tabular"/>
+            <repeat name="columns_selection">
+                <param name="column" value="2"/>
+                <param name="new_name" value="retention_time"/>
+            </repeat>
+            <repeat name="columns_selection">
+                <param name="column" value="1"/>
+                <param name="new_name" value="retention_index"/>
+            </repeat>
+            <output name="output_dataset" file="rename/reference_both_renamed.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        blub
+    ]]></help>
+    <citations>
+        <citation type="doi">10.5281/zenodo.3509134</citation>
+    </citations>
+</tool>
diff --git a/tools/tables/table_scipy_interpolate.py b/tools/tables/table_scipy_interpolate.py
@@ -0,0 +1,46 @@
+import argparse
+import numpy as np
+from scipy.interpolate import CubicSpline, PchipInterpolator, Akima1DInterpolator
+
+from utils import LoadDataAction, StoreOutputAction
+
+class InterpolationModelAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        interpolators = {
+            "linear": np.interp,
+            "cubic": CubicSpline,
+            "pchip": PchipInterpolator,
+            "akima": Akima1DInterpolator
+        }
+        if values not in interpolators:
+            raise ValueError(f"Unknown interpolation method: {values}")
+        setattr(namespace, self.dest, interpolators[values])
+
+
+def main(reference, query, x_col, y_col, xnew_col, model, output_dataset):
+    # Index is passed with base 1 so we need to subtract 1 to get the correct column names
+    x_col = reference.columns[x_col - 1]
+    y_col = reference.columns[y_col - 1]
+    xnew_col = query.columns[xnew_col - 1]
+
+    if model == np.interp:
+        query[y_col] = model(query[xnew_col], reference[x_col], reference[y_col])
+    else:
+        model_instance = model(reference[x_col], reference[y_col])
+        query[y_col] = model_instance(query[xnew_col]).astype(float)
+
+    write_func, file_path = output_dataset
+    write_func(query, file_path)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Interpolate data using various methods.')
+    parser.add_argument('--reference_dataset', nargs=2, action=LoadDataAction, required=True, help='Path to the reference dataset and its file extension (csv, tsv, parquet)')
+    parser.add_argument('--query_dataset', nargs=2, action=LoadDataAction, required=True, help='Path to the query dataset and its file extension (csv, tsv, parquet)')
+    parser.add_argument('--x_col', type=int, required=True, help='Index of the x column in the reference dataset (1-based)')
+    parser.add_argument('--y_col', type=int, required=True, help='Index of the y column in the reference dataset (1-based)')
+    parser.add_argument('--xnew_col', type=int, required=True, help='Index of the x column in the query dataset (1-based)')
+    parser.add_argument('--method', type=str, choices=['linear', 'cubic', 'pchip', 'akima'], action=InterpolationModelAction, required=True, help='Interpolation method')
+    parser.add_argument('--output_dataset', nargs=2, action=StoreOutputAction, required=True, help='Path to the output dataset and its file extension (csv, tsv, parquet)')
+
+    args = parser.parse_args()
+    main(args.reference_dataset, args.query_dataset, args.x_col, args.y_col, args.xnew_col, args.method, args.output_dataset)
diff --git a/tools/tables/table_scipy_interpolate.xml b/tools/tables/table_scipy_interpolate.xml
@@ -0,0 +1,88 @@
+<tool id="scipy_interpolate" name="scipy interpolate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>interpolate data using the scipy.interpolate library</description>
+    <!-- <xrefs>
+        <xref type="bio.tools"></xref>
+    </xrefs> -->
+    <macros>
+        <token name="@TOOL_VERSION@">1.14.1</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <!-- TODO: please annotate this tool with topics and operations from http://edamontology.org -->
+    <!-- TODO: for more information see: https://galaxy-iuc-standards.readthedocs.io/en/latest/best_practices/tool_xml.html#edam-topics-and-operations -->
+    <!-- <edam_topics>
+        <edam_topic>topic_TODO</edam_topic>
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_TODO</edam_operation>
+    </edam_operations> -->
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">scipy</requirement>
+        <requirement type="package" version="2.2.3">pandas</requirement>
+        <requirement type="package" version="18.0.0">pyarrow</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python3 '${__tool_directory__}/table_scipy_interpolate.py' 
+        --reference_dataset '$reference_dataset' '$reference_dataset.ext'
+        --query_dataset '$query_dataset' '$query_dataset.ext'
+        --x_col $x_col
+        --y_col $y_col
+        --xnew_col $xnew_col
+        --method '$method'
+        --output_dataset '$output_dataset' '$output_dataset.ext'
+        ]]></command>
+
+    <inputs>
+        <param argument="--reference_dataset" type="data" format="tabular,csv,tsv,parquet" label="Reference data" help="Reference dataset to use fopr the interpolation" />
+        <param name="x_col" type="data_column" data_ref="reference_dataset" use_header_names="true" label="x (reference)" help="Column from the reference dataset to use as X axis for the interpolator."/>
+        <param name="y_col" type="data_column" data_ref="reference_dataset" use_header_names="true" label="y (reference)" help="Column from the reference dataset to use as Y axis for the interpolator."/>
+        <param argument="--query_dataset" type="data" format="tabular,csv,tsv,parquet" label="Query dataset" help="Query dataset for which to interpolate the values." />
+        <param name="xnew_col" type="data_column" data_ref="query_dataset" use_header_names="true" label="x (query)" help="Column from the query dataset for which to interpolate."/>
+
+        <param name="method" type="select" label="Interpolation method" help="Interpolation method from scipy to use. For more details see [1].">
+            <option value="linear">Piecewise linear</option>
+            <option value="cubic" selected="true">Cubic spline</option>
+            <option value="pchip">Pchip</option>
+            <option value="akima">Akima1D</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_dataset" format_source="query_dataset" label="${tool.name} on ${on_string}">
+            <change_format>
+                <when input="query_dataset.ext" value="tsv" format="tabular" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="reference_dataset" value="reference.txt" ftype="tabular"/>
+            <param name="x_col" value="1"/>
+            <param name="y_col" value="2"/>
+            <param name="query_dataset" value="query.tabular" ftype="tabular"/>
+            <param name="xnew_col" value="3"/>
+            <output name="output_dataset" file="interpolate/query_interpolate_rt.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+This tool interpolates the values for a column to be added
+
+Usage
+.....
+[1] (https://docs.scipy.org/doc/scipy/tutorial/interpolate.html)
+
+**Input**
+Input a reference table and choose the X (source) and Y (target) columns for the interpolation.
+Choose a query table and the X (source) column for which to calculate the new Y values.
+
+
+**Output**
+A table with the interpolated Y column.
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.25080/Majora-92bf1922-00a</citation>
+        <citation type="doi">10.1038/s41592-019-0686-2</citation>
+    </citations>
+</tool>