Stub changes to support multiple distances for clustering

blab · Aug 7, 2024 · 83abb22 · 83abb22
1 parent 85b2e7a
commit 83abb22
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 1 deletion.
diff --git a/src/pathogen_embed/__main__.py b/src/pathogen_embed/__main__.py
@@ -81,7 +81,7 @@ def make_parser_cluster():
     )
     exclusive_input_group = input_group.add_mutually_exclusive_group(required=True)
     exclusive_input_group.add_argument("--embedding", help="an embedding to assign cluster labels to using Euclidean distance between input records")
-    exclusive_input_group.add_argument("--distance-matrix", help="a distance matrix to assign cluster labels to using the given precomputed values as the distance between input records")
+    exclusive_input_group.add_argument("--distance-matrix", nargs="+", help="one or more distance matrix files to assign cluster labels to using the given precomputed values as the distance between input records")
 
     options_group = parser.add_argument_group(
         "Options",

diff --git a/tests/pathogen-cluster-by-multiple-distances.t b/tests/pathogen-cluster-by-multiple-distances.t
@@ -0,0 +1,28 @@
+Get a distance matrix from a H3N2 HA alignment.
+
+  $ pathogen-distance \
+  >   --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
+  >   --output ha_distances.csv
+
+Get a distance matrix from a H3N2 NA alignment.
+
+  $ pathogen-distance \
+  >   --alignment $TESTDIR/data/h3n2_na_alignment.sorted.fasta \
+  >   --output na_distances.csv
+
+Find clusters from the genetic distances for both HA and NA.
+
+  $ pathogen-cluster \
+  >   --distance-matrix ha_distances.csv na_distances.csv \
+  >   --label-attribute genetic_label \
+  >   --distance-threshold 0.5 \
+  >   --output-dataframe cluster_distances.csv
+
+There should be one record in the cluster output for each record in the input distances.
+
+  $ [[ $(wc -l < cluster_distances.csv) == $(wc -l < ha_distances.csv) ]]
+
+The header should include the index strain, each strain from the distance matrix, and the requested cluster label.
+
+  $ head -n 1 cluster_distances.csv
+  strain,.*,genetic_label (re)