From 32e46880e5979b5cc08c4d55576cccf02e769ce7 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-1-13.tscc.sdsc.edu>
Date: Wed, 20 Nov 2024 22:43:43 -0800
Subject: [PATCH] dist filter test

---
 src/read_process.py                           |  4 +-
 src/utils.py                                  |  2 +-
 tests/integration_tests.ipynb                 | 63 +++++++++++++++----
 tests/integration_tests_auto_check.py         | 54 ++++++++++++----
 tests/integration_tests_run.sh                |  2 +-
 .../scripts/edge_case_dist_filter_test.sh     | 20 ++++++
 6 files changed, 116 insertions(+), 29 deletions(-)
 create mode 100644 tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh

diff --git a/src/read_process.py b/src/read_process.py
index b659c8e..9c8fc98 100644
--- a/src/read_process.py
+++ b/src/read_process.py
@@ -229,10 +229,10 @@ def get_read_information(read, contig, barcode_tag='CB', verbose=False, stranded
         distance_from_read_end = np.min([updated_position - reference_start, reference_end - updated_position])
 
         if distance_from_read_end < dist_from_end:
-            return 'read_end_proximal'.format(dist_from_end), [], {}
+            continue
 
         if int(qual) < min_base_quality:
-            return 'baseq_low'.format(min_base_quality), [], {}
+            continue
             
         # If we have been provided with a barcode CB (single-cell), we need to preset our contigs to match
         # the contigs that will be present in the reconfigured bams, ie. 9_GATCCCTCAGTAACGG-1 instead of 9.
diff --git a/src/utils.py b/src/utils.py
index c59a6c3..ff04d72 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -9,7 +9,7 @@
 from collections import OrderedDict, defaultdict
 from itertools import product
 
-cb_n = 3
+cb_n = 2
 
 def generate_permutations_list_for_CB(n):
     """
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index 037d756..25464a8 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 16,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -2900,6 +2900,9 @@
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for edge_case_test\n",
+      "\n",
+      "   >>> Found 2 edits total, as expected <<<\n",
+      "\n",
       "\tExpecting: {'contig': '10', 'barcode': 'AAACGAATCATTCATC-1', 'position': 132330481, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'T>G', 'strand_conversion': 'A>C', 'strand': '-', 'feature_name': 'STK32C', 'feature_strand': '-'}\n",
       "\n",
       "\t >>> edge_case_test passed! <<<\n",
@@ -2910,6 +2913,16 @@
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+      "Checking results for edge_case_filter_dist_test\n",
+      "\n",
+      "   >>> Found 1 edits total, as expected <<<\n",
+      "\n",
+      "\tExpecting: {'contig': '10', 'barcode': 'AAACGAATCATTCATC-1', 'position': 132330438, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'T>C', 'strand_conversion': 'A>G', 'strand': '-', 'feature_name': 'STK32C', 'feature_strand': '-'}\n",
+      "\n",
+      "\t >>> edge_case_filter_dist_test passed! <<<\n",
+      "\n",
+      "\n",
+      "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for unstranded_pair_test\n",
       "\tExpecting: {'contig': 'Citrine.dna', 'position': 435, 'count': 22, 'coverage': 22, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+'}\n",
       "\n",
@@ -3012,6 +3025,7 @@
     "test_name_to_expectations = {\n",
     "    \"edge_case_test\": {\n",
     "        \"folder\": \"singlecell_tests\",\n",
+    "        \"total_edit_sites\": 2,\n",
     "        \"expectations\": [{\n",
     "            \"contig\": \"10\",\n",
     "            \"barcode\": \t\"AAACGAATCATTCATC-1\",\n",
@@ -3037,8 +3051,24 @@
     "            \"strand\": \"-\",\n",
     "            \"feature_name\": \"STK32C\",\n",
     "            \"feature_strand\": \"-\"\n",
-    "        }\n",
-    "                        ]\n",
+    "        }]\n",
+    "    },\n",
+    "    \"edge_case_filter_dist_test\": {\n",
+    "        \"folder\": \"singlecell_tests\",\n",
+    "        \"total_edit_sites\": 1,\n",
+    "        \"expectations\": [{\n",
+    "            \"contig\": \"10\",\n",
+    "            \"barcode\": \t\"AAACGAATCATTCATC-1\",\n",
+    "            \"position\": 132330438,\n",
+    "            \"num_rows\": 1,\n",
+    "            \"count\": 1,\n",
+    "            \"coverage\": 1,\n",
+    "            \"conversion\": \"T>C\",\n",
+    "            \"strand_conversion\": \"A>G\",\n",
+    "            \"strand\": \"-\",\n",
+    "            \"feature_name\": \"STK32C\",\n",
+    "            \"feature_strand\": \"-\"\n",
+    "        }]\n",
     "    },\n",
     "    \"unstranded_pair_test\": {\n",
     "        \"folder\": \"strandedness_tests\",\n",
@@ -3251,32 +3281,41 @@
     "failures = 0\n",
     "for test_name, info in test_name_to_expectations.items():\n",
     "    print(\"\\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\\nChecking results for {}\".format(test_name))\n",
-    "    \n",
+    "\n",
+    "    failure = False\n",
+    "\n",
+    "    folder = info.get(\"folder\")\n",
+    "    final_filtered_site_info_annotated = \"{}/{}/final_filtered_site_info_annotated.tsv\".format(folder, test_name)\n",
+    "    final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\\t', index_col=0)\n",
+    "\n",
+    "    if \"total_edit_sites\" in info:\n",
+    "        try:\n",
+    "            assert(len(final_filtered_site_info_annotated_df) == info.get(\"total_edit_sites\"))\n",
+    "            print(\"\\n   >>> Found {} edits total, as expected <<<\\n\".format(info.get(\"total_edit_sites\")))\n",
+    "        except Exception as e:\n",
+    "            print(\"Exception:\\n\\tNum total rows in final_filtered_site_info_annotated.tsv expected: {}, was {}\".format(info.get(\"total_edit_sites\"), len(final_filtered_site_info_annotated_df)))\n",
+    "            failure = True \n",
+    "            failures += 1\n",
+    "            continue \n",
+    "            \n",
     "    expectations_list = info.get(\"expectations\")\n",
     "    for expectations in expectations_list:\n",
+    "        failure = False\n",
     "        print(\"\\tExpecting: {}\".format(expectations))\n",
-    "    \n",
-    "              \n",
-    "        folder = info.get(\"folder\")\n",
     "        \n",
     "        contig = expectations.get(\"contig\")\n",
     "        barcode = expectations.get(\"barcode\", None)\n",
     "        \n",
     "        position = expectations.get(\"position\")\n",
     "        \n",
-    "        final_filtered_site_info_annotated = \"{}/{}/final_filtered_site_info_annotated.tsv\".format(folder, test_name)\n",
-    "        final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\\t', index_col=0)\n",
-    "    \n",
     "        row_of_interest = final_filtered_site_info_annotated_df[\n",
     "            (final_filtered_site_info_annotated_df['position'] == position) &\\\n",
     "            (final_filtered_site_info_annotated_df['contig'].astype(str) == contig)\n",
     "        ]\n",
     "    \n",
-    "    \n",
     "        if barcode:\n",
     "            row_of_interest = row_of_interest[row_of_interest.barcode == barcode]\n",
     "    \n",
-    "        failure = False\n",
     "        try:\n",
     "            assert(len(row_of_interest) == expectations.get(\"num_rows\", 1))\n",
     "        except Exception as e:\n",
diff --git a/tests/integration_tests_auto_check.py b/tests/integration_tests_auto_check.py
index 2089624..d851aa0 100755
--- a/tests/integration_tests_auto_check.py
+++ b/tests/integration_tests_auto_check.py
@@ -5,8 +5,9 @@
 
 
 test_name_to_expectations = {
-    "edge_case_test": {
+"edge_case_test": {
         "folder": "singlecell_tests",
+        "total_edit_sites": 2,
         "expectations": [{
             "contig": "10",
             "barcode": 	"AAACGAATCATTCATC-1",
@@ -32,8 +33,24 @@
             "strand": "-",
             "feature_name": "STK32C",
             "feature_strand": "-"
-        }
-                        ]
+        }]
+    },
+    "edge_case_filter_dist_test": {
+        "folder": "singlecell_tests",
+        "total_edit_sites": 1,
+        "expectations": [{
+            "contig": "10",
+            "barcode": 	"AAACGAATCATTCATC-1",
+            "position": 132330438,
+            "num_rows": 1,
+            "count": 1,
+            "coverage": 1,
+            "conversion": "T>C",
+            "strand_conversion": "A>G",
+            "strand": "-",
+            "feature_name": "STK32C",
+            "feature_strand": "-"
+        }]
     },
     "unstranded_pair_test": {
         "folder": "strandedness_tests",
@@ -245,35 +262,46 @@
 
 print("Current directory: {}".format(os.getcwd()))
 
-# Check results of each test
+# Check out results of each test
 failures = 0
-for test_name, info in test_name_to_expectations.items():        
+for test_name, info in test_name_to_expectations.items():
     print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nChecking results for {}".format(test_name))
-    
+
+    failure = False
+
+    folder = info.get("folder")
+    final_filtered_site_info_annotated = "{}/{}/final_filtered_site_info_annotated.tsv".format(folder, test_name)
+    final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\t', index_col=0)
+
+    if "total_edit_sites" in info:
+        try:
+            assert(len(final_filtered_site_info_annotated_df) == info.get("total_edit_sites"))
+            print("\n   >>> Found {} edits total, as expected <<<\n".format(info.get("total_edit_sites")))
+        except Exception as e:
+            print("Exception:\n\tNum total rows in final_filtered_site_info_annotated.tsv expected: {}, was {}".format(info.get("total_edit_sites"), len(final_filtered_site_info_annotated_df)))
+            failure = True 
+            failures += 1
+            continue 
+            
     expectations_list = info.get("expectations")
     for expectations in expectations_list:
+        failure = False
         print("\tExpecting: {}".format(expectations))
-    
-              
-        folder = info.get("folder")
         
         contig = expectations.get("contig")
         barcode = expectations.get("barcode", None)
         
         position = expectations.get("position")
         
-        final_filtered_site_info_annotated = "{}/{}/final_filtered_site_info_annotated.tsv".format(folder, test_name)
-        final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\t', index_col=0)
-    
         row_of_interest = final_filtered_site_info_annotated_df[
             (final_filtered_site_info_annotated_df['position'] == position) &\
             (final_filtered_site_info_annotated_df['contig'].astype(str) == contig)
         ]
     
-    
         if barcode:
             row_of_interest = row_of_interest[row_of_interest.barcode == barcode]
     
+    
         failure = False
         try:
             assert(len(row_of_interest) == expectations.get("num_rows", 1))
diff --git a/tests/integration_tests_run.sh b/tests/integration_tests_run.sh
index 9f12804..f7ab6be 100755
--- a/tests/integration_tests_run.sh
+++ b/tests/integration_tests_run.sh
@@ -34,7 +34,7 @@ echo "SC tests scripts"
 ls -lh $MARINE/tests/$tests_folder/scripts/
 
 
-for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test"
+for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"
 
 do
     echo $t
diff --git a/tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh b/tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh
new file mode 100644
index 0000000..33679ce
--- /dev/null
+++ b/tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh
@@ -0,0 +1,20 @@
+# This sample has an edit at the very end of the read. But we want to make sure it is not included if dist from end filter
+# is active.
+
+mypython=$1
+
+$mypython $MARINE/marine.py \
+--bam_filepath \
+$MARINE/tests/singlecell_tests/bams/10_C-1_orig.bam \
+--annotation_bedfile_path \
+$MARINE/annotations/cellranger-GRCh38-3.0.0.annotation.genes.bed \
+--output_folder \
+$MARINE/tests/singlecell_tests/edge_case_filter_dist_test \
+--min_base_quality \
+15 \
+--cores \
+4 \
+--min_dist_from_end 2 \
+--barcode_tag "CB" \
+--strandedness 2 --num_per_sublist 1 --verbose \
+--contigs 10 --interval_length 40000000 --keep_intermediate_files
\ No newline at end of file