From 32e46880e5979b5cc08c4d55576cccf02e769ce7 Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Wed, 20 Nov 2024 22:43:43 -0800 Subject: [PATCH] dist filter test --- src/read_process.py | 4 +- src/utils.py | 2 +- tests/integration_tests.ipynb | 63 +++++++++++++++---- tests/integration_tests_auto_check.py | 54 ++++++++++++---- tests/integration_tests_run.sh | 2 +- .../scripts/edge_case_dist_filter_test.sh | 20 ++++++ 6 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh diff --git a/src/read_process.py b/src/read_process.py index b659c8e..9c8fc98 100644 --- a/src/read_process.py +++ b/src/read_process.py @@ -229,10 +229,10 @@ def get_read_information(read, contig, barcode_tag='CB', verbose=False, stranded distance_from_read_end = np.min([updated_position - reference_start, reference_end - updated_position]) if distance_from_read_end < dist_from_end: - return 'read_end_proximal'.format(dist_from_end), [], {} + continue if int(qual) < min_base_quality: - return 'baseq_low'.format(min_base_quality), [], {} + continue # If we have been provided with a barcode CB (single-cell), we need to preset our contigs to match # the contigs that will be present in the reconfigured bams, ie. 9_GATCCCTCAGTAACGG-1 instead of 9. diff --git a/src/utils.py b/src/utils.py index c59a6c3..ff04d72 100644 --- a/src/utils.py +++ b/src/utils.py @@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict from itertools import product -cb_n = 3 +cb_n = 2 def generate_permutations_list_for_CB(n): """ diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb index 037d756..25464a8 100644 --- a/tests/integration_tests.ipynb +++ b/tests/integration_tests.ipynb @@ -2889,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387", "metadata": {}, "outputs": [ @@ -2900,6 +2900,9 @@ "\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Checking results for edge_case_test\n", + "\n", + " >>> Found 2 edits total, as expected <<<\n", + "\n", "\tExpecting: {'contig': '10', 'barcode': 'AAACGAATCATTCATC-1', 'position': 132330481, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'T>G', 'strand_conversion': 'A>C', 'strand': '-', 'feature_name': 'STK32C', 'feature_strand': '-'}\n", "\n", "\t >>> edge_case_test passed! <<<\n", @@ -2910,6 +2913,16 @@ "\n", "\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "Checking results for edge_case_filter_dist_test\n", + "\n", + " >>> Found 1 edits total, as expected <<<\n", + "\n", + "\tExpecting: {'contig': '10', 'barcode': 'AAACGAATCATTCATC-1', 'position': 132330438, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'T>C', 'strand_conversion': 'A>G', 'strand': '-', 'feature_name': 'STK32C', 'feature_strand': '-'}\n", + "\n", + "\t >>> edge_case_filter_dist_test passed! <<<\n", + "\n", + "\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Checking results for unstranded_pair_test\n", "\tExpecting: {'contig': 'Citrine.dna', 'position': 435, 'count': 22, 'coverage': 22, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+'}\n", "\n", @@ -3012,6 +3025,7 @@ "test_name_to_expectations = {\n", " \"edge_case_test\": {\n", " \"folder\": \"singlecell_tests\",\n", + " \"total_edit_sites\": 2,\n", " \"expectations\": [{\n", " \"contig\": \"10\",\n", " \"barcode\": \t\"AAACGAATCATTCATC-1\",\n", @@ -3037,8 +3051,24 @@ " \"strand\": \"-\",\n", " \"feature_name\": \"STK32C\",\n", " \"feature_strand\": \"-\"\n", - " }\n", - " ]\n", + " }]\n", + " },\n", + " \"edge_case_filter_dist_test\": {\n", + " \"folder\": \"singlecell_tests\",\n", + " \"total_edit_sites\": 1,\n", + " \"expectations\": [{\n", + " \"contig\": \"10\",\n", + " \"barcode\": \t\"AAACGAATCATTCATC-1\",\n", + " \"position\": 132330438,\n", + " \"num_rows\": 1,\n", + " \"count\": 1,\n", + " \"coverage\": 1,\n", + " \"conversion\": \"T>C\",\n", + " \"strand_conversion\": \"A>G\",\n", + " \"strand\": \"-\",\n", + " \"feature_name\": \"STK32C\",\n", + " \"feature_strand\": \"-\"\n", + " }]\n", " },\n", " \"unstranded_pair_test\": {\n", " \"folder\": \"strandedness_tests\",\n", @@ -3251,32 +3281,41 @@ "failures = 0\n", "for test_name, info in test_name_to_expectations.items():\n", " print(\"\\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\\nChecking results for {}\".format(test_name))\n", - " \n", + "\n", + " failure = False\n", + "\n", + " folder = info.get(\"folder\")\n", + " final_filtered_site_info_annotated = \"{}/{}/final_filtered_site_info_annotated.tsv\".format(folder, test_name)\n", + " final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\\t', index_col=0)\n", + "\n", + " if \"total_edit_sites\" in info:\n", + " try:\n", + " assert(len(final_filtered_site_info_annotated_df) == info.get(\"total_edit_sites\"))\n", + " print(\"\\n >>> Found {} edits total, as expected <<<\\n\".format(info.get(\"total_edit_sites\")))\n", + " except Exception as e:\n", + " print(\"Exception:\\n\\tNum total rows in final_filtered_site_info_annotated.tsv expected: {}, was {}\".format(info.get(\"total_edit_sites\"), len(final_filtered_site_info_annotated_df)))\n", + " failure = True \n", + " failures += 1\n", + " continue \n", + " \n", " expectations_list = info.get(\"expectations\")\n", " for expectations in expectations_list:\n", + " failure = False\n", " print(\"\\tExpecting: {}\".format(expectations))\n", - " \n", - " \n", - " folder = info.get(\"folder\")\n", " \n", " contig = expectations.get(\"contig\")\n", " barcode = expectations.get(\"barcode\", None)\n", " \n", " position = expectations.get(\"position\")\n", " \n", - " final_filtered_site_info_annotated = \"{}/{}/final_filtered_site_info_annotated.tsv\".format(folder, test_name)\n", - " final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\\t', index_col=0)\n", - " \n", " row_of_interest = final_filtered_site_info_annotated_df[\n", " (final_filtered_site_info_annotated_df['position'] == position) &\\\n", " (final_filtered_site_info_annotated_df['contig'].astype(str) == contig)\n", " ]\n", " \n", - " \n", " if barcode:\n", " row_of_interest = row_of_interest[row_of_interest.barcode == barcode]\n", " \n", - " failure = False\n", " try:\n", " assert(len(row_of_interest) == expectations.get(\"num_rows\", 1))\n", " except Exception as e:\n", diff --git a/tests/integration_tests_auto_check.py b/tests/integration_tests_auto_check.py index 2089624..d851aa0 100755 --- a/tests/integration_tests_auto_check.py +++ b/tests/integration_tests_auto_check.py @@ -5,8 +5,9 @@ test_name_to_expectations = { - "edge_case_test": { +"edge_case_test": { "folder": "singlecell_tests", + "total_edit_sites": 2, "expectations": [{ "contig": "10", "barcode": "AAACGAATCATTCATC-1", @@ -32,8 +33,24 @@ "strand": "-", "feature_name": "STK32C", "feature_strand": "-" - } - ] + }] + }, + "edge_case_filter_dist_test": { + "folder": "singlecell_tests", + "total_edit_sites": 1, + "expectations": [{ + "contig": "10", + "barcode": "AAACGAATCATTCATC-1", + "position": 132330438, + "num_rows": 1, + "count": 1, + "coverage": 1, + "conversion": "T>C", + "strand_conversion": "A>G", + "strand": "-", + "feature_name": "STK32C", + "feature_strand": "-" + }] }, "unstranded_pair_test": { "folder": "strandedness_tests", @@ -245,35 +262,46 @@ print("Current directory: {}".format(os.getcwd())) -# Check results of each test +# Check out results of each test failures = 0 -for test_name, info in test_name_to_expectations.items(): +for test_name, info in test_name_to_expectations.items(): print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nChecking results for {}".format(test_name)) - + + failure = False + + folder = info.get("folder") + final_filtered_site_info_annotated = "{}/{}/final_filtered_site_info_annotated.tsv".format(folder, test_name) + final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\t', index_col=0) + + if "total_edit_sites" in info: + try: + assert(len(final_filtered_site_info_annotated_df) == info.get("total_edit_sites")) + print("\n >>> Found {} edits total, as expected <<<\n".format(info.get("total_edit_sites"))) + except Exception as e: + print("Exception:\n\tNum total rows in final_filtered_site_info_annotated.tsv expected: {}, was {}".format(info.get("total_edit_sites"), len(final_filtered_site_info_annotated_df))) + failure = True + failures += 1 + continue + expectations_list = info.get("expectations") for expectations in expectations_list: + failure = False print("\tExpecting: {}".format(expectations)) - - - folder = info.get("folder") contig = expectations.get("contig") barcode = expectations.get("barcode", None) position = expectations.get("position") - final_filtered_site_info_annotated = "{}/{}/final_filtered_site_info_annotated.tsv".format(folder, test_name) - final_filtered_site_info_annotated_df = pd.read_csv(final_filtered_site_info_annotated, sep='\t', index_col=0) - row_of_interest = final_filtered_site_info_annotated_df[ (final_filtered_site_info_annotated_df['position'] == position) &\ (final_filtered_site_info_annotated_df['contig'].astype(str) == contig) ] - if barcode: row_of_interest = row_of_interest[row_of_interest.barcode == barcode] + failure = False try: assert(len(row_of_interest) == expectations.get("num_rows", 1)) diff --git a/tests/integration_tests_run.sh b/tests/integration_tests_run.sh index 9f12804..f7ab6be 100755 --- a/tests/integration_tests_run.sh +++ b/tests/integration_tests_run.sh @@ -34,7 +34,7 @@ echo "SC tests scripts" ls -lh $MARINE/tests/$tests_folder/scripts/ -for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" +for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test" do echo $t diff --git a/tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh b/tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh new file mode 100644 index 0000000..33679ce --- /dev/null +++ b/tests/singlecell_tests/scripts/edge_case_dist_filter_test.sh @@ -0,0 +1,20 @@ +# This sample has an edit at the very end of the read. But we want to make sure it is not included if dist from end filter +# is active. + +mypython=$1 + +$mypython $MARINE/marine.py \ +--bam_filepath \ +$MARINE/tests/singlecell_tests/bams/10_C-1_orig.bam \ +--annotation_bedfile_path \ +$MARINE/annotations/cellranger-GRCh38-3.0.0.annotation.genes.bed \ +--output_folder \ +$MARINE/tests/singlecell_tests/edge_case_filter_dist_test \ +--min_base_quality \ +15 \ +--cores \ +4 \ +--min_dist_from_end 2 \ +--barcode_tag "CB" \ +--strandedness 2 --num_per_sublist 1 --verbose \ +--contigs 10 --interval_length 40000000 --keep_intermediate_files \ No newline at end of file