From 2aedc8fb32356a11569bcad6483ea31fd0b9cc73 Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Fri, 29 Nov 2024 13:25:04 -0800 Subject: [PATCH] adjustment --- marine.py | 4 ++-- src/utils.py | 9 +++++++-- tests/integration_tests.ipynb | 20 ++++++-------------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/marine.py b/marine.py index 4221238..0211128 100755 --- a/marine.py +++ b/marine.py @@ -221,7 +221,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag= if paired_end: paired_end_flag = '-s ' else: - paired_end_flag = 's' + paired_end_flag = '' # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt" @@ -243,7 +243,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag= '{}/final_edit_info_no_coverage.tsv'.format(output_folder), '{}/depths_source_cells.txt'.format(output_folder), '{}/final_edit_info.tsv'.format(output_folder), - header_columns) + header_columns, paired_end=paired_end) coverage_total_time = time.perf_counter() - coverage_start_time diff --git a/src/utils.py b/src/utils.py index dadda75..1534a13 100644 --- a/src/utils.py +++ b/src/utils.py @@ -652,15 +652,20 @@ def concat_and_write_bams_wrapper(params): concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, barcode_tag=barcode_tag, number_of_expected_bams=number_of_expected_bams, verbose=verbose) -def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns): +def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns, paired_end=False): # Convert header list into a tab-separated string header = "\t".join(header_columns) + position_adjustment = '1' + if paired_end: + position_adjustment = '0' + print(f"position adjustment is {position_adjustment} (paired_end is {paired_end})") + # Generate the Bash command for processing bash_command = f"""#!/bin/bash # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position # for join purposes, and sort by this column. Output in tab-separated format. - awk -v OFS='\\t' '{{print $1, $2+1, $3, $1":"($2+1)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv + awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information) sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb index f8fe151..cd07c64 100644 --- a/tests/integration_tests.ipynb +++ b/tests/integration_tests.ipynb @@ -2889,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387", "metadata": {}, "outputs": [ @@ -2932,18 +2932,12 @@ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Checking results for pair_test\n", "\tExpecting: {'contig': '18', 'position': 49491556, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'RPL17-C18orf32,RPL17', 'feature_strand': '-,-'}\n", - "Exception: coverage was site_id\n", - "no_barcode_18_49491556_G_A_- 3\n", - "Name: coverage, dtype: int64\n", "\n", - "\t ~~~ pair_test FAILED! ~~~\n", + "\t >>> pair_test passed! <<<\n", "\n", "\tExpecting: {'contig': '18', 'position': 49567494, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+', 'feature_name': 'LIPG', 'feature_strand': '+'}\n", - "Exception: coverage was site_id\n", - "no_barcode_18_49567494_C_T_+ 3\n", - "Name: coverage, dtype: int64\n", "\n", - "\t ~~~ pair_test FAILED! ~~~\n", + "\t >>> pair_test passed! <<<\n", "\n", "\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", @@ -2977,11 +2971,8 @@ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Checking results for same_pos_dif_reads_test\n", "\tExpecting: {'contig': 'chr17', 'position': 83199872, 'count': 9, 'coverage': 9, 'conversion': 'C>G', 'strand_conversion': 'C>G', 'strand': '+', 'feature_name': 'AC139099.2'}\n", - "Exception: coverage was site_id\n", - "no_barcode_chr17_83199872_C_G_+ 14\n", - "Name: coverage, dtype: int64\n", "\n", - "\t ~~~ same_pos_dif_reads_test FAILED! ~~~\n", + "\t >>> same_pos_dif_reads_test passed! <<<\n", "\n", "\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", @@ -3025,7 +3016,7 @@ "\n", "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n", "\n", - "There were 4 failures\n" + "There were 1 failures\n" ] } ], @@ -3351,6 +3342,7 @@ "bulk_folder = 'only_5_cells_bulk_mode_test'\n", "sc_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(sc_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n", "bulk_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(bulk_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n", + "\n", "print(\"Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\")\n", "grouped_sc = pd.DataFrame(sc_5_cells.groupby(['contig', 'position', 'strand_conversion']).agg({'count': sum, 'strand_conversion': 'unique'}))\n", "grouped_sc.index.names = ['contig', 'position', 'c']\n",