From 2aedc8fb32356a11569bcad6483ea31fd0b9cc73 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-14-42.tscc.sdsc.edu>
Date: Fri, 29 Nov 2024 13:25:04 -0800
Subject: [PATCH] adjustment

---
 marine.py                     |  4 ++--
 src/utils.py                  |  9 +++++++--
 tests/integration_tests.ipynb | 20 ++++++--------------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/marine.py b/marine.py
index 4221238..0211128 100755
--- a/marine.py
+++ b/marine.py
@@ -221,7 +221,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
         if paired_end:
             paired_end_flag = '-s '
         else:
-            paired_end_flag = 's'
+            paired_end_flag = ''
             
         # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
         samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
@@ -243,7 +243,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
                                 '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
                             '{}/depths_source_cells.txt'.format(output_folder), 
                             '{}/final_edit_info.tsv'.format(output_folder), 
-                                header_columns)
+                                header_columns, paired_end=paired_end)
     
     coverage_total_time = time.perf_counter() - coverage_start_time
     
diff --git a/src/utils.py b/src/utils.py
index dadda75..1534a13 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -652,15 +652,20 @@ def concat_and_write_bams_wrapper(params):
     concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, barcode_tag=barcode_tag, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
 
 
-def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns):
+def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns, paired_end=False):
     # Convert header list into a tab-separated string
     header = "\t".join(header_columns)
 
+    position_adjustment = '1'
+    if paired_end:
+        position_adjustment = '0' 
+    print(f"position adjustment is {position_adjustment} (paired_end is {paired_end})")
+        
     # Generate the Bash command for processing
     bash_command = f"""#!/bin/bash
     # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
     # for join purposes, and sort by this column. Output in tab-separated format.
-    awk -v OFS='\\t' '{{print $1, $2+1, $3, $1":"($2+1)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
+    awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
     
     # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
     sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index f8fe151..cd07c64 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -2932,18 +2932,12 @@
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for pair_test\n",
       "\tExpecting: {'contig': '18', 'position': 49491556, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'RPL17-C18orf32,RPL17', 'feature_strand': '-,-'}\n",
-      "Exception: coverage was site_id\n",
-      "no_barcode_18_49491556_G_A_-    3\n",
-      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t ~~~ pair_test FAILED! ~~~\n",
+      "\t >>> pair_test passed! <<<\n",
       "\n",
       "\tExpecting: {'contig': '18', 'position': 49567494, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+', 'feature_name': 'LIPG', 'feature_strand': '+'}\n",
-      "Exception: coverage was site_id\n",
-      "no_barcode_18_49567494_C_T_+    3\n",
-      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t ~~~ pair_test FAILED! ~~~\n",
+      "\t >>> pair_test passed! <<<\n",
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
@@ -2977,11 +2971,8 @@
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
       "Checking results for same_pos_dif_reads_test\n",
       "\tExpecting: {'contig': 'chr17', 'position': 83199872, 'count': 9, 'coverage': 9, 'conversion': 'C>G', 'strand_conversion': 'C>G', 'strand': '+', 'feature_name': 'AC139099.2'}\n",
-      "Exception: coverage was site_id\n",
-      "no_barcode_chr17_83199872_C_G_+    14\n",
-      "Name: coverage, dtype: int64\n",
       "\n",
-      "\t ~~~ same_pos_dif_reads_test FAILED! ~~~\n",
+      "\t >>> same_pos_dif_reads_test passed! <<<\n",
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
@@ -3025,7 +3016,7 @@
       "\n",
       "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
       "\n",
-      "There were 4 failures\n"
+      "There were 1 failures\n"
      ]
     }
    ],
@@ -3351,6 +3342,7 @@
     "bulk_folder = 'only_5_cells_bulk_mode_test'\n",
     "sc_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(sc_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n",
     "bulk_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(bulk_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n",
+    "\n",
     "print(\"Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\")\n",
     "grouped_sc = pd.DataFrame(sc_5_cells.groupby(['contig', 'position', 'strand_conversion']).agg({'count': sum, 'strand_conversion': 'unique'}))\n",
     "grouped_sc.index.names = ['contig', 'position', 'c']\n",