Skip to content

Commit

Permalink
adjustment
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Nov 29, 2024
1 parent 1616579 commit 2aedc8f
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 18 deletions.
4 changes: 2 additions & 2 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
if paired_end:
paired_end_flag = '-s '
else:
paired_end_flag = 's'
paired_end_flag = ''

# Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
Expand All @@ -243,7 +243,7 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
'{}/final_edit_info_no_coverage.tsv'.format(output_folder),
'{}/depths_source_cells.txt'.format(output_folder),
'{}/final_edit_info.tsv'.format(output_folder),
header_columns)
header_columns, paired_end=paired_end)

coverage_total_time = time.perf_counter() - coverage_start_time

Expand Down
9 changes: 7 additions & 2 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,15 +652,20 @@ def concat_and_write_bams_wrapper(params):
concat_and_write_bams(contig, df_dict, header_string, split_bams_folder, barcode_tag=barcode_tag, number_of_expected_bams=number_of_expected_bams, verbose=verbose)


def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns):
def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_file_path, header_columns, paired_end=False):
# Convert header list into a tab-separated string
header = "\t".join(header_columns)

position_adjustment = '1'
if paired_end:
position_adjustment = '0'
print(f"position adjustment is {position_adjustment} (paired_end is {paired_end})")

# Generate the Bash command for processing
bash_command = f"""#!/bin/bash
# Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
# for join purposes, and sort by this column. Output in tab-separated format.
awk -v OFS='\\t' '{{print $1, $2+1, $3, $1":"($2+1)}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
# Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
Expand Down
20 changes: 6 additions & 14 deletions tests/integration_tests.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2889,7 +2889,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -2932,18 +2932,12 @@
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
"Checking results for pair_test\n",
"\tExpecting: {'contig': '18', 'position': 49491556, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'RPL17-C18orf32,RPL17', 'feature_strand': '-,-'}\n",
"Exception: coverage was site_id\n",
"no_barcode_18_49491556_G_A_- 3\n",
"Name: coverage, dtype: int64\n",
"\n",
"\t ~~~ pair_test FAILED! ~~~\n",
"\t >>> pair_test passed! <<<\n",
"\n",
"\tExpecting: {'contig': '18', 'position': 49567494, 'count': 2, 'coverage': 2, 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '+', 'feature_name': 'LIPG', 'feature_strand': '+'}\n",
"Exception: coverage was site_id\n",
"no_barcode_18_49567494_C_T_+ 3\n",
"Name: coverage, dtype: int64\n",
"\n",
"\t ~~~ pair_test FAILED! ~~~\n",
"\t >>> pair_test passed! <<<\n",
"\n",
"\n",
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
Expand Down Expand Up @@ -2977,11 +2971,8 @@
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
"Checking results for same_pos_dif_reads_test\n",
"\tExpecting: {'contig': 'chr17', 'position': 83199872, 'count': 9, 'coverage': 9, 'conversion': 'C>G', 'strand_conversion': 'C>G', 'strand': '+', 'feature_name': 'AC139099.2'}\n",
"Exception: coverage was site_id\n",
"no_barcode_chr17_83199872_C_G_+ 14\n",
"Name: coverage, dtype: int64\n",
"\n",
"\t ~~~ same_pos_dif_reads_test FAILED! ~~~\n",
"\t >>> same_pos_dif_reads_test passed! <<<\n",
"\n",
"\n",
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
Expand Down Expand Up @@ -3025,7 +3016,7 @@
"\n",
"\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
"\n",
"There were 4 failures\n"
"There were 1 failures\n"
]
}
],
Expand Down Expand Up @@ -3351,6 +3342,7 @@
"bulk_folder = 'only_5_cells_bulk_mode_test'\n",
"sc_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(sc_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n",
"bulk_5_cells = pd.read_csv('singlecell_tests/{}/final_filtered_site_info.tsv'.format(bulk_folder), sep='\\t').sort_values(['position', 'strand_conversion'])\n",
"\n",
"print(\"Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\")\n",
"grouped_sc = pd.DataFrame(sc_5_cells.groupby(['contig', 'position', 'strand_conversion']).agg({'count': sum, 'strand_conversion': 'unique'}))\n",
"grouped_sc.index.names = ['contig', 'position', 'c']\n",
Expand Down

0 comments on commit 2aedc8f

Please sign in to comment.