From 00e193b3136b97d3b5b1da07caa0f9c708bae970 Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Fri, 29 Nov 2024 14:39:33 -0800 Subject: [PATCH] all cells only for specific conversion --- marine.py | 42 +++++++++++++++++++++++++---------- tests/integration_tests.ipynb | 9 ++++---- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/marine.py b/marine.py index ebdce53..3e85b43 100755 --- a/marine.py +++ b/marine.py @@ -289,7 +289,7 @@ def generate_bedgraphs(final_site_level_information_df, conversion_search, outpu sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False) -def generate_combined_sites_bed_for_all_cells(output_folder): +def generate_combined_sites_bed_for_all_cells(output_folder, strand_conversion="A>G"): # go from: # # 9_GATCCCTCAGTAACGG-1 3000447 3000448 @@ -305,29 +305,47 @@ def generate_combined_sites_bed_for_all_cells(output_folder): # 2_CATCCCTCAGTAACGA-1 3000451 3000452 # Input and output files - input_file = f"{output_folder}/combined_source_cells.bed" + input_file = f"{output_folder}/final_filtered_site_info.tsv" output_file = f"{output_folder}/combined_all_cells.bed" # Load the input data - df = pd.read_csv(input_file, sep="\t", header=None, names=["Contig", "Start", "End"]) + df = pd.read_csv(input_file, sep="\t") + print(f"Found {len(df)} total edits...") - df['Chromosome'] = [str(i.split('_')[0]) for i in df['Contig']] - unique_chromosomes = df['Chromosome'].unique() - print("Unique chromosomes found: {}".format(unique_chromosomes)) + df = df[df['strand_conversion'] == strand_conversion] + print(f"Running all positions for {len(df)} {strand_conversion} edits") + + df['chromosome'] = df['contig'].astype(str) + df['contig'] = df['contig'].astype(str) + '_' + df['barcode'].astype(str) + unique_chromosomes = df['chromosome'].unique() + print("Unique chromosomes found: {}".format(unique_chromosomes)) + + total_positions = 0 for chromosome in unique_chromosomes: - df_for_chromosome = df[df['Chromosome'] == str(chromosome)] + # Permute only within chromosome + + df_for_chromosome = df[df['chromosome'] == str(chromosome)] # Create unique lists: - unique_contigs = df_for_chromosome["Contig"].unique() # Unique contig pairs, within this chromosome - unique_positions = df_for_chromosome[["Start", "End"]].drop_duplicates().values.tolist() # Unique location pairs + unique_contigs = df_for_chromosome["contig"].unique() # Unique contig pairs, within this chromosome + + print("\n{} unique_contigs for chromosome {}".format(len(unique_contigs), chromosome)) + unique_positions = set(df_for_chromosome["position"].tolist()) # Unique location pairs + + print(f'unique_positions: {len(unique_positions)}, unique_contigs: {len(unique_contigs)}') + + positions_for_chromosome = len(unique_positions) * len(unique_contigs) + total_positions += positions_for_chromosome + print('\ttotal_positions: {}\n'.format(positions_for_chromosome)) # Open the output file and write combinations line by line with open(output_file, "a") as f: for contig in unique_contigs: - for start, end in unique_positions: - f.write(f"{contig}\t{start}\t{end}\n") - + for position in unique_positions: + f.write(f"{contig}\t{position-1}\t{position}\n") + print("Overall positions: {}\n\n".format(total_positions)) + print(f"Output written to {output_file}") diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb index 162ff9f..6c9d709 100644 --- a/tests/integration_tests.ipynb +++ b/tests/integration_tests.ipynb @@ -2889,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387", "metadata": {}, "outputs": [ @@ -3012,12 +3012,11 @@ "\t >>> long_read_sc_test passed! <<<\n", "\n", "Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n", - "grouped_sc_rows: 62, bulk_rows: 26\n", - "Exception: \n", + "grouped_sc_rows: 62, bulk_rows: 62\n", "\n", - "\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n", + "\t >>> single-cell and bulk on same dataset comparison passed! <<<\n", "\n", - "There were 1 failures\n" + "There were 0 failures\n" ] } ],