Skip to content

Commit

Permalink
all cells only for specific conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Nov 29, 2024
1 parent ba8b0c2 commit 00e193b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 17 deletions.
42 changes: 30 additions & 12 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def generate_bedgraphs(final_site_level_information_df, conversion_search, outpu
sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)


def generate_combined_sites_bed_for_all_cells(output_folder):
def generate_combined_sites_bed_for_all_cells(output_folder, strand_conversion="A>G"):
# go from:
#
# 9_GATCCCTCAGTAACGG-1 3000447 3000448
Expand All @@ -305,29 +305,47 @@ def generate_combined_sites_bed_for_all_cells(output_folder):
# 2_CATCCCTCAGTAACGA-1 3000451 3000452

# Input and output files
input_file = f"{output_folder}/combined_source_cells.bed"
input_file = f"{output_folder}/final_filtered_site_info.tsv"
output_file = f"{output_folder}/combined_all_cells.bed"

# Load the input data
df = pd.read_csv(input_file, sep="\t", header=None, names=["Contig", "Start", "End"])
df = pd.read_csv(input_file, sep="\t")
print(f"Found {len(df)} total edits...")

df['Chromosome'] = [str(i.split('_')[0]) for i in df['Contig']]

unique_chromosomes = df['Chromosome'].unique()
print("Unique chromosomes found: {}".format(unique_chromosomes))
df = df[df['strand_conversion'] == strand_conversion]
print(f"Running all positions for {len(df)} {strand_conversion} edits")

df['chromosome'] = df['contig'].astype(str)
df['contig'] = df['contig'].astype(str) + '_' + df['barcode'].astype(str)

unique_chromosomes = df['chromosome'].unique()
print("Unique chromosomes found: {}".format(unique_chromosomes))

total_positions = 0
for chromosome in unique_chromosomes:
df_for_chromosome = df[df['Chromosome'] == str(chromosome)]
# Permute only within chromosome

df_for_chromosome = df[df['chromosome'] == str(chromosome)]
# Create unique lists:
unique_contigs = df_for_chromosome["Contig"].unique() # Unique contig pairs, within this chromosome
unique_positions = df_for_chromosome[["Start", "End"]].drop_duplicates().values.tolist() # Unique location pairs
unique_contigs = df_for_chromosome["contig"].unique() # Unique contig pairs, within this chromosome

print("\n{} unique_contigs for chromosome {}".format(len(unique_contigs), chromosome))

unique_positions = set(df_for_chromosome["position"].tolist()) # Unique location pairs

print(f'unique_positions: {len(unique_positions)}, unique_contigs: {len(unique_contigs)}')

positions_for_chromosome = len(unique_positions) * len(unique_contigs)
total_positions += positions_for_chromosome
print('\ttotal_positions: {}\n'.format(positions_for_chromosome))
# Open the output file and write combinations line by line
with open(output_file, "a") as f:
for contig in unique_contigs:
for start, end in unique_positions:
f.write(f"{contig}\t{start}\t{end}\n")

for position in unique_positions:
f.write(f"{contig}\t{position-1}\t{position}\n")
print("Overall positions: {}\n\n".format(total_positions))

print(f"Output written to {output_file}")


Expand Down
9 changes: 4 additions & 5 deletions tests/integration_tests.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2889,7 +2889,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -3012,12 +3012,11 @@
"\t >>> long_read_sc_test passed! <<<\n",
"\n",
"Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n",
"grouped_sc_rows: 62, bulk_rows: 26\n",
"Exception: \n",
"grouped_sc_rows: 62, bulk_rows: 62\n",
"\n",
"\t ~~~ single cell vs bulk modes on sc dataset equivalency test FAILED! ~~~\n",
"\t >>> single-cell and bulk on same dataset comparison passed! <<<\n",
"\n",
"There were 1 failures\n"
"There were 0 failures\n"
]
}
],
Expand Down

0 comments on commit 00e193b

Please sign in to comment.