diff --git a/src/core.py b/src/core.py index b7c7ed9..89cdb00 100644 --- a/src/core.py +++ b/src/core.py @@ -20,7 +20,7 @@ from utils import get_contig_lengths_dict, get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \ write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, make_edit_finding_jobs, pretty_print, get_contigs_that_need_bams_written, split_bed_file, \ get_coverage_wrapper, write_reads_to_file, sort_bam, rm_bam, suffixes, get_broken_up_contigs, run_command, \ -make_depth_command_script_single_cell, concatenate_files, generate_and_run_bash_merge +make_depth_command_script_single_cell, concatenate_files, generate_and_run_bash_merge, read_barcode_whitelist_file import os, psutil diff --git a/src/utils.py b/src/utils.py index 07b7f1d..900c87c 100644 --- a/src/utils.py +++ b/src/utils.py @@ -14,8 +14,8 @@ from multiprocessing import Pool import multiprocessing import time -import scipy.sparse as sp import anndata as ad +from scipy.sparse import csr_matrix # Number of barcode characters to use as suffix during splitting CB_N = 1 @@ -976,6 +976,7 @@ def merge_files_by_chromosome(args): first_file = files[0] other_files = files[1:] merged_file = os.path.join(output_folder, f"{chromosome}_comprehensive_coverage_matrix.tsv") + h5ad_file = os.path.join(output_folder, f"{chromosome}_comprehensive_coverage_matrix.h5ad") # Prepare the paste command strip_headers_command = " ".join( @@ -987,6 +988,23 @@ def merge_files_by_chromosome(args): run_command(f"bash -c '{paste_command}'") print(f"\tColumnar merge complete for {chromosome}. Output saved to {merged_file}.") + # Convert the merged file to an h5ad format with a sparse matrix + print(f"\tConverting {merged_file} to {h5ad_file} as a sparse matrix.") + df = pd.read_csv(merged_file, sep='\t', index_col=0) # Assuming first column is positions + + # Convert DataFrame to sparse matrix + sparse_matrix = sp.csr_matrix(df.values) # Use csr_matrix here + + # Create AnnData object with sparse matrix + adata = sc.AnnData(sparse_matrix) + adata.obs_names = df.index # Set row (position) names + adata.var_names = df.columns # Set column (barcode) names + + # Write to .h5ad file + adata.write_h5ad(h5ad_file) + print(f"\th5ad conversion complete. Output saved to {h5ad_file}.") + + def prepare_matrix_files_multiprocess(output_matrix_folder, output_folder,