Skip to content

Commit

Permalink
concatenate files
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Nov 26, 2024
1 parent 65b7b3a commit ad1b0b7
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 52 deletions.
56 changes: 4 additions & 52 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites
make_depth_command_script, generate_and_run_bash_merge, get_sailor_sites, concatenate_files

from core import run_edit_identifier, run_bam_reconfiguration, \
gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information
Expand Down Expand Up @@ -168,56 +168,6 @@ def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag
return total_bam_generation_time, total_seconds_for_bams_df


import subprocess


def concatenate_files(source_folder, file_pattern, output_filepath):
# Create the concatenation command with numeric sorting and header skipping
concat_command = (
f"for f in $(ls -v {source_folder}/{file_pattern}); do "
"tail -n +2 \"$f\"; " # Skip the header row for each file
"done > {}".format(output_filepath)
)

# Write the command to a shell script
concat_bash = f"{source_folder}/concat_command.sh"
with open(concat_bash, 'w') as f:
f.write(concat_command)

print("Concatenating files in numerical order without headers...")
subprocess.run(['bash', concat_bash])
print("Done concatenating.")


def coverage_processing(output_folder, barcode_tag='CB', paired_end=False, verbose=False, cores=1, number_of_expected_bams=4,
min_read_quality=0, bam_filepath=''):

# Single-cell or long read version:
edit_info_grouped_per_contig_combined = gather_edit_information_across_subcontigs(output_folder,
barcode_tag=barcode_tag,
number_of_expected_bams=number_of_expected_bams
)

#if verbose:
# print('edit_info_grouped_per_contig_combined', edit_info_grouped_per_contig_combined.keys())


results, total_time, total_seconds_for_contig = run_coverage_calculator(edit_info_grouped_per_contig_combined, output_folder,
barcode_tag=barcode_tag,
paired_end=paired_end,
verbose=verbose,
processes=cores
)
concatenate_files(output_folder, "coverage/*.tsv", "{}/final_edit_info.tsv".format(output_folder))

total_seconds_for_contig_df = pd.DataFrame.from_dict(total_seconds_for_contig, orient='index')
total_seconds_for_contig_df.columns = ['seconds']
total_seconds_for_contig_df['contig sections'] = total_seconds_for_contig_df.index
total_seconds_for_contig_df.index = range(len(total_seconds_for_contig_df))

return results, total_time, total_seconds_for_contig_df


def print_marine_logo():
logo_lines = [
":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: ",
Expand Down Expand Up @@ -282,7 +232,9 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False):
all_depth_commands=all_depth_commands, output_suffix='', run=True)

print("Concatenating edit info files...")
concatenate_files(output_folder, "edit_info/*edit_info.tsv", "{}/final_edit_info_no_coverage.tsv".format(output_folder))
concatenate_files(output_folder, "edit_info/*edit_info.tsv",
"{}/final_edit_info_no_coverage.tsv".format(output_folder),
run=True)

print("Append the depth columns to the concatenated final_edit_info file...")

Expand Down
20 changes: 20 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import numpy as np
import sys
import subprocess
from collections import OrderedDict, defaultdict
from itertools import product
from scipy.special import betainc
Expand Down Expand Up @@ -779,4 +780,23 @@ def get_sailor_sites(final_site_level_information_df, conversion="C>T", skip_cov

final_site_level_information_df = final_site_level_information_df[['contig', 'start', 'end', 'score', 'combo', 'strand']]
return final_site_level_information_df, weird_sites


def concatenate_files(source_folder, file_pattern, output_filepath, run=False):
# Create the concatenation command with numeric sorting and header skipping
concat_command = (
f"for f in $(ls -v {source_folder}/{file_pattern}); do "
"tail -n +2 \"$f\"; " # Skip the header row for each file
"done > {}".format(output_filepath)
)

# Write the command to a shell script
concat_bash = f"{source_folder}/concat_command.sh"
with open(concat_bash, 'w') as f:
f.write(concat_command)

if run:
print("Concatenating files in numerical order without headers...")
subprocess.run(['bash', concat_bash])
print("Done concatenating.")

0 comments on commit ad1b0b7

Please sign in to comment.