diff --git a/marine.py b/marine.py index b05a8b0..9ebfd3e 100755 --- a/marine.py +++ b/marine.py @@ -270,7 +270,7 @@ def get_broken_up_contigs(contigs, num_per_sublist): broken_up_contigs.append(contig_sublist) return broken_up_contigs -def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_intervals_per_contig=16, strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor=False, min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, +def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_intervals_per_contig=16, strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, keep_intermediate_files=False, num_per_sublist=6, skip_coverage=False): @@ -460,7 +460,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in final_path_already_exists = True - if sailor: + if len(sailor_list) > 0: print("{} sites being converted to SAILOR format...".format(len(final_site_level_information_df))) # Output SAILOR-formatted file for use in FLARE downstream @@ -468,19 +468,21 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in # 1 629275 629276 0.966040688 2,30 + # 1 629309 629310 2.8306e-05 1,1043 + - conversion = 'C>T' - sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion, skip_coverage=skip_coverage) - sailor_sites = sailor_sites.drop_duplicates() - - print("{} final deduplicated SAILOR-formatted sites".format(len(sailor_sites))) - sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format( - output_folder, - conversion.replace(">", "-")), - header=False, - index=False, - sep='\t') + for conversion in sailor_list: + conversion_search = conversion[0] + '>' + conversion[1] + + print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion)) + + sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage) + sailor_sites = sailor_sites.drop_duplicates() - weird_sites.to_csv('{}/problematic_sites.tsv'.format(output_folder), sep='\t') + print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search)) + sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format( + output_folder, + conversion_search.replace(">", "-")), + header=False, + index=False, + sep='\t') if len(bedgraphs_list) > 0: # Make plot of edit distributions @@ -570,7 +572,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in parser.add_argument('--contigs', type=str, default='all') parser.add_argument('--min_read_quality', type=int, default=0, help='Minimum read quality, default is 0... every aligner assigns mapq scores differently, so double-check the range of qualities in your sample before setting this filter') - parser.add_argument('--sailor', dest='sailor', action='store_true') + parser.add_argument('--sailor', type=str, nargs='?', const='CT', default=None, dest='sailor') + parser.add_argument('--bedgraphs', type=str, default=None, help='Conversions for which to output a bedgraph for non-single cell runs, e.g. CT, AI') parser.add_argument('--verbose', dest='verbose', action='store_true') parser.add_argument('--keep_intermediate_files', dest='keep_intermediate_files', action='store_true') @@ -622,6 +625,17 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in assert(b in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG']) else: bedgraphs_list = [] + + if sailor: + if barcode_tag in ['CB', 'IB']: + sys.stderr.write("Can only output bedgraphs for bulk sequencing runs of MARINE") + sys.exit(1) + + sailor_list = sailor.upper().replace('I', 'G').split(',') + for s in sailor_list: + assert(s in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG']) + else: + sailor_list = [] assert(strandedness in [0, 1, 2]) @@ -647,7 +661,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in "\tCoverage only:\t{}".format(coverage_only), "\tFiltering only:\t{}".format(filtering_only), "\tAnnotation only:\t{}".format(annotation_only), - "\tSailor outputs:\t{}".format(sailor), + "\tSailor outputs:\t{}".format(sailor_list), "\tBedgraphs:\t{}".format(bedgraphs_list), "\tMinimum base quality:\t{}".format(min_base_quality), "\tMinimum read quality:\t{}".format(min_read_quality), @@ -683,7 +697,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in coverage_only=coverage_only, filtering_only=filtering_only, annotation_only=annotation_only, - sailor=sailor, + sailor_list=sailor_list, bedgraphs_list=bedgraphs_list, min_base_quality = min_base_quality, min_read_quality = min_read_quality, diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb index 95ca3ba..8b9edd3 100644 --- a/tests/integration_tests.ipynb +++ b/tests/integration_tests.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 27, + "execution_count": 11, "id": "9f2684b1-dbad-45d5-bdb5-b83989bec7dc", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 12, "id": "63b5feff-aa8a-417c-8c93-9b9d74e5dee9", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 13, "id": "f6012705-9d34-4997-8681-c7bbcc4f008b", "metadata": {}, "outputs": [], @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 14, "id": "ba12c2b8-2ccb-4866-86c8-a2284fd1229f", "metadata": { "scrolled": true @@ -73,33 +73,36 @@ "/tscc/projects/ps-yeolab3/ekofman/sailor2/tests\n", "Running\n", "/tscc/projects/ps-yeolab3/ekofman/sailor2\n", - "unstranded_pair_test\n", + "F1R2_pair_test-single_end_mode_sailor\n", "Removing old files...\n", "Running tests...\n", - "Assuming 1 cores available for multiprocessing. Set this to the number of available cores for optimal execution.\n", + "Python is /tscc/nfs/home/ekofman/miniconda3/envs/marine_environment/bin/python\n", + "Assuming 16 cores available for multiprocessing. Set this to the number of available cores for optimal execution.\n", "Arguments:\n", - "\tBAM filepath:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/bams/citrine435.bam\n", + "\tBAM filepath:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/bams/F1R2_pair.bam\n", "\tAnnotation bedfile filepath:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed\n", - "\tOutput folder:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/unstranded_pair_test\n", + "\tOutput folder:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/F1R2_pair_test-single_end_mode_sailor\n", "\tBarcode whitelist:\tNone\n", - "\tStrandedness:\t0\n", + "\tStrandedness:\t2\n", "\tBarcode Tag:\tNone\n", - "\tPaired End:\tTrue\n", + "\tPaired End:\tFalse\n", "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", - "\tBedgraphs:\t['CT']\n", + "\tSailor outputs:\t['CT', 'AG']\n", + "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", "\tMinimum distance from end:\t0\n", "\tMaximum edits per read:\tNone\n", - "\tContigs:\tall\n", - "\tNumber of intervals:\t1\n", - "\tCores:\t1\n", - "\tVerbose:\tFalse\n", + "\tContigs:\tchr17\n", + "\tNumber of intervals:\t16\n", + "\tCores:\t16\n", + "\tVerbose:\tTrue\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -111,57 +114,208 @@ "==================================================================\n", "Multi-core Algorithm for Rapid Identification of Nucleotide Edits\n", "==================================================================\n", - "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "WARNING /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/unstranded_pair_test is not empty\n", - "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "broken_up_contigs [[]]\n", - "Each contig is being split into 1 subsets...\n", - "\tContig Citrine.dna\n", - "1 total jobs\n", - "Reads processed:\t44\n", - "Time to process reads in min:\t0.02895\n", + "Contig groups to be processed: [['chr17']]\n", + "Each contig is being split into 16 subsets...\n", + "contig_lengths_dict:{'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, 'chrM': 16569, 'chr1_KI270706v1_random': 175055, 'chr1_KI270707v1_random': 32032, 'chr1_KI270708v1_random': 127682, 'chr1_KI270709v1_random': 66860, 'chr1_KI270710v1_random': 40176, 'chr1_KI270711v1_random': 42210, 'chr1_KI270712v1_random': 176043, 'chr1_KI270713v1_random': 40745, 'chr1_KI270714v1_random': 41717, 'chr2_KI270715v1_random': 161471, 'chr2_KI270716v1_random': 153799, 'chr3_GL000221v1_random': 155397, 'chr4_GL000008v2_random': 209709, 'chr5_GL000208v1_random': 92689, 'chr9_KI270717v1_random': 40062, 'chr9_KI270718v1_random': 38054, 'chr9_KI270719v1_random': 176845, 'chr9_KI270720v1_random': 39050, 'chr11_KI270721v1_random': 100316, 'chr14_GL000009v2_random': 201709, 'chr14_GL000225v1_random': 211173, 'chr14_KI270722v1_random': 194050, 'chr14_GL000194v1_random': 191469, 'chr14_KI270723v1_random': 38115, 'chr14_KI270724v1_random': 39555, 'chr14_KI270725v1_random': 172810, 'chr14_KI270726v1_random': 43739, 'chr15_KI270727v1_random': 448248, 'chr16_KI270728v1_random': 1872759, 'chr17_GL000205v2_random': 185591, 'chr17_KI270729v1_random': 280839, 'chr17_KI270730v1_random': 112551, 'chr22_KI270731v1_random': 150754, 'chr22_KI270732v1_random': 41543, 'chr22_KI270733v1_random': 179772, 'chr22_KI270734v1_random': 165050, 'chr22_KI270735v1_random': 42811, 'chr22_KI270736v1_random': 181920, 'chr22_KI270737v1_random': 103838, 'chr22_KI270738v1_random': 99375, 'chr22_KI270739v1_random': 73985, 'chrY_KI270740v1_random': 37240, 'chrUn_KI270302v1': 2274, 'chrUn_KI270304v1': 2165, 'chrUn_KI270303v1': 1942, 'chrUn_KI270305v1': 1472, 'chrUn_KI270322v1': 21476, 'chrUn_KI270320v1': 4416, 'chrUn_KI270310v1': 1201, 'chrUn_KI270316v1': 1444, 'chrUn_KI270315v1': 2276, 'chrUn_KI270312v1': 998, 'chrUn_KI270311v1': 12399, 'chrUn_KI270317v1': 37690, 'chrUn_KI270412v1': 1179, 'chrUn_KI270411v1': 2646, 'chrUn_KI270414v1': 2489, 'chrUn_KI270419v1': 1029, 'chrUn_KI270418v1': 2145, 'chrUn_KI270420v1': 2321, 'chrUn_KI270424v1': 2140, 'chrUn_KI270417v1': 2043, 'chrUn_KI270422v1': 1445, 'chrUn_KI270423v1': 981, 'chrUn_KI270425v1': 1884, 'chrUn_KI270429v1': 1361, 'chrUn_KI270442v1': 392061, 'chrUn_KI270466v1': 1233, 'chrUn_KI270465v1': 1774, 'chrUn_KI270467v1': 3920, 'chrUn_KI270435v1': 92983, 'chrUn_KI270438v1': 112505, 'chrUn_KI270468v1': 4055, 'chrUn_KI270510v1': 2415, 'chrUn_KI270509v1': 2318, 'chrUn_KI270518v1': 2186, 'chrUn_KI270508v1': 1951, 'chrUn_KI270516v1': 1300, 'chrUn_KI270512v1': 22689, 'chrUn_KI270519v1': 138126, 'chrUn_KI270522v1': 5674, 'chrUn_KI270511v1': 8127, 'chrUn_KI270515v1': 6361, 'chrUn_KI270507v1': 5353, 'chrUn_KI270517v1': 3253, 'chrUn_KI270529v1': 1899, 'chrUn_KI270528v1': 2983, 'chrUn_KI270530v1': 2168, 'chrUn_KI270539v1': 993, 'chrUn_KI270538v1': 91309, 'chrUn_KI270544v1': 1202, 'chrUn_KI270548v1': 1599, 'chrUn_KI270583v1': 1400, 'chrUn_KI270587v1': 2969, 'chrUn_KI270580v1': 1553, 'chrUn_KI270581v1': 7046, 'chrUn_KI270579v1': 31033, 'chrUn_KI270589v1': 44474, 'chrUn_KI270590v1': 4685, 'chrUn_KI270584v1': 4513, 'chrUn_KI270582v1': 6504, 'chrUn_KI270588v1': 6158, 'chrUn_KI270593v1': 3041, 'chrUn_KI270591v1': 5796, 'chrUn_KI270330v1': 1652, 'chrUn_KI270329v1': 1040, 'chrUn_KI270334v1': 1368, 'chrUn_KI270333v1': 2699, 'chrUn_KI270335v1': 1048, 'chrUn_KI270338v1': 1428, 'chrUn_KI270340v1': 1428, 'chrUn_KI270336v1': 1026, 'chrUn_KI270337v1': 1121, 'chrUn_KI270363v1': 1803, 'chrUn_KI270364v1': 2855, 'chrUn_KI270362v1': 3530, 'chrUn_KI270366v1': 8320, 'chrUn_KI270378v1': 1048, 'chrUn_KI270379v1': 1045, 'chrUn_KI270389v1': 1298, 'chrUn_KI270390v1': 2387, 'chrUn_KI270387v1': 1537, 'chrUn_KI270395v1': 1143, 'chrUn_KI270396v1': 1880, 'chrUn_KI270388v1': 1216, 'chrUn_KI270394v1': 970, 'chrUn_KI270386v1': 1788, 'chrUn_KI270391v1': 1484, 'chrUn_KI270383v1': 1750, 'chrUn_KI270393v1': 1308, 'chrUn_KI270384v1': 1658, 'chrUn_KI270392v1': 971, 'chrUn_KI270381v1': 1930, 'chrUn_KI270385v1': 990, 'chrUn_KI270382v1': 4215, 'chrUn_KI270376v1': 1136, 'chrUn_KI270374v1': 2656, 'chrUn_KI270372v1': 1650, 'chrUn_KI270373v1': 1451, 'chrUn_KI270375v1': 2378, 'chrUn_KI270371v1': 2805, 'chrUn_KI270448v1': 7992, 'chrUn_KI270521v1': 7642, 'chrUn_GL000195v1': 182896, 'chrUn_GL000219v1': 179198, 'chrUn_GL000220v1': 161802, 'chrUn_GL000224v1': 179693, 'chrUn_KI270741v1': 157432, 'chrUn_GL000226v1': 15008, 'chrUn_GL000213v1': 164239, 'chrUn_KI270743v1': 210658, 'chrUn_KI270744v1': 168472, 'chrUn_KI270745v1': 41891, 'chrUn_KI270746v1': 66486, 'chrUn_KI270747v1': 198735, 'chrUn_KI270748v1': 93321, 'chrUn_KI270749v1': 158759, 'chrUn_KI270750v1': 148850, 'chrUn_KI270751v1': 150742, 'chrUn_KI270752v1': 27745, 'chrUn_KI270753v1': 62944, 'chrUn_KI270754v1': 40191, 'chrUn_KI270755v1': 36723, 'chrUn_KI270756v1': 79590, 'chrUn_KI270757v1': 71251, 'chrUn_GL000214v1': 137718, 'chrUn_KI270742v1': 186739, 'chrUn_GL000216v2': 176608, 'chrUn_GL000218v1': 161147, 'chrEBV': 171823}\n", + "\tContig chr17\n", + "16 total jobs\n", + "chr17:000, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "MD tag 26A20G52\n", + "CIGAR tag 1S100M\n", + "is_reverse False\n", + "is_read1 True\n", + "is_read2 False\n", + "is_paired True\n", + "is_proper_pair True\n", + "mate_is_reverse True\n", + "read id VH01429:22:AACFJ5NHV:1:2211:40480:18323\n", + "VH01429:22:AACFJ5NHV:1:2211:40480:18323\t99\t#16\t43044305\t255\t1S100M\t#16\t43044312\t107\tGTGCTACCAAGTTTATTTGCAGTGTTACCAGCACAACATTTACAAAACATATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGA\tarray('B', [12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 12, 34, 34, 34, 34, 34, 26, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34])\t[('MD', '26A20G52'), ('PG', 'MarkDuplicates'), ('RG', 'ET_STAMPExpt_1-CTRL_S2'), ('NH', 1), ('HI', 1), ('NM', 2), ('AS', 190)]\n", + "reverse_or_forward: -\n", + "----------------------------\n", + "MD tag: 26A20G52\n", + "CIGAR string 1S100M\n", + "Reference seq: TGCTACCAAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACGTATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGA\n", + "Aligned seq: GTGCTACCAAGTTTATTTGCAGTGTTACCAGCACAACATTTACAAAACATATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGA\n", + "Qualities: array('B', [12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 12, 34, 34, 34, 34, 34, 26, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34])\n", + "CIGAR tuples before clipping (if needed):\n", + " [(4, 1), (0, 100)]\n", + "Aligned sequence before clipping (if needed):\n", + " GTGCTACCAAGTTTATTTGCAGTGTTACCAGCACAACATTTACAAAACATATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGA\n", + "Qualities before clipping:\n", + " array('B', [12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 12, 34, 34, 34, 34, 34, 26, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34])\n", + "Soft clipping quality scores ...\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "CIGAR tuples after clipping (if needed):\n", + " [(0, 100)]\n", + "Aligned sequence after clipping (if needed):\n", + " TGCTACCAAGTTTATTTGCAGTGTTACCAGCACAACATTTACAAAACATATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGA\n", + "Qualities after clipping:\n", + " array('B', [34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 12, 34, 34, 34, 34, 34, 26, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34])\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "['26', '20', '52']\n", + "[26, 47, 100]\n", + "Indicated reference seq:\n", + " tgctaccaagtttatttgcagtgttaAcagcacaacatttacaaaacGtattttgtacaatcaagtcttcactgcccttgcacactgggggggctaggga\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "Fixed reference seq:\n", + " tgctaccaagtttatttgcagtgttaAcagcacaacatttacaaaacGtattttgtacaatcaagtcttcactgcccttgcacactgggggggctaggga\n", + "Fixed aligned seq:\n", + " TGCTACCAAGTTTATTTGCAGTGTTACCAGCACAACATTTACAAAACATATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGA\n", + "Finalized fixed aligned seq:\n", + " tgctaccaagtttatttgcagtgttaCcagcacaacatttacaaaacAtattttgtacaatcaagtcttcactgcccttgcacactgggggggctaggga\n", + "Indicated qualities:\n", + " 34343434343434343434263434343434343434342634343434261234343434342626343434343434343434341234341234343434343434123434343434343412343434343412343434343434343434342634342634343426343426343434342634343434\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "alt bases ['C', 'A']\n", + "ref bases ['A', 'G']\n", + "Successfully ran get_edit_information_wrapper\n", + "alt bases: ['C', 'A'], ref bases: ['A', 'G']\n", + "Getting info: C A 12 27\n", + "Getting info: A G 12 48\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "MD tag 40G3T55\n", + "CIGAR tag 100M1S\n", + "is_reverse True\n", + "is_read1 False\n", + "is_read2 True\n", + "is_paired True\n", + "is_proper_pair True\n", + "mate_is_reverse False\n", + "read id VH01429:22:AACFJ5NHV:1:2211:40480:18323\n", + "VH01429:22:AACFJ5NHV:1:2211:40480:18323\t147\t#16\t43044312\t255\t100M1S\t#16\t43044305\t-107\tAAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACATATGTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGAAGACCTAC\tarray('B', [34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 12])\t[('MD', '40G3T55'), ('PG', 'MarkDuplicates'), ('RG', 'ET_STAMPExpt_1-CTRL_S2'), ('NH', 1), ('HI', 1), ('NM', 2), ('AS', 190)]\n", + "reverse_or_forward: -\n", + "----------------------------\n", + "MD tag: 40G3T55\n", + "CIGAR string 100M1S\n", + "Reference seq: AAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACGTATTTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGAAGACCTA\n", + "Aligned seq: AAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACATATGTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGAAGACCTAC\n", + "Qualities: array('B', [34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 12])\n", + "CIGAR tuples before clipping (if needed):\n", + " [(0, 100), (4, 1)]\n", + "Aligned sequence before clipping (if needed):\n", + " AAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACATATGTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGAAGACCTAC\n", + "Qualities before clipping:\n", + " array('B', [34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 12])\n", + "Soft clipping quality scores ...\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "CIGAR tuples after clipping (if needed):\n", + " [(0, 100)]\n", + "Aligned sequence after clipping (if needed):\n", + " AAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACATATGTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGAAGACCTA\n", + "Qualities after clipping:\n", + " array('B', [34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34])\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "['40', '3', '55']\n", + "[40, 44, 100]\n", + "Indicated reference seq:\n", + " aagtttatttgcagtgttaacagcacaacatttacaaaacGtatTttgtacaatcaagtcttcactgcccttgcacactgggggggctagggaagaccta\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "Fixed reference seq:\n", + " aagtttatttgcagtgttaacagcacaacatttacaaaacGtatTttgtacaatcaagtcttcactgcccttgcacactgggggggctagggaagaccta\n", + "Fixed aligned seq:\n", + " AAGTTTATTTGCAGTGTTAACAGCACAACATTTACAAAACATATGTTGTACAATCAAGTCTTCACTGCCCTTGCACACTGGGGGGGCTAGGGAAGACCTA\n", + "Finalized fixed aligned seq:\n", + " aagtttatttgcagtgttaacagcacaacatttacaaaacAtatGttgtacaatcaagtcttcactgcccttgcacactgggggggctagggaagaccta\n", + "Indicated qualities:\n", + " 34343434343434263434342634343434341234343434263434343434343434343434343426343434343434341234343434343434343434343434263412343434343434343434342634343434343434343434343434343426343434263434343434343434\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "alt bases ['A', 'G']\n", + "ref bases ['G', 'T']\n", + "Successfully ran get_edit_information_wrapper\n", + "alt bases: ['A', 'G'], ref bases: ['G', 'T']\n", + "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "Reads processed:\t2\n", + "Time to process reads in min:\t0.09813\n", "Read Summary:\n", - "edited 25\n", - "no_edits 19\n", - "total_edits 37\n", + "edited 2\n", + "total_edits 4\n", "dtype: int64\n", "\n", "Deleting overall_label_to_list_of_contents...\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Calculating coverage at edited sites, minimum read quality is 0...\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "\tsplit Citrine.dna_000_0_741, 0/1...\n", + "\tsplit chr17_000_0_5203591, 0/16...\n", + "\tsplit chr17_010_52035910_57239501, 10/16...\n", "Done grouping! Concatenating ...\n", - "Done concatenating!\n" + "Done concatenating!\n", + "edit_info_grouped_per_contig_combined dict_keys(['chr17_000_0_5203591', 'chr17_001_5203591_10407182', 'chr17_002_10407182_15610773', 'chr17_003_15610773_20814364', 'chr17_004_20814364_26017955', 'chr17_005_26017955_31221546', 'chr17_006_31221546_36425137', 'chr17_007_36425137_41628728', 'chr17_008_41628728_46832319', 'chr17_009_46832319_52035910', 'chr17_010_52035910_57239501', 'chr17_011_57239501_62443092', 'chr17_012_62443092_67646683', 'chr17_013_67646683_72850274', 'chr17_014_72850274_78053865', 'chr17_015_78053865_83257456'])\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:01<00:00, 1.70s/it]\n", - "100%|██████████| 1/1 [00:01<00:00, 1.74s/it]\n", - "/tscc/projects/ps-yeolab3/ekofman/sailor2//marine.py:483: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage']\n", - "/tscc/projects/ps-yeolab3/ekofman/sailor2//marine.py:484: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " sites_for_conversion['start'] = sites_for_conversion['position'] - 1\n", - "***** WARNING: File /tmp/pybedtools.96t907wz.tmp has inconsistent naming convention for record:\n", - "Citrine.dna\t436\t438\tno_barcode_Citrine.dna_437_A_G_+\tA>G\t+\n", - "\n", - "***** WARNING: File /tmp/pybedtools.96t907wz.tmp has inconsistent naming convention for record:\n", - "Citrine.dna\t436\t438\tno_barcode_Citrine.dna_437_A_G_+\tA>G\t+\n", - "\n" + "100%|██████████| 16/16 [00:05<00:00, 2.79it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "~~~~~~\n", + "!!!!SINGLE END!!!!!\n", + "~~~~~~~`\n", + "~~~~~~\n", + "!!!!SINGLE END!!!!!\n", + "~~~~~~~`\n", + "~~~~~~\n", + "!!!!SINGLE END!!!!!\n", + "~~~~~~~`\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 16/16 [00:05<00:00, 2.92it/s]\n" ] }, { @@ -170,27 +324,29 @@ "text": [ "Concatenating results...\n", "Done concatenating.\n", - "original 37\n", - "filtered 37\n", + "original 4\n", + "filtered 3\n", "dtype: int64\n", - "Total time to calculate coverage: 0.029 minutes\n", + "Total time to calculate coverage: 0.094 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", - "\t37\n", + "\t3\n", "\tNumber of unique edit sites:\n", - "\t14\n", + "\t3\n", "Writing sites...\n", "\n", "Adding strand-specific conversion...\n", "\n", - "14 sites being converted to SAILOR format...\n", + "3 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "2 final deduplicated SAILOR-formatted sites\n", - "Making bedgraphs for ['CT'] conversions...\n", - "\n", + "1 final deduplicated C>T SAILOR-formatted sites\n", + "Generating SAILOR-style bed outputs for conversion AG...\n", + "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", + "0 final deduplicated A>G SAILOR-formatted sites\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", - "Current memory usage 21.566707MB; Peak: 22.675407MB\n", - "Time elapsed: 4.40s\n", + "Current memory usage 21.288251MB; Peak: 22.400467MB\n", + "Time elapsed: 12.35s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -213,7 +369,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", + "\tSailor outputs:\t['CT']\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -225,6 +381,8 @@ "\tVerbose:\tTrue\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -239,10 +397,7 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['chr17']]\n", + "Contig groups to be processed: [['chr17']]\n", "Each contig is being split into 16 subsets...\n", "contig_lengths_dict:{'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, 'chrM': 16569, 'chr1_KI270706v1_random': 175055, 'chr1_KI270707v1_random': 32032, 'chr1_KI270708v1_random': 127682, 'chr1_KI270709v1_random': 66860, 'chr1_KI270710v1_random': 40176, 'chr1_KI270711v1_random': 42210, 'chr1_KI270712v1_random': 176043, 'chr1_KI270713v1_random': 40745, 'chr1_KI270714v1_random': 41717, 'chr2_KI270715v1_random': 161471, 'chr2_KI270716v1_random': 153799, 'chr3_GL000221v1_random': 155397, 'chr4_GL000008v2_random': 209709, 'chr5_GL000208v1_random': 92689, 'chr9_KI270717v1_random': 40062, 'chr9_KI270718v1_random': 38054, 'chr9_KI270719v1_random': 176845, 'chr9_KI270720v1_random': 39050, 'chr11_KI270721v1_random': 100316, 'chr14_GL000009v2_random': 201709, 'chr14_GL000225v1_random': 211173, 'chr14_KI270722v1_random': 194050, 'chr14_GL000194v1_random': 191469, 'chr14_KI270723v1_random': 38115, 'chr14_KI270724v1_random': 39555, 'chr14_KI270725v1_random': 172810, 'chr14_KI270726v1_random': 43739, 'chr15_KI270727v1_random': 448248, 'chr16_KI270728v1_random': 1872759, 'chr17_GL000205v2_random': 185591, 'chr17_KI270729v1_random': 280839, 'chr17_KI270730v1_random': 112551, 'chr22_KI270731v1_random': 150754, 'chr22_KI270732v1_random': 41543, 'chr22_KI270733v1_random': 179772, 'chr22_KI270734v1_random': 165050, 'chr22_KI270735v1_random': 42811, 'chr22_KI270736v1_random': 181920, 'chr22_KI270737v1_random': 103838, 'chr22_KI270738v1_random': 99375, 'chr22_KI270739v1_random': 73985, 'chrY_KI270740v1_random': 37240, 'chrUn_KI270302v1': 2274, 'chrUn_KI270304v1': 2165, 'chrUn_KI270303v1': 1942, 'chrUn_KI270305v1': 1472, 'chrUn_KI270322v1': 21476, 'chrUn_KI270320v1': 4416, 'chrUn_KI270310v1': 1201, 'chrUn_KI270316v1': 1444, 'chrUn_KI270315v1': 2276, 'chrUn_KI270312v1': 998, 'chrUn_KI270311v1': 12399, 'chrUn_KI270317v1': 37690, 'chrUn_KI270412v1': 1179, 'chrUn_KI270411v1': 2646, 'chrUn_KI270414v1': 2489, 'chrUn_KI270419v1': 1029, 'chrUn_KI270418v1': 2145, 'chrUn_KI270420v1': 2321, 'chrUn_KI270424v1': 2140, 'chrUn_KI270417v1': 2043, 'chrUn_KI270422v1': 1445, 'chrUn_KI270423v1': 981, 'chrUn_KI270425v1': 1884, 'chrUn_KI270429v1': 1361, 'chrUn_KI270442v1': 392061, 'chrUn_KI270466v1': 1233, 'chrUn_KI270465v1': 1774, 'chrUn_KI270467v1': 3920, 'chrUn_KI270435v1': 92983, 'chrUn_KI270438v1': 112505, 'chrUn_KI270468v1': 4055, 'chrUn_KI270510v1': 2415, 'chrUn_KI270509v1': 2318, 'chrUn_KI270518v1': 2186, 'chrUn_KI270508v1': 1951, 'chrUn_KI270516v1': 1300, 'chrUn_KI270512v1': 22689, 'chrUn_KI270519v1': 138126, 'chrUn_KI270522v1': 5674, 'chrUn_KI270511v1': 8127, 'chrUn_KI270515v1': 6361, 'chrUn_KI270507v1': 5353, 'chrUn_KI270517v1': 3253, 'chrUn_KI270529v1': 1899, 'chrUn_KI270528v1': 2983, 'chrUn_KI270530v1': 2168, 'chrUn_KI270539v1': 993, 'chrUn_KI270538v1': 91309, 'chrUn_KI270544v1': 1202, 'chrUn_KI270548v1': 1599, 'chrUn_KI270583v1': 1400, 'chrUn_KI270587v1': 2969, 'chrUn_KI270580v1': 1553, 'chrUn_KI270581v1': 7046, 'chrUn_KI270579v1': 31033, 'chrUn_KI270589v1': 44474, 'chrUn_KI270590v1': 4685, 'chrUn_KI270584v1': 4513, 'chrUn_KI270582v1': 6504, 'chrUn_KI270588v1': 6158, 'chrUn_KI270593v1': 3041, 'chrUn_KI270591v1': 5796, 'chrUn_KI270330v1': 1652, 'chrUn_KI270329v1': 1040, 'chrUn_KI270334v1': 1368, 'chrUn_KI270333v1': 2699, 'chrUn_KI270335v1': 1048, 'chrUn_KI270338v1': 1428, 'chrUn_KI270340v1': 1428, 'chrUn_KI270336v1': 1026, 'chrUn_KI270337v1': 1121, 'chrUn_KI270363v1': 1803, 'chrUn_KI270364v1': 2855, 'chrUn_KI270362v1': 3530, 'chrUn_KI270366v1': 8320, 'chrUn_KI270378v1': 1048, 'chrUn_KI270379v1': 1045, 'chrUn_KI270389v1': 1298, 'chrUn_KI270390v1': 2387, 'chrUn_KI270387v1': 1537, 'chrUn_KI270395v1': 1143, 'chrUn_KI270396v1': 1880, 'chrUn_KI270388v1': 1216, 'chrUn_KI270394v1': 970, 'chrUn_KI270386v1': 1788, 'chrUn_KI270391v1': 1484, 'chrUn_KI270383v1': 1750, 'chrUn_KI270393v1': 1308, 'chrUn_KI270384v1': 1658, 'chrUn_KI270392v1': 971, 'chrUn_KI270381v1': 1930, 'chrUn_KI270385v1': 990, 'chrUn_KI270382v1': 4215, 'chrUn_KI270376v1': 1136, 'chrUn_KI270374v1': 2656, 'chrUn_KI270372v1': 1650, 'chrUn_KI270373v1': 1451, 'chrUn_KI270375v1': 2378, 'chrUn_KI270371v1': 2805, 'chrUn_KI270448v1': 7992, 'chrUn_KI270521v1': 7642, 'chrUn_GL000195v1': 182896, 'chrUn_GL000219v1': 179198, 'chrUn_GL000220v1': 161802, 'chrUn_GL000224v1': 179693, 'chrUn_KI270741v1': 157432, 'chrUn_GL000226v1': 15008, 'chrUn_GL000213v1': 164239, 'chrUn_KI270743v1': 210658, 'chrUn_KI270744v1': 168472, 'chrUn_KI270745v1': 41891, 'chrUn_KI270746v1': 66486, 'chrUn_KI270747v1': 198735, 'chrUn_KI270748v1': 93321, 'chrUn_KI270749v1': 158759, 'chrUn_KI270750v1': 148850, 'chrUn_KI270751v1': 150742, 'chrUn_KI270752v1': 27745, 'chrUn_KI270753v1': 62944, 'chrUn_KI270754v1': 40191, 'chrUn_KI270755v1': 36723, 'chrUn_KI270756v1': 79590, 'chrUn_KI270757v1': 71251, 'chrUn_GL000214v1': 137718, 'chrUn_KI270742v1': 186739, 'chrUn_GL000216v2': 176608, 'chrUn_GL000218v1': 161147, 'chrEBV': 171823}\n", "\tContig chr17\n", @@ -250,27 +405,21 @@ "chr17:000, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "MD tag 26A20G52\n", "CIGAR tag 1S100M\n", @@ -370,39 +519,36 @@ " aagtttatttgcagtgttaacagcacaacatttacaaaacAtatGttgtacaatcaagtcttcactgcccttgcacactgggggggctagggaagaccta\n", "Indicated qualities:\n", " 34343434343434263434342634343434341234343434263434343434343434343434343426343434343434341234343434343434343434343434263412343434343434343434342634343434343434343434343434343426343434263434343434343434\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "alt bases ['A', 'G']\n", - "ref bases ['G', 'T']\n", - "Successfully ran get_edit_information_wrapper\n", - "alt bases: ['A', 'G'], ref bases: ['G', 'T']\n", - "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "alt bases ['A', 'G']\n", + "ref bases ['G', 'T']\n", + "Successfully ran get_edit_information_wrapper\n", + "alt bases: ['A', 'G'], ref bases: ['G', 'T']\n", "Getting info: A G 34 41\n", "Getting info: G T 12 45\n", "chr17:008, total reads: 2, counts_df: chr17\n", "edited 2\n", "total_edits 4\n", + "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", "Reads processed:\t2\n", - "Time to process reads in min:\t0.03892\n", + "Time to process reads in min:\t0.09448\n", "Read Summary:\n", "edited 2\n", "total_edits 4\n", @@ -423,7 +569,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 7.17it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.92it/s]\n" ] }, { @@ -445,7 +591,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 7.04it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.93it/s]\n" ] }, { @@ -457,7 +603,7 @@ "original 4\n", "filtered 3\n", "dtype: int64\n", - "Total time to calculate coverage: 0.039 minutes\n", + "Total time to calculate coverage: 0.094 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t3\n", @@ -468,11 +614,139 @@ "Adding strand-specific conversion...\n", "\n", "3 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", + "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", + "1 final deduplicated C>T SAILOR-formatted sites\n", + "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", + "Current memory usage 21.284538MB; Peak: 22.397199MB\n", + "Time elapsed: 12.21s\n", + "-------------------------------\n", + "Deleting intermediate files...\n", + "-------------------------------\n", + "++++++\n", + "Done!\n", + "++++++\n", + "unstranded_pair_test\n", + "Removing old files...\n", + "Running tests...\n", + "Assuming 1 cores available for multiprocessing. Set this to the number of available cores for optimal execution.\n", + "Arguments:\n", + "\tBAM filepath:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/bams/citrine435.bam\n", + "\tAnnotation bedfile filepath:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed\n", + "\tOutput folder:\t/tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/unstranded_pair_test\n", + "\tBarcode whitelist:\tNone\n", + "\tStrandedness:\t0\n", + "\tBarcode Tag:\tNone\n", + "\tPaired End:\tTrue\n", + "\tCoverage only:\tFalse\n", + "\tFiltering only:\tFalse\n", + "\tAnnotation only:\tFalse\n", + "\tSailor outputs:\t['CT']\n", + "\tBedgraphs:\t['CT']\n", + "\tMinimum base quality:\t0\n", + "\tMinimum read quality:\t0\n", + "\tMinimum distance from end:\t0\n", + "\tMaximum edits per read:\tNone\n", + "\tContigs:\tall\n", + "\tNumber of intervals:\t1\n", + "\tCores:\t1\n", + "\tVerbose:\tFalse\n", + "\tKeep intermediate files:\tFalse\n", + "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", + "\n", + ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", + "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", + "+:+ +:+:+ +:+ +:+ +:+ +:+ +:+ +:+ :+:+:+ +:+ +:+ \n", + "+#+ +:+ +#+ +#++:++#++: +#++:++#: +#+ +#+ +:+ +#+ +#++:++# \n", + "+#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+#+# +#+ \n", + "#+# #+# #+# #+# #+# #+# #+# #+# #+#+# #+# \n", + "### ### ### ### ### ### ########### ### #### ########## \n", + "==================================================================\n", + "Multi-core Algorithm for Rapid Identification of Nucleotide Edits\n", + "==================================================================\n", + "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "WARNING /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/unstranded_pair_test is not empty\n", + "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "~~~~~~~~~~~~~~~~~~\n", + "Identifying edits\n", + "~~~~~~~~~~~~~~~~~~\n", + "Contig groups to be processed: [[]]\n", + "Each contig is being split into 1 subsets...\n", + "\tContig Citrine.dna\n", + "1 total jobs\n", + "Reads processed:\t44\n", + "Time to process reads in min:\t0.01868\n", + "Read Summary:\n", + "edited 25\n", + "no_edits 19\n", + "total_edits 37\n", + "dtype: int64\n", + "\n", + "Deleting overall_label_to_list_of_contents...\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "Calculating coverage at edited sites, minimum read quality is 0...\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "\tsplit Citrine.dna_000_0_741, 0/1...\n", + "Done grouping! Concatenating ...\n", + "Done concatenating!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:01<00:00, 1.09s/it]\n", + "100%|██████████| 1/1 [00:01<00:00, 1.09s/it]\n", + "/tscc/projects/ps-yeolab3/ekofman/sailor2//marine.py:496: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage']\n", + "/tscc/projects/ps-yeolab3/ekofman/sailor2//marine.py:497: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " sites_for_conversion['start'] = sites_for_conversion['position'] - 1\n", + "***** WARNING: File /tmp/pybedtools.5bmfdxml.tmp has inconsistent naming convention for record:\n", + "Citrine.dna\t453\t455\tno_barcode_Citrine.dna_454_T_G_+\tT>G\t+\n", + "\n", + "***** WARNING: File /tmp/pybedtools.5bmfdxml.tmp has inconsistent naming convention for record:\n", + "Citrine.dna\t453\t455\tno_barcode_Citrine.dna_454_T_G_+\tT>G\t+\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concatenating results...\n", + "Done concatenating.\n", + "original 37\n", + "filtered 37\n", + "dtype: int64\n", + "Total time to calculate coverage: 0.018 minutes\n", + "Filtering..\n", + "\tNumber of edits after filtering:\n", + "\t37\n", + "\tNumber of unique edit sites:\n", + "\t14\n", + "Writing sites...\n", + "\n", + "Adding strand-specific conversion...\n", + "\n", + "14 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "1 final deduplicated SAILOR-formatted sites\n", + "2 final deduplicated C>T SAILOR-formatted sites\n", + "Making bedgraphs for ['CT'] conversions...\n", + "\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", - "Current memory usage 21.284636MB; Peak: 22.397353MB\n", - "Time elapsed: 5.56s\n", + "Current memory usage 21.569455MB; Peak: 22.678101MB\n", + "Time elapsed: 3.01s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -494,7 +768,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", + "\tSailor outputs:\t['CT']\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -506,6 +780,8 @@ "\tVerbose:\tTrue\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -523,17 +799,29 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['chr17']]\n", + "Contig groups to be processed: [['chr17']]\n", "Each contig is being split into 16 subsets...\n", "contig_lengths_dict:{'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, 'chrM': 16569, 'chr1_KI270706v1_random': 175055, 'chr1_KI270707v1_random': 32032, 'chr1_KI270708v1_random': 127682, 'chr1_KI270709v1_random': 66860, 'chr1_KI270710v1_random': 40176, 'chr1_KI270711v1_random': 42210, 'chr1_KI270712v1_random': 176043, 'chr1_KI270713v1_random': 40745, 'chr1_KI270714v1_random': 41717, 'chr2_KI270715v1_random': 161471, 'chr2_KI270716v1_random': 153799, 'chr3_GL000221v1_random': 155397, 'chr4_GL000008v2_random': 209709, 'chr5_GL000208v1_random': 92689, 'chr9_KI270717v1_random': 40062, 'chr9_KI270718v1_random': 38054, 'chr9_KI270719v1_random': 176845, 'chr9_KI270720v1_random': 39050, 'chr11_KI270721v1_random': 100316, 'chr14_GL000009v2_random': 201709, 'chr14_GL000225v1_random': 211173, 'chr14_KI270722v1_random': 194050, 'chr14_GL000194v1_random': 191469, 'chr14_KI270723v1_random': 38115, 'chr14_KI270724v1_random': 39555, 'chr14_KI270725v1_random': 172810, 'chr14_KI270726v1_random': 43739, 'chr15_KI270727v1_random': 448248, 'chr16_KI270728v1_random': 1872759, 'chr17_GL000205v2_random': 185591, 'chr17_KI270729v1_random': 280839, 'chr17_KI270730v1_random': 112551, 'chr22_KI270731v1_random': 150754, 'chr22_KI270732v1_random': 41543, 'chr22_KI270733v1_random': 179772, 'chr22_KI270734v1_random': 165050, 'chr22_KI270735v1_random': 42811, 'chr22_KI270736v1_random': 181920, 'chr22_KI270737v1_random': 103838, 'chr22_KI270738v1_random': 99375, 'chr22_KI270739v1_random': 73985, 'chrY_KI270740v1_random': 37240, 'chrUn_KI270302v1': 2274, 'chrUn_KI270304v1': 2165, 'chrUn_KI270303v1': 1942, 'chrUn_KI270305v1': 1472, 'chrUn_KI270322v1': 21476, 'chrUn_KI270320v1': 4416, 'chrUn_KI270310v1': 1201, 'chrUn_KI270316v1': 1444, 'chrUn_KI270315v1': 2276, 'chrUn_KI270312v1': 998, 'chrUn_KI270311v1': 12399, 'chrUn_KI270317v1': 37690, 'chrUn_KI270412v1': 1179, 'chrUn_KI270411v1': 2646, 'chrUn_KI270414v1': 2489, 'chrUn_KI270419v1': 1029, 'chrUn_KI270418v1': 2145, 'chrUn_KI270420v1': 2321, 'chrUn_KI270424v1': 2140, 'chrUn_KI270417v1': 2043, 'chrUn_KI270422v1': 1445, 'chrUn_KI270423v1': 981, 'chrUn_KI270425v1': 1884, 'chrUn_KI270429v1': 1361, 'chrUn_KI270442v1': 392061, 'chrUn_KI270466v1': 1233, 'chrUn_KI270465v1': 1774, 'chrUn_KI270467v1': 3920, 'chrUn_KI270435v1': 92983, 'chrUn_KI270438v1': 112505, 'chrUn_KI270468v1': 4055, 'chrUn_KI270510v1': 2415, 'chrUn_KI270509v1': 2318, 'chrUn_KI270518v1': 2186, 'chrUn_KI270508v1': 1951, 'chrUn_KI270516v1': 1300, 'chrUn_KI270512v1': 22689, 'chrUn_KI270519v1': 138126, 'chrUn_KI270522v1': 5674, 'chrUn_KI270511v1': 8127, 'chrUn_KI270515v1': 6361, 'chrUn_KI270507v1': 5353, 'chrUn_KI270517v1': 3253, 'chrUn_KI270529v1': 1899, 'chrUn_KI270528v1': 2983, 'chrUn_KI270530v1': 2168, 'chrUn_KI270539v1': 993, 'chrUn_KI270538v1': 91309, 'chrUn_KI270544v1': 1202, 'chrUn_KI270548v1': 1599, 'chrUn_KI270583v1': 1400, 'chrUn_KI270587v1': 2969, 'chrUn_KI270580v1': 1553, 'chrUn_KI270581v1': 7046, 'chrUn_KI270579v1': 31033, 'chrUn_KI270589v1': 44474, 'chrUn_KI270590v1': 4685, 'chrUn_KI270584v1': 4513, 'chrUn_KI270582v1': 6504, 'chrUn_KI270588v1': 6158, 'chrUn_KI270593v1': 3041, 'chrUn_KI270591v1': 5796, 'chrUn_KI270330v1': 1652, 'chrUn_KI270329v1': 1040, 'chrUn_KI270334v1': 1368, 'chrUn_KI270333v1': 2699, 'chrUn_KI270335v1': 1048, 'chrUn_KI270338v1': 1428, 'chrUn_KI270340v1': 1428, 'chrUn_KI270336v1': 1026, 'chrUn_KI270337v1': 1121, 'chrUn_KI270363v1': 1803, 'chrUn_KI270364v1': 2855, 'chrUn_KI270362v1': 3530, 'chrUn_KI270366v1': 8320, 'chrUn_KI270378v1': 1048, 'chrUn_KI270379v1': 1045, 'chrUn_KI270389v1': 1298, 'chrUn_KI270390v1': 2387, 'chrUn_KI270387v1': 1537, 'chrUn_KI270395v1': 1143, 'chrUn_KI270396v1': 1880, 'chrUn_KI270388v1': 1216, 'chrUn_KI270394v1': 970, 'chrUn_KI270386v1': 1788, 'chrUn_KI270391v1': 1484, 'chrUn_KI270383v1': 1750, 'chrUn_KI270393v1': 1308, 'chrUn_KI270384v1': 1658, 'chrUn_KI270392v1': 971, 'chrUn_KI270381v1': 1930, 'chrUn_KI270385v1': 990, 'chrUn_KI270382v1': 4215, 'chrUn_KI270376v1': 1136, 'chrUn_KI270374v1': 2656, 'chrUn_KI270372v1': 1650, 'chrUn_KI270373v1': 1451, 'chrUn_KI270375v1': 2378, 'chrUn_KI270371v1': 2805, 'chrUn_KI270448v1': 7992, 'chrUn_KI270521v1': 7642, 'chrUn_GL000195v1': 182896, 'chrUn_GL000219v1': 179198, 'chrUn_GL000220v1': 161802, 'chrUn_GL000224v1': 179693, 'chrUn_KI270741v1': 157432, 'chrUn_GL000226v1': 15008, 'chrUn_GL000213v1': 164239, 'chrUn_KI270743v1': 210658, 'chrUn_KI270744v1': 168472, 'chrUn_KI270745v1': 41891, 'chrUn_KI270746v1': 66486, 'chrUn_KI270747v1': 198735, 'chrUn_KI270748v1': 93321, 'chrUn_KI270749v1': 158759, 'chrUn_KI270750v1': 148850, 'chrUn_KI270751v1': 150742, 'chrUn_KI270752v1': 27745, 'chrUn_KI270753v1': 62944, 'chrUn_KI270754v1': 40191, 'chrUn_KI270755v1': 36723, 'chrUn_KI270756v1': 79590, 'chrUn_KI270757v1': 71251, 'chrUn_GL000214v1': 137718, 'chrUn_KI270742v1': 186739, 'chrUn_GL000216v2': 176608, 'chrUn_GL000218v1': 161147, 'chrEBV': 171823}\n", "\tContig chr17\n", "16 total jobs\n", + "chr17:000, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", + "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "MD tag 26A20G52\n", "CIGAR tag 1S100M\n", @@ -633,53 +921,45 @@ " aagtttatttgcagtgttaacagcacaacatttacaaaacAtatGttgtacaatcaagtcttcactgcccttgcacactgggggggctagggaagaccta\n", "Indicated qualities:\n", " 34343434343434263434342634343434341234343434263434343434343434343434343426343434343434341234343434343434343434343434263412343434343434343434342634343434343434343434343434343426343434263434343434343434\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "alt bases ['A', 'G']\n", - "ref bases ['G', 'T']\n", - "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:000, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "alt bases ['A', 'G']\n", + "ref bases ['G', 'T']\n", + "Successfully ran get_edit_information_wrapper\n", + "alt bases: ['A', 'G'], ref bases: ['G', 'T']\n", + "Getting info: A G 34 41\n", + "Getting info: G T 12 45\n", + "chr17:008, total reads: 2, counts_df: chr17\n", + "edited 2\n", + "total_edits 4\n", "Reads processed:\t2\n", - "Time to process reads in min:\t0.04157\n", + "Time to process reads in min:\t0.09358\n", "Read Summary:\n", "edited 2\n", "total_edits 4\n", @@ -700,7 +980,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.72it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.98it/s]\n" ] }, { @@ -728,7 +1008,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.73it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.85it/s]\n" ] }, { @@ -740,7 +1020,7 @@ "original 4\n", "filtered 3\n", "dtype: int64\n", - "Total time to calculate coverage: 0.042 minutes\n", + "Total time to calculate coverage: 0.097 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t3\n", @@ -751,11 +1031,12 @@ "Adding strand-specific conversion...\n", "\n", "3 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "1 final deduplicated SAILOR-formatted sites\n", + "1 final deduplicated C>T SAILOR-formatted sites\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", - "Current memory usage 21.285267MB; Peak: 22.397912MB\n", - "Time elapsed: 5.93s\n", + "Current memory usage 21.28509MB; Peak: 22.397674MB\n", + "Time elapsed: 12.28s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -777,7 +1058,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", + "\tSailor outputs:\t['CT']\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -789,6 +1070,8 @@ "\tVerbose:\tTrue\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -806,118 +1089,58 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['chr17']]\n", + "Contig groups to be processed: [['chr17']]\n", "Each contig is being split into 16 subsets...\n", "contig_lengths_dict:{'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, 'chrM': 16569, 'chr1_KI270706v1_random': 175055, 'chr1_KI270707v1_random': 32032, 'chr1_KI270708v1_random': 127682, 'chr1_KI270709v1_random': 66860, 'chr1_KI270710v1_random': 40176, 'chr1_KI270711v1_random': 42210, 'chr1_KI270712v1_random': 176043, 'chr1_KI270713v1_random': 40745, 'chr1_KI270714v1_random': 41717, 'chr2_KI270715v1_random': 161471, 'chr2_KI270716v1_random': 153799, 'chr3_GL000221v1_random': 155397, 'chr4_GL000008v2_random': 209709, 'chr5_GL000208v1_random': 92689, 'chr9_KI270717v1_random': 40062, 'chr9_KI270718v1_random': 38054, 'chr9_KI270719v1_random': 176845, 'chr9_KI270720v1_random': 39050, 'chr11_KI270721v1_random': 100316, 'chr14_GL000009v2_random': 201709, 'chr14_GL000225v1_random': 211173, 'chr14_KI270722v1_random': 194050, 'chr14_GL000194v1_random': 191469, 'chr14_KI270723v1_random': 38115, 'chr14_KI270724v1_random': 39555, 'chr14_KI270725v1_random': 172810, 'chr14_KI270726v1_random': 43739, 'chr15_KI270727v1_random': 448248, 'chr16_KI270728v1_random': 1872759, 'chr17_GL000205v2_random': 185591, 'chr17_KI270729v1_random': 280839, 'chr17_KI270730v1_random': 112551, 'chr22_KI270731v1_random': 150754, 'chr22_KI270732v1_random': 41543, 'chr22_KI270733v1_random': 179772, 'chr22_KI270734v1_random': 165050, 'chr22_KI270735v1_random': 42811, 'chr22_KI270736v1_random': 181920, 'chr22_KI270737v1_random': 103838, 'chr22_KI270738v1_random': 99375, 'chr22_KI270739v1_random': 73985, 'chrY_KI270740v1_random': 37240, 'chrUn_KI270302v1': 2274, 'chrUn_KI270304v1': 2165, 'chrUn_KI270303v1': 1942, 'chrUn_KI270305v1': 1472, 'chrUn_KI270322v1': 21476, 'chrUn_KI270320v1': 4416, 'chrUn_KI270310v1': 1201, 'chrUn_KI270316v1': 1444, 'chrUn_KI270315v1': 2276, 'chrUn_KI270312v1': 998, 'chrUn_KI270311v1': 12399, 'chrUn_KI270317v1': 37690, 'chrUn_KI270412v1': 1179, 'chrUn_KI270411v1': 2646, 'chrUn_KI270414v1': 2489, 'chrUn_KI270419v1': 1029, 'chrUn_KI270418v1': 2145, 'chrUn_KI270420v1': 2321, 'chrUn_KI270424v1': 2140, 'chrUn_KI270417v1': 2043, 'chrUn_KI270422v1': 1445, 'chrUn_KI270423v1': 981, 'chrUn_KI270425v1': 1884, 'chrUn_KI270429v1': 1361, 'chrUn_KI270442v1': 392061, 'chrUn_KI270466v1': 1233, 'chrUn_KI270465v1': 1774, 'chrUn_KI270467v1': 3920, 'chrUn_KI270435v1': 92983, 'chrUn_KI270438v1': 112505, 'chrUn_KI270468v1': 4055, 'chrUn_KI270510v1': 2415, 'chrUn_KI270509v1': 2318, 'chrUn_KI270518v1': 2186, 'chrUn_KI270508v1': 1951, 'chrUn_KI270516v1': 1300, 'chrUn_KI270512v1': 22689, 'chrUn_KI270519v1': 138126, 'chrUn_KI270522v1': 5674, 'chrUn_KI270511v1': 8127, 'chrUn_KI270515v1': 6361, 'chrUn_KI270507v1': 5353, 'chrUn_KI270517v1': 3253, 'chrUn_KI270529v1': 1899, 'chrUn_KI270528v1': 2983, 'chrUn_KI270530v1': 2168, 'chrUn_KI270539v1': 993, 'chrUn_KI270538v1': 91309, 'chrUn_KI270544v1': 1202, 'chrUn_KI270548v1': 1599, 'chrUn_KI270583v1': 1400, 'chrUn_KI270587v1': 2969, 'chrUn_KI270580v1': 1553, 'chrUn_KI270581v1': 7046, 'chrUn_KI270579v1': 31033, 'chrUn_KI270589v1': 44474, 'chrUn_KI270590v1': 4685, 'chrUn_KI270584v1': 4513, 'chrUn_KI270582v1': 6504, 'chrUn_KI270588v1': 6158, 'chrUn_KI270593v1': 3041, 'chrUn_KI270591v1': 5796, 'chrUn_KI270330v1': 1652, 'chrUn_KI270329v1': 1040, 'chrUn_KI270334v1': 1368, 'chrUn_KI270333v1': 2699, 'chrUn_KI270335v1': 1048, 'chrUn_KI270338v1': 1428, 'chrUn_KI270340v1': 1428, 'chrUn_KI270336v1': 1026, 'chrUn_KI270337v1': 1121, 'chrUn_KI270363v1': 1803, 'chrUn_KI270364v1': 2855, 'chrUn_KI270362v1': 3530, 'chrUn_KI270366v1': 8320, 'chrUn_KI270378v1': 1048, 'chrUn_KI270379v1': 1045, 'chrUn_KI270389v1': 1298, 'chrUn_KI270390v1': 2387, 'chrUn_KI270387v1': 1537, 'chrUn_KI270395v1': 1143, 'chrUn_KI270396v1': 1880, 'chrUn_KI270388v1': 1216, 'chrUn_KI270394v1': 970, 'chrUn_KI270386v1': 1788, 'chrUn_KI270391v1': 1484, 'chrUn_KI270383v1': 1750, 'chrUn_KI270393v1': 1308, 'chrUn_KI270384v1': 1658, 'chrUn_KI270392v1': 971, 'chrUn_KI270381v1': 1930, 'chrUn_KI270385v1': 990, 'chrUn_KI270382v1': 4215, 'chrUn_KI270376v1': 1136, 'chrUn_KI270374v1': 2656, 'chrUn_KI270372v1': 1650, 'chrUn_KI270373v1': 1451, 'chrUn_KI270375v1': 2378, 'chrUn_KI270371v1': 2805, 'chrUn_KI270448v1': 7992, 'chrUn_KI270521v1': 7642, 'chrUn_GL000195v1': 182896, 'chrUn_GL000219v1': 179198, 'chrUn_GL000220v1': 161802, 'chrUn_GL000224v1': 179693, 'chrUn_KI270741v1': 157432, 'chrUn_GL000226v1': 15008, 'chrUn_GL000213v1': 164239, 'chrUn_KI270743v1': 210658, 'chrUn_KI270744v1': 168472, 'chrUn_KI270745v1': 41891, 'chrUn_KI270746v1': 66486, 'chrUn_KI270747v1': 198735, 'chrUn_KI270748v1': 93321, 'chrUn_KI270749v1': 158759, 'chrUn_KI270750v1': 148850, 'chrUn_KI270751v1': 150742, 'chrUn_KI270752v1': 27745, 'chrUn_KI270753v1': 62944, 'chrUn_KI270754v1': 40191, 'chrUn_KI270755v1': 36723, 'chrUn_KI270756v1': 79590, 'chrUn_KI270757v1': 71251, 'chrUn_GL000214v1': 137718, 'chrUn_KI270742v1': 186739, 'chrUn_GL000216v2': 176608, 'chrUn_GL000218v1': 161147, 'chrEBV': 171823}\n", "\tContig chr17\n", "16 total jobs\n", - "chr17:000, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:000, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "MD tag 10A0G0A84\n", - "CIGAR tag 13M198556N84M1S\n", - "is_reverse False\n", - "is_read1 False\n", - "is_read2 True\n", - "is_paired True\n", - "is_proper_pair True\n", - "mate_is_reverse True\n", - "read id VH01429:22:AACFJ5NHV:1:1112:41030:25422\n", - "VH01429:22:AACFJ5NHV:1:1112:41030:25422\t163\t#16\t43001705\t255\t13M198556N84M1S\t#16\t43200271\t198653\tGGCTGTCATGGAGGAGGAGGAGGATGAGCTCAAAGATGAAGTTCAAAGTCAGTCCTCTGCTTCCTCAGAGGATTACATCATCATCCTGCCTGAGTGCA\tarray('B', [34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34])\t[('MD', '10A0G0A84'), ('PG', 'MarkDuplicates'), ('RG', 'ET_STAMPExpt_1-CTRL_S2'), ('NH', 1), ('HI', 1), ('NM', 3), ('AS', 172), ('XS', '+')]\n", - "reverse_or_forward: +\n", - "----------------------------\n", - "MD tag: 10A0G0A84\n", - "CIGAR string 13M198556N84M1S\n", - "Reference seq: GGCTGTCATGAGAGAGGAGGAGGATGAGCTCAAAGATGAAGTTCAAAGTCAGTCCTCTGCTTCCTCAGAGGATTACATCATCATCCTGCCTGAGTGC\n", - "Aligned seq: GGCTGTCATGGAGGAGGAGGAGGATGAGCTCAAAGATGAAGTTCAAAGTCAGTCCTCTGCTTCCTCAGAGGATTACATCATCATCCTGCCTGAGTGCA\n", - "Qualities: array('B', [34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34])\n", - "CIGAR tuples before clipping (if needed):\n", - " [(0, 13), (3, 198556), (0, 84), (4, 1)]\n", - "Aligned sequence before clipping (if needed):\n", - " GGCTGTCATGGAGGAGGAGGAGGATGAGCTCAAAGATGAAGTTCAAAGTCAGTCCTCTGCTTCCTCAGAGGATTACATCATCATCCTGCCTGAGTGCA\n", - "Qualities before clipping:\n", - " array('B', [34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34])\n", - "Soft clipping quality scores ...\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "CIGAR tuples after clipping (if needed):\n", - " [(0, 13), (3, 198556), (0, 84)]\n", - "Aligned sequence after clipping (if needed):\n", - " GGCTGTCATGGAGGAGGAGGAGGATGAGCTCAAAGATGAAGTTCAAAGTCAGTCCTCTGCTTCCTCAGAGGATTACATCATCATCCTGCCTGAGTGC\n", - "Qualities after clipping:\n", - " array('B', [34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 34, 26, 34, 34, 34, 34, 12, 34, 34, 12, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34])\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "['10', '0', '0', '84']\n", - "[10, 11, 12, 97]\n", - "Indicated reference seq:\n", - " ggctgtcatgAGAgaggaggaggatgagctcaaagatgaagttcaaagtcagtcctctgcttcctcagaggattacatcatcatcctgcctgagtgc\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "Fixed reference seq:\n", - " ggctgtcatgAGA198556*ngaggaggaggatgagctcaaagatgaagttcaaagtcagtcctctgcttcctcagaggattacatcatcatcctgcctgagtgc\n", - "Fixed aligned seq:\n", - " GGCTGTCATGGAG198556*nGAGGAGGAGGATGAGCTCAAAGATGAAGTTCAAAGTCAGTCCTCTGCTTCCTCAGAGGATTACATCATCATCCTGCCTGAGTGC\n", - "Finalized fixed aligned seq:\n", - " ggctgtcatgGAG198556*ngaggaggaggatgagctcaaagatgaagttcaaagtcagtcctctgcttcctcagaggattacatcatcatcctgcctgagtgc\n", - "Indicated qualities:\n", - " 34343434343434343434263434343434343434342634342634343434343434343434343434343434343434343434123434343434343426343434342634343434343434342634343434342634343434123434123434343434343434343434343434\n", - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "alt bases ['G', 'A', 'G']\n", - "ref bases ['A', 'G', 'A']\n", - "Successfully ran get_edit_information_wrapper\n", - "alt bases: ['G', 'A', 'G'], ref bases: ['A', 'G', 'A']\n", - "Getting info: G A 26 11\n", - "Getting info: A G 34 12\n", - "Getting info: G A 34 13\n", - "chr17:008, total reads: 1, counts_df: chr17\n", - "edited 1\n", - "total_edits 3\n", "Reads processed:\t1\n", - "Time to process reads in min:\t0.04317\n", + "Time to process reads in min:\t0.10021\n", "Read Summary:\n", "edited 1\n", "total_edits 3\n", @@ -938,7 +1161,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.50it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.76it/s]\n" ] }, { @@ -966,7 +1189,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.11it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.78it/s]\n" ] }, { @@ -978,7 +1201,7 @@ "original 3\n", "filtered 3\n", "dtype: int64\n", - "Total time to calculate coverage: 0.046 minutes\n", + "Total time to calculate coverage: 0.099 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t3\n", @@ -989,11 +1212,12 @@ "Adding strand-specific conversion...\n", "\n", "3 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "0 final deduplicated SAILOR-formatted sites\n", + "0 final deduplicated C>T SAILOR-formatted sites\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", - "Current memory usage 21.433455MB; Peak: 22.543922MB\n", - "Time elapsed: 6.32s\n", + "Current memory usage 21.435929MB; Peak: 22.546208MB\n", + "Time elapsed: 12.86s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -1015,7 +1239,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", + "\tSailor outputs:\t[]\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -1027,6 +1251,8 @@ "\tVerbose:\tTrue\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -1044,10 +1270,7 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['chr17']]\n", + "Contig groups to be processed: [['chr17']]\n", "Each contig is being split into 16 subsets...\n", "contig_lengths_dict:{'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, 'chrM': 16569, 'chr1_KI270706v1_random': 175055, 'chr1_KI270707v1_random': 32032, 'chr1_KI270708v1_random': 127682, 'chr1_KI270709v1_random': 66860, 'chr1_KI270710v1_random': 40176, 'chr1_KI270711v1_random': 42210, 'chr1_KI270712v1_random': 176043, 'chr1_KI270713v1_random': 40745, 'chr1_KI270714v1_random': 41717, 'chr2_KI270715v1_random': 161471, 'chr2_KI270716v1_random': 153799, 'chr3_GL000221v1_random': 155397, 'chr4_GL000008v2_random': 209709, 'chr5_GL000208v1_random': 92689, 'chr9_KI270717v1_random': 40062, 'chr9_KI270718v1_random': 38054, 'chr9_KI270719v1_random': 176845, 'chr9_KI270720v1_random': 39050, 'chr11_KI270721v1_random': 100316, 'chr14_GL000009v2_random': 201709, 'chr14_GL000225v1_random': 211173, 'chr14_KI270722v1_random': 194050, 'chr14_GL000194v1_random': 191469, 'chr14_KI270723v1_random': 38115, 'chr14_KI270724v1_random': 39555, 'chr14_KI270725v1_random': 172810, 'chr14_KI270726v1_random': 43739, 'chr15_KI270727v1_random': 448248, 'chr16_KI270728v1_random': 1872759, 'chr17_GL000205v2_random': 185591, 'chr17_KI270729v1_random': 280839, 'chr17_KI270730v1_random': 112551, 'chr22_KI270731v1_random': 150754, 'chr22_KI270732v1_random': 41543, 'chr22_KI270733v1_random': 179772, 'chr22_KI270734v1_random': 165050, 'chr22_KI270735v1_random': 42811, 'chr22_KI270736v1_random': 181920, 'chr22_KI270737v1_random': 103838, 'chr22_KI270738v1_random': 99375, 'chr22_KI270739v1_random': 73985, 'chrY_KI270740v1_random': 37240, 'chrUn_KI270302v1': 2274, 'chrUn_KI270304v1': 2165, 'chrUn_KI270303v1': 1942, 'chrUn_KI270305v1': 1472, 'chrUn_KI270322v1': 21476, 'chrUn_KI270320v1': 4416, 'chrUn_KI270310v1': 1201, 'chrUn_KI270316v1': 1444, 'chrUn_KI270315v1': 2276, 'chrUn_KI270312v1': 998, 'chrUn_KI270311v1': 12399, 'chrUn_KI270317v1': 37690, 'chrUn_KI270412v1': 1179, 'chrUn_KI270411v1': 2646, 'chrUn_KI270414v1': 2489, 'chrUn_KI270419v1': 1029, 'chrUn_KI270418v1': 2145, 'chrUn_KI270420v1': 2321, 'chrUn_KI270424v1': 2140, 'chrUn_KI270417v1': 2043, 'chrUn_KI270422v1': 1445, 'chrUn_KI270423v1': 981, 'chrUn_KI270425v1': 1884, 'chrUn_KI270429v1': 1361, 'chrUn_KI270442v1': 392061, 'chrUn_KI270466v1': 1233, 'chrUn_KI270465v1': 1774, 'chrUn_KI270467v1': 3920, 'chrUn_KI270435v1': 92983, 'chrUn_KI270438v1': 112505, 'chrUn_KI270468v1': 4055, 'chrUn_KI270510v1': 2415, 'chrUn_KI270509v1': 2318, 'chrUn_KI270518v1': 2186, 'chrUn_KI270508v1': 1951, 'chrUn_KI270516v1': 1300, 'chrUn_KI270512v1': 22689, 'chrUn_KI270519v1': 138126, 'chrUn_KI270522v1': 5674, 'chrUn_KI270511v1': 8127, 'chrUn_KI270515v1': 6361, 'chrUn_KI270507v1': 5353, 'chrUn_KI270517v1': 3253, 'chrUn_KI270529v1': 1899, 'chrUn_KI270528v1': 2983, 'chrUn_KI270530v1': 2168, 'chrUn_KI270539v1': 993, 'chrUn_KI270538v1': 91309, 'chrUn_KI270544v1': 1202, 'chrUn_KI270548v1': 1599, 'chrUn_KI270583v1': 1400, 'chrUn_KI270587v1': 2969, 'chrUn_KI270580v1': 1553, 'chrUn_KI270581v1': 7046, 'chrUn_KI270579v1': 31033, 'chrUn_KI270589v1': 44474, 'chrUn_KI270590v1': 4685, 'chrUn_KI270584v1': 4513, 'chrUn_KI270582v1': 6504, 'chrUn_KI270588v1': 6158, 'chrUn_KI270593v1': 3041, 'chrUn_KI270591v1': 5796, 'chrUn_KI270330v1': 1652, 'chrUn_KI270329v1': 1040, 'chrUn_KI270334v1': 1368, 'chrUn_KI270333v1': 2699, 'chrUn_KI270335v1': 1048, 'chrUn_KI270338v1': 1428, 'chrUn_KI270340v1': 1428, 'chrUn_KI270336v1': 1026, 'chrUn_KI270337v1': 1121, 'chrUn_KI270363v1': 1803, 'chrUn_KI270364v1': 2855, 'chrUn_KI270362v1': 3530, 'chrUn_KI270366v1': 8320, 'chrUn_KI270378v1': 1048, 'chrUn_KI270379v1': 1045, 'chrUn_KI270389v1': 1298, 'chrUn_KI270390v1': 2387, 'chrUn_KI270387v1': 1537, 'chrUn_KI270395v1': 1143, 'chrUn_KI270396v1': 1880, 'chrUn_KI270388v1': 1216, 'chrUn_KI270394v1': 970, 'chrUn_KI270386v1': 1788, 'chrUn_KI270391v1': 1484, 'chrUn_KI270383v1': 1750, 'chrUn_KI270393v1': 1308, 'chrUn_KI270384v1': 1658, 'chrUn_KI270392v1': 971, 'chrUn_KI270381v1': 1930, 'chrUn_KI270385v1': 990, 'chrUn_KI270382v1': 4215, 'chrUn_KI270376v1': 1136, 'chrUn_KI270374v1': 2656, 'chrUn_KI270372v1': 1650, 'chrUn_KI270373v1': 1451, 'chrUn_KI270375v1': 2378, 'chrUn_KI270371v1': 2805, 'chrUn_KI270448v1': 7992, 'chrUn_KI270521v1': 7642, 'chrUn_GL000195v1': 182896, 'chrUn_GL000219v1': 179198, 'chrUn_GL000220v1': 161802, 'chrUn_GL000224v1': 179693, 'chrUn_KI270741v1': 157432, 'chrUn_GL000226v1': 15008, 'chrUn_GL000213v1': 164239, 'chrUn_KI270743v1': 210658, 'chrUn_KI270744v1': 168472, 'chrUn_KI270745v1': 41891, 'chrUn_KI270746v1': 66486, 'chrUn_KI270747v1': 198735, 'chrUn_KI270748v1': 93321, 'chrUn_KI270749v1': 158759, 'chrUn_KI270750v1': 148850, 'chrUn_KI270751v1': 150742, 'chrUn_KI270752v1': 27745, 'chrUn_KI270753v1': 62944, 'chrUn_KI270754v1': 40191, 'chrUn_KI270755v1': 36723, 'chrUn_KI270756v1': 79590, 'chrUn_KI270757v1': 71251, 'chrUn_GL000214v1': 137718, 'chrUn_KI270742v1': 186739, 'chrUn_GL000216v2': 176608, 'chrUn_GL000218v1': 161147, 'chrEBV': 171823}\n", "\tContig chr17\n", @@ -1064,33 +1287,30 @@ "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:008, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "MD tag 55C43\n", "CIGAR tag 1S97M17144N2M\n", @@ -1820,29 +2040,19 @@ "Indicated reference seq:\n", " tttattaatttttgcaagacttcggagggtgctCtgtggcgattaatattgctgagctgagcacagctgcaGtactttcctgtaaaactgactaaaggaga\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "Fixed reference seq:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.78it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " chr17:008, total reads: 0, counts_df: Empty DataFrame\n", + "Fixed reference seq:\n", + " tttattaatttttgcaagacttcggagggtgctCtgtggcgattaatattgctgagctgagcacagctgcaGtactttcctgtaaaactgactaaaggaga\n", + "Fixed aligned seq:\n", + " chr17:010, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "tttattaatttttgcaagacttcggagggtgctCtgtggcgattaatattgctgagctgagcacagctgcaGtactttcctgtaaaactgactaaaggaga\n", - "Fixed aligned seq:\n", - " TTTATTAATTTTTGCAAGACTTCGGAGGGTGCTGTGTGGCGATTAATATTGCTGAGCTGAGCACAGCTGCAATACTTTCCTGTAAAACTGACTAAAGGAGA\n", + "TTTATTAATTTTTGCAAGACTTCGGAGGGTGCTGTGTGGCGATTAATATTGCTGAGCTGAGCACAGCTGCAATACTTTCCTGTAAAACTGACTAAAGGAGA\n", "Finalized fixed aligned seq:\n", " tttattaatttttgcaagacttcggagggtgctGtgtggcgattaatattgctgagctgagcacagctgcaAtactttcctgtaaaactgactaaaggaga\n", "Indicated qualities:\n", @@ -1912,7 +2122,7 @@ "no_edits 1\n", "total_edits 32\n", "Reads processed:\t16\n", - "Time to process reads in min:\t0.04113\n", + "Time to process reads in min:\t0.0969\n", "Read Summary:\n", "edited 15\n", "no_edits 1\n", @@ -1927,7 +2137,20 @@ "\tsplit chr17_010_52035910_57239501, 10/16...\n", "Done grouping! Concatenating ...\n", "Done concatenating!\n", - "edit_info_grouped_per_contig_combined dict_keys(['chr17_000_0_5203591', 'chr17_001_5203591_10407182', 'chr17_002_10407182_15610773', 'chr17_003_15610773_20814364', 'chr17_004_20814364_26017955', 'chr17_005_26017955_31221546', 'chr17_006_31221546_36425137', 'chr17_007_36425137_41628728', 'chr17_008_41628728_46832319', 'chr17_009_46832319_52035910', 'chr17_010_52035910_57239501', 'chr17_011_57239501_62443092', 'chr17_012_62443092_67646683', 'chr17_013_67646683_72850274', 'chr17_014_72850274_78053865', 'chr17_015_78053865_83257456'])\n", + "edit_info_grouped_per_contig_combined dict_keys(['chr17_000_0_5203591', 'chr17_001_5203591_10407182', 'chr17_002_10407182_15610773', 'chr17_003_15610773_20814364', 'chr17_004_20814364_26017955', 'chr17_005_26017955_31221546', 'chr17_006_31221546_36425137', 'chr17_007_36425137_41628728', 'chr17_008_41628728_46832319', 'chr17_009_46832319_52035910', 'chr17_010_52035910_57239501', 'chr17_011_57239501_62443092', 'chr17_012_62443092_67646683', 'chr17_013_67646683_72850274', 'chr17_014_72850274_78053865', 'chr17_015_78053865_83257456'])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 16/16 [00:05<00:00, 2.86it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "~~~~~~\n", "!!!!PAIRED END!!!!!\n", "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/same_pos_dif_reads_test/split_bams/chr17/chr17_015_78053865_83257456.bam.sorted.bam\n", @@ -1984,7 +2207,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.39it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.75it/s]\n" ] }, { @@ -1996,7 +2219,7 @@ "original 32\n", "filtered 24\n", "dtype: int64\n", - "Total time to calculate coverage: 0.044 minutes\n", + "Total time to calculate coverage: 0.101 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t24\n", @@ -2006,12 +2229,9 @@ "\n", "Adding strand-specific conversion...\n", "\n", - "10 sites being converted to SAILOR format...\n", - "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "2 final deduplicated SAILOR-formatted sites\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", - "Current memory usage 21.432284MB; Peak: 22.541949MB\n", - "Time elapsed: 6.11s\n", + "Current memory usage 21.395534MB; Peak: 22.504722MB\n", + "Time elapsed: 12.82s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -2033,7 +2253,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", + "\tSailor outputs:\t['CT']\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -2045,6 +2265,8 @@ "\tVerbose:\tTrue\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -2059,47 +2281,11 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['chr17']]\n", + "Contig groups to be processed: [['chr17']]\n", "Each contig is being split into 16 subsets...\n", "contig_lengths_dict:{'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259, 'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422, 'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr14': 107043718, 'chr15': 101991189, 'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr19': 58617616, 'chr20': 64444167, 'chr21': 46709983, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415, 'chrM': 16569, 'chr1_KI270706v1_random': 175055, 'chr1_KI270707v1_random': 32032, 'chr1_KI270708v1_random': 127682, 'chr1_KI270709v1_random': 66860, 'chr1_KI270710v1_random': 40176, 'chr1_KI270711v1_random': 42210, 'chr1_KI270712v1_random': 176043, 'chr1_KI270713v1_random': 40745, 'chr1_KI270714v1_random': 41717, 'chr2_KI270715v1_random': 161471, 'chr2_KI270716v1_random': 153799, 'chr3_GL000221v1_random': 155397, 'chr4_GL000008v2_random': 209709, 'chr5_GL000208v1_random': 92689, 'chr9_KI270717v1_random': 40062, 'chr9_KI270718v1_random': 38054, 'chr9_KI270719v1_random': 176845, 'chr9_KI270720v1_random': 39050, 'chr11_KI270721v1_random': 100316, 'chr14_GL000009v2_random': 201709, 'chr14_GL000225v1_random': 211173, 'chr14_KI270722v1_random': 194050, 'chr14_GL000194v1_random': 191469, 'chr14_KI270723v1_random': 38115, 'chr14_KI270724v1_random': 39555, 'chr14_KI270725v1_random': 172810, 'chr14_KI270726v1_random': 43739, 'chr15_KI270727v1_random': 448248, 'chr16_KI270728v1_random': 1872759, 'chr17_GL000205v2_random': 185591, 'chr17_KI270729v1_random': 280839, 'chr17_KI270730v1_random': 112551, 'chr22_KI270731v1_random': 150754, 'chr22_KI270732v1_random': 41543, 'chr22_KI270733v1_random': 179772, 'chr22_KI270734v1_random': 165050, 'chr22_KI270735v1_random': 42811, 'chr22_KI270736v1_random': 181920, 'chr22_KI270737v1_random': 103838, 'chr22_KI270738v1_random': 99375, 'chr22_KI270739v1_random': 73985, 'chrY_KI270740v1_random': 37240, 'chrUn_KI270302v1': 2274, 'chrUn_KI270304v1': 2165, 'chrUn_KI270303v1': 1942, 'chrUn_KI270305v1': 1472, 'chrUn_KI270322v1': 21476, 'chrUn_KI270320v1': 4416, 'chrUn_KI270310v1': 1201, 'chrUn_KI270316v1': 1444, 'chrUn_KI270315v1': 2276, 'chrUn_KI270312v1': 998, 'chrUn_KI270311v1': 12399, 'chrUn_KI270317v1': 37690, 'chrUn_KI270412v1': 1179, 'chrUn_KI270411v1': 2646, 'chrUn_KI270414v1': 2489, 'chrUn_KI270419v1': 1029, 'chrUn_KI270418v1': 2145, 'chrUn_KI270420v1': 2321, 'chrUn_KI270424v1': 2140, 'chrUn_KI270417v1': 2043, 'chrUn_KI270422v1': 1445, 'chrUn_KI270423v1': 981, 'chrUn_KI270425v1': 1884, 'chrUn_KI270429v1': 1361, 'chrUn_KI270442v1': 392061, 'chrUn_KI270466v1': 1233, 'chrUn_KI270465v1': 1774, 'chrUn_KI270467v1': 3920, 'chrUn_KI270435v1': 92983, 'chrUn_KI270438v1': 112505, 'chrUn_KI270468v1': 4055, 'chrUn_KI270510v1': 2415, 'chrUn_KI270509v1': 2318, 'chrUn_KI270518v1': 2186, 'chrUn_KI270508v1': 1951, 'chrUn_KI270516v1': 1300, 'chrUn_KI270512v1': 22689, 'chrUn_KI270519v1': 138126, 'chrUn_KI270522v1': 5674, 'chrUn_KI270511v1': 8127, 'chrUn_KI270515v1': 6361, 'chrUn_KI270507v1': 5353, 'chrUn_KI270517v1': 3253, 'chrUn_KI270529v1': 1899, 'chrUn_KI270528v1': 2983, 'chrUn_KI270530v1': 2168, 'chrUn_KI270539v1': 993, 'chrUn_KI270538v1': 91309, 'chrUn_KI270544v1': 1202, 'chrUn_KI270548v1': 1599, 'chrUn_KI270583v1': 1400, 'chrUn_KI270587v1': 2969, 'chrUn_KI270580v1': 1553, 'chrUn_KI270581v1': 7046, 'chrUn_KI270579v1': 31033, 'chrUn_KI270589v1': 44474, 'chrUn_KI270590v1': 4685, 'chrUn_KI270584v1': 4513, 'chrUn_KI270582v1': 6504, 'chrUn_KI270588v1': 6158, 'chrUn_KI270593v1': 3041, 'chrUn_KI270591v1': 5796, 'chrUn_KI270330v1': 1652, 'chrUn_KI270329v1': 1040, 'chrUn_KI270334v1': 1368, 'chrUn_KI270333v1': 2699, 'chrUn_KI270335v1': 1048, 'chrUn_KI270338v1': 1428, 'chrUn_KI270340v1': 1428, 'chrUn_KI270336v1': 1026, 'chrUn_KI270337v1': 1121, 'chrUn_KI270363v1': 1803, 'chrUn_KI270364v1': 2855, 'chrUn_KI270362v1': 3530, 'chrUn_KI270366v1': 8320, 'chrUn_KI270378v1': 1048, 'chrUn_KI270379v1': 1045, 'chrUn_KI270389v1': 1298, 'chrUn_KI270390v1': 2387, 'chrUn_KI270387v1': 1537, 'chrUn_KI270395v1': 1143, 'chrUn_KI270396v1': 1880, 'chrUn_KI270388v1': 1216, 'chrUn_KI270394v1': 970, 'chrUn_KI270386v1': 1788, 'chrUn_KI270391v1': 1484, 'chrUn_KI270383v1': 1750, 'chrUn_KI270393v1': 1308, 'chrUn_KI270384v1': 1658, 'chrUn_KI270392v1': 971, 'chrUn_KI270381v1': 1930, 'chrUn_KI270385v1': 990, 'chrUn_KI270382v1': 4215, 'chrUn_KI270376v1': 1136, 'chrUn_KI270374v1': 2656, 'chrUn_KI270372v1': 1650, 'chrUn_KI270373v1': 1451, 'chrUn_KI270375v1': 2378, 'chrUn_KI270371v1': 2805, 'chrUn_KI270448v1': 7992, 'chrUn_KI270521v1': 7642, 'chrUn_GL000195v1': 182896, 'chrUn_GL000219v1': 179198, 'chrUn_GL000220v1': 161802, 'chrUn_GL000224v1': 179693, 'chrUn_KI270741v1': 157432, 'chrUn_GL000226v1': 15008, 'chrUn_GL000213v1': 164239, 'chrUn_KI270743v1': 210658, 'chrUn_KI270744v1': 168472, 'chrUn_KI270745v1': 41891, 'chrUn_KI270746v1': 66486, 'chrUn_KI270747v1': 198735, 'chrUn_KI270748v1': 93321, 'chrUn_KI270749v1': 158759, 'chrUn_KI270750v1': 148850, 'chrUn_KI270751v1': 150742, 'chrUn_KI270752v1': 27745, 'chrUn_KI270753v1': 62944, 'chrUn_KI270754v1': 40191, 'chrUn_KI270755v1': 36723, 'chrUn_KI270756v1': 79590, 'chrUn_KI270757v1': 71251, 'chrUn_GL000214v1': 137718, 'chrUn_KI270742v1': 186739, 'chrUn_GL000216v2': 176608, 'chrUn_GL000218v1': 161147, 'chrEBV': 171823}\n", "\tContig chr17\n", "16 total jobs\n", - "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:008, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "MD tag 2C1C6G3A1G4G47\n", "CIGAR tag 28S70M\n", @@ -2160,20 +2346,53 @@ "chr17:000, total reads: 1, counts_df: chr17\n", "edited 1\n", "total_edits 6\n", - "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:001, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", - "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", + "chr17:002, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:003, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:004, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:005, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:006, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", "chr17:007, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", + "chr17:008, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:009, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:010, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:011, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:012, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", "chr17:013, total reads: 0, counts_df: Empty DataFrame\n", "Columns: []\n", "Index: []\n", + "chr17:014, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "chr17:015, total reads: 0, counts_df: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", "Reads processed:\t1\n", - "Time to process reads in min:\t0.04167\n", + "Time to process reads in min:\t0.09886\n", "Read Summary:\n", "edited 1\n", "total_edits 6\n", @@ -2194,50 +2413,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.73it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "~~~~~~\n", - "!!!!PAIRED END!!!!!\n", - "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/tax1bp3_chr17_3665556_read_test/split_bams/chr17/chr17_000_0_5203591.bam.sorted.bam\n", - " chr17:3665541~~~~~~~`\n", - "coverage_at_pos 1\n", - "~~~~~~\n", - "!!!!PAIRED END!!!!!\n", - "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/tax1bp3_chr17_3665556_read_test/split_bams/chr17/chr17_000_0_5203591.bam.sorted.bam\n", - " chr17:3665543~~~~~~~`\n", - "coverage_at_pos 1\n", - "~~~~~~\n", - "!!!!PAIRED END!!!!!\n", - "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/tax1bp3_chr17_3665556_read_test/split_bams/chr17/chr17_000_0_5203591.bam.sorted.bam\n", - " chr17:3665550~~~~~~~`\n", - "coverage_at_pos 1\n", - "~~~~~~\n", - "!!!!PAIRED END!!!!!\n", - "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/tax1bp3_chr17_3665556_read_test/split_bams/chr17/chr17_000_0_5203591.bam.sorted.bam\n", - " chr17:3665554~~~~~~~`\n", - "coverage_at_pos 1\n", - "~~~~~~\n", - "!!!!PAIRED END!!!!!\n", - "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/tax1bp3_chr17_3665556_read_test/split_bams/chr17/chr17_000_0_5203591.bam.sorted.bam\n", - " chr17:3665556~~~~~~~`\n", - "coverage_at_pos 1\n", - "~~~~~~\n", - "!!!!PAIRED END!!!!!\n", - "pos: /tscc/projects/ps-yeolab3/ekofman/sailor2//tests/strandedness_tests/tax1bp3_chr17_3665556_read_test/split_bams/chr17/chr17_000_0_5203591.bam.sorted.bam\n", - " chr17:3665561~~~~~~~`\n", - "coverage_at_pos 1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.51it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.77it/s]\n", + "100%|██████████| 16/16 [00:05<00:00, 2.97it/s]\n" ] }, { @@ -2249,7 +2426,7 @@ "original 6\n", "filtered 6\n", "dtype: int64\n", - "Total time to calculate coverage: 0.043 minutes\n", + "Total time to calculate coverage: 0.093 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t6\n", @@ -2260,11 +2437,12 @@ "Adding strand-specific conversion...\n", "\n", "6 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "1 final deduplicated SAILOR-formatted sites\n", + "1 final deduplicated C>T SAILOR-formatted sites\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/hg38_gencode.v35.annotation.genes.bed...\n", - "Current memory usage 21.403118MB; Peak: 22.513332MB\n", - "Time elapsed: 5.99s\n", + "Current memory usage 21.404582MB; Peak: 22.514748MB\n", + "Time elapsed: 12.41s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -2286,7 +2464,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tTrue\n", + "\tSailor outputs:\t['CT']\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -2298,6 +2476,8 @@ "\tVerbose:\tFalse\n", "\tKeep intermediate files:\tTrue\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -2315,15 +2495,12 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['18']]\n", + "Contig groups to be processed: [['18']]\n", "Each contig is being split into 1 subsets...\n", "\tContig 18\n", "1 total jobs\n", "Reads processed:\t10\n", - "Time to process reads in min:\t0.02795\n", + "Time to process reads in min:\t0.01945\n", "Read Summary:\n", "edited 6\n", "no_edits 2\n", @@ -2344,8 +2521,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:01<00:00, 1.64s/it]\n", - "100%|██████████| 1/1 [00:01<00:00, 1.68s/it]\n" + "100%|██████████| 1/1 [00:01<00:00, 1.13s/it]\n", + "100%|██████████| 1/1 [00:01<00:00, 1.10s/it]\n" ] }, { @@ -2357,7 +2534,7 @@ "original 6\n", "filtered 4\n", "dtype: int64\n", - "Total time to calculate coverage: 0.028 minutes\n", + "Total time to calculate coverage: 0.019 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t4\n", @@ -2368,11 +2545,12 @@ "Adding strand-specific conversion...\n", "\n", "2 sites being converted to SAILOR format...\n", + "Generating SAILOR-style bed outputs for conversion CT...\n", "0 rows had coverage of 0 or more edits than coverage... filtering these out, but look into them...\n", - "2 final deduplicated SAILOR-formatted sites\n", + "2 final deduplicated C>T SAILOR-formatted sites\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/cellranger-GRCh38-3.0.0.annotation.genes.bed...\n", - "Current memory usage 21.317943MB; Peak: 22.4279MB\n", - "Time elapsed: 4.15s\n", + "Current memory usage 21.316828MB; Peak: 22.426777MB\n", + "Time elapsed: 3.06s\n", "++++++\n", "Done!\n", "++++++\n" @@ -2393,7 +2571,7 @@ "export MARINE=/tscc/projects/ps-yeolab3/ekofman/sailor2/\n", "export mypython=/tscc/nfs/home/ekofman/miniconda3/envs/marine_environment/bin/python\n", "\n", - "for t in \"unstranded_pair_test\" \"F1R2_pair_test-single_end_mode\" \"F1R2_pair_test\" \"F2R1_end_second_in_pair_test\" \"same_pos_dif_reads_test\" \"tax1bp3_chr17_3665556_read_test\" \"pair_test\"\n", + "for t in \"F1R2_pair_test-single_end_mode_sailor\" \"F1R2_pair_test-single_end_mode\" \"unstranded_pair_test\" \"F1R2_pair_test\" \"F2R1_end_second_in_pair_test\" \"same_pos_dif_reads_test\" \"tax1bp3_chr17_3665556_read_test\" \"pair_test\"\n", "do\n", " echo $t\n", " echo \"Removing old files...\"\n", @@ -2408,7 +2586,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 15, "id": "57192b28-b498-4f81-bdcd-0d4d14f644ff", "metadata": { "scrolled": true @@ -2436,7 +2614,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tFalse\n", + "\tSailor outputs:\t[]\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -2448,6 +2626,8 @@ "\tVerbose:\tFalse\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -2465,15 +2645,12 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['9']]\n", + "Contig groups to be processed: [['9']]\n", "Each contig is being split into 16 subsets...\n", "\tContig 9\n", "16 total jobs\n", "Reads processed:\t31\n", - "Time to process reads in min:\t0.0404\n", + "Time to process reads in min:\t0.09979\n", "Read Summary:\n", "edited 31\n", "total_edits 99\n", @@ -2492,14 +2669,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 16/16 [00:02<00:00, 6.95it/s]\n" + "100%|██████████| 16/16 [00:05<00:00, 2.77it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Total time to concat and write bams: 0.041 minutes\n", + "\t4 suffixes\n", + "Total time to concat and write bams: 0.097 minutes\n", "Deleting overall_label_to_list_of_contents...\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Calculating coverage at edited sites, minimum read quality is 0...\n", @@ -2514,8 +2692,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:02<00:00, 2.39s/it]\n", - "100%|██████████| 4/4 [00:02<00:00, 1.74it/s]\n" + "100%|██████████| 1/1 [00:05<00:00, 5.66s/it]\n", + "100%|██████████| 4/4 [00:05<00:00, 1.44s/it]\n" ] }, { @@ -2527,7 +2705,7 @@ "original 99\n", "filtered 99\n", "dtype: int64\n", - "Total time to calculate coverage: 0.04 minutes\n", + "Total time to calculate coverage: 0.099 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t99\n", @@ -2538,8 +2716,8 @@ "Adding strand-specific conversion...\n", "\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/cellranger-mm10-3.0.0.annotation.genes.bed...\n", - "Current memory usage 21.626729MB; Peak: 22.733417MB\n", - "Time elapsed: 8.33s\n", + "Current memory usage 21.628384MB; Peak: 22.735117MB\n", + "Time elapsed: 18.71s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -2561,7 +2739,7 @@ "\tCoverage only:\tFalse\n", "\tFiltering only:\tFalse\n", "\tAnnotation only:\tFalse\n", - "\tSailor outputs:\tFalse\n", + "\tSailor outputs:\t[]\n", "\tBedgraphs:\t[]\n", "\tMinimum base quality:\t0\n", "\tMinimum read quality:\t0\n", @@ -2573,6 +2751,8 @@ "\tVerbose:\tFalse\n", "\tKeep intermediate files:\tFalse\n", "\tSkip coverage?:\tFalse\n", + "\tFor single-cell: \t6 contigs at at time\n", + "\n", "\n", ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: \n", "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: \n", @@ -2591,15 +2771,12 @@ "~~~~~~~~~~~~~~~~~~\n", "Identifying edits\n", "~~~~~~~~~~~~~~~~~~\n", - "i_options range(0, 2)\n", - "j_options range(0, 3)\n", - "j_options range(3, 6)\n", - "broken_up_contigs [['6']]\n", + "Contig groups to be processed: [['6']]\n", "Each contig is being split into 4 subsets...\n", "\tContig 6\n", "4 total jobs\n", "Reads processed:\t13\n", - "Time to process reads in min:\t0.03921\n", + "Time to process reads in min:\t0.08753\n", "Read Summary:\n", "edited 3\n", "no_edits 10\n", @@ -2619,7 +2796,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4/4 [00:02<00:00, 1.78it/s]\n" + "100%|██████████| 4/4 [00:05<00:00, 1.26s/it]\n" ] }, { @@ -2627,7 +2804,7 @@ "output_type": "stream", "text": [ "\t400 suffixes\n", - "Total time to concat and write bams: 0.282 minutes\n", + "Total time to concat and write bams: 0.247 minutes\n", "Deleting overall_label_to_list_of_contents...\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Calculating coverage at edited sites, minimum read quality is 0...\n", @@ -2641,8 +2818,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:16<00:00, 16.82s/it]\n", - "100%|██████████| 400/400 [00:04<00:00, 80.74it/s] \n" + "100%|██████████| 1/1 [00:14<00:00, 14.49s/it]\n", + "100%|██████████| 400/400 [00:08<00:00, 48.91it/s] \n" ] }, { @@ -2654,7 +2831,7 @@ "original 3\n", "filtered 3\n", "dtype: int64\n", - "Total time to calculate coverage: 0.085 minutes\n", + "Total time to calculate coverage: 0.141 minutes\n", "Filtering..\n", "\tNumber of edits after filtering:\n", "\t3\n", @@ -2665,8 +2842,8 @@ "Adding strand-specific conversion...\n", "\n", "Annotating sites with GTF information from /tscc/projects/ps-yeolab3/ekofman/sailor2//annotations/cellranger-mm10-3.0.0.annotation.genes.bed...\n", - "Current memory usage 21.518014MB; Peak: 22.629871MB\n", - "Time elapsed: 26.42s\n", + "Current memory usage 21.519682MB; Peak: 22.63154MB\n", + "Time elapsed: 29.99s\n", "-------------------------------\n", "Deleting intermediate files...\n", "-------------------------------\n", @@ -2712,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 16, "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387", "metadata": {}, "outputs": [ @@ -2747,6 +2924,13 @@ "\n", "\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", + "Checking results for F1R2_pair_test-single_end_mode_sailor\n", + "\tExpecting: {'contig': 'chr17', 'position': 43044352, 'count': 1, 'coverage': 2, 'conversion': 'G>A', 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'BRCA1', 'feature_strand': '-'}\n", + "\n", + "\t >>> F1R2_pair_test-single_end_mode_sailor passed! <<<\n", + "\n", + "\n", + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "Checking results for F1R2_pair_test-single_end_mode\n", "\tExpecting: {'contig': 'chr17', 'position': 43044352, 'count': 1, 'coverage': 2, 'conversion': 'G>A', 'num_rows': 1, 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'BRCA1', 'feature_strand': '-'}\n", "\n", @@ -2864,6 +3048,24 @@ " }]\n", " },\n", "\n", + " \"F1R2_pair_test-single_end_mode_sailor\": {\n", + " \"folder\": \"strandedness_tests\",\n", + " \"expectations\": [{\n", + " \"contig\": \"chr17\",\n", + " \"position\": 43044352,\n", + " \"count\": 1,\n", + " \"coverage\": 2,\n", + " \"conversion\": \"G>A\",\n", + " \"num_rows\": 1,\n", + " \"conversion\": \"G>A\",\n", + " \"strand_conversion\": \"C>T\",\n", + " \"strand\": \"-\",\n", + " \"feature_name\": \"BRCA1\",\n", + " \"feature_strand\": \"-\"\n", + " }]\n", + " },\n", + "\n", + " \n", " \"F1R2_pair_test-single_end_mode\": {\n", " \"folder\": \"strandedness_tests\",\n", " \"expectations\": [{\n", diff --git a/tests/integration_tests_auto_check.py b/tests/integration_tests_auto_check.py index 8c49e6a..045c736 100755 --- a/tests/integration_tests_auto_check.py +++ b/tests/integration_tests_auto_check.py @@ -56,6 +56,23 @@ "feature_strand": "-" }] }, + + "F1R2_pair_test-single_end_mode_sailor": { + "folder": "strandedness_tests", + "expectations": [{ + "contig": "chr17", + "position": 43044352, + "count": 1, + "coverage": 2, + "conversion": "G>A", + "num_rows": 1, + "conversion": "G>A", + "strand_conversion": "C>T", + "strand": "-", + "feature_name": "BRCA1", + "feature_strand": "-" + }] + }, "F1R2_pair_test-single_end_mode": { "folder": "strandedness_tests", diff --git a/tests/integration_tests_run.sh b/tests/integration_tests_run.sh index 93bad69..33e77f1 100755 --- a/tests/integration_tests_run.sh +++ b/tests/integration_tests_run.sh @@ -13,7 +13,7 @@ tests_folder="strandedness_tests/" echo "Bulk tests scripts" ls -lh $MARINE/tests/$tests_folder/scripts/ -for t in "F1R2_pair_test-single_end_mode" "F1R2_pair_test" "F2R1_end_second_in_pair_test" "same_pos_dif_reads_test" "tax1bp3_chr17_3665556_read_test" "pair_test" "unstranded_pair_test" +for t in "F1R2_pair_test-single_end_mode_sailor" "F1R2_pair_test-single_end_mode" "F1R2_pair_test" "F2R1_end_second_in_pair_test" "same_pos_dif_reads_test" "tax1bp3_chr17_3665556_read_test" "pair_test" "unstranded_pair_test" do echo $t echo "Removing old files..." diff --git a/tests/strandedness_tests/scripts/F1R2_pair_test-single_end_mode_sailor.sh b/tests/strandedness_tests/scripts/F1R2_pair_test-single_end_mode_sailor.sh new file mode 100644 index 0000000..e295447 --- /dev/null +++ b/tests/strandedness_tests/scripts/F1R2_pair_test-single_end_mode_sailor.sh @@ -0,0 +1,21 @@ +mypython=$1 +echo "Python is $mypython" + +$mypython $MARINE/marine.py \ +--bam_filepath \ +$MARINE/tests/strandedness_tests/bams/F1R2_pair.bam \ +--annotation_bedfile_path \ +$MARINE/annotations/hg38_gencode.v35.annotation.genes.bed \ +--output_folder \ +$MARINE/tests/strandedness_tests/F1R2_pair_test-single_end_mode_sailor \ +--min_dist_from_end \ +0 \ +--min_base_quality \ +0 \ +--cores \ +16 \ +--strandedness 2 \ +--contigs "chr17" \ +--sailor "CT,AG" \ +--verbose \ +--num_intervals_per_contig 16 \ No newline at end of file diff --git a/tests/strandedness_tests/scripts/same_pos_dif_reads_test.sh b/tests/strandedness_tests/scripts/same_pos_dif_reads_test.sh index 653a68c..e56429e 100644 --- a/tests/strandedness_tests/scripts/same_pos_dif_reads_test.sh +++ b/tests/strandedness_tests/scripts/same_pos_dif_reads_test.sh @@ -16,6 +16,5 @@ $MARINE/tests/strandedness_tests/same_pos_dif_reads_test \ --paired_end \ --strandedness 2 \ --contigs "chr17" \ ---sailor \ --verbose \ --num_intervals_per_contig 16 \ No newline at end of file diff --git a/tests/strandedness_tests/unstranded_pair_test/final_filtered_site_info.tsv b/tests/strandedness_tests/unstranded_pair_test/final_filtered_site_info.tsv index a554c94..8d56148 100644 --- a/tests/strandedness_tests/unstranded_pair_test/final_filtered_site_info.tsv +++ b/tests/strandedness_tests/unstranded_pair_test/final_filtered_site_info.tsv @@ -1,15 +1,15 @@ site_id barcode contig position ref alt strand count coverage conversion strand_conversion -no_barcode_Citrine.dna_437_A_G_+ no_barcode Citrine.dna 437 A G + 1 22 A>G A>G -no_barcode_Citrine.dna_21_G_A_+ no_barcode Citrine.dna 21 G A + 2 2 G>A G>A no_barcode_Citrine.dna_454_T_G_+ no_barcode Citrine.dna 454 T G + 1 14 T>G T>G no_barcode_Citrine.dna_431_A_G_+ no_barcode Citrine.dna 431 A G + 1 22 A>G A>G -no_barcode_Citrine.dna_428_A_G_+ no_barcode Citrine.dna 428 A G + 1 20 A>G A>G -no_barcode_Citrine.dna_432_C_T_+ no_barcode Citrine.dna 432 C T + 1 22 C>T C>T -no_barcode_Citrine.dna_439_A_G_+ no_barcode Citrine.dna 439 A G + 1 22 A>G A>G -no_barcode_Citrine.dna_441_C_A_+ no_barcode Citrine.dna 441 C A + 1 22 C>A C>A no_barcode_Citrine.dna_430_T_G_+ no_barcode Citrine.dna 430 T G + 2 22 T>G T>G +no_barcode_Citrine.dna_428_A_G_+ no_barcode Citrine.dna 428 A G + 1 20 A>G A>G no_barcode_Citrine.dna_435_C_T_+ no_barcode Citrine.dna 435 C T + 22 22 C>T C>T -no_barcode_Citrine.dna_149_C_G_+ no_barcode Citrine.dna 149 C G + 1 2 C>G C>G +no_barcode_Citrine.dna_439_A_G_+ no_barcode Citrine.dna 439 A G + 1 22 A>G A>G +no_barcode_Citrine.dna_432_C_T_+ no_barcode Citrine.dna 432 C T + 1 22 C>T C>T +no_barcode_Citrine.dna_21_G_A_+ no_barcode Citrine.dna 21 G A + 2 2 G>A G>A no_barcode_Citrine.dna_411_C_A_+ no_barcode Citrine.dna 411 C A + 1 18 C>A C>A no_barcode_Citrine.dna_438_C_A_+ no_barcode Citrine.dna 438 C A + 1 22 C>A C>A no_barcode_Citrine.dna_414_G_C_+ no_barcode Citrine.dna 414 G C + 1 18 G>C G>C +no_barcode_Citrine.dna_441_C_A_+ no_barcode Citrine.dna 441 C A + 1 22 C>A C>A +no_barcode_Citrine.dna_149_C_G_+ no_barcode Citrine.dna 149 C G + 1 2 C>G C>G +no_barcode_Citrine.dna_437_A_G_+ no_barcode Citrine.dna 437 A G + 1 22 A>G A>G