From d4d4f223e624d9dd3c431494ed91439f8c715418 Mon Sep 17 00:00:00 2001 From: David Kelley Date: Thu, 16 May 2024 13:17:24 -0700 Subject: [PATCH] port dataset code --- src/baskerville/data.py | 322 +++++++ src/baskerville/helpers/utils.py | 58 ++ src/baskerville/scripts/hound_data.py | 905 ++++++++++++++++++ src/baskerville/scripts/hound_data_align.py | 983 ++++++++++++++++++++ src/baskerville/scripts/hound_data_read.py | 382 ++++++++ src/baskerville/scripts/hound_data_write.py | 280 ++++++ 6 files changed, 2930 insertions(+) create mode 100644 src/baskerville/data.py create mode 100755 src/baskerville/scripts/hound_data.py create mode 100755 src/baskerville/scripts/hound_data_align.py create mode 100755 src/baskerville/scripts/hound_data_read.py create mode 100755 src/baskerville/scripts/hound_data_write.py diff --git a/src/baskerville/data.py b/src/baskerville/data.py new file mode 100644 index 0000000..03fc4d0 --- /dev/null +++ b/src/baskerville/data.py @@ -0,0 +1,322 @@ +# Copyright 2023 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +import collections +import heapq +import math +import subprocess +import sys +import tempfile + +import numpy as np +import pysam + +""" +data.py + +Helper methods for hound_data* +""" + + +def annotate_unmap(mseqs, unmap_bed, seq_length, pool_width): + """Intersect the sequence segments with unmappable regions + and annoate the segments as NaN to possible be ignored. + + Args: + mseqs: list of ModelSeq's + unmap_bed: unmappable regions BED file + seq_length: sequence length (after cropping) + pool_width: pooled bin width + + Returns: + seqs_unmap: NxL binary NA indicators + """ + + # print sequence segments to file + seqs_temp = tempfile.NamedTemporaryFile() + seqs_bed_file = seqs_temp.name + write_seqs_bed(seqs_bed_file, mseqs) + + # hash segments to indexes + chr_start_indexes = {} + for i in range(len(mseqs)): + chr_start_indexes[(mseqs[i].chr, mseqs[i].start)] = i + + # initialize unmappable array + pool_seq_length = seq_length // pool_width + seqs_unmap = np.zeros((len(mseqs), pool_seq_length), dtype="bool") + + # intersect with unmappable regions + p = subprocess.Popen( + "bedtools intersect -wo -a %s -b %s" % (seqs_bed_file, unmap_bed), + shell=True, + stdout=subprocess.PIPE, + ) + for line in p.stdout: + line = line.decode("utf-8") + a = line.split() + + seq_chrom = a[0] + seq_start = int(a[1]) + seq_end = int(a[2]) + seq_key = (seq_chrom, seq_start) + + unmap_start = int(a[4]) + unmap_end = int(a[5]) + + overlap_start = max(seq_start, unmap_start) + overlap_end = min(seq_end, unmap_end) + + pool_seq_unmap_start = math.floor((overlap_start - seq_start) / pool_width) + pool_seq_unmap_end = math.ceil((overlap_end - seq_start) / pool_width) + + # skip minor overlaps to the first + first_start = seq_start + pool_seq_unmap_start * pool_width + first_end = first_start + pool_width + first_overlap = first_end - overlap_start + if first_overlap < 0.1 * pool_width: + pool_seq_unmap_start += 1 + + # skip minor overlaps to the last + last_start = seq_start + (pool_seq_unmap_end - 1) * pool_width + last_overlap = overlap_end - last_start + if last_overlap < 0.1 * pool_width: + pool_seq_unmap_end -= 1 + + seqs_unmap[ + chr_start_indexes[seq_key], pool_seq_unmap_start:pool_seq_unmap_end + ] = True + assert ( + seqs_unmap[ + chr_start_indexes[seq_key], pool_seq_unmap_start:pool_seq_unmap_end + ].sum() + == pool_seq_unmap_end - pool_seq_unmap_start + ) + + return seqs_unmap + + +################################################################################ +def break_large_contigs(contigs, break_t, verbose=False): + """Break large contigs in half until all contigs are under + the size threshold.""" + + # initialize a heapq of contigs and lengths + contig_heapq = [] + for ctg in contigs: + ctg_len = ctg.end - ctg.start + heapq.heappush(contig_heapq, (-ctg_len, ctg)) + + ctg_len = break_t + 1 + while ctg_len > break_t: + + # pop largest contig + ctg_nlen, ctg = heapq.heappop(contig_heapq) + ctg_len = -ctg_nlen + + # if too large + if ctg_len > break_t: + if verbose: + print( + "Breaking %s:%d-%d (%d nt)" % (ctg.chr, ctg.start, ctg.end, ctg_len) + ) + + # break in two + ctg_mid = ctg.start + ctg_len // 2 + ctg_left = Contig(ctg.genome, ctg.chr, ctg.start, ctg_mid) + ctg_right = Contig(ctg.genome, ctg.chr, ctg_mid, ctg.end) + + # add left + ctg_left_len = ctg_left.end - ctg_left.start + heapq.heappush(contig_heapq, (-ctg_left_len, ctg_left)) + + # add right + ctg_right_len = ctg_right.end - ctg_right.start + heapq.heappush(contig_heapq, (-ctg_right_len, ctg_right)) + + # return to list + contigs = [len_ctg[1] for len_ctg in contig_heapq] + + return contigs + + +def contig_sequences(contigs, seq_length, stride, snap=1, label=None): + """Break up a list of Contig's into a list of model length + and stride sequence contigs.""" + mseqs = [] + + for ctg in contigs: + seq_start = int(np.ceil(ctg.start / snap) * snap) + seq_end = seq_start + seq_length + + while seq_end < ctg.end: + # record sequence + mseqs.append(ModelSeq(ctg.genome, ctg.chr, seq_start, seq_end, label)) + + # update + seq_start += stride + seq_end += stride + + return mseqs + + +def load_chromosomes(genome_file): + """Load genome segments from either a FASTA file or + chromosome length table.""" + + # is genome_file FASTA or (chrom,start,end) table? + file_fasta = open(genome_file).readline()[0] == ">" + + chrom_segments = {} + + if file_fasta: + fasta_open = pysam.Fastafile(genome_file) + for i in range(len(fasta_open.references)): + chrom_segments[fasta_open.references[i]] = [(0, fasta_open.lengths[i])] + fasta_open.close() + + else: + for line in open(genome_file): + a = line.split() + chrom_segments[a[0]] = [(0, int(a[1]))] + + return chrom_segments + + +def rejoin_large_contigs(contigs): + """Rejoin large contigs that were broken up before alignment comparison.""" + + # split list by genome/chromosome + gchr_contigs = {} + for ctg in contigs: + gchr = (ctg.genome, ctg.chr) + gchr_contigs.setdefault(gchr, []).append(ctg) + + contigs = [] + for gchr in gchr_contigs: + # sort within chromosome + gchr_contigs[gchr].sort(key=lambda x: x.start) + # gchr_contigs[gchr] = sorted(gchr_contigs[gchr], key=lambda ctg: ctg.start) + + ctg_ongoing = gchr_contigs[gchr][0] + for i in range(1, len(gchr_contigs[gchr])): + ctg_this = gchr_contigs[gchr][i] + if ctg_ongoing.end == ctg_this.start: + # join + # ctg_ongoing.end = ctg_this.end + ctg_ongoing = ctg_ongoing._replace(end=ctg_this.end) + else: + # conclude ongoing + contigs.append(ctg_ongoing) + + # move to next + ctg_ongoing = ctg_this + + # conclude final + contigs.append(ctg_ongoing) + + return contigs + + +def split_contigs(chrom_segments, gaps_file): + """Split the assembly up into contigs defined by the gaps. + + Args: + chrom_segments: dict mapping chromosome names to lists of (start,end) + gaps_file: file specifying assembly gaps + + Returns: + chrom_segments: same, with segments broken by the assembly gaps. + """ + + chrom_events = {} + + # add known segments + for chrom in chrom_segments: + if len(chrom_segments[chrom]) > 1: + print( + "I've made a terrible mistake...regarding the length of chrom_segments[%s]" + % chrom, + file=sys.stderr, + ) + exit(1) + cstart, cend = chrom_segments[chrom][0] + chrom_events.setdefault(chrom, []).append((cstart, "Cstart")) + chrom_events[chrom].append((cend, "cend")) + + # add gaps + for line in open(gaps_file): + a = line.split() + chrom = a[0] + gstart = int(a[1]) + gend = int(a[2]) + + # consider only if its in our genome + if chrom in chrom_events: + chrom_events[chrom].append((gstart, "gstart")) + chrom_events[chrom].append((gend, "Gend")) + + for chrom in chrom_events: + # sort + chrom_events[chrom].sort() + + # read out segments + chrom_segments[chrom] = [] + for i in range(len(chrom_events[chrom]) - 1): + pos1, event1 = chrom_events[chrom][i] + pos2, event2 = chrom_events[chrom][i + 1] + + event1 = event1.lower() + event2 = event2.lower() + + shipit = False + if event1 == "cstart" and event2 == "cend": + shipit = True + elif event1 == "cstart" and event2 == "gstart": + shipit = True + elif event1 == "gend" and event2 == "gstart": + shipit = True + elif event1 == "gend" and event2 == "cend": + shipit = True + elif event1 == "gstart" and event2 == "gend": + pass + else: + print( + "I'm confused by this event ordering: %s - %s" % (event1, event2), + file=sys.stderr, + ) + exit(1) + + if shipit and pos1 < pos2: + chrom_segments[chrom].append((pos1, pos2)) + + return chrom_segments + + +def write_seqs_bed(bed_file, seqs, labels=False): + """Write sequences to BED file.""" + bed_out = open(bed_file, "w") + for i in range(len(seqs)): + line = "%s\t%d\t%d" % (seqs[i].chr, seqs[i].start, seqs[i].end) + if labels: + line += "\t%s" % seqs[i].label + print(line, file=bed_out) + bed_out.close() + + +################################################################################ +Contig = collections.namedtuple("Contig", ["genome", "chr", "start", "end"]) +ModelSeq = collections.namedtuple( + "ModelSeq", ["genome", "chr", "start", "end", "label"] +) diff --git a/src/baskerville/helpers/utils.py b/src/baskerville/helpers/utils.py index d1549dc..9892179 100644 --- a/src/baskerville/helpers/utils.py +++ b/src/baskerville/helpers/utils.py @@ -1,4 +1,62 @@ +import os import pickle +import sys +import subprocess +import time + + +def exec_par(cmds, max_proc=None, verbose=False): + """ + Execute the commands in the list 'cmds' in parallel, but + only running 'max_proc' at a time. + Args: + cmds: list of commands to execute + max_proc: maximum number of processes to run in parallel + verbose: print command to stderr + """ + total = len(cmds) + finished = 0 + running = 0 + p = [] + + if max_proc == None: + max_proc = len(cmds) + + if max_proc == 1: + while finished < total: + if verbose: + print(cmds[finished], file=sys.stderr) + op = subprocess.Popen(cmds[finished], shell=True) + os.waitpid(op.pid, 0) + finished += 1 + + else: + while finished + running < total: + # launch jobs up to max + while running < max_proc and finished + running < total: + if verbose: + print(cmds[finished + running], file=sys.stderr) + p.append(subprocess.Popen(cmds[finished + running], shell=True)) + # print 'Running %d' % p[running].pid + running += 1 + + # are any jobs finished + new_p = [] + for i in range(len(p)): + if p[i].poll() != None: + running -= 1 + finished += 1 + else: + new_p.append(p[i]) + + # if none finished, sleep + if len(new_p) == len(p): + time.sleep(1) + p = new_p + + # wait for all to finish + for i in range(len(p)): + p[i].wait() def load_extra_options(options_pkl_file, options): diff --git a/src/baskerville/scripts/hound_data.py b/src/baskerville/scripts/hound_data.py new file mode 100755 index 0000000..a95284a --- /dev/null +++ b/src/baskerville/scripts/hound_data.py @@ -0,0 +1,905 @@ +#!/usr/bin/env python +# Copyright 2017 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +from optparse import OptionParser +import gzip +import json +import pdb +import os +import random +import shutil +import subprocess +import sys +import tempfile + +import numpy as np +import pandas as pd + +from baskerville import data +from baskerville.helpers import utils + +try: + import slurm +except ModuleNotFoundError: + pass + +""" +hound_data.py + +Compute model sequences from the genome, extracting DNA coverage values. +""" + + +################################################################################ +def main(): + usage = "usage: %prog [options] " + parser = OptionParser(usage) + parser.add_option( + "-b", + dest="blacklist_bed", + help="Set blacklist nucleotides to a baseline value.", + ) + parser.add_option( + "--break", + dest="break_t", + default=786432, + type="int", + help="Break in half contigs above length [Default: %default]", + ) + parser.add_option( + "-c", + "--crop", + dest="crop_bp", + default=0, + type="int", + help="Crop bp off each end [Default: %default]", + ) + parser.add_option( + "-d", + dest="decimals", + default=None, + type="int", + help="Round values to given decimals [Default: %default]", + ) + parser.add_option( + "-f", + dest="folds", + default=None, + type="int", + help="Generate cross fold split [Default: %default]", + ) + parser.add_option( + "-g", dest="gaps_file", help="Genome assembly gaps BED [Default: %default]" + ) + parser.add_option( + "-i", + dest="interp_nan", + default=False, + action="store_true", + help="Interpolate NaNs [Default: %default]", + ) + parser.add_option( + "-l", + dest="seq_length", + default=131072, + type="int", + help="Sequence length [Default: %default]", + ) + parser.add_option( + "--limit", + dest="limit_bed", + help="Limit to segments that overlap regions in a BED file", + ) + parser.add_option( + "--local", + dest="run_local", + default=False, + action="store_true", + help="Run jobs locally as opposed to on SLURM [Default: %default]", + ) + parser.add_option( + "-o", + dest="out_dir", + default="data_out", + help="Output directory [Default: %default]", + ) + parser.add_option( + "-p", + dest="processes", + default=None, + type="int", + help="Number parallel processes [Default: %default]", + ) + parser.add_option( + "--peaks", + dest="peaks_only", + default=False, + action="store_true", + help="Create contigs only from peaks [Default: %default]", + ) + parser.add_option( + "-r", + dest="seqs_per_tfr", + default=256, + type="int", + help="Sequences per TFRecord file [Default: %default]", + ) + parser.add_option( + "--restart", + dest="restart", + default=False, + action="store_true", + help="Continue progress from midpoint. [Default: %default]", + ) + parser.add_option( + "-s", + dest="sample_pct", + default=1.0, + type="float", + help="Down-sample the segments", + ) + parser.add_option( + "--seed", + dest="seed", + default=44, + type="int", + help="Random seed [Default: %default]", + ) + parser.add_option( + "--snap", + dest="snap", + default=1, + type="int", + help="Snap sequences to multiple of the given value [Default: %default]", + ) + parser.add_option( + "--st", + "--split_test", + dest="split_test", + default=False, + action="store_true", + help="Exit after split. [Default: %default]", + ) + parser.add_option( + "--stride", + "--stride_train", + dest="stride_train", + default=1.0, + type="float", + help="Stride to advance train sequences [Default: seq_length]", + ) + parser.add_option( + "--stride_test", + dest="stride_test", + default=1.0, + type="float", + help="Stride to advance valid and test sequences [Default: seq_length]", + ) + parser.add_option( + "-t", + dest="test_pct_or_chr", + default=0.05, + type="str", + help="Proportion of the data for testing [Default: %default]", + ) + parser.add_option("-u", dest="umap_bed", help="Unmappable regions in BED format") + parser.add_option( + "--umap_t", + dest="umap_t", + default=0.5, + type="float", + help="Remove sequences with more than this unmappable bin % [Default: %default]", + ) + parser.add_option( + "--umap_clip", + dest="umap_clip", + default=1, + type="float", + help="Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]", + ) + parser.add_option( + "--umap_tfr", + dest="umap_tfr", + default=False, + action="store_true", + help="Save umap array into TFRecords [Default: %default]", + ) + parser.add_option( + "-w", + dest="pool_width", + default=32, + type="int", + help="Sum pool width [Default: %default]", + ) + parser.add_option( + "-v", + dest="valid_pct_or_chr", + default=0.05, + type="str", + help="Proportion of the data for validation [Default: %default]", + ) + (options, args) = parser.parse_args() + + if len(args) != 2: + parser.error("Must provide FASTA and sample coverage labels and paths.") + else: + fasta_file = args[0] + targets_file = args[1] + + random.seed(options.seed) + np.random.seed(options.seed) + + if options.break_t is not None and options.break_t < options.seq_length: + print( + "Maximum contig length --break cannot be less than sequence length.", + file=sys.stderr, + ) + exit(1) + + # transform proportion strides to base pairs + if options.stride_train <= 1: + print("stride_train %.f" % options.stride_train, end="") + options.stride_train = options.stride_train * options.seq_length + print(" converted to %f" % options.stride_train) + options.stride_train = int(np.round(options.stride_train)) + if options.stride_test <= 1: + if options.folds is None: + print("stride_test %.f" % options.stride_test, end="") + options.stride_test = options.stride_test * options.seq_length + print(" converted to %f" % options.stride_test) + options.stride_test = int(np.round(options.stride_test)) + + # check snap + if options.snap is not None: + if np.mod(options.seq_length, options.snap) != 0: + raise ValueError("seq_length must be a multiple of snap") + if np.mod(options.stride_train, options.snap) != 0: + raise ValueError("stride_train must be a multiple of snap") + if np.mod(options.stride_test, options.snap) != 0: + raise ValueError("stride_test must be a multiple of snap") + + # setup output directory + if os.path.isdir(options.out_dir) and not options.restart: + print("Remove output directory %s or use --restart option." % options.out_dir) + exit(1) + elif not os.path.isdir(options.out_dir): + os.mkdir(options.out_dir) + + # read target datasets + targets_df = pd.read_csv(targets_file, index_col=0, sep="\t") + + ################################################################ + # define genomic contigs + ################################################################ + if not options.restart: + chrom_contigs = data.load_chromosomes(fasta_file) + + # remove gaps + if options.gaps_file: + chrom_contigs = data.split_contigs(chrom_contigs, options.gaps_file) + + # ditch the chromosomes for contigs + contigs = [] + for chrom in chrom_contigs: + contigs += [ + data.Contig(0, chrom, ctg_start, ctg_end) + for ctg_start, ctg_end in chrom_contigs[chrom] + ] + + # limit to a BED file + if options.limit_bed is not None: + contigs = limit_contigs(contigs, options.limit_bed) + + # limit to peaks + if options.peaks_only: + peaks_bed = curate_peaks( + targets_df, options.out_dir, options.pool_width, options.crop_bp + ) + contigs = limit_contigs(contigs, peaks_bed) + + # filter for large enough + seq_tlength = options.seq_length - 2 * options.crop_bp + contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= seq_tlength] + + # break up large contigs + if options.break_t is not None: + contigs = data.break_large_contigs(contigs, options.break_t) + + # print contigs to BED file + # ctg_bed_file = '%s/contigs.bed' % options.out_dir + # write_seqs_bed(ctg_bed_file, contigs) + + ################################################################ + # divide between train/valid/test + ################################################################ + # label folds + if options.folds is not None: + fold_labels = ["fold%d" % fi for fi in range(options.folds)] + num_folds = options.folds + else: + fold_labels = ["train", "valid", "test"] + num_folds = 3 + + if not options.restart: + if options.folds is not None: + # divide by fold pct + fold_contigs = divide_contigs_folds(contigs, options.folds) + + else: + try: + # convert to float pct + valid_pct = float(options.valid_pct_or_chr) + test_pct = float(options.test_pct_or_chr) + assert 0 <= valid_pct <= 1 + assert 0 <= test_pct <= 1 + + # divide by pct + fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct) + + except (ValueError, AssertionError): + # divide by chr + valid_chrs = options.valid_pct_or_chr.split(",") + test_chrs = options.test_pct_or_chr.split(",") + fold_contigs = divide_contigs_chr(contigs, test_chrs, valid_chrs) + + # rejoin broken contigs within set + for fi in range(len(fold_contigs)): + fold_contigs[fi] = data.rejoin_large_contigs(fold_contigs[fi]) + + # write labeled contigs to BED file + ctg_bed_file = "%s/contigs.bed" % options.out_dir + ctg_bed_out = open(ctg_bed_file, "w") + for fi in range(len(fold_contigs)): + for ctg in fold_contigs[fi]: + line = "%s\t%d\t%d\t%s" % (ctg.chr, ctg.start, ctg.end, fold_labels[fi]) + print(line, file=ctg_bed_out) + ctg_bed_out.close() + + if options.split_test: + exit() + + ################################################################ + # define model sequences + ################################################################ + if not options.restart: + fold_mseqs = [] + for fi in range(num_folds): + if fold_labels[fi] in ["valid", "test"]: + stride_fold = options.stride_test + else: + stride_fold = options.stride_train + + # stride sequences across contig + fold_mseqs_fi = data.contig_sequences( + fold_contigs[fi], + seq_tlength, + stride_fold, + options.snap, + fold_labels[fi], + ) + fold_mseqs.append(fold_mseqs_fi) + + # shuffle + random.shuffle(fold_mseqs[fi]) + + # down-sample + if options.sample_pct < 1.0: + fold_mseqs[fi] = random.sample( + fold_mseqs[fi], int(options.sample_pct * len(fold_mseqs[fi])) + ) + + # merge into one list + mseqs = [ms for fm in fold_mseqs for ms in fm] + + ################################################################ + # mappability + ################################################################ + if not options.restart: + if options.umap_bed is not None: + if shutil.which("bedtools") is None: + print("Install Bedtools to annotate unmappable sites", file=sys.stderr) + exit(1) + + # annotate unmappable positions + mseqs_unmap = data.annotate_unmap( + mseqs, options.umap_bed, seq_tlength, options.pool_width + ) + + # filter unmappable + mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t + mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]] + mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] + + # write to file + unmap_npy = "%s/mseqs_unmap.npy" % options.out_dir + np.save(unmap_npy, mseqs_unmap) + + # write sequences to BED + seqs_bed_file = "%s/sequences.bed" % options.out_dir + data.write_seqs_bed(seqs_bed_file, mseqs, True) + + else: + # read from directory + seqs_bed_file = "%s/sequences.bed" % options.out_dir + unmap_npy = "%s/mseqs_unmap.npy" % options.out_dir + mseqs = [] + fold_mseqs = [] + for fi in range(num_folds): + fold_mseqs.append([]) + for line in open(seqs_bed_file): + a = line.split() + msg = data.ModelSeq(0, a[0], int(a[1]), int(a[2]), a[3]) + mseqs.append(msg) + if a[3] == "train": + fi = 0 + elif a[3] == "valid": + fi = 1 + elif a[3] == "test": + fi = 2 + else: + fi = int(a[3].replace("fold", "")) + fold_mseqs[fi].append(msg) + + ################################################################ + # read sequence coverage values + ################################################################ + seqs_cov_dir = "%s/seqs_cov" % options.out_dir + if not os.path.isdir(seqs_cov_dir): + os.mkdir(seqs_cov_dir) + + read_jobs = [] + + for ti in range(targets_df.shape[0]): + genome_cov_file = targets_df["file"].iloc[ti] + seqs_cov_stem = "%s/%d" % (seqs_cov_dir, ti) + seqs_cov_file = "%s.h5" % seqs_cov_stem + + clip_ti = None + if "clip" in targets_df.columns: + clip_ti = targets_df["clip"].iloc[ti] + + clipsoft_ti = None + if "clip_soft" in targets_df.columns: + clipsoft_ti = targets_df["clip_soft"].iloc[ti] + + scale_ti = 1 + if "scale" in targets_df.columns: + scale_ti = targets_df["scale"].iloc[ti] + + if options.restart and os.path.isfile(seqs_cov_file): + print("Skipping existing %s" % seqs_cov_file, file=sys.stderr) + else: + cmd = "hound_data_read.py" + cmd += " -w %d" % options.pool_width + cmd += " -u %s" % targets_df["sum_stat"].iloc[ti] + if clip_ti is not None: + cmd += " -c %f" % clip_ti + if clipsoft_ti is not None: + cmd += " --clip_soft %f" % clipsoft_ti + cmd += " -s %f" % scale_ti + if options.blacklist_bed: + cmd += " -b %s" % options.blacklist_bed + if options.interp_nan: + cmd += " -i" + cmd += " %s" % genome_cov_file + cmd += " %s" % seqs_bed_file + cmd += " %s" % seqs_cov_file + + if options.run_local: + # breaks on some OS + # cmd += ' &> %s.err' % seqs_cov_stem + read_jobs.append(cmd) + else: + j = slurm.Job( + cmd, + name="read_t%d" % ti, + out_file="%s.out" % seqs_cov_stem, + err_file="%s.err" % seqs_cov_stem, + queue="standard", + mem=15000, + time="12:0:0", + ) + read_jobs.append(j) + + if options.run_local: + utils.exec_par(read_jobs, options.processes, verbose=True) + else: + slurm.multi_run( + read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5 + ) + + ################################################################ + # write TF Records + ################################################################ + # copy targets file + shutil.copy(targets_file, "%s/targets.txt" % options.out_dir) + + # initialize TF Records dir + tfr_dir = "%s/tfrecords" % options.out_dir + if not os.path.isdir(tfr_dir): + os.mkdir(tfr_dir) + + write_jobs = [] + + for fold_set in fold_labels: + fold_set_indexes = [i for i in range(len(mseqs)) if mseqs[i].label == fold_set] + fold_set_start = fold_set_indexes[0] + fold_set_end = fold_set_indexes[-1] + 1 + + tfr_i = 0 + tfr_start = fold_set_start + tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) + + while tfr_start <= fold_set_end: + tfr_stem = "%s/%s-%d" % (tfr_dir, fold_set, tfr_i) + + tfr_file = "%s.tfr" % tfr_stem + if options.restart and os.path.isfile(tfr_file): + print("Skipping existing %s" % tfr_file, file=sys.stderr) + else: + cmd = "hound_data_write.py" + cmd += " -s %d" % tfr_start + cmd += " -e %d" % tfr_end + cmd += " --umap_clip %f" % options.umap_clip + cmd += " -x %d" % options.crop_bp + if options.decimals is not None: + cmd += " -d %d" % options.decimals + if options.umap_tfr: + cmd += " --umap_tfr" + if options.umap_bed is not None: + cmd += " -u %s" % unmap_npy + + cmd += " %s" % fasta_file + cmd += " %s" % seqs_bed_file + cmd += " %s" % seqs_cov_dir + cmd += " %s" % tfr_file + + if options.run_local: + # breaks on some OS + # cmd += ' &> %s.err' % tfr_stem + write_jobs.append(cmd) + else: + j = slurm.Job( + cmd, + name="write_%s-%d" % (fold_set, tfr_i), + out_file="%s.out" % tfr_stem, + err_file="%s.err" % tfr_stem, + queue="standard", + mem=15000, + time="12:0:0", + ) + write_jobs.append(j) + + # update + tfr_i += 1 + tfr_start += options.seqs_per_tfr + tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end) + + if options.run_local: + utils.exec_par(write_jobs, options.processes, verbose=True) + else: + slurm.multi_run( + write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5 + ) + + ################################################################ + # stats + ################################################################ + stats_dict = {} + stats_dict["num_targets"] = targets_df.shape[0] + stats_dict["seq_length"] = options.seq_length + stats_dict["seq_1hot"] = True + stats_dict["pool_width"] = options.pool_width + stats_dict["crop_bp"] = options.crop_bp + + target_length = options.seq_length - 2 * options.crop_bp + target_length = target_length // options.pool_width + stats_dict["target_length"] = target_length + + for fi in range(num_folds): + stats_dict["%s_seqs" % fold_labels[fi]] = len(fold_mseqs[fi]) + + with open("%s/statistics.json" % options.out_dir, "w") as stats_json_out: + json.dump(stats_dict, stats_json_out, indent=4) + + +################################################################################ +def curate_peaks(targets_df, out_dir, pool_width, crop_bp): + """Merge all peaks, round to nearest pool_width, and add cropped bp.""" + + # concatenate and extend peaks + cat_bed_file = "%s/peaks_cat.bed" % out_dir + cat_bed_out = open(cat_bed_file, "w") + for bed_file in targets_df.file: + if bed_file[-3:] == ".gz": + bed_in = gzip.open(bed_file, "rt") + else: + bed_in = open(bed_file, "r") + + for line in bed_in: + a = line.rstrip().split("\t") + chrm = a[0] + start = int(a[1]) + end = int(a[2]) + + # extend to pool width + length = end - start + if length < pool_width: + mid = (start + end) // 2 + start = mid - pool_width // 2 + end = start + pool_width + + # add cropped bp + start = max(0, start - crop_bp) + end += crop_bp + + # print + print("%s\t%d\t%d" % (chrm, start, end), file=cat_bed_out) + + bed_in.close() + cat_bed_out.close() + + # merge + merge_bed_file = "%s/peaks_merge.bed" % out_dir + bedtools_cmd = "bedtools sort -i %s" % cat_bed_file + bedtools_cmd += " | bedtools merge -i - > %s" % merge_bed_file + subprocess.call(bedtools_cmd, shell=True) + + # round and add crop_bp + full_bed_file = "%s/peaks_full.bed" % out_dir + full_bed_out = open(full_bed_file, "w") + + for line in open(merge_bed_file): + a = line.rstrip().split("\t") + chrm = a[0] + start = int(a[1]) + end = int(a[2]) + mid = (start + end) // 2 + length = end - start + + # round length to nearest pool_width + bins = int(np.round(length / pool_width)) + assert bins > 0 + start = mid - (bins * pool_width) // 2 + start = max(0, start) + end = start + (bins * pool_width) + + # write + print("%s\t%d\t%d" % (chrm, start, end), file=full_bed_out) + + full_bed_out.close() + + return full_bed_file + + +################################################################################ +def divide_contigs_chr(contigs, test_chrs, valid_chrs): + """Divide list of contigs into train/valid/test lists + by chromosome.""" + + # initialize current train/valid/test nucleotides + train_nt = 0 + valid_nt = 0 + test_nt = 0 + + # initialize train/valid/test contig lists + train_contigs = [] + valid_contigs = [] + test_contigs = [] + + # process contigs + for ctg in contigs: + ctg_len = ctg.end - ctg.start + + if ctg.chr in test_chrs: + test_contigs.append(ctg) + test_nt += ctg_len + elif ctg.chr in valid_chrs: + valid_contigs.append(ctg) + valid_nt += ctg_len + else: + train_contigs.append(ctg) + train_nt += ctg_len + + total_nt = train_nt + valid_nt + test_nt + + print("Contigs divided into") + print( + " Train: %5d contigs, %10d nt (%.4f)" + % (len(train_contigs), train_nt, train_nt / total_nt) + ) + print( + " Valid: %5d contigs, %10d nt (%.4f)" + % (len(valid_contigs), valid_nt, valid_nt / total_nt) + ) + print( + " Test: %5d contigs, %10d nt (%.4f)" + % (len(test_contigs), test_nt, test_nt / total_nt) + ) + + return [train_contigs, valid_contigs, test_contigs] + + +################################################################################ +def divide_contigs_folds(contigs, folds): + """Divide list of contigs into cross fold lists.""" + + # sort contigs descending by length + length_contigs = [(ctg.end - ctg.start, ctg) for ctg in contigs] + length_contigs.sort(reverse=True) + + # compute total nucleotides + total_nt = sum([lc[0] for lc in length_contigs]) + + # compute aimed fold nucleotides + fold_nt_aim = int(np.ceil(total_nt / folds)) + + # initialize current fold nucleotides + fold_nt = np.zeros(folds) + + # initialize fold contig lists + fold_contigs = [] + for fi in range(folds): + fold_contigs.append([]) + + # process contigs + for ctg_len, ctg in length_contigs: + + # compute gap between current and aim + fold_nt_gap = fold_nt_aim - fold_nt + fold_nt_gap = np.clip(fold_nt_gap, 0, np.inf) + + # compute sample probability + fold_prob = fold_nt_gap / fold_nt_gap.sum() + + # sample train/valid/test + fi = np.random.choice(folds, p=fold_prob) + fold_contigs[fi].append(ctg) + fold_nt[fi] += ctg_len + + print("Contigs divided into") + for fi in range(folds): + print( + " Fold%d: %5d contigs, %10d nt (%.4f)" + % (fi, len(fold_contigs[fi]), fold_nt[fi], fold_nt[fi] / total_nt) + ) + + return fold_contigs + + +################################################################################ +def divide_contigs_pct(contigs, test_pct, valid_pct, pct_abstain=0.2): + """Divide list of contigs into train/valid/test lists, + aiming for the specified nucleotide percentages.""" + + # sort contigs descending by length + length_contigs = [(ctg.end - ctg.start, ctg) for ctg in contigs] + length_contigs.sort(reverse=True) + + # compute total nucleotides + total_nt = sum([lc[0] for lc in length_contigs]) + + # compute aimed train/valid/test nucleotides + test_nt_aim = test_pct * total_nt + valid_nt_aim = valid_pct * total_nt + train_nt_aim = total_nt - valid_nt_aim - test_nt_aim + + # initialize current train/valid/test nucleotides + train_nt = 0 + valid_nt = 0 + test_nt = 0 + + # initialize train/valid/test contig lists + train_contigs = [] + valid_contigs = [] + test_contigs = [] + + # process contigs + for ctg_len, ctg in length_contigs: + + # compute gap between current and aim + test_nt_gap = max(0, test_nt_aim - test_nt) + valid_nt_gap = max(0, valid_nt_aim - valid_nt) + train_nt_gap = max(1, train_nt_aim - train_nt) + + # skip if too large + if ctg_len > pct_abstain * test_nt_gap: + test_nt_gap = 0 + if ctg_len > pct_abstain * valid_nt_gap: + valid_nt_gap = 0 + + # compute remaining % + gap_sum = train_nt_gap + valid_nt_gap + test_nt_gap + test_pct_gap = test_nt_gap / gap_sum + valid_pct_gap = valid_nt_gap / gap_sum + train_pct_gap = train_nt_gap / gap_sum + + # sample train/valid/test + ri = np.random.choice( + range(3), 1, p=[train_pct_gap, valid_pct_gap, test_pct_gap] + )[0] + if ri == 0: + train_contigs.append(ctg) + train_nt += ctg_len + elif ri == 1: + valid_contigs.append(ctg) + valid_nt += ctg_len + elif ri == 2: + test_contigs.append(ctg) + test_nt += ctg_len + else: + print("TVT random number beyond 0,1,2", file=sys.stderr) + exit(1) + + print("Contigs divided into") + print( + " Train: %5d contigs, %10d nt (%.4f)" + % (len(train_contigs), train_nt, train_nt / total_nt) + ) + print( + " Valid: %5d contigs, %10d nt (%.4f)" + % (len(valid_contigs), valid_nt, valid_nt / total_nt) + ) + print( + " Test: %5d contigs, %10d nt (%.4f)" + % (len(test_contigs), test_nt, test_nt / total_nt) + ) + + return [train_contigs, valid_contigs, test_contigs] + + +################################################################################ +def limit_contigs(contigs, filter_bed): + """Limit to contigs overlapping the given BED. + + Args + contigs: list of Contigs + filter_bed: BED file to filter by + + Returns: + fcontigs: list of Contigs + """ + + # print ctgments to BED + ctg_fd, ctg_bed_file = tempfile.mkstemp() + ctg_bed_out = open(ctg_bed_file, "w") + for ctg in contigs: + print("%s\t%d\t%d" % (ctg.chr, ctg.start, ctg.end), file=ctg_bed_out) + ctg_bed_out.close() + + # intersect w/ filter_bed + fcontigs = [] + p = subprocess.Popen( + "bedtools intersect -a %s -b %s" % (ctg_bed_file, filter_bed), + shell=True, + stdout=subprocess.PIPE, + ) + for line in p.stdout: + a = line.decode("utf-8").split() + chrom = a[0] + ctg_start = int(a[1]) + ctg_end = int(a[2]) + fcontigs.append(data.Contig(0, chrom, ctg_start, ctg_end)) + + p.communicate() + + os.close(ctg_fd) + os.remove(ctg_bed_file) + + return fcontigs + + +if __name__ == "__main__": + main() diff --git a/src/baskerville/scripts/hound_data_align.py b/src/baskerville/scripts/hound_data_align.py new file mode 100755 index 0000000..b32a1c7 --- /dev/null +++ b/src/baskerville/scripts/hound_data_align.py @@ -0,0 +1,983 @@ +#!/usr/bin/env python +# Copyright 2017 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +from optparse import OptionParser +import collections +import gzip +import heapq +import pdb +import os +import random +import sys + +import networkx as nx +import numpy as np +import pybedtools + +from baskerville import data + +""" +hound_data_align.py + +Partition sequences from multiple aligned genomes into train/valid/test splits +that respect homology. +""" + + +################################################################################ +def main(): + usage = "usage: %prog [options] " + parser = OptionParser(usage) + parser.add_option( + "-a", dest="genome_labels", default=None, help="Genome labels in output" + ) + parser.add_option( + "--break", + dest="break_t", + default=None, + type="int", + help="Break in half contigs above length [Default: %default]", + ) + parser.add_option( + "-c", + "--crop", + dest="crop_bp", + default=0, + type="int", + help="Crop bp off each end [Default: %default]", + ) + parser.add_option( + "-f", + dest="folds", + default=None, + type="int", + help="Generate cross fold split [Default: %default]", + ) + parser.add_option( + "-g", + dest="gap_files", + help="Comma-separated list of assembly gaps BED files [Default: %default]", + ) + parser.add_option( + "-l", + dest="seq_length", + default=131072, + type="int", + help="Sequence length [Default: %default]", + ) + parser.add_option( + "--nf", + dest="net_fill_min", + default=100000, + type="int", + help="Alignment net fill size minimum [Default: %default]", + ) + parser.add_option( + "--no", + dest="net_olap_min", + default=1024, + type="int", + help="Alignment net and contig overlap minimum [Default: %default]", + ) + parser.add_option( + "-o", + dest="out_dir", + default="align_out", + help="Output directory [Default: %default]", + ) + parser.add_option( + "-s", + dest="sample_pct", + default=1.0, + type="float", + help="Down-sample the segments", + ) + parser.add_option( + "--seed", + dest="seed", + default=44, + type="int", + help="Random seed [Default: %default]", + ) + parser.add_option( + "--snap", + dest="snap", + default=1, + type="int", + help="Snap sequences to multiple of the given value [Default: %default]", + ) + parser.add_option( + "--stride", + "--stride_train", + dest="stride_train", + default=1.0, + type="float", + help="Stride to advance train sequences [Default: seq_length]", + ) + parser.add_option( + "--stride_test", + dest="stride_test", + default=1.0, + type="float", + help="Stride to advance valid and test sequences [Default: %default]", + ) + parser.add_option( + "-t", + dest="test_pct", + default=0.1, + type="float", + help="Proportion of the data for testing [Default: %default]", + ) + parser.add_option( + "-u", + dest="umap_beds", + help="Comma-separated genome unmappable segments to set to NA", + ) + parser.add_option( + "--umap_t", + dest="umap_t", + default=0.5, + type="float", + help="Remove sequences with more than this unmappable bin % [Default: %default]", + ) + parser.add_option( + "-w", + dest="pool_width", + default=32, + type="int", + help="Sum pool width [Default: %default]", + ) + parser.add_option( + "-v", + dest="valid_pct", + default=0.1, + type="float", + help="Proportion of the data for validation [Default: %default]", + ) + (options, args) = parser.parse_args() + + if len(args) != 2: + parser.error("Must provide alignment and FASTA files.") + else: + align_net_file = args[0] + fasta_files = args[1].split(",") + + # there is still some source of stochasticity + random.seed(options.seed) + np.random.seed(options.seed) + + # transform proportion strides to base pairs + if options.stride_train <= 1: + print("stride_train %.f" % options.stride_train, end="") + options.stride_train = options.stride_train * options.seq_length + print(" converted to %f" % options.stride_train) + options.stride_train = int(np.round(options.stride_train)) + if options.stride_test <= 1: + print("stride_test %.f" % options.stride_test, end="") + options.stride_test = options.stride_test * options.seq_length + print(" converted to %f" % options.stride_test) + options.stride_test = int(np.round(options.stride_test)) + + # check snap + if options.snap is not None: + if np.mod(options.seq_length, options.snap) != 0: + raise ValueError("seq_length must be a multiple of snap") + if np.mod(options.stride_train, options.snap) != 0: + raise ValueError("stride_train must be a multiple of snap") + if np.mod(options.stride_test, options.snap) != 0: + raise ValueError("stride_test must be a multiple of snap") + + # count genomes + num_genomes = len(fasta_files) + + # parse gap files + if options.gap_files is not None: + options.gap_files = options.gap_files.split(",") + assert len(options.gap_files) == num_genomes + + # parse unmappable files + if options.umap_beds is not None: + options.umap_beds = options.umap_beds.split(",") + assert len(options.umap_beds) == num_genomes + + # label genomes + if options.genome_labels is None: + options.genome_labels = ["genome%d" % (gi + 1) for gi in range(num_genomes)] + else: + options.genome_labels = options.genome_labels.split(",") + assert len(options.genome_labels) == num_genomes + + # create output directorys + if not os.path.isdir(options.out_dir): + os.mkdir(options.out_dir) + genome_out_dirs = [] + for gi in range(num_genomes): + gout_dir = "%s/%s" % (options.out_dir, options.genome_labels[gi]) + if not os.path.isdir(gout_dir): + os.mkdir(gout_dir) + genome_out_dirs.append(gout_dir) + + ################################################################ + # define genomic contigs + ################################################################ + genome_chr_contigs = [] + for gi in range(num_genomes): + genome_chr_contigs.append(data.load_chromosomes(fasta_files[gi])) + + # remove gaps + if options.gap_files[gi]: + genome_chr_contigs[gi] = data.split_contigs( + genome_chr_contigs[gi], options.gap_files[gi] + ) + + # ditch the chromosomes + contigs = [] + for gi in range(num_genomes): + for chrom in genome_chr_contigs[gi]: + contigs += [ + data.Contig(gi, chrom, ctg_start, ctg_end) + for ctg_start, ctg_end in genome_chr_contigs[gi][chrom] + ] + + # filter for large enough + seq_tlength = options.seq_length - 2 * options.crop_bp + contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= seq_tlength] + + # break up large contigs + if options.break_t is not None: + contigs = break_large_contigs(contigs, options.break_t) + + # print contigs to BED file + for gi in range(num_genomes): + contigs_i = [ctg for ctg in contigs if ctg.genome == gi] + ctg_bed_file = "%s/contigs.bed" % genome_out_dirs[gi] + data.write_seqs_bed(ctg_bed_file, contigs_i) + + ################################################################ + # divide between train/valid/test + ################################################################ + + # connect contigs across genomes by alignment + contig_components = connect_contigs( + contigs, + align_net_file, + options.net_fill_min, + options.net_olap_min, + options.out_dir, + genome_out_dirs, + ) + + if options.folds is not None: + # divide by fold + fold_contigs = divide_components_folds(contig_components, options.folds) + + else: + # divide by train/valid/test pct + fold_contigs = divide_components_pct( + contig_components, options.test_pct, options.valid_pct + ) + + # rejoin broken contigs within set + for fi in range(len(fold_contigs)): + fold_contigs[fi] = data.rejoin_large_contigs(fold_contigs[fi]) + + # label folds + if options.folds is not None: + fold_labels = ["fold%d" % fi for fi in range(options.folds)] + num_folds = options.folds + else: + fold_labels = ["train", "valid", "test"] + num_folds = 3 + + if options.folds is None: + # quantify leakage across sets + quantify_leakage( + align_net_file, + fold_contigs[0], + fold_contigs[1], + fold_contigs[2], + options.out_dir, + ) + + ################################################################ + # define model sequences + ################################################################ + + fold_mseqs = [] + for fi in range(num_folds): + if fold_labels[fi] in ["valid", "test"]: + stride_fold = options.stride_test + else: + stride_fold = options.stride_train + + # stride sequences across contig + fold_mseqs_fi = data.contig_sequences( + fold_contigs[fi], seq_tlength, stride_fold, options.snap, fold_labels[fi] + ) + fold_mseqs.append(fold_mseqs_fi) + + # shuffle + random.shuffle(fold_mseqs[fi]) + + # down-sample + if options.sample_pct < 1.0: + fold_mseqs[fi] = random.sample( + fold_mseqs[fi], int(options.sample_pct * len(fold_mseqs[fi])) + ) + + # merge into one list + mseqs = [ms for fm in fold_mseqs for ms in fm] + + # separate by genome + mseqs_genome = [] + for gi in range(num_genomes): + mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi] + mseqs_genome.append(mseqs_gi) + + ################################################################ + # filter for sufficient mappability + ################################################################ + for gi in range(num_genomes): + if options.umap_beds[gi] is not None: + # annotate unmappable positions + mseqs_unmap = data.annotate_unmap( + mseqs_genome[gi], options.umap_beds[gi], seq_tlength, options.pool_width + ) + + # filter unmappable + mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t + mseqs_genome[gi] = [ + mseqs_genome[gi][si] + for si in range(len(mseqs_genome[gi])) + if mseqs_map_mask[si] + ] + mseqs_unmap = mseqs_unmap[mseqs_map_mask, :] + + # write to file + unmap_npy_file = "%s/mseqs_unmap.npy" % genome_out_dirs[gi] + np.save(unmap_npy_file, mseqs_unmap) + + seqs_bed_files = [] + for gi in range(num_genomes): + # write sequences to BED + seqs_bed_files.append("%s/sequences.bed" % genome_out_dirs[gi]) + data.write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True) + + +################################################################################ +GraphSeq = collections.namedtuple("GraphSeq", ["genome", "net", "chr", "start", "end"]) + + +################################################################################ +def quantify_leakage( + align_net_file, train_contigs, valid_contigs, test_contigs, out_dir +): + """Quanitfy the leakage across sequence sets.""" + + def split_genome(contigs): + genome_contigs = [] + for ctg in contigs: + while len(genome_contigs) <= ctg.genome: + genome_contigs.append([]) + genome_contigs[ctg.genome].append((ctg.chr, ctg.start, ctg.end)) + genome_bedtools = [pybedtools.BedTool(ctgs) for ctgs in genome_contigs] + return genome_bedtools + + def bed_sum(overlaps): + osum = 0 + for overlap in overlaps: + osum += int(overlap[2]) - int(overlap[1]) + return osum + + train0_bt, train1_bt = split_genome(train_contigs) + valid0_bt, valid1_bt = split_genome(valid_contigs) + test0_bt, test1_bt = split_genome(test_contigs) + + assign0_sums = {} + assign1_sums = {} + + if os.path.splitext(align_net_file)[-1] == ".gz": + align_net_open = gzip.open(align_net_file, "rt") + else: + align_net_open = open(align_net_file, "r") + + for net_line in align_net_open: + if net_line.startswith("net"): + net_a = net_line.split() + chrom0 = net_a[1] + + elif net_line.startswith(" fill"): + net_a = net_line.split() + + # extract genome1 interval + start0 = int(net_a[1]) + size0 = int(net_a[2]) + end0 = start0 + size0 + align0_bt = pybedtools.BedTool([(chrom0, start0, end0)]) + + # extract genome2 interval + chrom1 = net_a[3] + start1 = int(net_a[5]) + size1 = int(net_a[6]) + end1 = start1 + size1 + align1_bt = pybedtools.BedTool([(chrom1, start1, end1)]) + + # count interval overlap + align0_train_bp = bed_sum(align0_bt.intersect(train0_bt)) + align0_valid_bp = bed_sum(align0_bt.intersect(valid0_bt)) + align0_test_bp = bed_sum(align0_bt.intersect(test0_bt)) + align0_max_bp = max(align0_train_bp, align0_valid_bp, align0_test_bp) + + align1_train_bp = bed_sum(align1_bt.intersect(train1_bt)) + align1_valid_bp = bed_sum(align1_bt.intersect(valid1_bt)) + align1_test_bp = bed_sum(align1_bt.intersect(test1_bt)) + align1_max_bp = max(align1_train_bp, align1_valid_bp, align1_test_bp) + + # assign to class + if align0_max_bp == 0: + assign0 = None + elif align0_train_bp == align0_max_bp: + assign0 = "train" + elif align0_valid_bp == align0_max_bp: + assign0 = "valid" + elif align0_test_bp == align0_max_bp: + assign0 = "test" + else: + print("Bad logic") + exit(1) + + if align1_max_bp == 0: + assign1 = None + elif align1_train_bp == align1_max_bp: + assign1 = "train" + elif align1_valid_bp == align1_max_bp: + assign1 = "valid" + elif align1_test_bp == align1_max_bp: + assign1 = "test" + else: + print("Bad logic") + exit(1) + + # increment + assign0_sums[(assign0, assign1)] = ( + assign0_sums.get((assign0, assign1), 0) + align0_max_bp + ) + assign1_sums[(assign0, assign1)] = ( + assign1_sums.get((assign0, assign1), 0) + align1_max_bp + ) + + # sum contigs + splits0_bp = {} + splits0_bp["train"] = bed_sum(train0_bt) + splits0_bp["valid"] = bed_sum(valid0_bt) + splits0_bp["test"] = bed_sum(test0_bt) + splits1_bp = {} + splits1_bp["train"] = bed_sum(train1_bt) + splits1_bp["valid"] = bed_sum(valid1_bt) + splits1_bp["test"] = bed_sum(test1_bt) + + leakage_out = open("%s/leakage.txt" % out_dir, "w") + print("Genome0", file=leakage_out) + for split0 in ["train", "valid", "test"]: + print(" %5s: %10d nt" % (split0, splits0_bp[split0]), file=leakage_out) + for split1 in ["train", "valid", "test", None]: + ss_bp = assign0_sums.get((split0, split1), 0) + print( + " %5s: %10d (%.5f)" % (split1, ss_bp, ss_bp / splits0_bp[split0]), + file=leakage_out, + ) + print("\nGenome1", file=leakage_out) + for split1 in ["train", "valid", "test"]: + print(" %5s: %10d nt" % (split1, splits1_bp[split1]), file=leakage_out) + for split0 in ["train", "valid", "test", None]: + ss_bp = assign1_sums.get((split0, split1), 0) + print( + " %5s: %10d (%.5f)" % (split0, ss_bp, ss_bp / splits1_bp[split1]), + file=leakage_out, + ) + leakage_out.close() + + +################################################################################ +def break_large_contigs(contigs, break_t, verbose=False): + """Break large contigs in half until all contigs are under + the size threshold.""" + + # initialize a heapq of contigs and lengths + contig_heapq = [] + for ctg in contigs: + ctg_len = ctg.end - ctg.start + heapq.heappush(contig_heapq, (-ctg_len, ctg)) + + ctg_len = break_t + 1 + while ctg_len > break_t: + + # pop largest contig + ctg_nlen, ctg = heapq.heappop(contig_heapq) + ctg_len = -ctg_nlen + + # if too large + if ctg_len > break_t: + if verbose: + print( + "Breaking %s:%d-%d (%d nt)" % (ctg.chr, ctg.start, ctg.end, ctg_len) + ) + + # break in two + ctg_mid = ctg.start + ctg_len // 2 + + try: + ctg_left = Contig(ctg.genome, ctg.chr, ctg.start, ctg_mid) + ctg_right = Contig(ctg.genome, ctg.chr, ctg_mid, ctg.end) + except AttributeError: + ctg_left = Contig(ctg.chr, ctg.start, ctg_mid) + ctg_right = Contig(ctg.chr, ctg_mid, ctg.end) + + # add left + ctg_left_len = ctg_left.end - ctg_left.start + heapq.heappush(contig_heapq, (-ctg_left_len, ctg_left)) + + # add right + ctg_right_len = ctg_right.end - ctg_right.start + heapq.heappush(contig_heapq, (-ctg_right_len, ctg_right)) + + # return to list + contigs = [len_ctg[1] for len_ctg in contig_heapq] + + return contigs + + +################################################################################ +def connect_contigs( + contigs, align_net_file, net_fill_min, net_olap_min, out_dir, genome_out_dirs +): + """Connect contigs across genomes by forming a graph that includes + net format aligning regions and contigs. Compute contig components + as connected components of that graph.""" + + # construct align net graph and write net BEDs + if align_net_file is None: + graph_contigs_nets = nx.Graph() + else: + graph_contigs_nets = make_net_graph(align_net_file, net_fill_min, out_dir) + + # add contig nodes + for ctg in contigs: + ctg_node = GraphSeq(ctg.genome, False, ctg.chr, ctg.start, ctg.end) + graph_contigs_nets.add_node(ctg_node) + + # intersect contigs BED w/ nets BED, adding graph edges. + intersect_contigs_nets( + graph_contigs_nets, 0, out_dir, genome_out_dirs[0], net_olap_min + ) + intersect_contigs_nets( + graph_contigs_nets, 1, out_dir, genome_out_dirs[1], net_olap_min + ) + + # find connected components + contig_components = [] + for contig_net_component in nx.connected_components(graph_contigs_nets): + # extract only the contigs + cc_contigs = [ + contig_or_net + for contig_or_net in contig_net_component + if contig_or_net.net is False + ] + + if cc_contigs: + # add to list + contig_components.append(cc_contigs) + + # write summary stats + comp_out = open("%s/contig_components.txt" % out_dir, "w") + for ctg_comp in contig_components: + ctg_comp0 = [ctg for ctg in ctg_comp if ctg.genome == 0] + ctg_comp1 = [ctg for ctg in ctg_comp if ctg.genome == 1] + ctg_comp0_nt = sum([ctg.end - ctg.start for ctg in ctg_comp0]) + ctg_comp1_nt = sum([ctg.end - ctg.start for ctg in ctg_comp1]) + ctg_comp_nt = ctg_comp0_nt + ctg_comp1_nt + cols = [len(ctg_comp), len(ctg_comp0), len(ctg_comp1)] + cols += [ctg_comp0_nt, ctg_comp1_nt, ctg_comp_nt] + cols = [str(c) for c in cols] + print("\t".join(cols), file=comp_out) + comp_out.close() + + return contig_components + + +################################################################################ +def contig_stats_genome(contigs): + """Compute contig statistics within each genome.""" + contigs_count_genome = [] + contigs_nt_genome = [] + + contigs_genome_found = True + gi = 0 + while contigs_genome_found: + contigs_genome = [ctg for ctg in contigs if ctg.genome == gi] + + if len(contigs_genome) == 0: + contigs_genome_found = False + + else: + contigs_nt = [ctg.end - ctg.start for ctg in contigs_genome] + + contigs_count_genome.append(len(contigs_genome)) + contigs_nt_genome.append(sum(contigs_nt)) + + gi += 1 + + return contigs_count_genome, contigs_nt_genome + + +################################################################################ +def divide_components_folds(contig_components, folds): + """Divide contig connected components into cross fold lists.""" + + # sort contig components descending by length + length_contig_components = [] + for cc_contigs in contig_components: + cc_len = sum([ctg.end - ctg.start for ctg in cc_contigs]) + length_contig_components.append((cc_len, cc_contigs)) + length_contig_components.sort(reverse=True) + + # compute total nucleotides + total_nt = sum([lc[0] for lc in length_contig_components]) + + # compute aimed fold nucleotides + fold_nt_aim = int(np.ceil(total_nt / folds)) + + # initialize current fold nucleotides + fold_nt = np.zeros(folds) + + # initialize fold contig lists + fold_contigs = [] + for fi in range(folds): + fold_contigs.append([]) + + # process contigs + for ctg_comp_len, ctg_comp in length_contig_components: + # compute gap between current and aim + fold_nt_gap = fold_nt_aim - fold_nt + fold_nt_gap = np.clip(fold_nt_gap, 0, np.inf) + + # compute sample probability + fold_prob = fold_nt_gap / fold_nt_gap.sum() + + # sample train/valid/test + fi = np.random.choice(folds, p=fold_prob) + fold_nt[fi] += ctg_comp_len + for ctg in ctg_comp: + fold_contigs[fi].append(ctg) + + # report genome-specific train/valid/test stats + report_divide_stats(fold_contigs) + + return fold_contigs + + +################################################################################ +def divide_components_pct(contig_components, test_pct, valid_pct, pct_abstain=0.5): + """Divide contig connected components into train/valid/test, + and aiming for the specified nucleotide percentages.""" + + # sort contig components descending by length + length_contig_components = [] + for cc_contigs in contig_components: + cc_len = sum([ctg.end - ctg.start for ctg in cc_contigs]) + length_contig_components.append((cc_len, cc_contigs)) + length_contig_components.sort(reverse=True) + + # compute total nucleotides + total_nt = sum([lc[0] for lc in length_contig_components]) + + # compute aimed train/valid/test nucleotides + test_nt_aim = test_pct * total_nt + valid_nt_aim = valid_pct * total_nt + train_nt_aim = total_nt - valid_nt_aim - test_nt_aim + + # initialize current train/valid/test nucleotides + train_nt = 0 + valid_nt = 0 + test_nt = 0 + + # initialie train/valid/test contig lists + train_contigs = [] + valid_contigs = [] + test_contigs = [] + + # process contigs + for ctg_comp_len, ctg_comp in length_contig_components: + # compute gap between current and aim + test_nt_gap = max(0, test_nt_aim - test_nt) + valid_nt_gap = max(0, valid_nt_aim - valid_nt) + train_nt_gap = max(1, train_nt_aim - train_nt) + + # skip if too large + if ctg_comp_len > pct_abstain * test_nt_gap: + test_nt_gap = 0 + if ctg_comp_len > pct_abstain * valid_nt_gap: + valid_nt_gap = 0 + + # compute remaining % + gap_sum = train_nt_gap + valid_nt_gap + test_nt_gap + test_pct_gap = test_nt_gap / gap_sum + valid_pct_gap = valid_nt_gap / gap_sum + train_pct_gap = train_nt_gap / gap_sum + + # sample train/valid/test + ri = np.random.choice( + range(3), 1, p=[train_pct_gap, valid_pct_gap, test_pct_gap] + )[0] + + # collect contigs (sorted is required for deterministic sequence order) + if ri == 0: + for ctg in sorted(ctg_comp): + train_contigs.append(ctg) + train_nt += ctg_comp_len + elif ri == 1: + for ctg in sorted(ctg_comp): + valid_contigs.append(ctg) + valid_nt += ctg_comp_len + elif ri == 2: + for ctg in sorted(ctg_comp): + test_contigs.append(ctg) + test_nt += ctg_comp_len + else: + print("TVT random number beyond 0,1,2", file=sys.stderr) + exit(1) + + # report genome-specific train/valid/test stats + report_divide_stats([train_contigs, valid_contigs, test_contigs]) + + return [train_contigs, valid_contigs, test_contigs] + + +################################################################################ +def intersect_contigs_nets( + graph_contigs_nets, genome_i, out_dir, genome_out_dir, min_olap=128 +): + """Intersect the contigs and nets from genome_i, adding the + overlaps as edges to graph_contigs_nets.""" + + contigs_file = "%s/contigs.bed" % genome_out_dir + nets_file = "%s/nets%d.bed" % (out_dir, genome_i) + + contigs_bed = pybedtools.BedTool(contigs_file) + nets_bed = pybedtools.BedTool(nets_file) + + for overlap in contigs_bed.intersect(nets_bed, wo=True): + ctg_chr = overlap[0] + ctg_start = int(overlap[1]) + ctg_end = int(overlap[2]) + net_chr = overlap[3] + net_start = int(overlap[4]) + net_end = int(overlap[5]) + olap_len = int(overlap[6]) + + if olap_len > min_olap: + # create node objects + ctg_node = GraphSeq(genome_i, False, ctg_chr, ctg_start, ctg_end) + net_node = GraphSeq(genome_i, True, net_chr, net_start, net_end) + + # add edge / verify we found nodes + gcn_size_pre = graph_contigs_nets.number_of_nodes() + graph_contigs_nets.add_edge(ctg_node, net_node) + gcn_size_post = graph_contigs_nets.number_of_nodes() + assert gcn_size_pre == gcn_size_post + + +################################################################################ +def make_net_graph(align_net_file, net_fill_min, out_dir): + """Construct a Graph with aligned net intervals connected + by edges.""" + + graph_nets = nx.Graph() + + nets1_bed_out = open("%s/nets0.bed" % out_dir, "w") + nets2_bed_out = open("%s/nets1.bed" % out_dir, "w") + + if os.path.splitext(align_net_file)[-1] == ".gz": + align_net_open = gzip.open(align_net_file, "rt") + else: + align_net_open = open(align_net_file, "r") + + for net_line in align_net_open: + if net_line.startswith("net"): + net_a = net_line.split() + chrom1 = net_a[1] + + elif net_line.startswith(" fill"): + net_a = net_line.split() + + # extract genome1 interval + start1 = int(net_a[1]) + size1 = int(net_a[2]) + end1 = start1 + size1 + + # extract genome2 interval + chrom2 = net_a[3] + start2 = int(net_a[5]) + size2 = int(net_a[6]) + end2 = start2 + size2 + + if min(size1, size2) >= net_fill_min: + # add edge + net1_node = GraphSeq(0, True, chrom1, start1, end1) + net2_node = GraphSeq(1, True, chrom2, start2, end2) + graph_nets.add_edge(net1_node, net2_node) + + # write interval1 + cols = [chrom1, str(start1), str(end1)] + print("\t".join(cols), file=nets1_bed_out) + + # write interval2 + cols = [chrom2, str(start2), str(end2)] + print("\t".join(cols), file=nets2_bed_out) + + nets1_bed_out.close() + nets2_bed_out.close() + + return graph_nets + + +################################################################################ +def report_divide_stats(fold_contigs): + """Report genome-specific statistics about the division of contigs + between sets.""" + + fold_counts_genome = [] + fold_nts_genome = [] + for fi in range(len(fold_contigs)): + fcg, fng = contig_stats_genome(fold_contigs[fi]) + fold_counts_genome.append(fcg) + fold_nts_genome.append(fng) + num_genomes = len(fold_counts_genome[0]) + + # sum nt across genomes + fold_nts = [sum(fng) for fng in fold_nts_genome] + total_nt = sum(fold_nts) + + # compute total sum nt per genome + total_nt_genome = [] + print("Total nt") + for gi in range(num_genomes): + total_nt_gi = sum([fng[gi] for fng in fold_nts_genome]) + total_nt_genome.append(total_nt_gi) + print(" Genome%d: %10d nt" % (gi, total_nt_gi)) + + # label folds and guess that 3 is train/valid/test + fold_labels = [] + if len(fold_contigs) == 3: + fold_labels = ["Train", "Valid", "Test"] + else: + fold_labels = ["Fold%d" % fi for fi in range(len(fold_contigs))] + + print("Contigs divided into") + for fi in range(len(fold_contigs)): + print( + " %s: %5d contigs, %10d nt (%.4f)" + % ( + fold_labels[fi], + len(fold_contigs[fi]), + fold_nts[fi], + fold_nts[fi] / total_nt, + ) + ) + for gi in range(num_genomes): + print( + " Genome%d: %5d contigs, %10d nt (%.4f)" + % ( + gi, + fold_counts_genome[fi][gi], + fold_nts_genome[fi][gi], + fold_nts_genome[fi][gi] / total_nt_genome[gi], + ) + ) + + +################################################################################ +def report_divide_stats_v1(train_contigs, valid_contigs, test_contigs): + """Report genome-specific statistics about the division of contigs + between train/valid/test sets.""" + + # compute genome-specific stats + train_count_genome, train_nt_genome = contig_stats_genome(train_contigs) + valid_count_genome, valid_nt_genome = contig_stats_genome(valid_contigs) + test_count_genome, test_nt_genome = contig_stats_genome(test_contigs) + num_genomes = len(train_count_genome) + + # sum nt across genomes + train_nt = sum(train_nt_genome) + valid_nt = sum(valid_nt_genome) + test_nt = sum(test_nt_genome) + total_nt = train_nt + valid_nt + test_nt + + # compute total sum nt per genome + total_nt_genome = [] + for gi in range(num_genomes): + total_nt_gi = train_nt_genome[gi] + valid_nt_genome[gi] + test_nt_genome[gi] + total_nt_genome.append(total_nt_gi) + + print("Contigs divided into") + print( + " Train: %5d contigs, %10d nt (%.4f)" + % (len(train_contigs), train_nt, train_nt / total_nt) + ) + for gi in range(num_genomes): + print( + " Genome%d: %5d contigs, %10d nt (%.4f)" + % ( + gi, + train_count_genome[gi], + train_nt_genome[gi], + train_nt_genome[gi] / total_nt_genome[gi], + ) + ) + + print( + " Valid: %5d contigs, %10d nt (%.4f)" + % (len(valid_contigs), valid_nt, valid_nt / total_nt) + ) + for gi in range(num_genomes): + print( + " Genome%d: %5d contigs, %10d nt (%.4f)" + % ( + gi, + valid_count_genome[gi], + valid_nt_genome[gi], + valid_nt_genome[gi] / total_nt_genome[gi], + ) + ) + + print( + " Test: %5d contigs, %10d nt (%.4f)" + % (len(test_contigs), test_nt, test_nt / total_nt) + ) + for gi in range(num_genomes): + print( + " Genome%d: %5d contigs, %10d nt (%.4f)" + % ( + gi, + test_count_genome[gi], + test_nt_genome[gi], + test_nt_genome[gi] / total_nt_genome[gi], + ) + ) + + +################################################################################ +if __name__ == "__main__": + main() diff --git a/src/baskerville/scripts/hound_data_read.py b/src/baskerville/scripts/hound_data_read.py new file mode 100755 index 0000000..5b6ec35 --- /dev/null +++ b/src/baskerville/scripts/hound_data_read.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python +# Copyright 2017 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +from optparse import OptionParser + +import os +import sys + +import h5py +import intervaltree +import numpy as np +import pandas as pd +import scipy.interpolate + +import pyBigWig + +from baskerville import data + +""" +hound_data_read.py + +Read sequence values from coverage files. +""" + + +################################################################################ +# main +################################################################################ +def main(): + usage = "usage: %prog [options] " + parser = OptionParser(usage) + parser.add_option( + "-b", + dest="blacklist_bed", + help="Set blacklist nucleotides to a baseline value.", + ) + parser.add_option( + "--black_pct", + dest="blacklist_pct", + default=0.5, + type="float", + help="Clip blacklisted regions to this distribution value [Default: %default", + ) + parser.add_option( + "-c", + dest="clip", + default=None, + type="float", + help="Clip values post-summary to a maximum [Default: %default]", + ) + parser.add_option( + "--clip_soft", + dest="clip_soft", + default=None, + type="float", + help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]", + ) + parser.add_option( + "--clip_pct", + dest="clip_pct", + default=0.9999999, + type="float", + help="Clip extreme values to this distribution value [Default: %default", + ) + parser.add_option( + "--crop", + dest="crop_bp", + default=0, + type="int", + help="Crop bp off each end [Default: %default]", + ) + parser.add_option( + "-i", + dest="interp_nan", + default=False, + action="store_true", + help="Interpolate NaNs [Default: %default]", + ) + parser.add_option( + "-s", + dest="scale", + default=1.0, + type="float", + help="Scale values by [Default: %default]", + ) + parser.add_option( + "-u", + dest="sum_stat", + default="sum", + help="Summary statistic to compute in windows [Default: %default]", + ) + parser.add_option( + "-w", + dest="pool_width", + default=1, + type="int", + help="Average pooling width [Default: %default]", + ) + (options, args) = parser.parse_args() + + if len(args) != 3: + parser.error("") + else: + genome_cov_file = args[0] + seqs_bed_file = args[1] + seqs_cov_file = args[2] + + assert options.crop_bp >= 0 + + # read model sequences + model_seqs = [] + for line in open(seqs_bed_file): + a = line.split() + model_seqs.append(data.ModelSeq(0, a[0], int(a[1]), int(a[2]), None)) + + # read blacklist regions + black_chr_trees = read_blacklist(options.blacklist_bed) + + # compute dimensions + num_seqs = len(model_seqs) + seq_len_nt = model_seqs[0].end - model_seqs[0].start + seq_len_nt -= 2 * options.crop_bp + target_length = seq_len_nt // options.pool_width + assert target_length > 0 + + # collect targets + targets = [] + + # open genome coverage file + genome_cov_open = CovFace(genome_cov_file) + + # for each model sequence + for si in range(num_seqs): + mseq = model_seqs[si] + + # read coverage + seq_cov_nt = genome_cov_open.read(mseq.chr, mseq.start, mseq.end) + seq_cov_nt = seq_cov_nt.astype("float32") + + # interpolate NaN + if options.interp_nan: + seq_cov_nt = interp_nan(seq_cov_nt) + + # determine baseline coverage + if target_length >= 8: + baseline_cov = np.percentile(seq_cov_nt, 100 * options.blacklist_pct) + baseline_cov = np.nan_to_num(baseline_cov) + else: + baseline_cov = 0 + + # set blacklist to baseline + if mseq.chr in black_chr_trees: + for black_interval in black_chr_trees[mseq.chr][mseq.start : mseq.end]: + # adjust for sequence indexes + black_seq_start = black_interval.begin - mseq.start + black_seq_end = black_interval.end - mseq.start + black_seq_values = seq_cov_nt[black_seq_start:black_seq_end] + seq_cov_nt[black_seq_start:black_seq_end] = np.clip( + black_seq_values, -baseline_cov, baseline_cov + ) + # seq_cov_nt[black_seq_start:black_seq_end] = baseline_cov + + # set NaN's to baseline + if not options.interp_nan: + nan_mask = np.isnan(seq_cov_nt) + seq_cov_nt[nan_mask] = baseline_cov + + # crop + if options.crop_bp > 0: + seq_cov_nt = seq_cov_nt[options.crop_bp : -options.crop_bp] + + # scale + seq_cov_nt = options.scale * seq_cov_nt + + # sum pool + seq_cov = seq_cov_nt.reshape(target_length, options.pool_width) + if options.sum_stat == "sum": + seq_cov = seq_cov.sum(axis=1, dtype="float32") + elif options.sum_stat == "sum_sqrt": + seq_cov = seq_cov.sum(axis=1, dtype="float32") + seq_cov = -1 + np.sqrt(1 + seq_cov) + elif options.sum_stat == "sum_exp75": + seq_cov = seq_cov.sum(axis=1, dtype="float32") + seq_cov = -1 + (1 + seq_cov) ** 0.75 + elif options.sum_stat in ["mean", "avg"]: + seq_cov = seq_cov.mean(axis=1, dtype="float32") + elif options.sum_stat in ["mean_sqrt", "avg_sqrt"]: + seq_cov = seq_cov.mean(axis=1, dtype="float32") + seq_cov = -1 + np.sqrt(1 + seq_cov) + elif options.sum_stat == "median": + seq_cov = seq_cov.median(axis=1) + elif options.sum_stat == "max": + seq_cov = seq_cov.max(axis=1) + elif options.sum_stat == "peak": + seq_cov = seq_cov.mean(axis=1, dtype="float32") + seq_cov = np.clip(np.sqrt(seq_cov * 4), 0, 1) + else: + print( + 'ERROR: Unrecognized summary statistic "%s".' % options.sum_stat, + file=sys.stderr, + ) + exit(1) + + # clip + if options.clip_soft is not None: + clip_mask = seq_cov > options.clip_soft + seq_cov[clip_mask] = ( + options.clip_soft + - 1 + + np.sqrt(seq_cov[clip_mask] - options.clip_soft + 1) + ) + if options.clip is not None: + seq_cov = np.clip(seq_cov, -options.clip, options.clip) + + # clip float16 min/max + seq_cov = np.clip(seq_cov, np.finfo(np.float16).min, np.finfo(np.float16).max) + + # save + targets.append(seq_cov.astype("float16")) + + # close genome coverage file + genome_cov_open.close() + + # clip extreme values + targets = np.array(targets, dtype="float16") + extreme_clip = np.percentile(targets, 100 * options.clip_pct) + targets = np.clip(targets, -extreme_clip, extreme_clip) + print("Targets sum: %.3f" % targets.sum(dtype="float64")) + + # write + with h5py.File(seqs_cov_file, "w") as seqs_cov_open: + seqs_cov_open.create_dataset( + "targets", data=targets, dtype="float16", compression="gzip" + ) + + +def interp_nan(x, kind="linear"): + """Linearly interpolate to fill NaN.""" + + # pad zeroes + xp = np.zeros(len(x) + 2) + xp[1:-1] = x + + # find NaN + x_nan = np.isnan(xp) + + if np.sum(x_nan) == 0: + # unnecessary + return x + + else: + # interpolate + inds = np.arange(len(xp)) + interpolator = scipy.interpolate.interp1d( + inds[~x_nan], xp[~x_nan], kind=kind, bounds_error=False + ) + + loc = np.where(x_nan) + xp[loc] = interpolator(loc) + + # slice off pad + return xp[1:-1] + + +def read_blacklist(blacklist_bed, black_buffer=20): + """Construct interval trees of blacklist + regions for each chromosome.""" + black_chr_trees = {} + + if blacklist_bed is not None and os.path.isfile(blacklist_bed): + for line in open(blacklist_bed): + a = line.split() + chrm = a[0] + start = max(0, int(a[1]) - black_buffer) + end = int(a[2]) + black_buffer + + if chrm not in black_chr_trees: + black_chr_trees[chrm] = intervaltree.IntervalTree() + + black_chr_trees[chrm][start:end] = True + + return black_chr_trees + + +class CovFace: + def __init__(self, cov_file): + self.cov_file = cov_file + self.bigwig = False + self.bed = False + + cov_ext = os.path.splitext(self.cov_file)[1].lower() + if cov_ext == ".gz": + cov_ext = os.path.splitext(self.cov_file[:-3])[1].lower() + + if cov_ext in [".bed", ".narrowpeak"]: + self.bed = True + self.preprocess_bed() + + elif cov_ext in [".bw", ".bigwig"]: + self.cov_open = pyBigWig.open(self.cov_file, "r") + self.bigwig = True + + elif cov_ext in [".h5", ".hdf5", ".w5", ".wdf5"]: + self.cov_open = h5py.File(self.cov_file, "r") + + else: + print( + 'Cannot identify coverage file extension "%s".' % cov_ext, + file=sys.stderr, + ) + exit(1) + + def preprocess_bed(self): + # read BED + bed_df = pd.read_csv( + self.cov_file, sep="\t", usecols=range(3), names=["chr", "start", "end"] + ) + + # for each chromosome + self.cov_open = {} + for chrm in bed_df.chr.unique(): + bed_chr_df = bed_df[bed_df.chr == chrm] + + # find max pos + pos_max = bed_chr_df.end.max() + + # initialize array + self.cov_open[chrm] = np.zeros(pos_max, dtype="bool") + + # set peaks + for peak in bed_chr_df.itertuples(): + self.cov_open[peak.chr][peak.start : peak.end] = 1 + + def read(self, chrm, start, end): + if self.bigwig: + cov = self.cov_open.values(chrm, start, end, numpy=True).astype("float16") + + else: + if chrm in self.cov_open: + cov = self.cov_open[chrm][start:end] + + # handle mysterious inf's + cov = np.clip(cov, np.finfo(np.float16).min, np.finfo(np.float16).max) + + # pad + pad_zeros = end - start - len(cov) + if pad_zeros > 0: + cov_pad = np.zeros(pad_zeros, dtype="bool") + cov = np.concatenate([cov, cov_pad]) + + else: + print( + "WARNING: %s doesn't see %s:%d-%d. Setting to all zeros." + % (self.cov_file, chrm, start, end), + file=sys.stderr, + ) + cov = np.zeros(end - start, dtype="float16") + + return cov + + def close(self): + if not self.bed: + self.cov_open.close() + + +################################################################################ +# __main__ +################################################################################ +if __name__ == "__main__": + main() diff --git a/src/baskerville/scripts/hound_data_write.py b/src/baskerville/scripts/hound_data_write.py new file mode 100755 index 0000000..bf55065 --- /dev/null +++ b/src/baskerville/scripts/hound_data_write.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python +# Copyright 2019 Calico LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +from optparse import OptionParser +import os +import sys + +import h5py +import numpy as np +import pdb +import pysam +import tensorflow as tf + +from baskerville import data +from baskerville import dna + +""" +basenji_data_write.py + +Write TF Records for batches of model sequences. +""" + + +################################################################################ +# main +################################################################################ +def main(): + usage = ( + "usage: %prog [options] " + ) + parser = OptionParser(usage) + parser.add_option( + "-d", + dest="decimals", + default=None, + type="int", + help="Round values to given decimals [Default: %default]", + ) + parser.add_option( + "-s", + dest="start_i", + default=0, + type="int", + help="Sequence start index [Default: %default]", + ) + parser.add_option( + "-e", + dest="end_i", + default=None, + type="int", + help="Sequence end index [Default: %default]", + ) + parser.add_option( + "--te", + dest="target_extend", + default=None, + type="int", + help="Extend targets vector [Default: %default]", + ) + parser.add_option("-u", dest="umap_npy", help="Unmappable array numpy file") + parser.add_option( + "--umap_clip", + dest="umap_clip", + default=1, + type="float", + help="Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]", + ) + parser.add_option( + "--umap_tfr", + dest="umap_tfr", + default=False, + action="store_true", + help="Save umap array into TFRecords [Default: %default]", + ) + parser.add_option( + "-x", + dest="extend_bp", + default=0, + type="int", + help="Extend sequences on each side [Default: %default]", + ) + (options, args) = parser.parse_args() + + if len(args) != 4: + parser.error("Must provide input arguments.") + else: + fasta_file = args[0] + seqs_bed_file = args[1] + seqs_cov_dir = args[2] + tfr_file = args[3] + + ################################################################ + # read model sequences + + model_seqs = [] + for line in open(seqs_bed_file): + a = line.split() + model_seqs.append(data.ModelSeq(0, a[0], int(a[1]), int(a[2]), None)) + + if options.end_i is None: + options.end_i = len(model_seqs) + + num_seqs = options.end_i - options.start_i + + ################################################################ + # determine sequence coverage files + + seqs_cov_files = [] + ti = 0 + seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti) + while os.path.isfile(seqs_cov_file): + seqs_cov_files.append(seqs_cov_file) + ti += 1 + seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti) + + if len(seqs_cov_files) == 0: + print( + "Sequence coverage files not found, e.g. %s" % seqs_cov_file, + file=sys.stderr, + ) + exit(1) + + seq_pool_len = h5py.File(seqs_cov_files[0], "r")["targets"].shape[1] + num_targets = len(seqs_cov_files) + + ################################################################ + # read targets + + # initialize targets + targets = np.zeros((num_seqs, seq_pool_len, num_targets), dtype="float16") + + # read each target + for ti in range(num_targets): + seqs_cov_open = h5py.File(seqs_cov_files[ti], "r") + targets[:, :, ti] = seqs_cov_open["targets"][options.start_i : options.end_i, :] + seqs_cov_open.close() + + ################################################################ + # modify unmappable + + if options.umap_npy is not None and options.umap_clip < 1: + unmap_mask = np.load(options.umap_npy) + + for si in range(num_seqs): + msi = options.start_i + si + + # determine unmappable null value + seq_target_null = np.percentile( + targets[si], q=[100 * options.umap_clip], axis=0 + )[0] + + # set unmappable positions to null + targets[si, unmap_mask[msi, :], :] = np.minimum( + targets[si, unmap_mask[msi, :], :], seq_target_null + ) + + elif options.umap_npy is not None and options.umap_tfr: + unmap_mask = np.load(options.umap_npy) + + ################################################################ + # write TFRecords + + # open FASTA + fasta_open = pysam.Fastafile(fasta_file) + + # define options + tf_opts = tf.io.TFRecordOptions(compression_type="ZLIB") + + with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer: + for si in range(num_seqs): + msi = options.start_i + si + mseq = model_seqs[msi] + mseq_start = mseq.start - options.extend_bp + mseq_end = mseq.end + options.extend_bp + + # read FASTA + seq_dna = fetch_dna(fasta_open, mseq.chr, mseq_start, mseq_end) + + # one hot code (N's as zero) + # seq_1hot = dna.dna_1hot(seq_dna, n_uniform=False, n_sample=False) + seq_1hot = dna.dna_1hot_index(seq_dna) # more efficient + + # truncate decimals (which aids compression) + if options.decimals is not None: + targets_si = targets[si].astype("float32") + targets_si = np.around(targets_si, decimals=options.decimals) + targets_si = targets_si.astype("float16") + # targets_si = rround(targets[si], decimals=options.decimals) + else: + targets_si = targets[si] + + assert np.isinf(targets_si).sum() == 0 + + # hash to bytes + features_dict = { + "sequence": feature_bytes(seq_1hot), + "target": feature_bytes(targets_si), + } + + # add unmappability + if options.umap_tfr: + features_dict["umap"] = feature_bytes(unmap_mask[msi, :]) + + # write example + example = tf.train.Example( + features=tf.train.Features(feature=features_dict) + ) + writer.write(example.SerializeToString()) + + fasta_open.close() + + +def tround(a, decimals): + """Truncate to the specified number of decimals.""" + return np.true_divide(np.floor(a * 10**decimals), 10**decimals) + + +def rround(a, decimals): + """Round to the specified number of decimals, randomly sampling + the last digit according to a bernoulli RV.""" + a_dtype = a.dtype + a = a.astype("float32") + dec_probs = (a - tround(a, decimals)) * 10**decimals + dec_bin = np.random.binomial(n=1, p=dec_probs) + a_dec = tround(a, decimals) + dec_bin / 10**decimals + return np.around(a_dec.astype(a_dtype), decimals) + + +def fetch_dna(fasta_open, chrm, start, end): + """Fetch DNA when start/end may reach beyond chromosomes.""" + + # initialize sequence + seq_len = end - start + seq_dna = "" + + # add N's for left over reach + if start < 0: + seq_dna = "N" * (-start) + start = 0 + + # get dna + seq_dna += fasta_open.fetch(chrm, start, end) + + # add N's for right over reach + if len(seq_dna) < seq_len: + seq_dna += "N" * (seq_len - len(seq_dna)) + + return seq_dna + + +def feature_bytes(values): + """Convert numpy arrays to bytes features.""" + values = values.flatten().tobytes() + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) + + +def feature_floats(values): + """Convert numpy arrays to floats features. + Requires more space than bytes for float16""" + values = values.flatten().tolist() + return tf.train.Feature(float_list=tf.train.FloatList(value=values)) + + +################################################################################ +# __main__ +################################################################################ +if __name__ == "__main__": + main()