TreeMer.py

#!/usr/bin/env python3

"""
Takes kmer a set of nucleotide sequences in FASTA format, and generates kmer count files,
stuctured as:

    kmer0 count
    kmer1 count
    ...
    kmern count

in tab seperated format (denoting the kmer spectrum of the sequence). These kmer spectra
are used to distance vector, and a Hierarchical Clustering tree generated.

Dependencies:
python3
argparse
scipy
numpy
matplotlib
seaborn
"""

splash_text = """
 _____            ___  ___
|_   _|           |  \/  |
  | |_ __ ___  ___| .  . | ___ _ __
  | | '__/ _ \/ _ \ |\/| |/ _ \ '__|
  | | | |  __/  __/ |  | |  __/ |
  \_/_|  \___|\___\_|  |_/\___|_|    v1.0

Arthur V. Morris Feb 2020
Version 1.0
"""

import argparse
import sys
import time
import subprocess
import re
import matplotlib.pyplot as plt
import collections as c
import numpy as np
import seaborn as sns
from os import path, system
from scipy.cluster.hierarchy import dendrogram, linkage, to_tree
from scipy.spatial.distance import pdist, squareform

def main(argv):

    args = parse_args(argv)

    """ create dictionary to store kmer counts, structured as:

        { seq0 : { kmer0 : count, kmer1 : count, ... , kmern : count},
          seq1 : { kmer0 : count, kmer1 : count, ... , kmern : count}
          ... ,
          seqn : { kmer0 : count, kmer1 : count, ... , kmern : count} }
    """

    kmer_master_dict = c.defaultdict()

    print(splash_text)
    print(f"""
    Args:
    fa files    : {len(args.fa_files)}
    kmer size   : {args.k}
    max count   : {args.m}
    KS interval : {args.i[0]}-{args.i[1]}
    Skip kmerise: {args.s}
    Dist metric : {args.d}
    Clus method : {args.c}

    """)

    if args.s is False:
        print("Kmerising FASTAs...")
    else:
        print("Kmer files provided, skipping kmerisation.")

    for fa in args.fa_files:
        ## check if fasta files were passed, and therefore must be digested into kmer-spectra
        if args.s is False:
            check_format(fa, args)
            ks_file = kmerise(fa, args)
            kmer_array = read_kmer_array(ks_file, args)
            fname = path.basename(ks_file).split(".")[0]
        ## else assume they are pre-digested kmer-spectrum files
        else:
            check_format(fa, args)
            kmer_array = read_kmer_array(fa, args)
            fname = path.basename(fa).split(".")[0]

        kmer_master_dict[fname] = kmer_array

    if args.n:
        print("\nCalculating distance using kmer count vectors rather than frequency.\nI hope you know what you're doing...")

    print(f"\nCalculating {args.k}mer spectrum distance...\n")
    dist_dict = gen_Dn_dict(kmer_master_dict, args)
    print(f"Generating hierarchical clustering dendrogram...\n")
    plot_main(dist_dict, args)

def kmerise(fasta, args):
    """ Takes a sequence file in FASTA format and digests it into a counted kmer spectrum using genKmerCount.
    Outputs a tab seperated kmer count file.

    Source arguments used in this function:
    script = the path to this script, used to locate the genKmerCount executable
    k = kmer size used to digest the fasta file
    m = maximum kmer count to return kmers
    """

    root_dir = path.dirname(args.script)
    gKC_exec = root_dir + "/bin/genKmerCount"
    ks_outfile = path.join("./", f"{path.basename(fasta)}.k{args.k}")
    gKC_argline = f"{gKC_exec} {fasta} {args.k} {args.m} > {ks_outfile}"

    if args.v is True:
        ## verbose mode
        print(f"execline: {gKC_argline}")
    ## Spawn subprocess
    system(gKC_argline)

    return ks_outfile

def read_kmer_array(kmer_count_file, args):
    """ Reads the kmer count file generated by genKmerCount, structured as:

        kmer0 count
        kmer1 count
        ...
        kmern count

    in tab seperated format.

    Source arguments used in this function:
    lbc = lower bound percentile of the set of all kmers, ordered by count, to include in the tree (i.e. >=25th percentile of all kmers where lbc=25)
    ubc = upper bound percentile of the set of all kmers, ordered by count, to include in the tree (i.e. <=75th percentile of all kmers where ubc=75)

    Return:
    a counter dictionary of kmer counts structured as:
        {kmer0 : count, kmer1 : count, ... , kmern : count}
    """

    lbc=args.i[0]
    ubc=args.i[1]

    kmer_array = c.defaultdict(int)
    ksa = []

    with open(kmer_count_file, "r") as f:
        argc = f.readline()
        header = f.readline()

        for line in f.readlines():
            kmer, count, _ = re.split("[\t|\n]", line)
            if "N" in kmer:
                continue
            # kmer_array[kmer] = int(count)
            ksa.append([kmer, count])

    ksa_s = sorted(ksa, key=lambda x: x[1], reverse=True)
    lb = int((len(ksa_s)/100)*lbc)
    ub = int((len(ksa_s)/100)*ubc)

    for kmer, count in ksa_s[lb:ub+1]:
        kmer_array[kmer] = int(count)

    return kmer_array

def gen_Dn_dict(kmer_master_dict, args):
    """ Generates an n by n dimensional dictionary of kmer count/frequency vectors. Structured as:
        { G0 : { G0 : D, G1 : D, ... , Gn : D},
          G1 : { G0 : D, G1 : D, ... , Gn : D},
          ... ,
          Gn : { G0 : D, G1 : D, ... , Gn : D } }
    """

    kmer_set = {kmer for header, ks in kmer_master_dict.items() for kmer, count in ks.items()}
    total_c = {header : float(np.sum([count for kmer, count in ks.items()])) for header, ks in kmer_master_dict.items()}

    dist_dict = c.defaultdict(dict)
    for header1, ks1 in kmer_master_dict.items():
        for header2, ks2 in kmer_master_dict.items():
            if args.n is False:
                ## generate a kmer frequency observation vector
                obs_vec = [[ks1[kmer]/total_c[header1] if kmer in kmer_set else 0 for kmer in kmer_set],
                [ks2[kmer]/total_c[header2] if kmer in kmer_set else 0 for kmer in kmer_set]]
            else:
                ## generate a kmer count obervation vector
                obs_vec = [[ks1[kmer] if kmer in kmer_set else 0 for kmer in kmer_set],
                [ks2[kmer] if kmer in kmer_set else 0 for kmer in kmer_set]]
            dist_dict[header1][header2] = calc_distance(obs_vec, args)[0]

    return dist_dict

def calc_distance(obs_vec, args):
    """ Calculates the distance between oberservations within the observation vector, using the metric provided in args.d """

    return pdist(obs_vec, metric=args.d)

def get_geolocs(args):
    """ Extracts the geographic locations of provided sequences. Assuming they are in format:
        'ID  geoloc'
    in tab seperated format
    """

    geolocs = c.defaultdict(str)
    with open(args.g, "r") as tsv:
        for line in tsv.readlines():
            sline = line.split("\t")
            geolocs[sline[0]] = sline[1].split("\n")[0]

    return geolocs

def plot_main(dist_dict, args):
    """ Control function for data plotting and output. """

    ## if geolocations are provided, header_set is generated to include them in header id's
    if args.g != False:
        geolocs = get_geolocs(args)
        header_set = [f"{h} | {geolocs[h]}" if h in geolocs else h for h, d in dist_dict.items()]
    else:
        header_set = [h for h, d in dist_dict.items()]

    """ Pre-process and reformat data """
    ## 2D distance array from distance dictionary
    D_array = [[d2 for h2, d2 in d1.items()] for h1, d1 in dist_dict.items()]
    ## convert square 2D array into a condensed array for HC tree building
    Z = linkage(squareform(D_array, force='tovector'), args.c)

    """ Generate plots and raw data outputs """
    ## output simple heatmap in tsv format
    output_heatmap_tsv(header_set, D_array, f"./heatmap.{args.d}.tsv")

    ## plot heatmap
    plot_headmap(D_array, header_set, args)

    ## generate a scipy tree object and output to file in Newick format
    tree = to_tree(Z, False)
    newick = getNewick(tree, "", tree.dist, header_set)
    with open("./HC_tree.nwk", "w") as nwk_f: nwk_f.write(newick)

    ## plot HC tree
    plot_kmer_tree(Z, header_set, args)

def output_heatmap_tsv(header_set, D_array, out_tsv="./heatmap.tsv"):
    """ Writes the heatmap of Euclidean distances generated from kmer-spectrum data in tsv format. """

    print(f"Writing heatmap tsv to {out_tsv}...")
    with open(out_tsv, "w") as tsv:
        tsv.write("\t" + "\t".join(header_set) + "\n")
        for i, row in enumerate(D_array):
            tsv.write(header_set[i] + "\t")
            tsv.write("\t".join([str(j) for j in row]) + "\n")

def plot_headmap(D_array, header_set, args, style="ggplot"):
    """ Plots an n by n heatmap of sequence distances """

    plt.style.use(style)
    plt.figure("HM", figsize=[20,12])
    plt.title(f"{args.d} distance")
    sns.heatmap(D_array, xticklabels=header_set, yticklabels=header_set, linewidth=0.0, annot=False)

    plt.savefig("./heatmap.png")

def plot_kmer_tree(Z, header_set, args, style="ggplot"):
    """ Plots a hierarchical clustering tree using Euclidean distance between kmer spectrums of provided sequences. """

    # plt.tight_layout()
    plt.style.use(style)

    plt.figure("HCD", figsize=[20,12])
    plt.title(f"Hierarchical Clustering Dendrogram ({args.c})")
    plt.xlabel("Sequence ID")
    plt.ylabel(f"{args.d} distance")

    ddata = dendrogram(Z, labels=header_set, leaf_rotation=90)
    ## Add distances at branch roots
    for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
        x = 0.5 * sum(i[1:3])
        y = d[1]
        plt.plot(x, y, 'o', c=c)
        plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                     textcoords='offset points',
                     va='top', ha='center')

    plt.savefig("./HC_dendro.png")

def getNewick(node, newick, parentdist, leaf_names):
    """ Get a newick format tree from a scipy tree object """

    if node.is_leaf():
        return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
    else:
        if len(newick) > 0:
            newick = "):%.2f%s" % (parentdist - node.dist, newick)
        else:
            newick = ");"
        newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
        newick = getNewick(node.get_right(), f",{newick}", node.dist, leaf_names)
        newick = f"({newick}"
        return newick

def check_format(f, args):
    """ Checks the format of f is as expected given the provided arguments. """

    ## checks if f is in FASTA format by extension
    if args.s is False:
        assert (f.endswith(".fa") or f.endswith(".fna") or f.endswith(".fasta")), "FASTA format not detected. File extensions for sequence files must be one of the following: .fa .fna .fasta"
    ## checks if f in in accepted kmer count format
    elif args.s is True:
        with open(f, "r") as _f:
            assert (_f.readline().startswith("fasta")), "Kmer spectrum format not detected. Please use the genKmerCount tool to generate a kmer count file."

def is_file(filename):
    """ Checks if a path is a file """

    if not path.isfile(filename):
        msg = "{0} is not a file".format(filename)
        raise argparse.ArgumentTypeError(msg)
    else:
        return path.abspath(path.realpath(path.expanduser(filename)))

def is_dir(direname):
    """ Checks if a path is a directory """

    if not path.isdir(direname):
        msg = "{0} is not a directory".format(direname)
        raise argparse.ArgumentTypeError(msg)
    else:
        return path.abspath(path.realpath(path.expanduser(direname)))

def parse_args(argv):

    usage_line = """TreeMer.py [-h] [-i I I] [-k K] [-m M] [-s]
                  [-d {distance metric}}]
                  [-c {clustering method}]
                  [-g G]
                  [fa_files [fa_files ...]]"""

    parser = argparse.ArgumentParser(usage=usage_line)

    parser.add_argument('script', type=path.abspath, action='store', help=argparse.SUPPRESS)
    parser.add_argument('fa_files', type=is_file, nargs='*', action='store',
                        help='An arbitrary number of sequence files in FASTA format.')

    parser.add_argument('-k', type=int, action='store', default=7,
                        help='Kmer size to use in constructing genome comparison. Default=7.')
    parser.add_argument('-m', type=int, action='store', default=0,
                        help='The maximum count to return a kmer, e.g. return only kmers with count <=10 if m=10. Default=return ALL.')
    parser.add_argument('-i', type=int, nargs=2, default=[0,100], action='store',
                        help='Lower and upper bound percentiles to construct the tree. \
                        E.g. 25 75 will generate a tree from kmers from the 25th to the 75th percentiles in the total set of kmers ordered by count.')
    parser.add_argument('-n', action='store_true', default=False,
                        help='Calculate distance using kmer count vector rather than frequency vector. Default=False. NOT RECOMMENDED.')
    parser.add_argument('-s', action='store_true', default=False,
                        help='Suppress the generation of kmer-spectra from sequence files. \
                        This assumes that all positional arguments provided to this tool are already kmer-spectra files generated by genKmerCount. Default=False.')
    parser.add_argument('-d', type=str, action='store',
                        choices=[
                        'euclidean',
                        'minkowski',
                        'cityblock',
                        'sqeuclidean',
                        'hamming',
                        'jaccard',
                        'chebyshev',
                        'canberra',
                        'braycurtis',
                        'yule'],
                        default='euclidean', help='Metric used in calculating distance between kmer spectra. Default=euclidean.')
    parser.add_argument('-c', type=str, action='store',
                        choices=[
                        'ward',
                        'single',
                        'complete',
                        'average',
                        'weighted',
                        'centroid',
                        'median'],
                        default='ward', help='Clustering method utilised to build the tree. Default=ward.')
    parser.add_argument('-g', type=is_file, action='store', default=False,
                        help='A tab seperated text file containing geographic locations for each sequence, ith the sequence ID in col0 an geolocation in col1. Default=False.')
    parser.add_argument('-v', action='store_true', default=False,
                        help='Verbose output mode. Default=False.')

    args = parser.parse_args(argv)
    return args

if __name__ == "__main__":
    if len(sys.argv) < 2 or "-h" in sys.argv:
        print(splash_text)
        print(__doc__)
        main([sys.argv[0], "-h"])
        sys.exit(1)
    main(sys.argv)