extract_regions

#!/usr/bin/env python

from __future__ import division
import os
import errno
import sys
import argparse
import shlex
import shutil
import time
import subprocess
import glob
import multiprocessing
import tempfile


# position of the script -------------------------------------------------------
path_DB = os.path.realpath(__file__)
path_array = path_DB.split("/")
relative_path = "/".join(path_array[0:-1])
relative_path = relative_path + "/"

# ------------------------------------------------------------------------------
#       print the help informations
# ------------------------------------------------------------------------------
class CapitalisedHelpFormatter(argparse.HelpFormatter):
    def add_usage(self, usage, actions, groups, prefix=None):
        if prefix is None:
            prefix = ''
        return super(CapitalisedHelpFormatter, self).add_usage(usage, actions, groups, prefix)


def msg(name=None):
    str_msg = '''
\00
Program: extract_regions - a tool to extract variable regions from 16S gene sequences

Usage: extract_regions [-i/-a] <file> [options]

Input option:
  -i  FILE    fasta file containing the 16S sequences
  -a  FILE    provide the alignment generated by -A [None]
  -f  STR     instead of extracting a region, specify forward primer [None]
  -r  STR     instead of extracting a region, specify reverse primer [None]
  -c  STR     which clade to use [Bacteria]

Output option:
  -o  FILE    output file name [stdout]
  -A  FILE    save the intermediate alignment [None]
  -s          print output as MSA

Algorithm options:
  -t  INT     number of threads [1]
  -v  INT     verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3]

Cmalign options:
  --mxsize  INT  maximum allowable DP matrix size [4000]

Possible forward primers: 8F, 27F, 68F, 341F, 515F, 967F, 1237F
Possible reverse primers: 338R, 519R, 785R, 806R, 907R, 926R, 1100R, 1391R, 1492R
        '''
    return str_msg

# ------------------------------------------------------------------------------
# function to check if a specific tool exists
def is_tool(name):
    try:
        devnull = open(os.devnull)
        subprocess.Popen([name], stdout=devnull, stderr=devnull).communicate()
    except OSError as e:
        if e.errno == errno.ENOENT:
            return False
    return True

# ------------------------------------------------------------------------------
# function to align fasta sequences with cmalign and return the stream of data
def cm_align_fun(relative_path,fasta_input,mxsize,threads,selected_cmfile,output_alignment,save_alignment):
    # NOTE1: that the alignment used U instead of T's, so we need to convert
    # before calling the alignment

    # NOTE2: we add a reference sequence at the beginning. The variable
    # regions and primers are defined on this sequence

    # both are taken care in bin/print_converted.py
    cmd_convert = "python "+relative_path+"bin/print_converted.py "
    cmd_convert = cmd_convert + relative_path+"DATA/reference_sequence.fasta "
    cmd_convert = cmd_convert + fasta_input

    # cmalignscript (note output is in stdout)
    cmd_cmalign = "cmalign --mxsize "+str(mxsize)+" --outformat Pfam --cpu "+str(threads)+" "
    # cm model
    cmd_cmalign = cmd_cmalign + relative_path+"DATA/RDPinfernal1.1Traindata/"+selected_cmfile
    # fasta input, is through stdin
    cmd_cmalign = cmd_cmalign + " -"

    # parse alignment
    cmd_parse_al = "python "+relative_path+"bin/parse_alignment.py "
    cmd_parse_al = cmd_parse_al + output_alignment + " "+save_alignment

    # run convert and alignment ----------------------------
    CMD_convert = shlex.split(cmd_convert)
    CMD_cmalign = shlex.split(cmd_cmalign)
    CMD_parse_al = shlex.split(cmd_parse_al)

    # import devnull
    try:
        from subprocess import DEVNULL
    except ImportError:
        DEVNULL = open(os.devnull, 'wb')

    # commands
    convert_ex = subprocess.Popen(CMD_convert,stdout=subprocess.PIPE,encoding='utf8')
    cmalign_ex = subprocess.Popen(CMD_cmalign,stdin=convert_ex.stdout,stdout=subprocess.PIPE,encoding='utf8')
    parse_al_ex = subprocess.Popen(CMD_parse_al,stdin=cmalign_ex.stdout,stdout=subprocess.PIPE,encoding='utf8')

    # return the line
    for line in parse_al_ex.stdout:
        yield line

    # check that the commands finished correctly
    convert_ex.stdout.close()
    return_code = convert_ex.wait()
    if return_code:
        sys.stderr.write("[E::cmalign] Error. parse the fasta file failed\n")
        sys.exit(1)

    cmalign_ex.stdout.close()
    return_code = cmalign_ex.wait()
    if return_code:
        sys.stderr.write("[E::cmalign] Error. cmalign failed\n")
        sys.exit(1)

    parse_al_ex.stdout.close()
    return_code = parse_al_ex.wait()
    if return_code:
        sys.stderr.write("[E::cmalign] Error. parsing the alignment file failed\n")
        sys.exit(1)


# ------------------------------------------------------------------------------
# function to load to the stream of data the alignment rows
def create_stream_al(alignment_file):
    o = open(alignment_file,"r")
    for line in o:
        yield line
    o.close()

# ------------------------------------------------------------------------------
# function to find the positions to extract
def find_al_position(forPrimer, reversePrimer, aligned_ref):
    pos_to_check = dict()
    # FIND VARIABLE REGIONS --------------------------
    if forPrimer == "" and reversePrimer == "":
        # then we extract all the 9 variable regions
        o = open(relative_path+"DATA/variable_regions.tsv","r")
        o.readline()
        o.readline()
        for line in o:
            vals = line.rstrip().split("\t")
            pos_to_check[vals[0]] = vals[1]+"-"+vals[2]
        o.close()

    # FIND REGION BETWEEN PRIMERS --------------------
    if (forPrimer != "" or reversePrimer != "") and forPrimer.endswith("F") and reversePrimer.endswith("R"):
        # load primers and find the position in the string, i.e. left margin and
        # right margin
        o = open(relative_path+"DATA/primers_info.tsv","r")
        o.readline()
        for i in o:
            vals = i.split("\t")
            if vals[0] == forPrimer:
                left_margin = vals[1]
            if vals[0] == reversePrimer:
                right_margin = vals[2]
        o.close()
        pos_to_check[forPrimer+"-"+reversePrimer] = left_margin+"-"+right_margin

    # FIND REGION BETWEEN POSITIONS --------------------
    if forPrimer != "" or reversePrimer != "":
        if not forPrimer.endswith("F"):
            pos_to_check[forPrimer+"-"+reversePrimer] = forPrimer+"-"+reversePrimer

    # arriving here, we have in pos_to_check the positions in the reference
    # sequence that we need to extract
    # now we need to find these positions in the MSA
    pos_in_msa = dict()
    for region in pos_to_check:
        # position in the string
        left_margin = int(pos_to_check[region].split("-")[0])
        right_margin = int(pos_to_check[region].split("-")[1])
        # position in the MSA
        left_msa = 0
        right_msa = 0
        # go throught the sequence and find the positions in the MSA
        count_nucleotides = 0
        count_pos_msa = 0
        for p in aligned_ref:
            count_pos_msa = count_pos_msa + 1
            if p != "." and p != "-":
                count_nucleotides = count_nucleotides + 1
                if left_margin == (count_nucleotides + 1): left_msa = count_pos_msa
                if right_margin == count_nucleotides: right_msa = count_pos_msa
        # add result to the dict to return
        pos_in_msa[region] = [left_msa,right_msa]

    return pos_in_msa


# ------------------------------------------------------------------------------
# MAIN
# ------------------------------------------------------------------------------
def main(argv=None):

    devnull = open(os.devnull)

    parser = argparse.ArgumentParser(usage=msg(), formatter_class=CapitalisedHelpFormatter,add_help=False)
    parser.add_argument('-v', action='store', type=int, default=None, dest='verbose', help='Verbose levels')
    parser.add_argument('-t', type=int, action="store", dest='threads', default=None, help='Number of threads to be used.')
    parser.add_argument('-o', action="store", dest='output', default=None, help='name of output file')
    parser.add_argument('-A', action="store", dest='output_alignment', default=None, help='save alignment')
    parser.add_argument('-r', action="store", default=None,dest='reversePrimer', help='which reverse primer to use')
    parser.add_argument('-f', action="store", default=None,dest='forPrimer', help='which forward primer to use')
    parser.add_argument('-c', action="store", default=None,dest='clade', help='which clade',choices=['Bacteria',"Archaea","Fungal"])
    parser.add_argument('-i', action="store", default=None,dest='fasta_input', help='fasta input')
    parser.add_argument('-a', action="store", default=None,dest='al_input', help='alignment input')
    parser.add_argument('-s', action='store_true', default=None, dest='out_as_alignment', help='print output as an alignment')
    parser.add_argument('--mxsize', action="store", default=None, dest='mxsize', help='to input to cmalign')
    args = parser.parse_args()

    if (args.fasta_input is None) and (args.al_input is None):
        print(msg())
        sys.exit(1)


    # set default for args.verbose
    if (args.verbose is None): args.verbose = 3

    # set the default
    if (args.output is None):
        output = ""
    else:
        output = args.output
    if (args.threads is None): args.threads = 1
    if (args.mxsize is None): args.mxsize = 4000
    if (args.output_alignment is None):
        args.output_alignment = "NA"
        save_alignment = "False"
    else:
        save_alignment = "True"
    if (args.reversePrimer is None): args.reversePrimer = ""
    if (args.forPrimer is None): args.forPrimer = ""
    if (args.fasta_input is None): args.fasta_input = ""
    if (args.al_input is None): args.al_input = ""
    if (args.out_as_alignment is None): args.out_as_alignment = False
    if (args.clade is None): args.clade = "Bacteria"

    # ---------------------- check general parameters --------------------------
    if args.verbose < 1:
        sys.stderr.write("[E::main] Error: verbose level (-v) is less than 1\n")
        sys.exit(1)
    if args.threads < 1:
        sys.stderr.write("[E::main] Error: number of threads (-t) is less than 1\n")
        sys.exit(1)
    # there cannot be -a and -i
    if args.fasta_input != "" and args.al_input != "":
        sys.stderr.write("[E::main] Error: Provide -i OR -a (not both)\n")
        sys.exit(1)
    # there should be either -a or -i
    if args.fasta_input == "" and args.al_input == "":
        sys.stderr.write("[E::main] Error: Missing input file (provide either -i or -a)\n")
        sys.exit(1)
    # check that if one primer is present, then they both are
    if (args.forPrimer != "" and args.reversePrimer == ""):
        sys.stderr.write("[E::main] Error: -r missing\n")
        sys.exit(1)
    if (args.forPrimer == "" and args.reversePrimer != ""):
        sys.stderr.write("[E::main] Error: -f missing\n")
        sys.exit(1)
    # check that the primers are correct (regarding the selection of the region)
    if args.forPrimer != "" and args.reversePrimer != "":
        if args.forPrimer[-1] == "F":
            pos_F = int(args.forPrimer[:-1])
        else:
            pos_F = int(args.forPrimer)
        if args.reversePrimer[-1] == "R":
            pos_R = int(args.reversePrimer[:-1])
        else:
            pos_R = int(args.reversePrimer)

        if pos_F > pos_R:
            sys.stderr.write("[E::main] Error: primers don't represent any region\n")
            sys.exit(1)


    # --------------------------- set the clade --------------------------------
    selected_cmfile = "bacteria_model.cm"
    if(args.clade != "Bacteria"):
        sys.stderr.write("[W::main] Warning: only Bacteria is implemented. Running with -c Bacteria\n")


    # --------------------------------------------------------------------------
    #   0. CHECK DEPENDENCIES
    # --------------------------------------------------------------------------
    # Infernal
    if not is_tool("cmalign"):
        if args.fasta_input != "": # needed only if the input is the fasta file
            sys.stderr.write("[E::main] Error: cmalign is not in the path. Please install Infernal.\n")
            sys.exit(1)

    # --------------------------------------------------------------------------
    #   0. CHECK FILE
    # --------------------------------------------------------------------------
    # check that -i is a fasta file
    if args.fasta_input != "":
        try:
            o = open(args.fasta_input,"r")
            # check that it starts with ">"
            if not(o.readline().startswith(">")):
                sys.stderr.write("[E::main] Error. Not a fasta file: "+args.fasta_input+"\n")
                sys.stderr.write("          Fasta file is expected to start with '>'\n")
                o.close()
                sys.exit(1)
            o.close()
        except:
            sys.stderr.write("[E::main] Error: Cannot open file: "+args.fasta_input+"\n")
            sys.exit(1)
    # check that -a is alignment file
    if args.al_input != "":
        try:
            o = open(args.al_input,"r")
            # check that it starts with "REFERENCE"
            if not(o.readline().startswith("REFERENCE")):
                sys.stderr.write("[E::main] Error. Not an alignment file: "+args.al_input+"\n")
                o.close()
                sys.exit(1)
            o.close()
        except:
            sys.stderr.write("[E::main] Error: Cannot open file: "+args.al_input+"\n")
            sys.exit(1)

    # --------------------------------------------------------------------------
    #   1. ALIGN FASTA INPUTS
    # --------------------------------------------------------------------------
    if args.fasta_input != "":
        # we use cmailgn to align the 16S sequences as input to
        al_lines = cm_align_fun(relative_path,args.fasta_input,args.mxsize,args.threads,selected_cmfile,args.output_alignment,save_alignment)
        # in al_lines there is a stream of rows that represents aligned 16S sequences

    # otherwise we load the alignment saved before
    if args.al_input != "":
        al_lines = create_stream_al(args.al_input)


    # --------------------------------------------------------------------------
    #   2. EXTRACT COLUMNS FROM THE MSA
    # --------------------------------------------------------------------------
    check_ref = True # faster to check than the startswith

    # general print
    if output != "":
        outfile = tempfile.NamedTemporaryFile(delete=False, mode="w")
        os.chmod(outfile.name, 0o644)
    else:
        outfile = sys.stdout

    for row in al_lines:
        # the first line is the reference sequence, that we use to extract the
        # correct columns
        if check_ref:
            if row.startswith("REFERENCE_Escherichia-coli-K-12"):
                check_ref = False
                al_ref_seq = row.rstrip().split("\t")[1]
                dict_positions = find_al_position(args.forPrimer, args.reversePrimer,al_ref_seq)
                # dict_positions contains the position in the alignment of the
                # region(s) that we are intereseted in, example:
                # {'V1': '134-189', 'V2': '265-308', ...

                # if we have only one, I extract the key of the dictionary
                s_region = next(iter(dict_positions))
                # and the two boundaries
                l_re = dict_positions[s_region][0]
                r_re = dict_positions[s_region][1]

        else:
            # we extract the columns from dict_positions
            vals = row.rstrip().split("\t")
            # if there is only one region, then we don't change the header
            if len(dict_positions) == 1:
                outfile.write(">"+vals[0]+"\n")
                if args.out_as_alignment:
                    outfile.write(vals[1][l_re:r_re]+"\n")
                else:
                    outfile.write(vals[1][l_re:r_re].replace(".","").replace("-","").upper()+"\n")
            # otherwise, we add the region info in the header
            else:
                for rr in dict_positions:
                    outfile.write(">"+vals[0]+"__"+rr+"\n")
                    if args.out_as_alignment:
                        outfile.write(vals[1][dict_positions[rr][0]:dict_positions[rr][1]]+"\n")
                    else:
                        outfile.write(vals[1][dict_positions[rr][0]:dict_positions[rr][1]].replace(".","").replace("-","").upper()+"\n")


    # move the temp file to the final destination ---------
    if output != "":
        try:
            outfile.flush()
            os.fsync(outfile.fileno())
            outfile.close()
        except:
            sys.stderr.write("[E::main] Error: failed to save the output file\n")
            sys.exit(1)
        try:
            shutil.move(outfile.name,output) #It is not atomic if the files are on different filsystems.
        except:
            sys.stderr.write("[E::main] Error: failed to save the output file\n")
            sys.stderr.write("[E::main] you can find the file here:\n"+outfile.name+"\n")
            sys.exit(1)


    return 0        # success


#-------------------------------- run main -------------------------------------
if __name__ == '__main__':
    status = main()
    sys.exit(status)