"""Script for extracting mutational paths from BEAST *.trees file.

Written by Jesse Bloom, 2011."""


import re
import gzip
import tree
import parse_tree


def GetPaths(treefile, burnin, startname, endname, f_out, gzipped, seqtype):
    """Extracts mutational paths from a *.trees file, writes to specified output.

    'treefile' is the name of a *.trees file generated by BEAST.  This file
        can be gzipped if the gzipped switch is set to True
    'burnin' is an integer >= 0 specifying the number of trees in
        'treefile' that are treated as burnin, and not used to generate
        a path.  Note that this the number of trees, not the number of
        steps, treated as burnin.
    'startname' is the name of the sequence for which we start the path
        tracing.
    'endname' is the name of the sequence for which we end the path tracing.
    'f_out' is a writeable file-like object to which we write the paths.
    'gzipped' is a boolean switch, set to true if and only if 'treefile'
        is compressed with gzip.
    'seqtype' is "DNA" or "PROTEIN"
    """
    replace_with_null = [ # patterns to remove from node and branch annotations
        re.compile('history\_\d+\=\{\}\,'),
        re.compile('\,history\_\d+\=\{\}'),
        re.compile('history\_\d+\=\{\}')]
    aamatch = re.compile('^AA\d+$')
    print "\nExtracting mutational paths from %s." % treefile
    if gzipped:
        f_in = gzip.open(treefile)
    else:
        f_in = open(treefile)
    startcode = endcode = None
    start_trees = False
    itree = 0
    intmatch = re.compile('^\d+ ')
    for line in f_in:
        line = line.strip()
        if not start_trees:
            if (startname in line) and intmatch.search(line):
                if startcode:
                    raise ValueError("Duplicate codes for %s" % startname)
                startcode = intmatch.search(line).group(0).strip()
                print "The path will start from %s, which has code %s in this file." % (startname, startcode)
            if (endname in line) and intmatch.search(line):
                if endcode:
                    raise ValueError("Duplicate codes for %s" % endname)
                endcode = intmatch.search(line).group(0).strip()
                print "The path will end at %s, which has code %s in this file." % (endname, endcode)
        if 'tree' == line[ : 4]:
            start_tree = True
            if not startcode and endcode:
                raise ValueError("Starting trees, but have not found start and end code for %s and %s." % (startname, endname))
            itree += 1
            if itree <= burnin:
                print "Disregarding tree %d as burnin." % itree
            else:
                print "Parsing tree %d." % itree
                newick_tree = parse_tree.GetTreeString(line, replace_with_null)
                t = tree.Tree(newick_tree)
                time_label = "time_since_root"
                total_time = tree.AssignNodeTimes(t.GetRoot(), 0.0, time_label)
                parse_tree.AssignMutations(t.GetRoot(), total_time, time_label, 'PROTEIN')
                #tree.ApplyToNodes(t.GetRoot(), parse_tree.BreakDNASeqToAAs)
                (mutationpath, cumulative_time, forward_time) = parse_tree.ProteinMutationPath(t, startcode, endcode, 'PROTEIN')
                f_out.write("%s:%f" % mutationpath[0])
                for (mut, cumulative_time) in mutationpath[1 : ]:
                    f_out.write(", %s:%f" % (mut, cumulative_time))
                f_out.write('\n')
                f_out.flush()
    f_in.close()


def main():
    treefile = 'temp.trees.gz'
    outfile = 'temppath.txt'
    startname = 'A/Aichi/2/1968_1968.00'
    endname = 'A/Brisbane/10/2007_2007.10'
    burnin = 100 
    f_out = open(outfile, 'w')
    GetPaths(treefile, burnin, startname, endname, f_out, gzipped=True, seqtype="PROTEIN")
    f_out.close()

main()
