require(phyloseq)
require(magrittr)
require(ape)
require(tidyr)
require(dplyr)

# Given a tree and a tax table (corresponding to that tree) runs 
# tax2tree (the python scripts) and outputs a tree decorated with the 
# taxonomy - will strip existing node names on the tree. 
# tax should be a filename - should not have headers
# Cleanup removes temporary tax2tree output files.
# Note path to nlevel is hardcoded currently TODO:  
tax2tree <- function(tr, tax, cleanup=TRUE){
  df <- read.delim(tax, header=F)[,c(1,2)]
  df <- clean.tax(df)
  
  ## Subset to only contain the tips in tr
  df %<>% filter(V1 %in% tr$tip.label)
  
  # Write cleaned to file
  write.table(df, file='tmp.tax.cleaned.txt', sep='\t',
              quote=F, col.names=F, row.names=F)
  
  # Write tree with labels removed
  tr$node.label <- NULL
  write.tree(tr, file='tmp.tree')
  
  # Run tax2tree
  system("/Users/Justin/Library/Enthought/Canopy_64bit/User/bin/nlevel -t tmp.tree -m tmp.tax.cleaned.txt -o tax2tree_output", wait = T)
  while(!file.exists('tax2tree_output')){Sys.sleep(1)}
  
  # Note Documentation of problem 
  # https://groups.google.com/forum/#!msg/qiime-forum/v-TfjP20uws/IGkeWpUO6rwJ
  # This is the reason I am using phyloseq's tree reader for greengenes style trees
  # That apparently tax2tree also outputs in. 
  tr <- phyloseq::read_tree_greengenes('tax2tree_output')
  
  # remove weirdness
  tr$node.label <- gsub('><-><','.',tr$node.label)
  
  # Remove extra quotes around names:
  # http://joey711.github.io/phyloseq-demo/HMP_import_example.html
  tr$node.label <- gsub("'","", tr$node.label)
  tr$tip.label <- gsub("'","", tr$tip.label)
  
  # Remove tax2tree output, and tmp.tax, tmp.tree
  system('rm tmp.tax.cleaned.txt')
  system('rm tmp.tree')
  if (cleanup)system('rm tax2tree_output*')
  
  # return annotated tree
  tr
}

# called from tax2tree - input is data.frame to be 'cleaned' for 
# tax2table - e.g., make sure everything has the same number of ranks 
# (even if they are empty) and ensure that stuff uses the "Unclassified" 
# keyword. 
clean.tax <- function(df) {
  df %<>% separate(V2, paste('c', 1:7,sep=''), sep=';')
  
  # Fix the NAs
  df[['c7']][is.na(df[['c7']])] <- ' s__'
  df[['c6']][is.na(df[['c6']])] <- ' g__'
  df[['c5']][is.na(df[['c5']])] <- ' f__'
  df[['c4']][is.na(df[['c4']])] <- ' o__'
  df[['c3']][is.na(df[['c3']])] <- ' c__'
  df[['c2']][is.na(df[['c2']])] <- ' p__'
  df[['c1']][is.na(df[['c1']])] <- ' k__'
  
  # Fix "Unassigned"
  if (any(df$c1=='Unassigned')){
    df[df$c1=='Unassigned',c(3,8)] <- rep(NA, 6)
    df[df$c1=='Unassigned','c1'] <- 'Unclassified'
  }
  
  # Strip Whitespace
  # returns string w/o leading or trailing whitespace
  trim <- function (x) gsub("^\\s+|\\s+$", "", x)
  df %<>% sapply(trim) %>% as.data.frame(stringsAsFactors=F)
  
  # Collapse taxonomy to a single column 
  df %<>% unite(c, c1, c2, c3, c4, c5, c6, c7, sep='; ')
  df
}


extract.rank.data <- function(tr) {
  # calculate mean.dist.to.tips for each node
  blw <- mean_dist_to_tips(tr)
  
  # If a given node has multiple ranks assigned take the lowest. 
  blw.lowest.names <- strsplit(names(blw), '.', fixed=TRUE) %>% 
    map(~(ifelse(length(.x)>0, .x[length(.x)], ""))) %>% 
    as_vector()
  names(blw) <- blw.lowest.names
  
  # Depth of species
  blw.byrank <- list()
  blw.byrank[['species']] <- blw[grep('s__[A-Za-z]+', names(blw))]
  blw.byrank[['genus']] <- blw[grep('g__[A-Za-z]+', names(blw))]
  blw.byrank[['family']] <- blw[grep('f__[A-Za-z]+', names(blw))]
  blw.byrank[['order']] <- blw[grep('o__[A-Za-z]+', names(blw))]
  blw.byrank[['phylum']]<- blw[grep('p__[A-Za-z]+', names(blw))]
  blw.byrank[['kingdom']]<- blw[grep('k__[A-Za-z]+', names(blw))]
  
  blw.byrank %<>% 
    map(~data.frame(mean.dist.to.tips=.x)) %>% 
    bind_rows(.id='rank')
  
  # Reorder factors 
  blw.byrank$rank <- factor(blw.byrank$rank, 
                            levels=c('species', 'genus','family', 'order', 
                                     'phylum', 'kingdom'))
  blw.byrank
}
