# Assign updated taxonomy information to all the sequences in the HMP dataset.
# That we are using.  
# Also output tree for use by tax2tree. 

library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
library(magrittr)
library(purrr)
library(ape)

load('../HMP.RData')

otus <- taxa_names(HMP)

refs <- readDNAStringSet('~/Research/data/_data_derived/HMP16s/rep_set_v35.fna.gz')

n <- names(refs)
n %<>% strsplit(" ") %>%
  map(~.x[1]) %>%
  as_vector()
names(refs) <- n

refs <- refs[otus]

# Write this much smaller fasta file
writeXStringSet(refs, file='rep_set_v35_phiselected.fna.gz', compress='gzip')

# Follow with assign_taxonomy.sh


# Output tree 
tr <- phy_tree(HMP)
tr$node.label <- NULL
write.tree(tr, file='hmp.selected.tree')
