#analysis of MJ11 transcriptomic changes in evolved and delta-binK strains
# used to generate Figure 9 and Table 3

#source("http://www.bioconductor.org/biocLite.R")
#biocLite("edgeR")

#DE between species, for each tissue
library(edgeR)
dat1<-read.table("~/Source code 1.txt",header=T, row.names=1)
colnames(dat1)<-substring(colnames(dat1),first=15,last=19)
dat2<-read.table("~/Source code 2.txt",header=T, row.names=1)
colnames(dat2)<-substring(colnames(dat2),first=15,last=19)
data<-rbind(dat1,dat2)
data<-round(data)
colnames(data)
data<-data[,c(1,8,9,10,11,12,2,3,4,5,6,7)]

colSums(data)
#drop<-c(2); data<-data[,-drop]


#specify strain factor 
strains<-factor(c(rep("MJ11",4),rep("p2p4i1",4),rep("delta",4)))
#strains<-factor(c(rep("MJ11",3),rep("p2p4i1",4),rep("delta",4)))

#par(mfrow=c(1,2))

#for exactTest: only one factor is considered
#ie, MJ vs delta comparison
#can calc dispersion using qCML instead of GLM
MJdeltadat<-data[,which(strains == "MJ11" | strains=="delta")]
strains<-factor(c(rep("MJ11",4),rep("delta",4))) #strains<-factor(c(rep("MJ11",3),rep("delta",4)))


b<-DGEList(counts= MJdeltadat,group=strains)
b<-calcNormFactors(b)
b<-estimateCommonDisp(b, verbose=T)
b<-estimateTrendedDisp(b)
b<-estimateTagwiseDisp(b)
plotBCV(b,cex=0.5)

et<-exactTest(b)
toptagstable<-topTags(et,n=nrow(b))$table  #adds a FDR col
toptagstable<-toptagstable[order(toptagstable$FDR),]
head(toptagstable) 
signif<-0.05

toptop<-toptagstable[toptagstable$FDR<signif,]
toptop[order(rownames(toptop)),]
detags<-rownames(topTags(et,n=20))
cpm(b)[detags,]
summary(de<-decideTestsDGE(et, p= signif,adjust="BH"))
detags<-rownames(b)[as.logical(de)] #could set FDR lower above

MJ_deltal_detags<-rownames(b)[as.logical(de)] #could set FDR lower above

#log-fold change/MA plot
plotSmear(et,de.tags=detags)
title("MJ vs delta: DE transcripts")
#abline(h=c(-2,2), col="blue")
legend("topright",legend=c(paste("FDR p <", signif)),text.col="red")
#with(toptagstable, plot(logCPM,logFC)) #same as plotsmear
with(subset(toptagstable, FDR<signif),points(logCPM,logFC, col="red")) 
with(subset(toptagstable, FDR<signif),text(logCPM,logFC,labels=rownames(subset(toptagstable, FDR<signif)), col="red",adj=1.5,cex=0.5))
text(toptagstable$logCPM[rownames(toptagstable)=="VFMJ11_A1146"],toptagstable$logFC[rownames(toptagstable)=="VFMJ11_A1146"],labels="A1146")

#highlight specfic sets of genes
geneset<-c("VFMJ11_A0104","VFMJ11_A0105","VFMJ11_A0106","VFMJ11_A0107") #dmso reductatse
geneset<-c("VFMJ11_A1038","VFMJ11_A1039","VFMJ11_A1040","VFMJ11_A1041") #lum biosynthesis related
geneset<-c("VFMJ11_A1058","VFMJ11_A1059","VFMJ11_A1060","VFMJ11_A1061") #fructose related
geneset<-c("VFMJ11_A0388","VFMJ11_A0389","VFMJ11_A0390","VFMJ11_A0391","VFMJ11_A0392","VFMJ11_A0393","VFMJ11_A0394","VFMJ11_A0395","VFMJ11_A0396","VFMJ11_A0397","VFMJ11_A0398") #hhk region related
geneset<-c("VFMJ11_A0487") #NAG related
geneset<-c("VFMJ11_A1000","VFMJ11_A1001","VFMJ11_A1007") # cellulose synthesis
subdat<-toptagstable[rownames(toptagstable) %in% geneset,]
with(subdat,text(logCPM,logFC,labels=rownames(subdat), col="red",adj=1.5,cex=0.5))


mjdeltatoptags<-toptagstable



#volcano plot  
#plot(et$table$logFC,-log(FDR),pch=20,cex=0.5)
with(toptagstable,plot(logFC,-log2(FDR),main="MJ vs delta log-fold change vs significance", pch=20, cex=0.5, col="grey"))
with(subset(toptagstable, FDR<signif),points(logFC,-log2(FDR), pch=20, cex=0.5,col="red"))
legend("topright",legend=c(paste("FDR p <", signif)),text.col="red")
abline(v=c(-2,2),col="blue")


#for exactTest: only one factor is considered
#ie, MJ vs p2p4 comparison
strains<-factor(c(rep("MJ11",3),rep("p2p4i1",4),rep("delta",4)))
MJp2dat<-data[,which(strains == "MJ11" | strains=="p2p4i1")]
strains<-factor(c(rep("MJ11",3),rep("evolved p2p4i1",4)))
signif<-0.05
b<-DGEList(counts= MJp2dat,group=strains)
b<-calcNormFactors(b)
b<-estimateCommonDisp(b, verbose=T)
b<-estimateTrendedDisp(b)
b<-estimateTagwiseDisp(b)
plotBCV(b,cex=0.5)

et<-exactTest(b)
toptagstable<-topTags(et,n=nrow(b))$table  #adds a FDR col
toptagstable<-toptagstable[order(toptagstable$FDR),]
head(toptagstable) 
toptop<-toptagstable[toptagstable$FDR<signif,]

detags<-rownames(topTags(et,n=20))
cpm(b)[detags,]
summary(de<-decideTestsDGE(et, p=0.05,adjust="BH"))
detags<-rownames(b)[as.logical(de)] #could set FDR lower above

#log-fold change/MA plot
plotSmear(et,de.tags=detags)
title("MJ vs p2p4i1: DE transcripts")
#abline(h=c(-2,2), col="blue")
legend("topright",legend=c(paste("FDR p < 0.05")),text.col="red")
#with(toptagstable, plot(logCPM,logFC)) #same as plotsmear
with(subset(toptagstable, FDR<0.05),points(logCPM,logFC, col="red")) #could use to show gradations in FDR values by changing cutoff and colors sequentially adding to open plot
with(subset(toptagstable, FDR<signif),text(logCPM,logFC,labels=rownames(subset(toptagstable, FDR<signif)), col="red",adj=1.5,cex=0.5))


with(toptagstable,plot(logFC,-log2(FDR),main="MJ vs p2p4 log-fold change vs significance", pch=20, cex=0.5, col="grey"))
with(subset(toptagstable, FDR<0.05),points(logFC,-log2(FDR), pch=20, cex=0.5,col="red"))
legend("topright",legend=c(paste("FDR p < 0.05")),text.col="red")
#abline(v=c(-2,2),col="blue")

#generate heat map of zscore expression for top DE genes (Figure 10)
strains<-factor(c(rep("MJ11",4),rep("p2p4i1",4),rep("delta",4)))
b<-DGEList(counts=data, group=strains)
normcounts<-cpm(b, normalized.lib.sizes=T)
zscored<-matrix(dat=NA,ncol=12,nrow=nrow(normcounts),dimnames=list(rownames(normcounts),colnames(normcounts)));
for (i in 1:nrow(normcounts)){zscored[i,]<-scale(normcounts[i,], center=T,scale=T)}

MJ_deltal_detags
zmygenes<-zscored[MJ_deltal_detags,]

tzscored<-t(zmygenes)
rownames(tzscored)<-strains

#heat plot for expression of top 2000 DE genes
library(lattice)

rgb.palette <- colorRampPalette(c( "red","black","green"), space = "Lab")
library(gplots)
heatmap.2(tzscored,trace="none",dendrogram="none",key=F,col=rgb.palette(120),density.info=NULL)

par(oma=c(1,1,1,1))
heatmap.2(tzscored, trace="none", dendrogram="none",key=F, col=rgb.palette(120), density.info=NULL, Rowv=NULL)


