#!/usr/bin/Rscript

##### This script was written by Serkan Erdin to perform differential expression analysis between selected groups

#################################################################################
##### SessionInfo		              					#
##### R version 3.2.2 (2015-08-14 )						#
##### Platform: x86_64-pc-linux-gnu (64-bit)					#
##### Running under: Red Hat Enterprise Linux Server release 6.5 (Santiago)	#
#####										#	
##### locale:									#	
##### [1] LC_CTYPE=en_US.UTF-8	LC_NUMERIC=C					#
#####  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8			#	
#####  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8			#
#####  [7] LC_PAPER=en_US.UTF-8	LC_NAME=C					#
#####  [9] LC_ADDRESS=C               LC_TELEPHONE=C				#
##### [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C			#
##### 										#
##### attached base packages:							#
##### [1] methods   stats     graphics  grDevices utils     datasets  base	#	
#####										#
##### other attached packages:							#
##### [1] sva_3.18.0        genefilter_1.52.1 mgcv_1.8-17	  nlme_3.1-131	#
##### [5] gap_1.1-16        edgeR_3.12.0	limma_3.26.3			#	
#####										#
##### loaded via a namespace (and not attached):				#
#####  [1] locfit_1.5-9.1	  IRanges_2.4.8        lattice_0.20-34		#
#####  [4] XML_3.98-1.3         grid_3.2.2           xtable_1.8-0		#	
#####  [7] DBI_0.5-1            stats4_3.2.2         RSQLite_1.0.0       	#
##### [10] annotate_1.48.0	  S4Vectors_0.8.11     Matrix_1.2-8             #
##### [13] splines_3.2.2        statmod_1.4.22       Biobase_2.30.0             #
##### [16] survival_2.39-5	  parallel_3.2.2       BiocGenerics_0.16.1      #
##### [19] AnnotationDbi_1.32.3                                                 #
#################################################################################

##### REQUIRED LIBRARIES #####
library(edgeR)
library(gap)
library(sva)

##### FUNCTION #####

source("cleaningP.R")


##### INPUT FILES AND PARAMETERS #####

countdataFile <- "Counts/countdata.csv"
metadataFile <- "MetaData/metadata.csv"
shortGeneFile<-"Mus_musculus_.GRCm38.75.genes_to_remove.txt"

readthreshold<-6
filter_short_genes<-"y"

countdata <- read.table(file=countdataFile, header = T, sep = ",",stringsAsFactors=F,check.names=F)
rownames(countdata) <- countdata$gene
countdata <- countdata[,-c(1:4)]

print(dim(countdata))

metadata <- read.table(file=metadataFile,header=T,sep=",",stringsAsFactors=F)

print(dim(metadata))

countdata <- countdata[,match(metadata$ID,colnames(countdata))]

if(!all.equal(colnames(countdata),metadata$ID)){
   print("Column names of count data did not match with metadata IDs correctly!")
}

shortGenes<-read.table(shortGeneFile,sep="\t")
colnames(shortGenes)=c("ensemblID","geneSymbol","genetype")
shortGenes$gene=paste(shortGenes$geneSymbol,shortGenes$ensemblID,sep="|")

noERCC<-subset(countdata, !(grepl("ERCC-", rownames(countdata))))
noShortGenes<-subset(noERCC,!(rownames(noERCC) %in% shortGenes$gene))
if (filter_short_genes == "y") {
  noERCC<-noShortGenes
}

dim(noERCC)

### differential expression analysis function

de_analysis <- function(input_count,metadata,comparison,case_group,control_group,prefix){

   group1_no <- nrow(metadata[metadata$Group==1,])
   group2_no <- nrow(metadata[metadata$Group==2,])
   group3_no <- nrow(metadata[metadata$Group==3,])
   group4_no <- nrow(metadata[metadata$Group==4,])

   groupInfo <- factor(c(rep(1,times=group1_no),rep(2,times=group2_no),rep(3,times=group3_no),rep(4,times=group4_no)))

   if(comparison != "all"){   
      keep <- (rowSums(input_count[,which(metadata$Group==case_group)] >= readthreshold) == length(which(metadata$Group==case_group)) ) | 
       (rowSums(input_count[,which(metadata$Group==control_group)] >= readthreshold) == length(which(metadata$Group==control_group)) )
    }else if(comparison == "all"){
      keep <- (rowSums(input_count[,which(metadata$Group==1)] >= readthreshold) == group1_no ) |
      (rowSums(input_count[,which(metadata$Group==2)] >= readthreshold) == group2_no ) |  
      (rowSums(input_count[,which(metadata$Group==3)] >= readthreshold) == group3_no ) |
      (rowSums(input_count[,which(metadata$Group==4)] >= readthreshold) == group4_no ) 
    }

    countselected <- input_count[keep,]

    y <-DGEList(countselected,group=groupInfo)
    y <- calcNormFactors(y,method="TMM")

    mod <- model.matrix(~0+groupInfo,data=y$samples)
    mod0 <- model.matrix(~ 1,data=y$samples)
    svobj <- svaseq(cpm(y),mod,mod0)
    dim(svobj$sv)
    designSVA <- cbind(mod,svobj$sv)
    designSVA


    cleanCount <- cleaningP(cpm(y,normalized.lib.sizes=TRUE, log=FALSE),mod,svobj)
    write.table(cleanCount,file=paste0(prefix,".SVAcorrectedcount.txt"),sep="\t",quote=F)

   if(comparison == "1vs2"){
       DEcontrast=c(1,-1,0,0,rep(0,times=dim(svobj$sv)[2]))
   }else if(comparison == "3vs1"){
       DEcontrast=c(-1,0,1,0,rep(0,times=dim(svobj$sv)[2]))
   }else if(comparison == "4vs2"){
       DEcontrast=c(0,-1,0,1,rep(0,times=dim(svobj$sv)[2]))
   }else if(comparison == "3vs4"){
       DEcontrast=c(0,0,1,-1,rep(0,times=dim(svobj$sv)[2]))
   }else if(comparison == "3vs2"){
       DEcontrast=c(0,-1,1,0,rep(0,times=dim(svobj$sv)[2]))
   }
    
DEcontrast
print(y$samples)
   if(comparison != "all"){
      y <- estimateDisp(y,designSVA,robust=TRUE)
      print(y$common.dispersion)

       pdf(paste(prefix,"_BCVplot.pdf",sep=""), width = 7 , height = 7 )
       plotBCV(y)
       dev.off()

       pdf(paste(prefix,"_MeanVariance.pdf",sep=""), width = 7 , height = 7 )
       meanvar <- plotMeanVar(y, show.raw.vars=TRUE,show.tagwise.vars=TRUE,show.ave.raw.vars=FALSE,NBline=TRUE,main = "Mean-Variance Plot")
       plotMeanVar(y, meanvar=meanvar, show.tagwise.vars=TRUE, NBline=TRUE)
       dev.off()

       fit <- glmQLFit(y,designSVA)
       head(fit$coefficients)

       pdf(paste(prefix,"_QLDisplot.pdf",sep=""), width = 7 , height = 7 )
       plotQLDisp(fit)
       dev.off()
       qlf <- glmQLFTest(fit,contrast=DEcontrast)
       qlf$table$BH = p.adjust(qlf$table$PValue,"BH")
       qlf$table$bonferroni = p.adjust(qlf$table$PValue,"bonferroni")

       is.de <- decideTestsDGE(qlf, p.value=0.05)
       summary(is.de)
       png(filename=paste(prefix,"_edgeR_qlf_qqplot.SVA.png",sep=""),width=800,height=600)
       qqunif(qlf$table$PValue)
       dev.off()

       png(filename=paste(prefix,"_edgeR_qlf_plotsmear.SVA.png",sep=""),width=800,height=600)
       plotSmear(qlf, de.tags=rownames(qlf)[is.de!=0])
       dev.off()

       write.table(qlf$table,file=paste(prefix,"_edgeR_quasilikelihoodFtest.SVA.txt",sep=""),sep="\t",quote=FALSE)
    }
} 

case_group <- 1
control_group <- 2
comparison <- "1vs2"
prefix <- "HttQ111vsWT_at_Hdac2WT"

de_analysis(noERCC,metadata,comparison,case_group,control_group,prefix)
		
case_group <- 3
control_group <- 1
comparison <- "3vs1"
prefix <- "Hdac2KOvsWT_at_HttQ111"

de_analysis(noERCC,metadata,comparison,case_group,control_group,prefix) 

case_group <- 4
control_group <- 2
comparison <- "4vs2"
prefix <- "Hdac2KOvsWT_at_HttWT"

de_analysis(noERCC,metadata,comparison,case_group,control_group,prefix)

case_group <- 3
control_group <- 4
comparison <- "3vs4"
prefix <- "HttQ111vsWT_at_Hdac2KO"

de_analysis(noERCC,metadata,comparison,case_group,control_group,prefix)

case_group <- 3
control_group <- 2
comparison <- "3vs2"
prefix <- "HttQ111atHdac2KOvsHttWTHdac2WT"

de_analysis(noERCC,metadata,comparison,case_group,control_group,prefix)

##### Below comparison is to generate SVA corrected counts for all the samples used in this study.

case_group <- NA
control_group <- NA
comparison <- "all"
prefix <- "ThisStudy"

de_analysis(noERCC,metadata,comparison,case_group,control_group,prefix)

sessionInfo()



