#!/usr/bin/Rscript

##################################################################################################
#### This script was written by Serkan Erdin to assess the statistical significance of        ####
#### overlap of differentially expressed genes (DEGs) between this study and Langfelder study ####
#### and to generate expression heatmap of overlapping DEGs                                   ####
##################################################################################################

##################################################################################################
##### R version 3.4.3 (2017-11-30)                                                            ####
##### Platform: x86_64-apple-darwin15.6.0 (64-bit)                                            ####
##### Running under: macOS Sierra 10.12.6                                                     ####
#####                                                                                         ####
##### Matrix products: default                                                                ####
##### BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib       ####
##### LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib     ####
#####                                                                                         ####
##### locale:                                                                                 ####
##### [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8                       ####
#####                                                                                         ####
##### attached base packages:                                                                 ####
##### [1] grid      stats     graphics  grDevices utils     datasets  base                    ####
#####                                                                                         ####
##### other attached packages:                                                                ####
##### [1] RColorBrewer_1.1-2 pheatmap_1.0.8                                                   ####
#####                                                                                         ####
##### loaded via a namespace (and not attached):                                              ####
##### [1] colorspace_1.3-2 scales_0.5.0     compiler_3.4.3   plyr_1.8.4                       ####
##### [5] gtable_0.2.0     Rcpp_0.12.17     methods_3.4.3    munsell_0.4.3                    ####
##################################################################################################

library(pheatmap)
library(RColorBrewer)
library(grid)

countDEG <- function(x){
   no_analyzed <- nrow(x)
   no_nominal <- nrow(x[x$PValue < 0.05,])
   no_nominal_up <- nrow(x[x$PValue < 0.05 & x$logFC > 0,])
   no_nominal_down <- nrow(x[x$PValue < 0.05 & x$logFC < 0,])
   no_FDR <- nrow(x[x$BH < 0.05,])
   no_FDR_up <- nrow(x[x$BH < 0.05 & x$logFC > 0,])
   no_FDR_down <- nrow(x[x$BH < 0.05 & x$logFC < 0,])
   return(list("no_analyzed"=no_analyzed,"no_nominal"=no_nominal,"no_nominal_up"=no_nominal_up,"no_nominal_down"=no_nominal_down,
   "no_FDR"=no_FDR,"no_FDR_up"=no_FDR_up,"no_FDR_down"=no_FDR_down))
}

overlap_significance <- function(merged_results,statistics,threshold=0.05){
    background <- nrow(merged_results)

    if(statistics == "pvalue"){
        overlap <- merged_results[merged_results$PValue.x < threshold & merged_results$PValue.y < threshold,]
        n11 <- nrow(overlap)
        n1 <- nrow(merged_results[merged_results$PValue.x < threshold,])
        n2 <- nrow(merged_results[merged_results$PValue.y < threshold,])
    }else if(statistics == "BH"){
        overlap <- merged_results[merged_results$BH.x < threshold & merged_results$BH.y < threshold,]
        n11 <- nrow(overlap)
        n1 <- nrow(merged_results[merged_results$BH.x < threshold,])
        n2 <- nrow(merged_results[merged_results$BH.y < threshold,])
    }else if(statistics == "bonferroni"){
        overlap <- merged_results[merged_results$bonferroni.x < threshold & merged_results$bonferroni.y < threshold,]
        n11 <- nrow(overlap)
        n1 <- nrow(merged_results[merged_results$bonferroni.x < threshold,])
        n2 <- nrow(merged_results[merged_results$bonferroni.y < threshold,])
    }

    uu <- nrow(overlap[overlap$logFC.x > 0 & overlap$logFC.y > 0,])
    ud <- nrow(overlap[overlap$logFC.x > 0 & overlap$logFC.y < 0,])
    du <- nrow(overlap[overlap$logFC.x < 0 & overlap$logFC.y > 0,])
    dd <- nrow(overlap[overlap$logFC.x < 0 & overlap$logFC.y < 0,])

    print("Table of overlap")

    print(matrix(c(uu,du,ud,dd),nrow=2))
    table <- matrix(c(n11,n2-n11,n1-n11,background-n1-n2+n11),nrow=2)
    pvalue <- fisher.test(table,alternative="greater")$p.value
    conf_int_lower <- fisher.test(table,alternative="greater")$conf.int[1]
    conf_int_upper <- fisher.test(table,alternative="greater")$conf.int[2]
    oddsratio <- fisher.test(table,alternative="greater")$estimate
    result_list <- list("pvalue"=pvalue,"conf_int_lw"=conf_int_lower,"conf_int_up"=conf_int_upper,"OddsRatio"=oddsratio)
    return(result_list)
}

thisStudy <- "DEG_lists/HttQ111vsWT_WTHdac2_2vs1_edgeR_quasilikelihoodFtest.SVA.txt"
LangfelderStudy <- "DEG_lists/LangfelderStudy_edgeR_quasilikelihoodFtest.txt"

thisStudy_data <- read.table(file=thisStudy,head=F,sep="\t",stringsAsFactors=F,skip=1)
colnames(thisStudy_data) <- c("id","logFC","logCPM","F","PValue","BH","bonferroni")
elems <- unlist(strsplit(as.character(thisStudy_data$id),"\\|"))
m <- matrix(elems,ncol=2,byrow=T)
thisStudy_data$symbol <- m[,1]
thisStudy_data$ensemblid <- m[,2]

LangfelderStudy_data <- read.table(file=LangfelderStudy,head=F,sep="\t",stringsAsFactors=F,skip=1)
colnames(LangfelderStudy_data) <- c("ensemblid","logFC","logCPM","F","PValue","BH","bonferroni")

cat("This study differential expression results:\n")
cat(paste0("Number of analyzed genes: ", countDEG(thisStudy_data)$no_analyzed),"\n")
cat(paste0("Number of DEGs at p < 0.05: ", countDEG(thisStudy_data)$no_nominal),"\n")
cat(paste0("Number of up regulated DEGs at p < 0.05: ", countDEG(thisStudy_data)$no_nominal_up),"\n")
cat(paste0("Number of down regulated  DEGs at p < 0.05: ", countDEG(thisStudy_data)$no_nominal_down),"\n")
cat(paste0("Number of DEGs at FDR < 0.05: ", countDEG(thisStudy_data)$no_FDR),"\n")
cat(paste0("Number of up regulated DEGs at FDR < 0.05: ", countDEG(thisStudy_data)$no_FDR_up),"\n")
cat(paste0("Number of down regulated  DEGs at FDR < 0.05: ", countDEG(thisStudy_data)$no_FDR_down),"\n")

cat("Langfelder study differential expression results:\n")
cat(paste0("Number of analyzed genes: ", countDEG(LangfelderStudy_data)$no_analyzed),"\n")
cat(paste0("Number of DEGs at p < 0.05: ", countDEG(LangfelderStudy_data)$no_nominal),"\n")
cat(paste0("Number of up regulated DEGs at p < 0.05: ", countDEG(LangfelderStudy_data)$no_nominal_up),"\n")
cat(paste0("Number of down regulated  DEGs at p < 0.05: ", countDEG(LangfelderStudy_data)$no_nominal_down),"\n")
cat(paste0("Number of DEGs at FDR < 0.05: ", countDEG(LangfelderStudy_data)$no_FDR),"\n")
cat(paste0("Number of up regulated DEGs at FDR < 0.05: ", countDEG(LangfelderStudy_data)$no_FDR_up),"\n")
cat(paste0("Number of down regulated  DEGs at FDR < 0.05: ", countDEG(LangfelderStudy_data)$no_FDR_down),"\n")

merged <- merge(thisStudy_data,LangfelderStudy_data,by.x="ensemblid",by.y="ensemblid")

#### comparison of two studies:

cat("Comparison of two studies at p < 0.05\n")
background <- nrow(merged)
cat(paste0("Number of compared genes in both studies: ",background),"\n")

n_up_1 <- nrow(merged[merged$PValue.x < 0.05 & merged$logFC.x > 0,])
n_down_1 <- nrow(merged[merged$PValue.x < 0.05 & merged$logFC.x < 0,])

n_up_2 <- nrow(merged[merged$PValue.y < 0.05 & merged$logFC.y > 0,])
n_down_2 <- nrow(merged[merged$PValue.y < 0.05 & merged$logFC.y < 0,])

cat(paste0("Number of up regulated DEGs from this study analyzed in both studies (p<0.05): ",n_up_1),"\n")
cat(paste0("Number of down regulated DEGs from this study analyzed in both studies (p<0.05): ",n_down_1),"\n")

cat(paste0("Number of up regulated DEGs from Langfelder study analyzed in both studies (p<0.05): ",n_up_2),"\n")
cat(paste0("Number of down regulated DEGs Langfelder study analyzed in both studies (p<0.05): ",n_down_2),"\n")

print("Overlap statistics at p < 0.05")
cat(paste0("Pvalue =",overlap_significance(merged,"pvalue")$pvalue),"\n")
cat(paste0("Odss ratio =",overlap_significance(merged,"pvalue")$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",overlap_significance(merged,"pvalue")$conf_int_up),"\n")
cat(paste0("Conf Int Lower =",overlap_significance(merged,"pvalue")$conf_int_lw),"\n")

cat("Comparison of two studies at FDR < 0.05\n")

n_up_1 <- nrow(merged[merged$BH.x < 0.05 & merged$logFC.x > 0,])
n_down_1 <- nrow(merged[merged$BH.x < 0.05 & merged$logFC.x < 0,])

n_up_2 <- nrow(merged[merged$BH.y < 0.05 & merged$logFC.y > 0,])
n_down_2 <- nrow(merged[merged$BH.y < 0.05 & merged$logFC.y < 0,])

cat(paste0("Number of up regulated DEGs from this study analyzed in both studies (FDR<0.05): ",n_up_1),"\n")
cat(paste0("Number of down regulated DEGs from this study analyzed in both studies (FDR<0.05): ",n_down_1),"\n")

cat(paste0("Number of up regulated DEGs from Langfelder study analyzed in both studies (FDR<0.05): ",n_up_2),"\n")
cat(paste0("Number of down regulated DEGs from Langfelder study analyzed in both studies (FDR<0.05): ",n_down_2),"\n")

print("Overlap statistics at FDR < 0.05")
cat(paste0("Pvalue =",overlap_significance(merged,"BH")$pvalue),"\n")
cat(paste0("Odss ratio =",overlap_significance(merged,"BH")$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",overlap_significance(merged,"BH")$conf_int_up),"\n")
cat(paste0("Conf Int Lower =",overlap_significance(merged,"BH")$conf_int_lw),"\n")

data <- merged

data$Significance <- ifelse((data$BH.x < 0.05 & data$BH.y < 0.05),"sig","nonsig")
data$Size <- ifelse((data$BH.x < 0.05 & data$BH.y < 0.05),6,2)

hits= data[data$Significance=="sig",]
print(dim(hits))

selected <- hits[,colnames(hits) %in% c("symbol","logFC.x","logFC.y")]
selected <- selected[,match(c("symbol","logFC.x","logFC.y"),colnames(selected))]

selected$mean <- apply(selected[,c(2:3)],1,mean)
selected <- selected[order(-selected$mean),]
print(selected)

rownames(selected) <- selected$symbol
selected <- selected[,-c(1,4)]

thisStudy_counts <- read.table(file="Counts/ThisStudy.SVAcorrectedcount.txt",head=T,sep="\t",check.names=F)
LangfelderStudy_counts <- read.table(file="Counts/LangfelderStudy.SVAcorrectedcount.txt",head=T,sep="\t",check.names=F)

thisStudy_counts <- thisStudy_counts[,c(1:15)]
thisStudy_rowzscore <- as.data.frame(t(scale(t(thisStudy_counts))))
thisStudy_rowzscore$id <- rownames(thisStudy_rowzscore)
elems <- unlist(strsplit(as.character(thisStudy_rowzscore$id),"\\|"))
m <- matrix(elems,ncol=2,byrow=T)
thisStudy_rowzscore$symbol <- m[,1]
thisStudy_rowzscore$ensemblid <- m[,2]

LangfelderStudy_rowzscore <- as.data.frame(t(scale(t(LangfelderStudy_counts))))
LangfelderStudy_rowzscore$ensemblid <- rownames(LangfelderStudy_rowzscore)

merged_rowzscore <- merge(thisStudy_rowzscore,LangfelderStudy_rowzscore,by.x="ensemblid",by.y="ensemblid")

selected_merged_rowzscore <- merged_rowzscore[merged_rowzscore$id %in% hits$id,]

rownames(selected_merged_rowzscore) <- selected_merged_rowzscore$symbol

selected_merged_rowzscore <- selected_merged_rowzscore[,!(colnames(selected_merged_rowzscore) %in% c("id","symbol","ensemblid"))]

selected_merged_rowzscore <- selected_merged_rowzscore[match(rownames(selected),rownames(selected_merged_rowzscore)),]

annotation_col <- data.frame(Htt_genotype=factor(c(rep("Q111/+",7),rep("+/+",8),rep("Q111/+",8),rep("+/+",8))))
row.names(annotation_col) <- colnames(selected_merged_rowzscore)

legend_col <- list(Htt_genotype=c("+/+"="#ef8a62","Q111/+"="#67a9cf"))

pdf(file="OverlappingGenes_LangfelderThisStudy_heatmap.pdf",height=8,width=12)
pheatmap(selected_merged_rowzscore,cluster_col=F,cluster_row=F,annotation_col=annotation_col,show_colnames=F,scale="none",
fontsize=15,gaps_row=11,gaps_col=c(15),annotation_colors=legend_col[1],annotation_legend=F,cellheight=16,cellwidth=16)
#color = colorRampPalette((brewer.pal(n = 7, name ="YlOrRd")))(100),gaps_col=c(15))
grid.text(c("This study"),x=0.26,y=0.95,gp=gpar(fontsize=18))
grid.text(c("Langfelder et al."),x=0.57,y=0.95,gp=gpar(fontsize=18))
grid.text(c("Z score"),x=0.83,y=0.94,gp=gpar(fontsize=14,fontface=2))
grid.text(c("Q111/+"),x=0.2,y=0.91,gp=gpar(fontsize=18))
grid.text(c("+/+"),x=0.335,y=0.91,gp=gpar(fontsize=18))
grid.text(c("Q111/+"),x=0.49,y=0.91,gp=gpar(fontsize=18))
grid.text(c("+/+"),x=0.635,y=0.91,gp=gpar(fontsize=18))
dev.off()

sessionInfo()

