#!/usr/bin/Rscript

##### This script was written by Serkan Erdin to assess the statistical significance of overlap 
##### of differentially regulated pathways for up regulated and down regulated differntially 
##### expressed genes (DEGs) from two different contrasts. 


################################################################################################
##### R version 3.4.3 (2017-11-30)                                                             #
##### Platform: x86_64-apple-darwin15.6.0 (64-bit)                                             #
##### Running under: macOS Sierra 10.12.6                                                      #
#####                                                                                          #
##### Matrix products: default                                                                 #
##### BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib        #
##### LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib      #
#####                                                                                          #
##### locale:                                                                                  #              
##### [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8                        #
#####                                                                                          #
##### attached base packages:                                                                  #
##### [1] stats     graphics  grDevices utils     datasets  base                               #
#####                                                                                          #
##### other attached packages:                                                                 #
##### [1] forcats_0.3.0   stringr_1.3.0   dplyr_0.7.5     purrr_0.2.5                          #
##### [5] readr_1.1.1     tidyr_0.8.1     tibble_2.1.3    ggplot2_2.2.1                        #
##### [9] tidyverse_1.2.1                                                                      #
#####                                                                                          #
##### loaded via a namespace (and not attached):                                               #
#####  [1] Rcpp_0.12.17     cellranger_1.1.0 pillar_1.4.2     compiler_3.4.3                   #
#####  [5] plyr_1.8.4       bindr_0.1.1      methods_3.4.3    tools_3.4.3                      #
#####  [9] lubridate_1.7.4  jsonlite_1.5     nlme_3.1-131     gtable_0.2.0                     #
##### [13] lattice_0.20-35  pkgconfig_2.0.1  rlang_0.4.0      psych_1.8.4                      #
##### [17] cli_1.1.0        rstudioapi_0.8   parallel_3.4.3   haven_1.1.2                      #
##### [21] bindrcpp_0.2.2   xml2_1.2.0       httr_1.3.1       hms_0.4.2                        #
##### [25] grid_3.4.3       tidyselect_0.2.4 glue_1.3.1       R6_2.2.2                         #
##### [29] readxl_1.1.0     foreign_0.8-69   modelr_0.1.8     reshape2_1.4.3                   #
##### [33] magrittr_1.5     scales_0.5.0     rvest_0.3.6      assertthat_0.2.0                 #
##### [37] mnormt_1.5-5     colorspace_1.3-2 stringi_1.1.6    lazyeval_0.2.1                   #
##### [41] munsell_0.4.3    broom_0.4.4      crayon_1.3.4                                      #
################################################################################################

library(tidyverse)

args<-commandArgs(TRUE)
results_upreg_1 <- args[1] #### DAVID's results for up regulated DEGs from contrast 1 in Pathways directory
results_downreg_1 <- args[2] #### DAVID's results for down regulated DEGs from contrast 1 in Pathways directory
results_upreg_2 <- args[3]  #### DAVID's results for up regulated DEGs from contrast 2 in Pathways directory
results_downreg_2 <- args[4] #### DAVID's results for down regulated DEGs from contrast 2 in Pathways directory
statistics <- args[5]  #### How to define differentially regulated pathways: pvalue or Benjamini (we use it for FDR) 
threshold <- as.numeric(args[6]) #### pvalue of Benjamini threshold 

selected_columns <- c("Term","Count","PValue","Benjamini")

upreg_1 <- read.table(file=results_upreg_1,head=T,sep="\t",check.names=F,quote="")
upreg_1 <- upreg_1[,colnames(upreg_1) %in% selected_columns]
colnames(upreg_1) <- c("Term","Count_up1","PValue_up1","Benjamini_up1")

downreg_1 <- read.table(file=results_downreg_1,head=T,sep="\t",check.names=F,quote="")
downreg_1 <- downreg_1[,colnames(downreg_1) %in% selected_columns]
colnames(downreg_1) <- c("Term","Count_down1","PValue_down1","Benjamini_down1")

upreg_2 <- read.table(file=results_upreg_2,head=T,sep="\t",check.names=F,quote="")
upreg_2 <- upreg_2[,colnames(upreg_2) %in% selected_columns]
colnames(upreg_2) <- c("Term","Count_up2","PValue_up2","Benjamini_up2")

downreg_2 <- read.table(file=results_downreg_2,head=T,sep="\t",check.names=F,quote="")
downreg_2 <- downreg_2[,colnames(downreg_2) %in% selected_columns]
colnames(downreg_2) <- c("Term","Count_down2","PValue_down2","Benjamini_down2")

upup <- full_join(upreg_1,upreg_2,by="Term")
upupdown <- full_join(upup,downreg_1,by="Term")
all <- full_join(upupdown,downreg_2,by="Term")


all$PValue_up1[is.na(all$PValue_up1)] <- 1
all$PValue_down1[is.na(all$PValue_down1)] <- 1
all$PValue_up2[is.na(all$PValue_up2)] <- 1   
all$PValue_down2[is.na(all$PValue_down2)] <- 1

all$Benjamini_up1[is.na(all$Benjamini_up1)] <- 1
all$Benjamini_down1[is.na(all$Benjamini_down1)] <- 1
all$Benjamini_up2[is.na(all$Benjamini_up2)] <- 1
all$Benjamini_down2[is.na(all$Benjamini_down2)] <- 1

background <- nrow(all)

if(statistics == "pvalue"){
   n_uu <- nrow(all[all$PValue_up1 < threshold &  all$PValue_up2 < threshold,]) 
   n_ud <- nrow(all[all$PValue_up1 < threshold &  all$PValue_down2 < threshold,])
   n_du <- nrow(all[all$PValue_down1 < threshold &  all$PValue_up2 < threshold,])
   n_dd <- nrow(all[all$PValue_down1 < threshold &  all$PValue_down2 < threshold,])

   n1_u <- nrow(all[all$PValue_up1 < threshold,])
   n1_d <- nrow(all[all$PValue_down1 < threshold,])
   n2_u <- nrow(all[all$PValue_up2 < threshold,])
   n2_d <- nrow(all[all$PValue_down2 < threshold,])
}else if(statistics == "Benjamini"){
   n_uu <- nrow(all[all$Benjamini_up1 < threshold &  all$Benjamini_up2 < threshold,])
   n_ud <- nrow(all[all$Benjamini_up1 < threshold &  all$Benjamini_down2 < threshold,])
   n_du <- nrow(all[all$Benjamini_down1 < threshold &  all$Benjamini_up2 < threshold,])
   n_dd <- nrow(all[all$Benjamini_down1 < threshold &  all$Benjamini_down2 < threshold,])

   n1_u <- nrow(all[all$Benjamini_up1 < threshold,])
   n1_d <- nrow(all[all$Benjamini_down1 < threshold,])
   n2_u <- nrow(all[all$Benjamini_up2 < threshold,])
   n2_d <- nrow(all[all$Benjamini_down2 < threshold,])
}

n1 <- n1_u + n1_d
n2 <- n2_u + n2_d
n11 <- n_uu + n_ud + n_du + n_dd

cat(paste(n1_u,n1_d,n2_u,n2_d,sep="\t"),"\n")

cat(paste(n_uu,n_ud,n_du,n_dd,sep="\t"),"\n")

enrichment_stat <- function(n11,n1,n2,background){
   table <- matrix(c(n11,n2-n11,n1-n11,background-n1-n2+n11),nrow=2)
   pvalue <- fisher.test(table,alternative="greater")$p.value
   conf_int_lower <- fisher.test(table,alternative="greater")$conf.int[1]
   conf_int_upper <- fisher.test(table,alternative="greater")$conf.int[2]
   oddsratio <- fisher.test(table,alternative="greater")$estimate
   result_list <- list("pvalue"=pvalue,"conf_int_lw"=conf_int_lower,"conf_int_up"=conf_int_upper,"OddsRatio"=oddsratio)
   return(result_list)
}

print("Overall overlap statistics")
cat(paste0("Pvalue =",enrichment_stat(n11,n1,n2,background)$pvalue),"\n")
cat(paste0("Odss ratio =",enrichment_stat(n11,n1,n2,background)$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",enrichment_stat(n11,n1,n2,background)$conf_int_up),"\n")
cat(paste0("Conf Int Lower =",enrichment_stat(n11,n1,n2,background)$conf_int_lw),"\n")

print("On diagonall overlap statistics")

n11 <- n_uu + n_dd

cat(paste0("Pvalue =",enrichment_stat(n11,n1,n2,background)$pvalue),"\n")
cat(paste0("Odss ratio =",enrichment_stat(n11,n1,n2,background)$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",enrichment_stat(n11,n1,n2,background)$conf_int_up),"\n")
cat(paste0("Conf Int Lower =",enrichment_stat(n11,n1,n2,background)$conf_int_lw),"\n")

print("Off diagonal overlap statistics")

n11 <- n_ud + n_du

cat(paste0("Pvalue =",enrichment_stat(n11,n1,n2,background)$pvalue),"\n")
cat(paste0("Odss ratio =",enrichment_stat(n11,n1,n2,background)$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",enrichment_stat(n11,n1,n2,background)$conf_int_up),"\n")
cat(paste0("Conf Int Lower =",enrichment_stat(n11,n1,n2,background)$conf_int_lw),"\n")





sessionInfo()

