#!/usr/bin/Rscript

##################################################################################################
#### This script was written by Serkan Erdin to assess the statistical significance of        ####
#### overlap of differentially expressed genes at a selected p value or FDR threshod using    ####
#### one-tailed Fisher's exact test.                                                          ####
##################################################################################################

##############################################################################
##### R version 3.6.0 (2019-04-26)                                           #
##### Platform: x86_64-apple-darwin13.4.0 (64-bit)                           #
##### Running under: macOS Sierra 10.12.6                                    #
#####                                                                        #
##### Matrix products: default                                               #
##### BLAS/LAPACK: /anaconda3/lib/R/lib/libRblas.dylib                       #
#####                                                                        #
##### locale:                                                                #
##### [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8      #
#####                                                                        #
##### attached base packages:                                                #
##### [1] stats     graphics  grDevices utils     datasets  methods   base   #  
#####                                                                        #
##### loaded via a namespace (and not attached):                             #
##### [1] compiler_3.6.0                                                     #
##############################################################################

args<-commandArgs(TRUE)
DEresults_1 <- args[1]  #### differential expression results from comparison 1 in DEG_lists dierctory
DEresults_2 <- args[2]  #### differential expression results from comparison 2 in DEG_lists directory
statistics <- args[3]  #### how to define differentially expressed genes: pvalue, BH (we use this for FDR), Bonferroni
threshold <- as.numeric(args[4])  #### pvalue or FDR threshold

DE_1 <- read.table(file=DEresults_1,head=F,sep="\t",skip=1,stringsAsFactors=F)
colnames(DE_1) <- c("id","logFC","logCPM","F","PValue","BH","bonferroni")

DE_2 <- read.table(file=DEresults_2,head=F,sep="\t",skip=1,stringsAsFactors=F)
colnames(DE_2) <- c("id","logFC","logCPM","F","PValue","BH","bonferroni") 

merged_results <- merge(DE_1,DE_2,by.x="id",by.y="id")

background <- nrow(merged_results) 

if(statistics == "pvalue"){
    overlap <- merged_results[merged_results$PValue.x < threshold & merged_results$PValue.y < threshold,]
    n11 <- nrow(overlap)
    n1 <- nrow(merged_results[merged_results$PValue.x < threshold,])
    n2 <- nrow(merged_results[merged_results$PValue.y < threshold,])
}else if(statistics == "BH"){
    overlap <- merged_results[merged_results$BH.x < threshold & merged_results$BH.y < threshold,]
    n11 <- nrow(overlap)
    n1 <- nrow(merged_results[merged_results$BH.x < threshold,])
    n2 <- nrow(merged_results[merged_results$BH.y < threshold,])
}else if(statistics == "bonferroni"){
    overlap <- merged_results[merged_results$bonferroni.x < threshold & merged_results$bonferroni.y < threshold,]
    n11 <- nrow(overlap)
    n1 <- nrow(merged_results[merged_results$bonferroni.x < threshold,])
    n2 <- nrow(merged_results[merged_results$bonferroni.y < threshold,])
}   

uu <- nrow(overlap[overlap$logFC.x > 0 & overlap$logFC.y > 0,])
ud <- nrow(overlap[overlap$logFC.x > 0 & overlap$logFC.y < 0,])
du <- nrow(overlap[overlap$logFC.x < 0 & overlap$logFC.y > 0,])
dd <- nrow(overlap[overlap$logFC.x < 0 & overlap$logFC.y < 0,])

print("Table of overlap")

print(matrix(c(uu,du,ud,dd),nrow=2))

enrichment_stat <- function(n11,n1,n2,background){
   table <- matrix(c(n11,n2-n11,n1-n11,background-n1-n2+n11),nrow=2)
   pvalue <- fisher.test(table,alternative="greater")$p.value
   conf_int_lower <- fisher.test(table,alternative="greater")$conf.int[1]
   conf_int_upper <- fisher.test(table,alternative="greater")$conf.int[2]
   oddsratio <- fisher.test(table,alternative="greater")$estimate
   result_list <- list("pvalue"=pvalue,"conf_int_lw"=conf_int_lower,"conf_int_up"=conf_int_upper,"OddsRatio"=oddsratio)
   return(result_list)
}

print("Overall overlap statistics") 
cat(paste0("Pvalue =",enrichment_stat(n11,n1,n2,background)$pvalue),"\n")
cat(paste0("Odss ratio =",enrichment_stat(n11,n1,n2,background)$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",enrichment_stat(n11,n1,n2,background)$conf_int_up),"\n")  
cat(paste0("Conf Int Lower =",enrichment_stat(n11,n1,n2,background)$conf_int_lw),"\n") 

print("On diagonall overlap statistics")

n11 <- uu + dd

cat(paste0("Pvalue =",enrichment_stat(n11,n1,n2,background)$pvalue),"\n") 
cat(paste0("Odss ratio =",enrichment_stat(n11,n1,n2,background)$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",enrichment_stat(n11,n1,n2,background)$conf_int_up),"\n")  
cat(paste0("Conf Int Lower =",enrichment_stat(n11,n1,n2,background)$conf_int_lw),"\n") 

print("Off diagonal overlap statistics")

n11 <- ud + du

cat(paste0("Pvalue =",enrichment_stat(n11,n1,n2,background)$pvalue),"\n") 
cat(paste0("Odss ratio =",enrichment_stat(n11,n1,n2,background)$OddsRatio),"\n")
cat(paste0("Conf Int Upper =",enrichment_stat(n11,n1,n2,background)$conf_int_up),"\n")  
cat(paste0("Conf Int Lower =",enrichment_stat(n11,n1,n2,background)$conf_int_lw),"\n") 

sessionInfo()
