library(tidyverse)
library(readxl);
library(rstan);
library(bayesplot)
library(gridExtra)
library(sqldf)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())


prolifdat <- read_excel("pHH3_data_summary.xlsx",na="nd")
names(prolifdat) <- c("GT","litter","sample","section","side","dapicount","phh3count","dapipix","sompix","phh3pixdens")
prolifdat$GT <- factor(prolifdat$GT,levels=c("nor","ako","sko","dko"))

## scale is arbitrary here, so just make everything O(1)
prolifdat$meas <- prolifdat$phh3count/prolifdat$dapicount*100;
prolifdat[is.na(prolifdat$meas),"meas"] <- prolifdat[is.na(prolifdat$meas),"phh3pixdens"]/1e3
prolifdat[is.na(prolifdat$meas),"meas"] <- prolifdat[is.na(prolifdat$meas),"phh3count"]/10


print(prolifdat,n=120,width=Inf)

sqldf("select GT,litter,sample,count() as N from prolifdat group by litter,sample;")

ggplot(prolifdat,aes(GT,phh3count/dapicount)) + geom_jitter(width=c(.1,0))

ggplot(prolifdat,aes(GT,dapipix))+geom_point()

smod <- stan_model("prolif.stan")

## initsamps <- vb(smod,data=list(N = NROW(prolifdat),
##                                  L= length(unique(prolifdat$litter)),
##                                  Nsamp = length(unique(paste(prolifdat$sample,"-",prolifdat$litter))),
##                                  GT = as.numeric(prolifdat$GT),
##                                  litter= as.numeric(as.factor(prolifdat$litter)),
##                                  sample = as.numeric(as.factor(paste(prolifdat$sample,"-",prolifdat$litter))),
##                                  meas = prolifdat$meas))

measments <- with(prolifdat,matrix(c(phh3count/dapicount,phh3pixdens,phh3count/sompix), nrow=NROW(prolifdat),ncol=3))
somsizedat <- matrix(c(prolifdat$dapipix,prolifdat$sompix),ncol=2)
for( i in 1:3){
    measments[,i] <- measments[,i]/exp(round(log(median(measments[,i],na.rm=TRUE))));
    if(i < 3) {somsizedat[,i] <- somsizedat[,i]/exp(round(log(median(somsizedat[,i],na.rm=TRUE))));}
}
measments[is.na(measments)] <- -1.0;
somsizedat[is.na(somsizedat)] <- -1

prolifdat$samplefactor <- as.factor(paste(prolifdat$litter,"-",prolifdat$sample))


samps <- sampling(smod,data=list(N = NROW(prolifdat),
                                 L= length(unique(prolifdat$litter)),
                                 Nsamp = NROW(levels(prolifdat$samplefactor)),
                                 GT = as.numeric(prolifdat$GT),
                                 litter= as.numeric(as.factor(prolifdat$litter)),
                                 sample = as.numeric(prolifdat$samplefactor),
                                 meas = measments,
                                 somsize = somsizedat
                                 ),seed = 20170927,
                  control=list(adapt_delta=.995,max_treedepth=13,stepsize=1e-5),
                  sample_file="prolifsamp",iter=2000,thin=2)


print(samps)

poster <- as.data.frame(samps)
plotlist <- list(
    mcmc_intervals(poster,pars=grep("normscale",names(poster),value=TRUE))+labs(title="MCMC intervals for normal scale inferences"),
    mcmc_intervals(poster,pars=grep("normsize",names(poster),value=TRUE))+labs(title="MCMC intervals for normal scale inferences"),
    mcmc_intervals(poster,pars=grep("avgGall",names(poster),value=TRUE))+labs(title="MCMC intervals for proliferation as fraction of normal"),
    mcmc_intervals(poster,pars=grep("avgGsizeall",names(poster),value=TRUE))+labs(title="MCMC intervals for somite size as fraction of normal"),
    mcmc_intervals(poster,pars=grep("deltaSamp",names(poster),value=TRUE)),
    mcmc_intervals(poster,pars=grep("(avgGall)|(avgGsizeall)|(deltaSamp)|(normscale)|(normsize)|(lp__)",names(poster),value=TRUE,invert=TRUE))
)

pdf("ProlifFit.pdf")
plotlist
dev.off()



traceplot(samps,pars=grep("avgG",names(poster),value=TRUE))
ggplot(prolifdat,aes(x=GT,y=meas,col=paste(litter," ",sample)))+geom_point()

ggplot(poster) + geom_density(aes_string("`avgGall[1]`"),fill="blue",alpha=.25)+geom_density(aes_string("`avgGall[2]`"),fill="red",alpha=.25)+geom_density(aes_string("`avgGall[3]`"),fill="green",alpha=.25)+coord_cartesian(xlim=c(0,1.5))+labs(title="shh, Apaf, and DKO cell division counts\nrelative to control wild type average",x="Cell Division measure as fraction of wild type in same litter")+geom_label(aes(x,y,label=label),col=c("green","blue","red"),data=data.frame(x=c(.25,.75,1.0),y=c(3,3,3),label=c("DKO","Apaf","Shh")))

ggsave("GenotypeProlifEst.pdf")


boxplotdata <- data.frame(prolifvals=c(poster$`avgGall[1]`,poster$`avgGall[2]`,poster$`avgGall[3]`),
                          sizevals=c(poster$`avgGsizeall[1]`,poster$`avgGsizeall[2]`,poster$`avgGsizeall[3]`),
                          genotype=
                              factor(c(rep(
                                  "Apaf",NROW(poster)),rep("Shh",NROW(poster)),rep("DKO",NROW(poster))),
                                  levels=c("Apaf","Shh","DKO")))

ggplot(boxplotdata)+stat_ecdf(aes(x=prolifvals,col=genotype))+scale_color_manual(values=c("blue","red","green"))+scale_y_continuous(breaks=(0:10)/10)+scale_x_continuous(breaks=seq(0,1.6,by=.2))+labs(title="CDF curves for posterior ratios of proliferation by genotype",x="Ratio relative to matched controls",y="Probability")
ggsave("PosteriorProlifEcdf.pdf")

ggplot(boxplotdata)+stat_ecdf(aes(x=sizevals,col=genotype))+scale_color_manual(values=c("blue","red","green"))+scale_y_continuous(breaks=(0:10)/10)+scale_x_continuous(breaks=seq(0,1.6,by=.2))+labs(title="CDF curves for posterior ratios of initial Size by genotype",x="Ratio relative to matched controls",y="Probability")
ggsave("PosteriorSizeEcdf.pdf")

boxplotdata$genotypewnorm <- factor(boxplotdata$genotype,levels=c("Normal","Apaf","Shh","DKO"))


ggplot(boxplotdata %>% group_by(genotype) %>% summarize(lower=quantile(prolifvals,prob=c(.125)),upper=quantile(prolifvals,prob=c(1-.125)),mean=mean(prolifvals)))+geom_pointrange(aes(x=genotype,y=mean,ymin=lower,ymax=upper,col=genotype),size=2)+scale_color_manual(values=c("blue","red","green")) + labs(title="Proliferation rate relative to litter-matched controls\nExpected value with 75% posterior range",x="Genotype",y="Relative Rate")+theme(legend.position="none")
ggsave("PosteriorProlifPointrange.pdf")


ggplot(rbind(boxplotdata,data.frame(prolifvals=c(1),sizevals=c(1),genotype=c("Normal"),genotypewnorm=c("Normal")))) + geom_boxplot(aes(genotypewnorm,prolifvals),color="dodgerblue3")+labs(title="Proliferation Rate relative to controls\nPosterior Samples from Model",x="Genotype",y="Rate / Wild Type Rate")+theme(panel.background=element_blank(),panel.grid.minor=element_blank(),panel.grid.major=element_line(colour="lightgrey"))
ggsave("PosteriorProlifBoxplot.pdf",width=4,height=4)
system("evince PosteriorProlifBoxplot.pdf&")


ggplot(rbind(boxplotdata,data.frame(prolifvals=c(1),sizevals=c(1),genotype=c("Normal"),genotypewnorm=c("Normal")))) + geom_boxplot(aes(genotypewnorm,sizevals),color="dodgerblue3")+labs(title="Initial Size relative to controls\nPosterior Samples from Model",x="Genotype",y="Size / Wild Type Size")+theme(panel.background=element_blank(),panel.grid.minor=element_blank(),panel.grid.major=element_line(colour="lightgrey"))
ggsave("PosteriorSizeBoxplot.pdf",width=4,height=4)
system("evince PosteriorSizeBoxplot.pdf&")

#ggplot(data.frame(apaf=boxplotdata[boxplotdata$genotype == "Apaf","prolifvals"],
#             shh=boxplotdata[boxplotdata$genotype == "Shh","prolifvals"]))+ geom_density(aes(x=apaf/shh))



ggplot(poster) + geom_density(aes_string("`avgGall[1]`"),fill="blue",alpha=.25)+geom_density(aes_string("`avgGall[2]`"),fill="red",alpha=.25)+geom_density(aes_string("`avgGall[3]`"),fill="green",alpha=.25)+coord_cartesian(xlim=c(0,1.5))+labs(title="shh, Apaf, and DKO cell division counts\nrelative to control wild type average",x="Cell Division measure as fraction of wild type in same litter")+geom_label(aes(x,y,label=label),col=c("green","blue","red"),data=data.frame(x=c(.25,.75,1.0),y=c(3,3,3),label=c("DKO","Apaf","Shh")))





ggplot(poster) + geom_density(aes_string("`avgGsizeall[1]`"),fill="blue",alpha=.25)+geom_density(aes_string("`avgGsizeall[2]`"),fill="red",alpha=.25)+geom_density(aes_string("`avgGsizeall[3]`"),fill="green",alpha=.25)+coord_cartesian(xlim=c(0,1.5))+labs(title="shh, Apaf, and DKO somite size measurements\nrelative to control wild type average",x="Somite Size measure as fraction of wild type in same litter")+geom_label(aes(x,y,label=label),col=c("green","blue","red"),data=data.frame(x=c(.25,1.0,.75),y=c(3,3,3),label=c("DKO","Apaf","Shh")))

ggsave("GenotypeSizeEst.pdf")

Gtplot <- mcmc_intervals(poster,pars=grep("avgG",names(poster),value=TRUE))+labs(title="MCMC intervals for proliferation as fraction of normal")

avgGvals <- data.frame(p=c(.025,.25,.5,.75,.975),
                       Apaf=quantile(poster$`avgGall[1]`,probs=c(.025,.25,.5,.75,.975)),
                       shh=quantile(poster$`avgGall[2]`,probs=c(.025,.25,.5,.75,.975)),
                       dko=quantile(poster$`avgGall[3]`,probs=c(.025,.25,.5,.75,.975)))
write_csv(avgGvals,"PosteriorIntervals.csv")

system("evince GenotypeProlifEst.pdf&")
system("evince GenotypeSizeEst.pdf&")
system("evince ProlifFit.pdf&")

write_csv(boxplotdata %>% group_by(genotype) %>% summarize(medianprolif=median(prolifvals),mediansize=median(sizevals)),"PosteriorParamvals.csv")

simdat <- read_csv("GenotypeRuns.csv")
simdat$Genotype <- factor(map(simdat$G,function(x){if(x==1) return("WT"); if(x==2) return("Apaf"); if(x==3) return("Shh"); if(x==4) return("DKO")}),
                          levels=c("WT","Apaf","Shh","DKO"))

ggplot(simdat) + geom_jitter(aes(x=Genotype,y=(NW+NR+NB)),width=.1,height=0,shape=1,size=2) + labs(title="Total Cells at End of 10 example\nSimulations",y="Cell Count",x="Genotype (with horizontal jitter)")+coord_cartesian(ylim=c(0,6000))

ggsave("TotalCellsExample.pdf",width=4,height=4)
system("evince TotalCellsExample.pdf&")

ggplot(simdat)+geom_jitter(aes(x=Genotype,y=NR/(NR+NB)),width=.1,height=0,col="red",shape=1,size=2)+labs(title="Fraction of colored cells that are red",y="Fraction of Colored Cells",x="Genotype (with horizontal jitter)")
ggsave("RednessExample.pdf",width=4,height=4)
system("evince RednessExample.pdf&")

median(poster$`avgGsizeall[1]`)
median(poster$`avgGsizeall[2]`)
median(poster$`avgGsizeall[3]`)

ecdf(poster$`avgGsizeall[1]`)(1)
ecdf(poster$`avgGsizeall[2]`)(1)
ecdf(poster$`avgGsizeall[3]`)(1)


median(poster$`avgGall[1]`)
median(poster$`avgGall[2]`)
median(poster$`avgGall[3]`)

ecdf(poster$`avgGall[1]`)(1)
ecdf(poster$`avgGall[2]`)(1)
ecdf(poster$`avgGall[3]`)(1)

