library(DSS)
require(bsseq)
library(data.table)
library(ggplot2)
library(dplyr)
library(tidyr)

setwd("~/lab/zhaozd/WGBS/")
dat.B <- fread("B/merge.PE+R1/countData.for.DSS", header=T)
dat.B12d <- fread("B12d/merge.PE+R1/countData.for.DSS", header=T)
dat.P    <- fread("P65KO/merge.PE+R1/countData.for.DSS",header=T)
dat.P12d    <- fread("P65KO12d/merge.PE+R1/countData.for.DSS",header=T)
dat.T    <- fread("Tet2/merge.PE+R1/countData.for.DSS",header=T)
dat.T12d    <- fread("Tet2_12d/merge.PE+R1/countData.for.DSS",header=T)

BSobj = makeBSseqData( list(dat.B, dat.B12d),  c("B","B12d") )
BSobj.P = makeBSseqData( list(dat.P, dat.P12d),  c("P","P12d") )
BSobj.T = makeBSseqData( list(dat.T, dat.T12d),  c("T","T12d") )

mParam = MulticoreParam(workers=30, progressbar=TRUE)

#BPPARAM work on new version of DSS.
dmlTest.sm = DMLtest(BSobj, group1=c("B"), group2=c("B12d"), smoothing=TRUE, BPPARAM = mParam )
dmlTest.P = DMLtest(BSobj.P, group1=c("P"), group2=c("P12d"), smoothing=TRUE, BPPARAM = mParam )
dmlTest.T = DMLtest(BSobj.T, group1=c("T"), group2=c("T12d"), smoothing=TRUE, BPPARAM = mParam )

BSobj.sm   <- BSmooth(BSobj,   BPPARAM=mParam)
BSobj.P.sm <- BSmooth(BSobj.P, BPPARAM=mParam)
BSobj.T.sm <- BSmooth(BSobj.T, BPPARAM=mParam)

ens <- read.table("~/common_dataset/hg38-gene-features/Homo_sapiens.GRCh38.100.gene.filter", sep="\t", head=T, stringsAsFactors = F)
ens <- rbind(ens, c("EGFP","EGFP","protein_coding"))

p65.CpGn <- read.table("~/lab/zhaozd/P65-ChIPseq/macs2/p65-peaks-centered-200bp-CpGn2.txt", head=F)
colnames(p65.CpGn)[c(1,2,3,7)] <- c("chr","start","end","CpGn")

p65.target.all <- read.table("~/lab/zhaozd/analysis/closest_genes/p65_all.target_genes_TSS", head=F)
p65.target.all$mem_t <- 'na'
p65.target.all[is.element(p65.target.all$V8, my5G),]$mem_t <-'mem'

my7G <- c("EGFP", "CALCB", "IL32", "PTGES", "LHX2", "TNF", "CCL2")
my6G <- c("EGFP", "CALCB", "PTGES", "LHX2", "TNF", "CCL2")
my5G <- c("CALCB", "PTGES", "LHX2", "TNF", "CCL2")

p65.methyl <- data.frame(p65.mat)
# 增加target信息
p65.methyl <- merge(p65.methyl, p65.target.all[,c('V2','V8', 'V10', 'mem_t')], by.x='start', by.y='V2', all.x=T)
p65.methyl <- merge(p65.methyl, p65.CpGn[,c('start','CpGn')], by='start', all.x=T)
p65.methyl <- p65.methyl[,c(2,1,seq(3,ncol(p65.methyl)))]

my.demethyl <- read.table("~/lab/zhaozd/analysis/calc_demethylation_by_WGBS/demethyl_sum_for_all_p65_peak.result250_", head=F)
colnames(my.demethyl) <- c("chr", "start", "end", "tot.B", "tot.B12d", "myCpGn")
p65.methyl <- merge(p65.methyl, my.demethyl[,c('start','tot.B', 'tot.B12d', 'myCpGn')], by='start')
p65.methyl <- within(p65.methyl, totd <- (B-B12d)*myCpGn )
p65.methyl <- p65.methyl[, c(2,1,seq(3,ncol(p65.methyl)))]
p65.methyl.f <- subset(p65.methyl, !is.na(totd) & B>0.5)
p65.methyl.f <- within(p65.methyl.f, totd.q <- cut(totd, quantile(totd, seq(0,1,0.25),names=FALSE), include.lowest=F, labels=F )   )
totd1.4th <- subset(p65.methyl.f, totd>=quantile(p65.methyl.f$totd, 0.75))
totd3.4th <- subset(p65.methyl.f, totd<quantile(p65.methyl.f$totd, 0.75))

draw_fig2 <- function(d2a, ylim=0){
  d2a <- within(d2a, lfc <- log2((value_2+0.5)/(value_1+0.5)))
  d2a <- within(d2a, rpkmav <- log2((value_1+0.5)*(value_2+0.5))/2)
  d2a$selected <- FALSE
  my5G <- c("EGFP", "CALCB", "IL32", "PTGES", "LHX2", "TNF", "CCL2")
  d2a$selected[ is.element(d2a$gene, my5G)  ] <- TRUE
  d2a$color <- "grey"
  d2a$color[ d2a$selected ] <- "red"
  #with(d2a, plot(rpkmav, lfc, cex=0.5, col=color, ylim=c(-2,8)))
  if(length(ylim)==1){ylim <- range(d2a$lfc)}
  with(d2a, plot(rpkmav, lfc, cex=0.5, col=color, ylim=ylim))
  text(d2a$rpkmav[ d2a$selected ], d2a$lfc[ d2a$selected ], labels = d2a$gene[ d2a$selected ], adj=0, pos=4, cex = 0.5)
}

# diff34 is 12h vs 0h
diff34 <- read.table("~/lab/zhaozd/RNA-seq/allWT-14-RNAseq-BGI-201809/Cleandata/3-4.vs.1-2.diff/gene_exp.diff", head=T, sep="\t", stringsAsFactors = F)
# diff56 is 12d vs 0h
diff56 <- read.table("~/lab/zhaozd/RNA-seq/allWT-14-RNAseq-BGI-201809/Cleandata/5-6.vs.1-2.diff/gene_exp.diff", head=T, sep="\t", stringsAsFactors = F)
# diff1112 is 12d+10d vs 0h
diff1112 <- read.table("~/lab/zhaozd/RNA-seq/allWT-14-RNAseq-BGI-201809/Cleandata/11-12.vs.1-2.diff/gene_exp.diff", head=T, sep="\t", stringsAsFactors = F)
# diff1314 is 12d+10d+12h vs 0h
diff1314 <- read.table("~/lab/zhaozd/RNA-seq/allWT-14-RNAseq-BGI-201809/Cleandata/13-14.vs.1-2.diff/gene_exp.diff", head=T, sep="\t", stringsAsFactors = F)
draw_fig2 <- function(d2a, ylim=0){
  d2a <- within(d2a, lfc <- log2((value_2+0.5)/(value_1+0.5)))
  d2a <- within(d2a, rpkmav <- log2((value_1+0.5)*(value_2+0.5))/2)
  d2a$selected <- FALSE
  my5G <- c("EGFP", "CALCB", "IL32", "PTGES", "LHX2", "TNF", "CCL2")
  d2a$selected[ is.element(d2a$gene, my5G)  ] <- TRUE
  d2a$color <- "grey"
  d2a$color[ d2a$selected ] <- "red"
  #with(d2a, plot(rpkmav, lfc, cex=0.5, col=color, ylim=c(-2,8)))
  if(length(ylim)==1){ylim <- range(d2a$lfc)}
  with(d2a, plot(rpkmav, lfc, cex=0.5, col=color, ylim=ylim))
  text(d2a$rpkmav[ d2a$selected ], d2a$lfc[ d2a$selected ], labels = d2a$gene[ d2a$selected ], adj=0, pos=4, cex = 0.5)
}
#======================|
#  Fig.2A              |
#======================|
d2a <- diff34 %>%  separate(gene_id, into=c("gene_id","ver"), sep='\\.')
d2a <- merge(d2a, ens[,c(1,3)], by="gene_id")
d2a <- d2a[d2a$gene_biotype=="protein_coding",]
draw_fig2(d2a, c(-1,8))

#======================|
#  Fig.2B              |
#======================|
d2b <- merge(diff34[,c("gene_id", "gene","value_2")], diff1314[,c("gene_id","value_2")], by="gene_id")
colnames(d2b)[c(3,4)] <- c("value_1", "value_2")
d2b <- d2b %>%  separate(gene_id, into=c("gene_id","ver"), sep='\\.')
d2b <- merge(d2b, ens[,c(1,3)], by="gene_id")
d2b <- d2b[d2b$gene_biotype=="protein_coding",]
draw_fig2(d2b)

#======================|
#  Fig.2C              |
#======================|
d2c <- diff1112 %>%  separate(gene_id, into=c("gene_id","ver"), sep='\\.')
d2c <- merge(d2c, ens[,c(1,3)], by="gene_id")
d2c <- d2c[d2c$gene_biotype=="protein_coding",]
draw_fig2(d2c, c(-1,4))

#======================|
#  Fig.2D              |
#======================|
d2d<- read.table("data-for-fig2d.txt", head=T, sep = "\t")
d2d$sam <- ordered(d2d$sam, levels=d2d$sam)
ggplot(d2d)+
  geom_col(aes(x=sam, y=mean), width=0.5 )+
  geom_point(aes(x=sam, y=rep1),fill="white", shape=21, size=2)+
  geom_point(aes(x=sam, y=rep2),fill="white", shape=21, size=2)+
  scale_y_continuous(limits = c(0,800),name="CALCB FPKM", breaks=seq(0,800,100), labels = seq(0,800,100)) +         # Set tick every 0.1
  theme_bw()

RNA.lfc <- merge(diff34[,c("gene_id", "gene","value_1", "value_2")], diff1112[,c("gene_id","value_2")], by="gene_id")
RNA.lfc <- merge(RNA.lfc, diff1314[,c("gene_id","value_2")], by="gene_id")
colnames(RNA.lfc)[c(3,4,5,6)] <- c("v1","v2","v3","v4")
RNA.lfc <- within(RNA.lfc, lfc1 <- log2( (v2+0.5)/(v1+0.5)) )
RNA.lfc <- within(RNA.lfc, lfc2 <- log2( (v3+0.5)/(v1+0.5)) )
RNA.lfc <- within(RNA.lfc, lfc3 <- log2( (v4+0.5)/(v1+0.5)) )
RNA.lfc <- RNA.lfc %>%  separate(gene_id, into=c("gene_id","ver"), sep='\\.')
RNA.lfc <- merge(RNA.lfc, ens[,c(1,3)], by="gene_id")
RNA.lfc <- RNA.lfc[RNA.lfc$gene_biotype=="protein_coding",]

#======================|
#  Fig.3H              |
#======================|
d3h<- read.table("data-for-fig3H.txt", head=T, sep = "\t")
d3h$tp   <- ordered(d3h$tp,   levels=d3h$tp[seq(1,4)])
d3h$cell <- ordered(d3h$cell, levels=d3h$cell[c(1,5)])
d3h$gene <- ordered(d3h$gene, levels=d3h$gene[c(1,9,17,25,33,41,49)])

ggplot(d3h,aes(x=cell, y=val, fill=tp))+
  geom_col(width=0.5, position = "dodge" )+
  #geom_errorbar(aes(x=cell, y=val, ymin=val1,ymax=val2))+
  scale_fill_manual(values= rep(c("green","blue","orange","red"),15))+
  scale_color_manual("black")+
  geom_point(aes(y=val1),position = position_dodge(width = 0.5), shape=21, size=2)+
  geom_point(aes(y=val2),position = position_dodge(width = 0.5), shape=21, size=2)+
  scale_y_continuous(name="FPKM") +      
  facet_wrap("gene", scales = "free")+
  theme_bw()

#======================|
#  Fig.4E              |
#======================|
with(p65.methyl, boxplot(B, B12d, B[B>=0.5], B12d[B>=0.5], B[B<0.5], B12d[B<0.5], outline=F))

#======================|
#  Fig.4F              |
#======================|
#d4f <- read.table("c:/Users/zhang/Desktop/average_demethyl_around_methylated_p65_motif.result", head=F)
d4f <- read.table("raw-for-Fig4F.txt", head=F)
d4f$d <- d4f$V4/d4f$V5
d4f$h <- d4f$V2/d4f$V5
d4f$l <- d4f$V3/d4f$V5
library(ggplot2)
ggplot(d4f)+
geom_point(aes(V1,h), color="blue", alpha=0.5)+
geom_point(aes(V1,l), color="green", alpha=0.5)+
geom_point(aes(V1,d), color="orange", alpha=0.5)+
geom_smooth(aes(V1,h), method='loess', color="blue")+
geom_smooth(aes(V1,l), method='loess', color="green")+
geom_smooth(aes(V1,d), method='loess', color="orange")

#======================|
#  Fig.4G              |
#======================|
with(p65.methyl, boxplot(B[B>=0.5], B12d[B>=0.5], P[B>=0.5], P12d[B>=0.5], T[B>=0.5], T12d[B>=0.5], outline=F))
write.table(p65.methyl, "data-for-fig4eg.txt", sep="\t", quote=F, row.names = F, col.names = T)

eRNA.12d <- read.table("~/lab/zhaozd/figures/elife/revise/eRNA-at-peaks-12d.txt", head=T)
p65.methylated.eRNA <- merge(p65.methylated, eRNA.12d[,c("start","eRNA.rpkm.12d")], by="start")

#======================|
# Figure 4H, left part.|
#======================|
boxplot(
    subset(p65.methylated.eRNA, eRNA.rpkm.12d==0)$B, 
    subset(p65.methylated.eRNA, eRNA.rpkm.12d==0)$B12d, 
    subset(p65.methylated.eRNA, eRNA.rpkm.12d>0 & eRNA.rpkm.12d<=10)$B, 
    subset(p65.methylated.eRNA, eRNA.rpkm.12d>0 & eRNA.rpkm.12d<=10)$B12d, 
    subset(p65.methylated.eRNA, eRNA.rpkm.12d>10)$B, 
    subset(p65.methylated.eRNA, eRNA.rpkm.12d>10)$B12d, 
ylim=c(0,1))

#=======================|
# Figure 4H, right part.|
#=======================|
boxplot(
    with(subset(p65.methylated.eRNA, eRNA.rpkm.12d==0), P-B),
    with(subset(p65.methylated.eRNA, eRNA.rpkm.12d==0), P12d-B12d),
    with(subset(p65.methylated.eRNA, eRNA.rpkm.12d>0 & eRNA.rpkm.12d<=10),P-B),
    with(subset(p65.methylated.eRNA, eRNA.rpkm.12d>0 & eRNA.rpkm.12d<=10), P12d-B12d),
    with(subset(p65.methylated.eRNA, eRNA.rpkm.12d>10), P-B),
    with(subset(p65.methylated.eRNA, eRNA.rpkm.12d>10), P12d-B12d),
)

#=========================|
#  Fig.6A, target genes   |
#=========================|
d6a <- gather(subset(p65.methyl, mem_t=="mem"), cell, val, B:T12d)
d6a$tnf <- NA
d6a$tnf[d6a$cell=='B' | d6a$cell=='P' | d6a$cell=='T'] <- '0 h'
d6a$tnf[d6a$cell=='B12d' | d6a$cell=='P12d' | d6a$cell=='T12d'] <- '12 d'
ggplot(d6a, aes(x=cell, y=val, color=tnf))+
geom_boxplot(outlier.shape = NA)+
  geom_jitter(shape=21)+
  scale_y_continuous(limits = c(0,1),name="Methylation (%)", breaks=seq(0,100,20)/100, labels = seq(0,100,20))+
  theme_bw()

#====================#
#  Fig.6L, eRNA      #
#====================#
eRNA.wt <- read.table("~/lab/zhaozd/figures/eRNA-data-on-p65-peaks.txt", sep="\t", head=T)
eRNA.wt <- merge(p65.methyl.f, eRNA.wt[,c("start", "lfc1", "lfc3")])
#LTR
#eRNA.wt <- merge(p65.methyl.f.LTR, eRNA.wt[,c("start", "lfc1", "lfc3")])
p65.eRNA.tag <- list()
p65.eRNA.tag$q4 <- with( eRNA.wt, totd.q==4)
p65.eRNA.tag$q3 <- with( eRNA.wt, totd.q==3)
p65.eRNA.tag$q2 <- with( eRNA.wt, totd.q==2)
p65.eRNA.tag$q1 <- with( eRNA.wt, totd.q==1)

eRNA.wt$q <- NA
eRNA.wt$q[p65.eRNA.tag$q1] <- 'q1'
eRNA.wt$q[p65.eRNA.tag$q2] <- 'q2'
eRNA.wt$q[p65.eRNA.tag$q3] <- 'q3'
eRNA.wt$q[ with(p65.eRNA.tag, q1|q2|q3) ] <- 'q1-q3'
eRNA.wt$q[p65.eRNA.tag$q4] <- 'q4'

d6i <- gather(eRNA.wt[!is.na(eRNA.wt$q),], lfc_t, lfc_val, lfc1:lfc3)

ggplot(d6i, aes(x=q, y=lfc_val, color=lfc_t))+
  #geom_boxplot()+
  geom_boxplot(outlier.shape = NA)+
  #geom_jitter(shape=21)+
  scale_y_continuous(name="eRNA Log2 FC vs. 0 h ctrl")+
  #coord_cartesian(ylim = c(-1, 1.5))+
  theme_bw()

#=========================|
#  Fig.6M, target genes   |
#=========================|
totdByGenes <- aggregate(p65.methyl.f[,c("totd")], list(p65.methyl.f$V8), sum)
totdByGenes <- within(totdByGenes, q <- cut(x, quantile(x, seq(0,1,0.25),names=F), labels=F))

p65.target.tag <- list()
p65.target.tag$q4 <- is.element(RNA.lfc$gene, subset(totdByGenes, q==4)$Group.1)
p65.target.tag$q3 <- is.element(RNA.lfc$gene, subset(totdByGenes, q==3)$Group.1)
p65.target.tag$q2 <- is.element(RNA.lfc$gene, subset(totdByGenes, q==2)$Group.1)
p65.target.tag$q1 <- is.element(RNA.lfc$gene, subset(totdByGenes, q==1)$Group.1)

RNA.lfc$q <- NA
RNA.lfc$q[p65.target.tag$q1] <- 'q1-q3'
RNA.lfc$q[p65.target.tag$q2] <- 'q1-q3'
RNA.lfc$q[p65.target.tag$q3] <- 'q1-q3'
RNA.lfc$q[p65.target.tag$q4] <- 'q4'

d6J <- gather(RNA.lfc[!is.na(RNA.lfc$q),], lfc_t, lfc_val, lfc1,lfc3)

ggplot(d6J, aes(x=q, y=lfc_val, color=lfc_t))+
  #geom_boxplot()+
  geom_boxplot(outlier.shape = NA)+
  #geom_jitter(shape=21)+
  scale_y_continuous(name="Log2 FC vs. 0 h ctrl")+
  coord_cartesian(ylim = c(-0.5, 1))+
    theme_bw()

#==============================|
#  Fig.6N, target by distances |
#==============================|
boxplot(RNA.lfc[is.element(RNA.lfc$gene, subset(totd1.4th,abs(V10)>10000)$V8),]$lfc1,
        RNA.lfc[is.element(RNA.lfc$gene, subset(totd1.4th,abs(V10)>10000)$V8),]$lfc3, 
        RNA.lfc[is.element(RNA.lfc$gene, subset(totd1.4th,abs(V10)<10000)$V8),]$lfc1, 
        RNA.lfc[is.element(RNA.lfc$gene, subset(totd1.4th,abs(V10)<10000)$V8),]$lfc3, outline=F)

#======================|
#  Fig.7B              |
#======================|
d7b<- read.table("data-for-fig7B.txt", head=T, sep = "\t")
d7b <- within(d7b, val <- (val1+val2)/2 )
ggplot(d7b,aes(x=cell, y=val, fill=TNF))+
  geom_col(position = "dodge")+
  scale_fill_manual(values= rep(c("green","blue","pink"),12))+
  scale_color_manual("black")+
  #geom_errorbar(aes(ymin=val1, ymax=val2),position = position_dodge(0.9), width = .2)+
  geom_point(aes(y=val1),position = position_dodge(0.9), shape=21, size=2)+
  geom_point(aes(y=val2),position = position_dodge(0.9), shape=21, size=2)+
  #scale_fill_manual("white")+
  facet_wrap("gene", scales = "free")+
  theme_bw()
