# Load libraries
library("DESeq2")
library("pheatmap")
library("ggplot2")
library("dplyr")
library("ggrepel")

# Load Data
read.counts<-read.table("Drosophila_Esg_Read_Counts.txt", header=FALSE, stringsAsFactors=FALSE)
row.names(read.counts)<-read.counts[,1]
read.counts<-read.counts[, -c(0:1)]
names(read.counts)<-c("1-", "2-", "3-", "1+", "2+", "3+")
sample.info<-data.frame(Dm=c("1","2","3","1","2","3"), condition=c(rep("Negative",3), rep("Positive",3)), row.names=names(read.counts))

# Differential Expression analysis
# Create DESeq object
DESeq.ds<-DESeqDataSetFromMatrix(countData=read.counts, colData=sample.info, design = ~ Dm + condition)
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) >0,]
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)>0)>=3]
DESeq.ds <- estimateSizeFactors(DESeq.ds)
counts.sf_normalized<-counts(DESeq.ds, normalized=TRUE)
log.norm.counts <- log2(counts.sf_normalized +1)
str(colData(DESeq.ds)$condition)
colData(DESeq.ds)$condition <- relevel(colData(DESeq.ds)$condition, "Negative")
DESeq.ds <- DESeq(DESeq.ds, fitType = "local")

# Obtaining Results
DGE.results <- results(DESeq.ds, pAdjustMethod="BH")
summary(DGE.results)
head(DGE.results)

# Number of Differentially expressed genes at p<0.05 (n)
table(DGE.results$padj<0.05)

# Sort and obtain differentially expressed genes in a csv file
DGE.results.sorted <- DGE.results[order(DGE.results$padj),]
DGEgenes <- rownames(subset(DGE.results.sorted, padj<0.05))
All_DE_genes <- as.data.frame(DGE.results.sorted)
write.csv(All_DE_genes, "Drosophila_Esg_DE_Gene_List.csv")
write.csv(All_DE_genes, "Drosophila_Esg_DE_Gene_List.rnk")

# Quality control of the RNA-seq analysis
# For histogram of frequencies of p-values
pdf(file="Drosophila_Esg_Histogram.pdf", onefile=FALSE)
hist(DGE.results$pvalue, col="grey", border = "white", xlab = "", ylab = "", main="frequencies of p-values")
dev.off()

# For Boxplot of log2-transformed read counts
pdf(file="Drosophila_Esg_Boxplot.pdf", onefile=FALSE)
boxplot(log.norm.counts, notch=FALSE, main = "log2-transformed read counts", ylab="log2(read counts)")
dev.off()

# For Dendrogram
pdf(file="Drosophila_Esg_Dendrogram.pdf", onefile=FALSE)
distance.log<- as.dist(1-cor(log.norm.counts, method="pearson"))
plot(hclust(distance.log), labels= colnames(log.norm.counts), main="log2 transformed read counts\ndistance:Pearson correlation")
dev.off()

# For MA Plot
pdf(file="Drosophila_Esg_MAplot.pdf", onefile=FALSE, useDingbats=FALSE)
plotMA(DGE.results, alpha=0.05, main="esg- vs esg+", ylim= c(-15,15))
dev.off()

# Heatmap plot of Top 50 differentially expressed genes
DGE_Top<-DGEgenes[1:50]
hm.mat_DGEgenes<-log.norm.counts[DGE_Top,]
pdf(file="Drosophila_Esg_Top50_DEGenes_Heatmap.pdf", onefile=FALSE)
pheatmap(hm.mat_DGEgenes, clustering_method = "average", scale="row", fontsize = 7)
dev.off()

# Heatmap of markers based on Doupe et al., 2018 DamID paper
Cell_clusters <- read.delim("Drosophila_Perrimon_DamID_Markers.csv", header=TRUE, sep=",")
Cell_clusters <- subset(Cell_clusters, FDR.esg<0.01)
Cell_clusters <- subset(Cell_clusters, FDR.myo>0.01)
Cell_clusters <- Cell_clusters[order(Cell_clusters$FDR.esg),]
Cell_clusters <- as.character(Cell_clusters$name)
Cell_clusters_2 <- read.delim("Drosophila_Perrimon_DamID_Markers.csv", header=TRUE, sep=",")
Cell_clusters_2 <- subset(Cell_clusters_2, FDR.myo<0.01)
Cell_clusters_2 <- subset(Cell_clusters_2, FDR.esg>0.01)
Cell_clusters_2 <- Cell_clusters_2[order(Cell_clusters_2$FDR.myo),]
Cell_clusters_2 <- as.character(Cell_clusters_2$name)
Cell_clusters <- c(Cell_clusters, Cell_clusters_2)

true <- Cell_clusters %in% DGEgenes
a<-as.data.frame(true)
b<-as.data.frame(Cell_clusters)
total<-cbind(a,b)
ind <- which(with(total, true=="TRUE"))
all<-total[ind,]
DE_markers <- as.character(all$Cell_clusters)
DE_markerheatmap <- log.norm.counts[DE_markers,]
pdf(file="Drosophila_Esg_Perrimon_DamID_Markers_all_Heatmap.pdf", onefile=FALSE, useDingbats=FALSE)
pheatmap(DE_markerheatmap, cluster_rows=FALSE, cluster_cols = FALSE, scale="row", fontsize=2)
dev.off()

# Heatmap of markers genes based on Hung et al., PNAS single cell RNA-seq paper
Cell_clusters <- read.delim("Gut_marker_scRNA-seq_ ROC_avg_diff_0.75_Major_Celltypes.txt", header=FALSE)
Cell_clusters <- as.character(Cell_clusters$V4)

true <- Cell_clusters %in% DGEgenes
a<-as.data.frame(true)
b<-as.data.frame(Cell_clusters)
total<-cbind(a,b)
ind <- which(with(total, true=="TRUE"))
all<-total[ind,]
DE_markers <- as.character(all$Cell_clusters)
DE_markerheatmap <- log.norm.counts[DE_markers,]
pdf(file="Drosophila_Esg_Perrimon_scRNAseq_Markers_all_Heatmap.pdf", onefile=FALSE, useDingbats=FALSE)
pheatmap(DE_markerheatmap, cluster_rows=FALSE, cluster_cols = FALSE, scale="row", fontsize=1)
dev.off()

