# Load libraries
library("DESeq2")
library("pheatmap")
library("ggplot2")
library("dplyr")
library("ggrepel")
library('biomaRt')

# Load Data
read.counts<-read.table("Chick_Fgf8_Read_Counts.txt", header=FALSE, stringsAsFactors=FALSE)
id <- read.counts$V1
mart <- useDataset("ggallus_gene_ensembl", useMart("ensembl"))
gene_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","external_gene_name"),values=id,mart= mart)
gene_list <- make.names(gene_list[,2], unique=TRUE)
read.counts <- read.counts[1:24356,]

row.names(read.counts)<-gene_list
read.counts<-read.counts[, -c(0:1)]
names(read.counts)<-c("Fgf8-_1", "Fgf8-_2", "Fgf8-_3", "Fgf8+_1", "Fgf8+_2", "Fgf8+_3")
sample.info<-data.frame(Chick=c("1","2","3","1","2","3"), condition=c(rep("Negative",3), rep("Positive",3)), row.names=names(read.counts))

# Differential Expression analysis
# Create DESeq object
DESeq.ds<-DESeqDataSetFromMatrix(countData=read.counts, colData=sample.info, design = ~ Chick + condition)
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) >0,]
DESeq.ds <- estimateSizeFactors(DESeq.ds)
counts.sf_normalized<-counts(DESeq.ds, normalized=TRUE)
log.norm.counts <- log2(counts.sf_normalized +1)
str(colData(DESeq.ds)$condition)
colData(DESeq.ds)$condition <- relevel(colData(DESeq.ds)$condition, "Negative")
DESeq.ds <- DESeq(DESeq.ds)

# Obtaining Results
DGE.results <- results(DESeq.ds, pAdjustMethod="BH")
summary(DGE.results)
head(DGE.results)

# Number of Differentially expressed genes at p<0.05 (n)
table(DGE.results$padj<0.05)

# Sort and obtain differentially expressed genes in a csv file
DGE.results.sorted <- DGE.results[order(DGE.results$padj),]
DGEgenes <- rownames(subset(DGE.results.sorted, padj<0.05))
All_DE_genes <- as.data.frame(DGE.results.sorted)
write.csv(All_DE_genes, "Chick_Fgf8_DE_Gene_List.csv")

# Heatmap of Top 50 differentially expressed genes
DGE_Top<-DGEgenes[1:50]
hm.mat_DGEgenes<-log.norm.counts[DGE_Top,]
pdf(file="Chick_Fgf8_Top50_DEGenes_Heatmap.pdf", onefile=FALSE)
pheatmap(hm.mat_DGEgenes, cluster_rows = FALSE, cluster_cols = FALSE, scale="row")
dev.off()

# Heatmap of Top 50 enriched genes in the FGF8+ population
Fgf8.Pos <- rownames(subset(DGE.results.sorted, log2FoldChange>0))
DGE_Top<-Fgf8.Pos[1:50]
hm.mat_DGEgenes<-log.norm.counts[DGE_Top,]
pdf(file="Chick_Fgf8_Top50_Fgf8_Enriched_DEGenes_Heatmap.pdf", onefile=FALSE)
pheatmap(hm.mat_DGEgenes, clustering_method = "average", scale="row", cluster_rows = FALSE)
dev.off()

# DE gene list of Top 500 most enriched genes in the FGF8+ population
Fgf8.Pos.Top <- Fgf8.Pos[1:500]
write.csv(Fgf8.Pos.Top, "Chick_Fgf8_Enriched_Genes.csv")

# Quality control of the RNA-seq analysis
# For histogram of frequencies of p-values
pdf(file="Chick_Fgf8_Histogram.pdf", onefile=FALSE)
hist(DGE.results$pvalue, col="grey", border = "white", xlab = "", ylab = "", main="frequencies of p-values")
dev.off()

# For Boxplot of log2-transformed read counts
pdf(file="Chick_Fgf8_Boxplot.pdf", onefile=FALSE)
boxplot(log.norm.counts, notch=TRUE, main = "log2-transformed read counts", ylab="log2(read counts)")
dev.off()

# For Dendrogram
pdf(file="Chick_Fgf8_Dendrogram.pdf", onefile=FALSE)
distance.log<- as.dist(1-cor(log.norm.counts, method="pearson"))
plot(hclust(distance.log), labels= colnames(log.norm.counts), main="log2 transformed read counts\ndistance:Pearson correlation")
dev.off()

# For MA Plot
pdf(file="Chick_Fgf8_MAplot.pdf", onefile=FALSE, useDingbats=FALSE)
plotMA(DGE.results, alpha=0.05, main="FGF8- vs FGF8+", ylim= c(-15,15))
dev.off()