# ---
# title: "MSID_2550"
# author: "Proteomics facility MPI-BP, MPI-BR"
# ---

#### dependencies ------------------------------------------------
library(patchwork)
library(tidyverse)
library(ggrepel)
library(pheatmap)
library(RColorBrewer)

#### data import and tidying data tables ---------------------------
#read dia-nn output (filtered for FDR in command line tool of search engine)
#protein group = pg
pg <- read.delim("report.pg_matrix.tsv") #Change to dedicated file path or set working directory
# filter pg for MS contaminants
pg <-  filter(pg, !grepl("cRAP-", Protein.Group))
 
#precursor table = prec
prec <-  read.delim("report.pr_matrix.tsv") #Change to dedicated file path or set working directory
# filter prec for MS contaminants
prec <-  filter(prec, !grepl("cRAP-", Protein.Names)) 
# filter prec for proteotypic/unique peptides
prec <-  filter(prec, !grepl("0", Proteotypic))
# rename columns
 colnames(pg) <- c("Protein.Group","Protein.Names","Genes","Description",
                   "A1","B1","C1","D1","E1",
                   "A2","B2","C2","D2","E2",
                   "A3","B3","C3","D3","E3",
                   "A_ctrl","B_ctrl","C_ctrl","D_ctrl","E_ctrl")
 colnames(prec) <- c("Protein.Group","Protein.Ids","Protein.Names","Genes","Description", 
                     "Proteotypic", "Stripped.Sequence","Modified.Sequence", "Charge","ID",
                   "A1","B1","C1","D1","E1",
                   "A2","B2","C2","D2","E2",
                   "A3","B3","C3","D3","E3",
                   "A_ctrl","B_ctrl","C_ctrl","D_ctrl","E_ctrl")
 
 
 ### OUTLIER REMOVAL(A3-C2, E2-D2) ---------------------
 # previously identified outliers by PCA analysis
 pg$A3 <- NULL
 pg$C2 <- NULL
 pg$E2 <- NULL
 pg$D2 <- NULL
 
 prec$A3 <- NULL
 prec$C2 <- NULL
 prec$E2 <- NULL
 prec$D2 <- NULL
 
 ### Protein ID count --------------------
 prot <- pg[,5:ncol(pg)] ### CHANGE FOR GLOBAL OR FITLER
 prot <- prot[,c("A1","A2","B1","B2","B3","C1","C3","D1","D3","E1","E3","A_ctrl","B_ctrl","C_ctrl","D_ctrl","E_ctrl")]
 
 prot_count <- c()
 # count valid values per column
 i = NULL
 for(i in 1:ncol(prot)) { #scol number o sample number of runs
   temp <- subset.data.frame(prot, prot[,i] != "NA")
   temp <- nrow(temp)
   prot_count <- c(prot_count, temp)
   rm(temp)
 }
#define colors for plots
 cols <- c(rep("#66C2A5",2),rep("#FDAE61",3),rep("#8DA0CB",2),rep("#E78AC3",2),rep("#80B1D3",2),rep("grey", 5))

 # barplot of IDs
 tmp <- data.frame(
   name=colnames(prot[,1:(length(prot))]) ,  
   value=prot_count)
 A <- ggplot(tmp, aes(x=name, y=value)) + 
   geom_bar(stat = "identity", colour = "black", fill = cols) +
   scale_x_discrete(limits=colnames(prot[,1:length(prot)]))+
   theme_classic()+ theme(legend.position = "none", axis.text=element_text(colour="black")) +
   ggtitle("")+
   ylab("protein groups") + xlab("") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
 
 test <- prot
 test <- pivot_longer(test,cols = c(1:ncol(prot)), names_to = "condition", values_to = "intensity")
 test$condition <- factor(test$condition, levels = colnames(prot[,1:ncol(prot)]))
 
 # plot boxplot of said candidate for all conditions
 B <- ggplot(data = test, aes(y = log2(intensity), x = condition))+ geom_boxplot(size = 0.7, fill = cols) + 
   ylab("log2 protein intensity") + xlab("")+
   theme_classic() + 
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "none", axis.text=element_text(colour="black"))
 rm(test)
 
 
 windows(8,5)
 (A|B)
 
 
 ### Pearson Correlation ------------------------------------------
 tmp.cor <- pg[,5:ncol(pg)]

 breaksList = seq(0, 1, by = 0.01)
 pheatmap(cor(log2(tmp.cor), use = "pairwise.complete.obs"), 
          main = "Protein correlation (Pearson Coefficient)",
          show_colnames = T,
          show_rownames = T, 
          cluster_rows = T, cluster_cols = T,
          color = colorRampPalette(brewer.pal(9, "BuPu")) (length(breaksList)), 
          breaks = breaksList,
          cellwidth = 10, cellheight = 10,
          fontsize = 7, fontsize_row  = 7, fontsize_col  = 7,
          border_col = "black",
          treeheight_row = 35,
          treeheight_col = 35
 )
 
rm(tmp.cor)


#Principal component analysis -------------------------------------

PCA_mat <-  pg
rownames(PCA_mat) <- PCA_mat$Protein.Group
PCA_mat$Protein.Group <- NULL; PCA_mat$Genes <- NULL; PCA_mat$Description <- NULL; PCA_mat$Protein.Names <- NULL
PCA_mat <- log2(PCA_mat); PCA_mat <- na.omit(PCA_mat)

#perform pricipal component analyis, matrix transposed (input of prcomp: sample = rows)
pca <- prcomp(na.omit(as.matrix( t(PCA_mat))), scale = F) 

#calculate Variation for each experiment
#square of std.dev is variation
# calculate percentage of variation
pca.var <- pca$sdev^2 
pca.var.per <- round(pca.var/sum(pca.var)*100,1) 

# generate a dataframe from the info to use for plotting
pca.var.data <- data.frame(Component = seq(1,length(pca.var.per),1),
                           Y = seq(1,length(pca.var.per),1),
                           X = pca.var.per)

#format for ggplot
pca.data <- data.frame(Sample = rownames(pca$x),
                       X = pca$x[,1],
                       Y = pca$x[,2],
                       Z = pca$x[,3])

### assign pca groups test
pca.data$Group <- c("A","B","C","D","E","A","B","B","C","D","E","ctrl","ctrl","ctrl","ctrl","ctrl")


#barplot(pca.var.per, main = "Scree plot", xlab = "Principal component", ylab = "Percent variation")
A <- ggplot(data = pca.var.data, aes(x = as.factor(Y), y = X)) + 
  geom_bar(stat = "identity")+
  ylab("Percentage of variation")+
  xlab("Principal component")+
  #ggtitle("Scree plot: Variance per PC")+
  theme_classic() + theme(legend.position = "none", axis.text=element_text(colour="black"))

#define colors for plots
cols <- c("#66C2A5", "#FDAE61", "#8DA0CB","grey", "#E78AC3", "#80B1D3")

B <- ggplot(data = pca.data, aes(x = X, y = Y, label = Sample)) +
  geom_point(aes(col = Group), size = 5) +
  geom_point(shape = 1, size = 5, colour = "black") +
  xlab(paste("PC1 - ", pca.var.per[1], "%", sep =""))+
  ylab(paste("PC2 - ", pca.var.per[2], "%", sep =""))+
  geom_label_repel(size = 3)+
  scale_color_manual(values = cols)+
  #ggtitle("PCA: log2 intensity")+
  theme_classic() + theme(legend.position = "none", axis.text=element_text(colour="black"))


windows(8,5)
(B|A)
rm(A,B, PCA_mat, pca.data, pca, pca.var, pca.var.data)



 
### Candidate-boxplots based on PRECURSORS --------------------------------------------------------------------------------------------------
tmp<- prec
## Which protein candidate do you want to plot? 
p = c("P04637;P04637-4") # INSERT PROTEIN ID HERE
#make sure this selects one row of the dataframe - won't work if multiple isoforms are selected
test <- filter(tmp, str_detect(tmp$Protein.Group, p))

## Check test-table for identified peptide-precursor and decide which sequence and charge state you want to plot? 
i = c("") # INSERT PEPETIDE SEQUENCE and charge state
test <- filter(test, str_detect(test$ID, i))

##filter dataset columns for log2 intensity of protein of interest
test <- test[,11:26] 
#create a data frame to plot in ggplot (transpose from wide to long format); name new columns and conditions 
test <- as.data.frame(t(test)); colnames(test) <- c("Intensity")  

test$Condition <-  as.factor(c("A-D40p53a","B-D133p53a","C-D133p53b","D-TAp53b","E-TAp53g",
                               "A-D40p53a","B-D133p53a",
                               "B-D133p53a","C-D133p53b","D-TAp53b","E-TAp53g",
                               "ctrl","ctrl","ctrl","ctrl","ctrl"))

# plot boxplot of said candidate for all conditions
windows(3,5)
ggplot(data = test, aes(y = log2(Intensity), x = Condition, col = Condition)) + 
  geom_boxplot(aes(Condition), size = 1, outlier.shape = NA) + 
  scale_x_discrete(limits = c("A-D40p53a", "B-D133p53a", "C-D133p53b","D-TAp53b","E-TAp53g","ctrl"))+
  scale_color_manual(values=c(rep(c("#66C2A5", "#FDAE61", "#8DA0CB", "grey", "#E78AC3", "#80B1D3"),1))) +
  geom_jitter(color = "black", shape=16, position=position_jitter(0.2), size =2)+
  xlab("") + ylab("log2 precursor intensity") + 
  theme_classic() +  theme(axis.text.x = element_text(angle = 90))+
  ggtitle(paste(p, i)) + ylim(10,25)+
  theme(legend.position="none", text = element_text(size = 10), axis.text = element_text(colour="black"))
rm(i,p, test)

### WRITE OUTPUT FILES-------
write.csv(test, file="TP53_precursor_unique_contam-filtered.csv")
write.csv(prec, file="precursor_all_unique_contam-filtered.csv")
write.csv(pg, file="protein_all_contam-filtered.csv")


### Candidate-boxplots based on PROTEINS --------------------------------------------------------------------------------------------------
tmp<- pg
## Which protein candidate do you want to plot? 
i = c("P04637;P04637-4") # INSERT PROTEIN ID HERE
#make sure this selects one row of the dataframe - won't work if multiple isoforms are selected
test <- filter(tmp, str_detect(tmp$Protein.Group, i))


##filter dataset columns for log2 intensity of protein of interest
test <- test[,5:20] 

#create a data frame to plot in ggplot (transpose from wide to long format); name new columns and conditions 
test <- as.data.frame(t(test)); colnames(test) <- c("Intensity")
test$Condition <-  as.factor(c("A-D40p53a","B-D133p53a","C-D133p53b","D-TAp53b","E-TAp53g",
                               "A-D40p53a","B-D133p53a",
                               "B-D133p53a","C-D133p53b","D-TAp53b","E-TAp53g",
                               "ctrl","ctrl","ctrl","ctrl","ctrl"))
# plot boxplot of said candidate for all conditions
windows(3,5)
ggplot(data = test, aes(y = log2(Intensity), x = Condition, col = Condition)) + 
  geom_boxplot(aes(Condition), size = 1, outlier.shape = NA) + 
  scale_x_discrete(limits = c("A-D40p53a", "B-D133p53a", "C-D133p53b","D-TAp53b","E-TAp53g","ctrl"))+
  scale_color_manual(values=c(rep(c("#66C2A5", "#FDAE61", "#8DA0CB","grey", "#E78AC3", "#80B1D3"),1))) +
  geom_jitter(color = "black", shape=16, position=position_jitter(0.2), size =2)+
  xlab("") + ylab("log2 protein intensity") + 
  theme_classic() +  theme(axis.text.x = element_text(angle = 90))+
  ggtitle(i) + ylim(10,25)+
  theme(legend.position="none", text = element_text(size = 10), axis.text = element_text(colour="black"))
rm(i, test)




