#relate methylation module to trait
#model: "y ~OUT+AGE+SEX+i.REGION+HOURS+EDU+MAR+covariates+sv

# covariates:
# smoking (continuous): smk_con ; 
# alcohol consumption (continuous): drk_con; 
# diet (0-6 score): diet_6score;
# PA (met continuous): cut met at tertile; 
# BMI (continuous): bmi_calc
####################################
# gen smk_con=0 if smoking_category == 1 | smoking_category == 2
# replace smk_con=0 if smoking_category == 3 & (smoking_stopped_reason > 0 & smoking_stopped_reason <= 4)
# replace smk_con=cig_equiv_day if (smoking_category == 4 | (smoking_category == 3 & smoking_stopped_reason == 0))
# gen drk_con=total_alc_typ_day_g if alcohol_7groups!=3
# replace drk_con=total_alc_typ_day_g/7*1.5 if alcohol_7groups==3 & alc_weekly==0
# replace drk_con=total_alc_typ_day_g/7*4 if alcohol_7groups==3 & alc_weekly==1
####################################
# sample: 980

# 2019-04-07
# SJH


#2019-03-04
# Si JH
rm(list=ls())
setwd("C:/Users/sijia/Desktop/current working dictionary/methylation/code/20190404 WGCNA/03.relateMEtoPhenotype/task43-0405relateMEtoPhenotype.R")
load("0405_step3_GS_MM_info.RData")
rownames(datTraits) <- rownames(pheno)
dim(datTraits) #phenotype [1] 980  21
dim(MEs) #module [1] 980  6
#case_index
# status <- read.csv("C:/Users/sijia/Desktop/current working dictionary/methylation/sample sheet/status.csv",as.is=T)
# rownames(status) <- status$status_1
# status980 <- status[rownames(datTraits),]
# datTraits <- as.data.frame(datTraits)
# datTraits <- datTraits[order(rownames(datTraits)),]
# status980 <- status980[order(rownames(status980)),]

sum(rownames(datTraits)!=rownames(t_bn_top_20k)) #0
# sum(rownames(datTraits)!=rownames(status980)) #0
# datTraits <- cbind(datTraits, status980$case_index)


batch <- read.csv("C:/Users/sijia/Desktop/current working dictionary/methylation/sample sheet/status.csv",as.is=T)
datTraits_batch <- as.data.frame(merge(datTraits, batch, by.x = "row.names", by.y = "status_1"))

sva <- read.csv("190403_smartSVA_CHD.csv",as.is=T)
rownames(sva) <- sva$X
sva <- sva[rownames(datTraits),]
sva <- sva[order(rownames(sva)),]
sum(rownames(datTraits)!=rownames(sva)) #0
sva <- sva[,2:dim(sva)[2]]

#
covs<-data.frame(datTraits,sva)
#relate ME to chd and all covariates
Formula <- formula(paste("y ~CHD+Age+Gender+Education+Marital_status+SMK+DRK+PA1+PA2+Diet_score+BMI+Fasting_time+Region1+Region2+Region3+Region4+Region5+Region6+Region7+Region8+Region9+", 
                         paste(paste("sv", 1:dim(sva)[2], sep=""), collapse="+")))

fitmodel =function(i){
  
  covs$y =  qnorm(rank(MEs[,i])/(length(MEs[,i])+1),mean = 0,sd = 1)
  est=rep(NA,21) #21 may need to change
  se=rep(NA,21)
  pvalue=rep(NA,21)
  tryCatch({
    lm2=lm(Formula,data=covs)
    est=coef(summary(lm2))[2:22,1]
    se=coef(summary(lm2))[2:22,2]
    pvalue=coef(summary(lm2))[2:22,4]   
    
  },warning=function(w) {print(paste("warning",i,sep=" "))}, error = function(e) {print(paste("error",i,sep=" "))})

  c(est,se,pvalue)
}

result=as.data.frame(t(sapply(1:6,fitmodel)))
colNames = c("CHD", "Age", "Gender", "Education", "Marital_status", "SMK", "DRK", "PA1", "PA2", "Diet_score", "BMI", "Fasting_time", "Region1","Region2", "Region3","Region4","Region5","Region6","Region7","Region8","Region9")
names(result)=c(paste("e",colNames,sep=""),paste("se",colNames,sep=""),paste("p",colNames,sep=""))

result$ModuleName=colnames(MEs)[1:6]

write.table(result,"20190408_with_sv.csv",quote=F,sep=",",col.name=T,row.name=F)

moduleTraitCor = cor(MEs, datTraits, use = "p");
col.name <- colnames(datTraits)
row.name = rownames(moduleTraitCor)[-6]

moduleTraitCor = cbind(result$eCHD, result$eAge, result$eGender, result$eEducation, result$eMarital_status, result$eSMK, result$eDRK, result$ePA1, result$ePA2, result$eDiet_score, result$eBMI, result$eFasting_time, result$eRegion1, result$eRegion2, result$eRegion3, result$eRegion4, result$eRegion5, result$eRegion6, result$eRegion7, result$eRegion8, result$eRegion9)
moduleTraitCor <- moduleTraitCor[1:5,]

colnames(moduleTraitCor) <- col.name
rownames(moduleTraitCor) <- row.name
moduleTraitPvalue = cbind(result$pCHD, result$pAge, result$pGender, result$pEducation, result$pMarital_status, result$pSMK, result$pDRK, result$pPA1, result$pPA2, result$pDiet_score, result$pBMI, result$pFasting_time, result$pRegion1, result$pRegion2, result$pRegion3, result$pRegion4, result$pRegion5, result$pRegion6, result$pRegion7, result$pRegion8, result$pRegion9)
moduleTraitPvalue <- moduleTraitPvalue[1:5,]

colnames(moduleTraitPvalue) <- col.name
rownames(moduleTraitPvalue) <- row.name

library("RColorBrewer")
library("WGCNA")
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                    signif(-log10(moduleTraitPvalue), 2), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = -log10(moduleTraitPvalue),
               xLabels = col.name,
               yLabels = names(MEs)[-6],
               ySymbols = names(MEs)[-6],
               colorLabels = FALSE,
               colors = colorRampPalette(brewer.pal(7,"Blues"),bias = 3)(100),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               cex.lab.x = 1,
               cex.lab.y =1,
               main = paste("Module-trait relationships"))
# dev.off()


#relate MEs to lifestyle factors
Formula <- formula(paste("y ~Age+Gender+Education+Marital_status+SMK+DRK+PA1+PA2+Diet_score+BMI+Fasting_time+Region1+Region2+Region3+Region4+Region5+Region6+Region7+Region8+Region9+", 
                         paste(paste("sv", 1:dim(sva)[2], sep=""), collapse="+")))

fitmodel =function(i){
  
  covs$y =  qnorm(rank(MEs[,i])/(length(MEs[,i])+1),mean = 0,sd = 1)
  est=rep(NA,20) #20 may need to change
  se=rep(NA,20)
  pvalue=rep(NA,20)
  tryCatch({
    lm2=lm(Formula,data=covs)
    est=coef(summary(lm2))[2:21,1]
    se=coef(summary(lm2))[2:21,2]
    pvalue=coef(summary(lm2))[2:21,4]   
    
  },warning=function(w) {print(paste("warning",i,sep=" "))}, error = function(e) {print(paste("error",i,sep=" "))})

  c(est,se,pvalue)
}

result=as.data.frame(t(sapply(1:6,fitmodel)))
colNames = c("Age", "Gender", "Education", "Marital_status", "SMK", "DRK", "PA1", "PA2", "Diet_score", "BMI", "Fasting_time", "Region1","Region2", "Region3","Region4","Region5","Region6","Region7","Region8","Region9")
names(result)=c(paste("e",colNames,sep=""),paste("se",colNames,sep=""),paste("p",colNames,sep=""))

result$ModuleName=colnames(MEs)[1:6]

write.table(result,"20190408_with_sv_withoutCHD.csv",quote=F,sep=",",col.name=T,row.name=F)

moduleTraitCor = cor(MEs, datTraits, use = "p");
col.name <- colnames(datTraits)
col.name = col.name[2:21]
row.name = rownames(moduleTraitCor)

moduleTraitCor = cbind(result$eAge, result$eGender, result$eEducation, result$eMarital_status, result$eSMK, result$eDRK, result$ePA1, result$ePA2, result$eDiet_score, result$eBMI, result$eFasting_time, result$eRegion1, result$eRegion2, result$eRegion3, result$eRegion4, result$eRegion5, result$eRegion6, result$eRegion7, result$eRegion8, result$eRegion9)
colnames(moduleTraitCor) <- col.name
rownames(moduleTraitCor) <- row.name
moduleTraitPvalue = cbind(result$pAge, result$pGender, result$pEducation, result$pMarital_status, result$pSMK, result$pDRK, result$pPA1, result$pPA2, result$pDiet_score, result$pBMI, result$pFasting_time, result$pRegion1, result$pRegion2, result$pRegion3, result$pRegion4, result$pRegion5, result$pRegion6, result$pRegion7, result$pRegion8, result$pRegion9)
colnames(moduleTraitPvalue) <- col.name
rownames(moduleTraitPvalue) <- row.name

sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                    signif(-log10(moduleTraitPvalue), 2), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = -log10(moduleTraitPvalue),
               xLabels = col.name,
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = colorRampPalette(brewer.pal(7,"Blues"),bias = 3)(100),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               cex.lab.x = 1,
               cex.lab.y =1,
               main = paste("Module-trait relationships"))
# dev.off()

# relate ME to chd, batch (dumy variable), and all covariates
class(datTraits_batch$No_batch)
datTraits_batch$No_batch<-factor(datTraits_batch$No_batch)

Formula <- formula(paste("y ~CHD+No_batch+Age+Gender+Education+Marital_status+SMK+DRK+PA1+PA2+Diet_score+BMI+Fasting_time+Region1+Region2+Region3+Region4+Region5+Region6+Region7+Region8+Region9+", 
                         paste(paste("sv", 1:dim(sva)[2], sep=""), collapse="+")))

fitmodel =function(i){
  
  covs$y =  qnorm(rank(MEs[,i])/(length(MEs[,i])+1),mean = 0,sd = 1)
  est=rep(NA,25) #25 may need to change
  se=rep(NA,25)
  pvalue=rep(NA,25)
  tryCatch({
    lm2=lm(Formula,data=covs)
    est=coef(summary(lm2))[2:26,1]
    se=coef(summary(lm2))[2:26,2]
    pvalue=coef(summary(lm2))[2:26,4]   
    
  },warning=function(w) {print(paste("warning",i,sep=" "))}, error = function(e) {print(paste("error",i,sep=" "))})

  c(est,se,pvalue)
}

result=as.data.frame(t(sapply(1:6,fitmodel)))
colNames = c("CHD", "Batch_2", "Batch_3", "Batch_4", "Bacth_5", "Age", "Gender", "Education", "Marital_status", "SMK", "DRK", "PA1", "PA2", "Diet_score", "BMI", "Fasting_time", "Region1","Region2", "Region3","Region4","Region5","Region6","Region7","Region8","Region9")
names(result)=c(paste("e",colNames,sep=""),paste("se",colNames,sep=""),paste("p",colNames,sep=""))

result$ModuleName=colnames(MEs)[1:6]

write.table(result,"20190620_with_batch_sv.csv",quote=F,sep=",",col.name=T,row.name=F)

moduleTraitCor = cor(MEs, datTraits, use = "p");
col.name <- colnames(datTraits)
row.name = rownames(moduleTraitCor)[-6]

moduleTraitCor = cbind(result$eCHD, result$eAge, result$eGender, result$eEducation, result$eMarital_status, result$eSMK, result$eDRK, result$ePA1, result$ePA2, result$eDiet_score, result$eBMI, result$eFasting_time, result$eRegion1, result$eRegion2, result$eRegion3, result$eRegion4, result$eRegion5, result$eRegion6, result$eRegion7, result$eRegion8, result$eRegion9)
moduleTraitCor <- moduleTraitCor[1:5,]

colnames(moduleTraitCor) <- col.name
rownames(moduleTraitCor) <- row.name
moduleTraitPvalue = cbind(result$pCHD, result$pAge, result$pGender, result$pEducation, result$pMarital_status, result$pSMK, result$pDRK, result$pPA1, result$pPA2, result$pDiet_score, result$pBMI, result$pFasting_time, result$pRegion1, result$pRegion2, result$pRegion3, result$pRegion4, result$pRegion5, result$pRegion6, result$pRegion7, result$pRegion8, result$pRegion9)
moduleTraitPvalue <- moduleTraitPvalue[1:5,]

colnames(moduleTraitPvalue) <- col.name
rownames(moduleTraitPvalue) <- row.name

library("RColorBrewer")
library("WGCNA")
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix =  paste(signif(moduleTraitCor, 2), "/n(",
                    signif(-log10(moduleTraitPvalue), 2), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = -log10(moduleTraitPvalue),
               xLabels = col.name,
               yLabels = names(MEs)[-6],
               ySymbols = names(MEs)[-6],
               colorLabels = FALSE,
               colors = colorRampPalette(brewer.pal(7,"Blues"),bias = 3)(100),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               cex.lab.x = 1,
               cex.lab.y =1,
               main = paste("Module-trait relationships"))
# dev.off()


#generate CpGs list
result <- read.csv("C:/Users/sijia/Desktop/current working dictionary/methylation/code/20190403 CHD_EWAS/allssite_plusanno.csv",as.is=T)
result <- result[order(result$pOUT),]
result_top20k <- result[1:20000,]
result_top20k <- result_top20k[order(result_top20k$probename),]

moduleColors <- as.data.frame(moduleColors)
moduleColors$probename <- colnames(t_bn_top_20k)

moduleColors <- moduleColors[order(moduleColors$probename),]

sum(moduleColors$probename!=result_top20k$probename) #0
result_top20k <- cbind(result_top20k, moduleColors$moduleColors)

write.csv(result_top20k, "find_cpg.csv")

