#!/public/software/R/3.4.3/bin/Rscript
#WGCNA
#data_input

# generate phenotype data for network analysis
# import delimited "C:\Users\sijia\Desktop\current working dictionary\methylation\20190331_pheno_982.csv"
# keep studyid region_code is_female hours_last_ate_x10_g1 case_control highest_education met bmi_calc marital_status_g1 age_at_study_date smk_con drk_con met_3g diet_6score
# tab met_3g
# gen pa1=1 if met_3g==1
# replace pa1=0 if pa1==.
# gen pa2=1 if met_3g==2
# replace pa2=0 if pa2==.
# gen region12=1 if region_code==12
# replace region12=0 if region12==.
# gen region16=1 if region_code==16
# replace region16=0 if region16==.
# gen region26=1 if region_code==26
# replace region26=0 if region26==.
# gen region36=1 if region_code==36
# replace region36=0 if region36==.
# gen region46=1 if region_code==46
# replace region46=0 if region46==.
# gen region52=1 if region_code==52
# replace region52=0 if region52==.
# gen region58=1 if region_code==58
# replace region58=0 if region58==.
# gen region68=1 if region_code==68
# replace region68=0 if region68==.
# gen region78=1 if region_code==78
# replace region78=0 if region78==.
# drop met_3g region_code
# export delimited using "C:\Users\sijia\Desktop\current working dictionary\methylation\code\20190404 WGCNA\0404_982_pheno_WGCNA.csv", replace

getwd()
rm(list=ls())   #clear objects
load("2018-10-31_bn.RData")

PHENO <- read.csv("INPUT/csv/0404_982_pheno_WGCNA.csv",as.is=T)

PHENO <- PHENO[order(PHENO$studyid),]
pheno <- as.data.frame(pheno)
pheno <- pheno[order(pheno$studyid),]
sum(pheno$studyid!=PHENO$studyid)
dim(PHENO)
dim(pheno)
# 998 in pheno, while 982 in PHENO
# delete QC samples in pheno
# 10 QC samples: 202259340031_R08C01 202259340185_R08C01 202259490132_R08C01 202259340033_R08C01 202259340195_R08C01 202259340179_R08C01 202259340166_R08C01 202274990080_R08C01 202410000174_R08C01 202410000012_R08C01
bn <- bn[, !(colnames(bn) %in% c("202259340031_R08C01", "202259340185_R08C01", "202259490132_R08C01", "202259340033_R08C01", "202259340195_R08C01", "202259340179_R08C01", "202259340166_R08C01", "202274990080_R08C01", "202410000174_R08C01", "202410000012_R08C01") ) ]
dim(bn)
# delete 2 other QC samples
# "202410000012R06C01", "202410000012R07C01"
bn <- bn[, !(colnames(bn) %in% c("202410000012_R06C01", "202410000012_R07C01") ) ]
dim(bn)
#delete 2 samples not in the original list
# "202259350068R06C01", "202274990175R06C01"
# drop if studyid=="680319599"
# drop if studyid=="680386261"
bn <- bn[, !(colnames(bn) %in% c("202259350068_R06C01", "202274990175_R06C01") ) ]
dim(bn)

# delete samples with wrong sex in bn: 2 samples
# case_index=431, 432
# * 431: case(CK28190308) had wrong sex in mds plot 202259340113R03C01
# * 432: control(880171999, CK24795830) 202250800154R03C01
# "202259340113R03C01", "202250800154R03C01"
bn <- bn[, !(colnames(bn) %in% c("202259340113_R03C01", "202250800154_R03C01") ) ]
dim(bn)

# 202259340013_R01C01 202242420191_R05C01 already droped in QC process

#should be 982 here
na.bn <- na.omit(bn) ##delete 'NA' probes, save as sva analysis
dim(na.bn)
t_bn <- as.data.frame(t(na.bn))
t_bn <- t_bn[order(rownames(t_bn)),] ##sort samples by Barcode
pheno <- pheno[which(rownames(pheno) %in% rownames(t_bn)),]
dim(pheno)
#should be 982 here

#subset top 20k CpGs found in mixed linear regression using smartSVA
top_20k=read.csv("INPUT/csv/top_20k_0404.csv",stringsAsFactors=FALSE)
colnames(top_20k) <- "probename"
top_20k$probename[1:10]
CpGs = top_20k$probename
t_bn_top_20k <- t_bn[ , (colnames(t_bn) %in% CpGs ) ]
dim(t_bn_top_20k)  # 20000 982

library(WGCNA, lib="~/Rlib")

gsg = goodSamplesGenes(t_bn_top_20k, verbose = 3);
gsg$allOK

sampleTree = hclust(dist(t_bn_top_20k), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
pdf(file = "~/OUTPUT/0404WGCNA/0404sampleClustering.pdf", width = 50, height = 50);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)
dev.off()

PHENO <- PHENO[order(PHENO$studyid),]
pheno <- pheno[order(pheno$studyid),]
sum(pheno$studyid!=PHENO$studyid)

PHENO$Barcode <- rownames(pheno) #generate status in PHENO 
rownames(PHENO) <- PHENO$Barcode
dim(PHENO)
datTraits = PHENO[, -c(1, dim(PHENO)[2])];
#sort two datasets by barcode
datTraits <- datTraits[order(rownames(datTraits)),]
sum(rownames(t_bn)!=rownames(datTraits))
#should be 0 here

collectGarbage();

# Re-cluster samples
sampleTree2 = hclust(dist(t_bn_top_20k), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
pdf(file = "~/OUTPUT/0404WGCNA/0404_Sample dendrogram and trait heatmap.pdf", width = 50, height = 50);
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits), 
                    main = "Sample dendrogram and trait heatmap")
dev.off()


#=====================================================================================
#
#  Code chunk 9
#
#=====================================================================================


save(t_bn_top_20k, datTraits, file = "0404WGCNA-dataInput-top20k.RData")
save(sampleTree, file = "0404_sample tree.RData")