#!/public/software/R/3.4.3/bin/Rscript

# EWAS for CHD
# covariates
# smoking (continuous): smk_con ; 
# alcohol consumption (continuous): drk_con; 
# diet (0-6 score): diet_6score;
# PA (met continuous): cut met at tertile; 
# BMI (continuous): bmi_calc
####################################
# gen smk_con=0 if smoking_category == 1 | smoking_category == 2
# replace smk_con=0 if smoking_category == 3 & (smoking_stopped_reason > 0 & smoking_stopped_reason <= 4)
# replace smk_con=cig_equiv_day if (smoking_category == 4 | (smoking_category == 3 & smoking_stopped_reason == 0))
# gen drk_con=total_alc_typ_day_g if alcohol_7groups!=3
# replace drk_con=total_alc_typ_day_g/7*1.5 if alcohol_7groups==3 & alc_weekly==0
# replace drk_con=total_alc_typ_day_g/7*4 if alcohol_7groups==3 & alc_weekly==1
####################################
# sample: 982
# original: 988 and 12 duplicated sample
# 2 samples missing rate>=0.01;
# 2 samples predicted wrong sex;
# 2 samples in the 6th batch.
# 2019-04-03
# SJH


getwd()
rm(list=ls())   #clear objects
load("2018-10-31_bn.RData")

args <- commandArgs(trailingOnly = TRUE)
cpg=args[1]
cpg_n<-as.numeric(cpg)

start <- (cpg_n-1)*40000+1 
if (cpg_n==19)
          {
          end <- 747726
        }else {
          end <- cpg_n*40000
        }

PHENO <- read.csv("INPUT/csv/20190331_pheno_982.csv",as.is=T)
PHENO <- PHENO[order(PHENO$studyid),]
pheno <- as.data.frame(pheno)
pheno <- pheno[order(pheno$studyid),]
sum(pheno$studyid!=PHENO$studyid)
dim(PHENO)
dim(pheno)
# 998 in pheno, while 596 in PHENO

# delete QC samples in bn
# 10 QC samples: 202259340031_R08C01 202259340185_R08C01 202259490132_R08C01 202259340033_R08C01 202259340195_R08C01 202259340179_R08C01 202259340166_R08C01 202274990080_R08C01 202410000174_R08C01 202410000012_R08C01
bn <- bn[, !(colnames(bn) %in% c("202259340031_R08C01", "202259340185_R08C01", "202259490132_R08C01", "202259340033_R08C01", "202259340195_R08C01", "202259340179_R08C01", "202259340166_R08C01", "202274990080_R08C01", "202410000174_R08C01", "202410000012_R08C01") ) ]
dim(bn)
# delete 2 other QC samples
# "202410000012R06C01", "202410000012R07C01"
bn <- bn[, !(colnames(bn) %in% c("202410000012_R06C01", "202410000012_R07C01") ) ]
dim(bn)
#delete 2 samples not in the original list
# "202259350068R06C01", "202274990175R06C01"
# drop if studyid=="680319599"
# drop if studyid=="680386261"
bn <- bn[, !(colnames(bn) %in% c("202259350068_R06C01", "202274990175_R06C01") ) ]
dim(bn)

# delete samples with wrong sex in bn: 2 samples
# case_index=431, 432
# * 431: case(CK28190308) had wrong sex in mds plot 202259340113R03C01
# * 432: control(880171999, CK24795830) 202250800154R03C01
# "202259340113R03C01", "202250800154R03C01"
bn <- bn[, !(colnames(bn) %in% c("202259340113_R03C01", "202250800154_R03C01") ) ]
dim(bn) # 982
# 202259340013_R01C01 202242420191_R05C01 already droped in QC process


na.bn <- na.omit(bn) ##delete 'NA' probes, save as sva analysis
dim(na.bn)
t_bn <- t(na.bn)
t_bn <- t_bn[order(rownames(t_bn)),] ##sort samples by Barcode
pheno <- pheno[which(rownames(pheno) %in% rownames(t_bn)),]
dim(pheno)
#should be 982 here

PHENO <- PHENO[order(PHENO$studyid),]
pheno <- pheno[order(pheno$studyid),]
sum(pheno$studyid!=PHENO$studyid)

PHENO$Barcode <- rownames(pheno) #generate status in PHENO 

PHENO$region_code <- factor(PHENO$region_code) #generate dumy viriable 

rownames(PHENO) <- PHENO$Barcode

head(PHENO)
head(rownames(t_bn))
PHENO <- PHENO[with(PHENO,order(Barcode)),]   ##sort PHENO DATA by Barcode
sum(rownames(PHENO)!=rownames(t_bn))   # 0

dim(t_bn) 
dim(PHENO)

#FOR LME ONLY
# THUS ADJUSTED FOR 1.AGE 2.SEX 3.REGION AND 4.HOURS SINCE LAST ATE
AGE <- PHENO[,"age_at_study_date"]
SEX <- PHENO[,"is_female"]
REGION <- PHENO[,"region_code"]
HOURS <- PHENO[,"hours_last_ate_x10_g1"]
# social demographic factors
EDU <- PHENO[,"highest_education"]
MAR <- PHENO[,"marital_status_g1"]
#covariates
smk <- PHENO[,"smk_con"]
drk <- PHENO[,"drk_con"]
pa <- PHENO[,"met_3g"]
diet <- PHENO[,"diet_6score"]
bmi <- PHENO[,"bmi_calc"]
# outcome: case or control
OUT <- PHENO[,"case_control"]
# # diseases
# BMI <- PHENO[,"bmi_calc"]
# DIA <- PHENO[,"has_diabetes"]
# HYP <- PHENO[,"has_hypertension"]
# LIP <- PHENO[,"has_lip"]
REGION <- factor(REGION)
pa <- factor(pa)

require(R.utils)

#sort pheno using rownames
pheno <- pheno[order(rownames(pheno)),]
sum(rownames(pheno)!=rownames(t_bn))   # 0


####model 1######
# CpG = healthy lifestyles(smk, drk, pa, bmi, diet) + AGE + SEX + REGION + HOURS + EDU + MAR +  SVA/Factor
# problem: region as a dummy variable, need to try
sva <- read.csv("INPUT/csv/190403_smartSVA_CHD.csv",as.is=T)
rownames(sva) <- factor(sva$X)
sva <- sva[,2:dim(sva)[2]]
sum(rownames(sva)!=rownames(pheno))  # 0

covs<-data.frame(OUT,smk,drk,pa,bmi,diet,AGE,SEX,REGION,HOURS,EDU,MAR,sva)

Formula <- formula(paste("y ~OUT+smk+drk+pa+bmi+diet+AGE+SEX+HOURS+EDU+MAR+REGION+", 
                         paste(paste("sv", 1:dim(sva)[2], sep=""), collapse="+")))

fitmodel =function(i){
  
  covs$y = as.numeric(t_bn[,i])
  est=rep(NA,12) #15 may need to change
  se=rep(NA,12)
  pvalue=rep(NA,12)
  tryCatch({
    lm2=lm(Formula,data=covs)
    est=coef(summary(lm2))[2:13,1]
    se=coef(summary(lm2))[2:13,2]
    pvalue=coef(summary(lm2))[2:13,4]   
     
  },warning=function(w) {print(paste("warning",i,sep=" "))}, error = function(e) {print(paste("error",i,sep=" "))})
  
  c(est,se,pvalue)
}
result=as.data.frame(t(sapply(start:end,fitmodel)))
colNames = c("OUT","smk","drk","pa1","pa2","bmi","diet","AGE","SEX","HOURS","EDU","MAR")

names(result)=c(paste("e",colNames,sep=""),paste("se",colNames,sep=""),paste("p",colNames,sep=""))
result$probename=colnames(t_bn)[start:end]

write.table(result,paste("~/OUTPUT/0403CHD_EWAS/190403_982_CHDEWAS",cpg,".csv",sep = ""),quote=F,sep=",",col.name=T,row.name=F)