###############################################################################
#
#     Analysis of InCHIANTI Proteomics
#     Date: 9/25/2020
#     Name: Toshiko Tanaka
#
#     a. Analysis of proteins with age, sex, and interactions sex*age
#     b. Analysis of mortality 
#     c. Analysis of multimorbidity
#     d. DEswan analysis
#     e. Mediation Analysis
#
###############################################################################



#############################################################################################################################
#
#     a. Analysis of proteins with age, sex, and interactions sex*age
#
#############################################################################################################################


home = "~/Somascan/InCHIANTI proteomics/Run (1) -(2)/adat file/"

# read in phenotype file
phen   <- read.table("InCHIANTI_soma_pheno_31May2018.csv",sep=",",header=T)

### reading protein file

all    <- read.table(paste(home,"ALL_hyb.cal_RFU.csv",sep=""),sep=",",header=T)

### Reading in Inchinati keep the sample data only, remove nonspecific probes
remove <- c("SL000445","SL004697","SL004698","SL016148")
s      <- c("Pooled EDTA QC","QC_CHI2")
all2   <- all[!(all$SampleId %in% s) & all$SampleType=="Sample",!(names(all) %in% remove)]

inch   <- merge(phen, all2, by.x="CODE98", by.y="SampleId", all.y=TRUE)

rm(remove, all2, s)


### Linear regression analysis 1) age, 2) sex, 2) sex-interactions

runlin <- function(i){
  inch$logpro <- log(inch[,i])
  inch$testp2 <- inch$logpro
  inch$testp2[abs(inch$logpro-mean(inch$logpro))>=4*sd(inch$logpro)] <- NA
  temp        <- subset(inch,!is.na(inch$testp2))
  inch$age2   <- inch$IXAGE*inch$IXAGE
  
  ### Age analysis ####################################
  
  a<-lm(temp$testp2~as.factor(SEX) + as.factor(SITE) , data=temp,na.action=na.exclude)
  temp$res<-a$residuals
  
  ## all subjects
  reg<-lm(inch$testp2~IXAGE + as.factor(SEX) + as.factor(SITE) , data=inch,na.action=na.omit)
  coefs <- data.frame(coef(summary(reg)))
  
  # variance explained
  rsq<-summary(reg)$adj.r.squared - summary(a)$adj.r.squared

  # adjust for disease
  regb<-lm(inch$testp2~IXAGE + as.factor(SEX) + as.factor(SITE) + INDEX , data=inch,na.action=na.omit)
  coefsb <- data.frame(coef(summary(regb)))
  
  # sample size
  len<-length(reg$residuals)
  

  
  ### intx
  regi   <-lm(testp2~IXAGE*as.factor(SEX) + as.factor(SITE) , data=inch,na.action=na.omit)
  coefsi <- data.frame(coef(summary(regi)))

  results1<-c(len,coefs[2,1],coefs[2,2],coefs[2,4],rsq,
              coefsb[2,1],coefsb[2,2],coefsb[2,4],
              coefs[3,1],coefs[3,2],coefs[3,4],
              coefsi[5,1],coefsi[5,2],coefsi[5,4])
  results1
}

# Run analysis
lenfull        <- length(inch[1,]) 
results        <- data.frame(t(sapply(83:lenfull,runlin))) 
names(results) <- c("N","beta","se","p","varexp","beta_index","se_index","p_index",
                    "beta_sex","se_sex","p_sex","beta_intx","se_intx","p_intx")
results$SomaId <- colnames(inch)[83:lenfull]


# FDR adjust
results$pfdr     <- p.adjust(results$p,      method="fdr", n = length(results$p))
results$pindexfdr<- p.adjust(results$p_index,method="fdr", n = length(results$p_index))
results$psexfdr  <- p.adjust(results$p_sex,  method="fdr", n = length(results$p_sex))
results$pintxfdr <- p.adjust(results$p_intx, method="fdr", n = length(results$p_intx))


#############################################################################################################################
#     b. Analysis of mortality 
#############################################################################################################################

library(survival)

# limit the analysis to age associated proteins
notsig  <- subset(results,results$pfdr>0.05)
inchsig <- inch[,!(names(all) %in% notsig$SomaId)]

# test for proportional hazards
coxph   <- coxph(Surv(fuyears,allcausemort)~ IXAGE + SEX + SITE, data=inchsig, na.action=na.omit)
test.ph <- cox.zph(coxph)


runcox<-function(i){
  inchsig$logpro<-log(inchsig[,i])
  inchsig$testp2<-inchsig$logpro
  inchsig$testp2[abs(inch$logpro-mean(inch$logpro))>=4*sd(inchsig$logpro)]<-NA 
  inch2<-subset(inchsig,!is.na(inchsig$testp2))
  inch2$testp2a<-scale(inch2$testp2,center=TRUE,scale=TRUE)
  
  # protein alone
  coxph1a <- coxph(Surv(fuyears2,allcausemort)~ testp2  ,data=inch2,na.action=na.omit)
  coefs   <- coef(summary(coxph1a))
  
  inch2$lp.pro <- predict(coxph1a, type = "lp")
  a1           <- rcorrcens(Surv(fuyears2,allcausemort) ~ I(-1 * lp.pro), data = inch2)
  lci1         <- a1[1,1]-1.96*(a1[1,4]/sqrt(a1[1,7]))
  uci1         <- a1[1,1]+1.96*(a1[1,4]/sqrt(a1[1,7]))
  
  # protein adjusted for covariates
  
  coxph2        <- coxph(Surv(fuyears2,allcausemort)~ testp2 + IXAGE + as.factor(SEX)+SITE ,data=inch2,na.action=na.omit)
  coefs2        <- coef(summary(coxph2))
  
  inch2$lp.age.pro  <- predict(coxph2, type = "lp")
  a                 <- rcorrcens(Surv(fuyears2,allcausemort) ~ I(-1 * lp.age.pro), data = inch2)
  lci               <- a[1,1]-1.96*(a[1,4]/sqrt(a[1,7]))
  uci               <- a[1,1]+1.96*(a[1,4]/sqrt(a[1,7]))
  
  results1<-c(coefs[1,1],coefs[1,3],coefs[1,5],a1[1,1],lci1,uci1,coefs2[1,1],coefs2[1,3],coefs2[1,5],a[1,1],lci,uci)
  
  results1
}

# Run mortality analysis
lenfull <- length(inchsig[1,]) 
results <- data.frame(t(sapply(82:lenfull,runcox))) 
names(results) <- c("beta_mortpro","se_mortpro","p_mortpro","cindex","lci","uci","beta_mort","se_mort","p_mort","cindex2","lci2","uci2")
results$SomaId <- colnames(inchsig)[82:lenfull]

results$pmortfdr     <- p.adjust(a$p_mortpro,method="fdr", n = length(a$p_mortpro))
results$pmortfullfdr <- p.adjust(a$p_mort,   method="fdr", n = length(a$p_mort))

#############################################################################################################################
#     c. Analysis of multimorbidity
#############################################################################################################################



runmult <- function(i){
  inchsig$logpro<-log(inchsig[,i])
  inchsig$testp2<-inchsig$logpro
  inchsig$testp2[abs(inch$logpro-mean(inch$logpro))>=4*sd(inchsig$logpro)]<-NA 
  inch2<-subset(inchsig,!is.na(inchsig$testp2))
  inch2$testp2a<-scale(inch2$testp2,center=TRUE,scale=TRUE)
  
  ### all trajectory no base multimorbidity
  reg4    <- (lm(sindex_basefree~testp2a ,data=inch2, na.action=na.omit))
  coefs4  <- data.frame(coef(summary(reg4)))
  
  reg4a   <- (lm(sindex_basefree~testp2a + IXAGE + SEX +SITE ,data=inch2, na.action=na.omit))
  coefs4a <- data.frame(coef(summary(reg4a)))
  
  results <- c(coefs4[2,1], coefs4[2,2], coefs4[2,4],summary(reg4)$r.squared, 
               coefs4a[2,1], coefs4a[2,2], coefs4a[2,4],summary(reg4a)$adj.r.squared)
  
}

# Run multimorbidity analysis
lenfull <- length(inchsig[1,]) 
results <- data.frame(t(sapply(82:lenfull,runmult))) 
names(results) <- c("B_sindex_bfree","SE_sindex_bfree","P_sindex_bfree","Rsq_sindex_bfree","B_sindexadj_bfree",
                    "SE_sindexadj_bfree","P_sindexadj_bfree","Rsq_sindexadj_bfree")
results$SomaId <- colnames(inchsig)[82:lenfull]

results$pmultmorbfdr     <- p.adjust(a$P_sindex_bfree,    method="fdr", n = length(a$P_sindex_bfree))
results$pmultmorbfullfdr <- p.adjust(a$P_sindexadj_bfree, method="fdr", n = length(a$P_sindexadj_bfree))

#############################################################################################################################
#     d. DEswan analysis
#############################################################################################################################


library(tidyverse)
library("DEswan")


## read in covariate file
covar   <- read.table(paste(home,"InCHIANTI_soma_pheno_31May2018.csv",sep=""),sep=",",header=T)
covar   <- covar[,c("CODE98","IXAGE","SEX","SITE")]

## read in proteomic data 
inch    <- read.table(paste(home,"ALL_hyb.cal_RFU.csv",sep=""), header=T, sep="\t")
remove  <- c("SL000445","SL004697","SL004698","SL016148")
inch    <- inch[,!(names(inch) %in% remove)]

# Merge with phenotype data
inchall <- merge(covar, inch, by.x="CODE98", by.y="SampleId", all.y = TRUE)

# log/scale protein values
pro     <-inchall[, c(6:1306)]
cov     <-inchall[, c(2:4)]

lpro    <- log(pro)
zpro    <- as.data.frame(scale(lpro,center=TRUE,scale=TRUE))

# remove extra data
rm(covar,remove,inch,pro,lpro)

### window 10
res10 = DEswan(data.df          = zpro,
               qt            = cov[,1],
               window.center = seq(25,90,1),
               buckets.size  = 10,
               covariates    = cov[,c(2:3)])

### window 20
res20 = DEswan(data.df          = zpro,
               qt            = cov[,1],
               window.center = seq(25,90,1),
               buckets.size  = 20,
               covariates    = cov[,c(2:3)])

### window 30
res30 = DEswan(data.df          = zpro,
               qt            = cov[,1],
               window.center = seq(25,90,1),
               buckets.size  = 30,
               covariates    = cov[,c(2:3)])

##### window 10 extract significant proteins at age41, 61, 78

coef41  <- res10$coeff  %>% filter(window.center==41, factor=="qt") %>%  rename(beta_41=coefficient) %>% select(variable,beta_41)
coef61  <- res10$coeff  %>% filter(window.center==61, factor=="qt") %>%  rename(beta_61=coefficient) %>% select(variable,beta_61)
coef78  <- res10$coeff  %>% filter(window.center==78, factor=="qt") %>%  rename(beta_78=coefficient) %>% select(variable,beta_78)

q       <- res10.wide.q[,c("variable","X41","X61","X78")]
names(q)<- c("variable","qvalue_41","qvalue_61","qvalue_78")
p       <- res10.wide.q[,c("variable","X41","X61","X78")]
names(p)<- c("variable","pvalue_41","pvalue_61","pvalue_78")

allres  <- merge(coef41,coef61, by="variable")
allres  <- merge(allres,coef78, by="variable")
allres  <- merge(allres, q, by="variable")
allres  <- merge(allres, p, by="variable")


#############################################################################################################################
#     e. Mediation Analysis
#          This is a template code for mediation analysis in each chromosome
#############################################################################################################################


###	 	Read in file with position of each protein
pror <-  read.table(paste(home,"Protein_info_chrCHRNUM.txt",sep=""), sep="\t",header = T) 		

###		Read list of age-associated CpG
methr     <- read.table(paste(home,"CpG_age_siginfo.txt",sep=""),sep="\t", header = T) 
methr     <- methr[order(methr$CHR,methr$pos),]

###		Read in phenotype file
pheno     <- read.table(paste(home,"InCHIANTI_soma_pheno_31May2018.csv",sep=""),sep=",", header = T)  		

###		Read in raw beta methylation
meth98    <- read.table(paste(home,"chrCHRNUM.csv",sep=""),sep=",", header = T) 


###		Read in raw RFU protein age-associated proteins only
pro         <-  read.table(paste(home,"INCH_RFU_pro_subset.txt",sep=""),sep="\t", header = T) # read in raw protein file

N_TESTS <- length(pror$SomaId)		# getting the number of proteins on the chromsome

for (i in 1:N_TESTS)
{
  ## save SomaId
  somaid<-as.character(pror[i,1])
  
  ## save Uniprot
  uniprot<-as.character(pror[i,3])
  #entrez <- as.character(pror[i,4])
  
  ### get the number getting the CpG within 1MB
  low <- pror[i,6] - 1000000
  high <- pror[i,7] + 1000000
  
  cpg<-subset(methr,methr$CHR=="CHRNUM" & methr$pos > low & methr$pos < high )
  
  # save number of CpG tested
  ncpg<-dim(cpg)[1] 
  
  # MERGE PHENOTYPE, METHYLATION AND PROTEIN DATA
  # protein
  pro1<-pro[,c("CODE98",somaid)]
  names(pro1)<-c("CODE98","pro")
  
  pro1$logpro<-log(pro1$pro)
  pro1$testp2<-pro1$logpro
  pro1$testp2[abs(pro1$logpro-mean(pro1$logpro))>=4*sd(pro1$logpro)]<-NA
  
  # get the CpG for that is significant within 1MB
  meth1<-meth98[,names(meth98) %in% cpg$TargetID]
  meth1a<-cbind(meth98$Sample_code98,meth1)
  
  a <- merge(pro1,pheno,by="CODE98")
  a <- merge(a,meth1a,by.x="CODE98",by.y="meth98$Sample_code98")
  a <- a[!is.na(a$testp2), ]
  
  # remove all temp datasets
  rm(cpg,pro1,meth1,meth1a)
  
  reg         <- lm(testp2 ~ SITE +SEX, na.action=na.exclude, data=a)
  presid  <- residuals(reg)
  
  reg         <-lm(presid ~ IXAGE,data=a)
  coefs1    <- data.frame(coef(summary(reg)))
  
  runmed<-function(j)
  {
    reg2     <- lm(a[,j] ~ SITE + as.factor(Batch)+SEX + X_P_NEU + X_P_LIN + X_P_MON + X_P_EOS,na.action=na.exclude, data=a)
    mresid  <- residuals(reg2)
    
    reg2<-lm(mresid ~ IXAGE,data=a)
    coefs2 <- data.frame(coef(summary(reg2)))
    
    reg3<-lm(presid ~ IXAGE + mresid ,data=a)
    coefs3 <- data.frame(coef(summary(reg3)))
    
    reg4 <-lm(presid ~ mresid, data=a)
    coefs4 <- data.frame(coef(summary(reg4)))
    
    # mediation effect
    medef <- coefs2[2,1] *  coefs3[3,1]
  
    
    # Sobel test
    b2s     <- (coefs3[3,1])^2 * (coefs2[2,2])^2
    a2s     <- (coefs2[2,1])^2 * (coefs3[3,2])^2
    z         <- medef/sqrt( b2s + a2s  )
    pval    <- 2*pnorm(-abs(z))
    
    results1 <-  c(names(a)[j],uniprot,coefs1[2,1], coefs1[2,2], coefs1[2,4], coefs2[2,1], coefs2[2,2], coefs2[2,4], 
                   coefs3[2,1], coefs3[2,2], coefs3[2,4], coefs3[3,1], coefs3[3,2], coefs3[3,4], 
                   coefs4[2,1], coefs4[2,2], coefs4[2,4],medef, pval)
    results1
  }
  
  # Run analysis
  lenfull<-length(a[1,]) 
  results<-data.frame(t(sapply(14:lenfull,runmed))) 
  names(results) <-   c("CpG","uniprot","Age_pro_beta",  "Age_pro_se", "Age_pro_p", "Age_me_beta", "Age_me_se", "Age_me_p",
                        "Age_pro_beta2", "Age_pro_se2", "Age_pro_p2", "Age_me_beta2", "Age_me_se2", "Age_me_p2", 
                        "pro_me_beta","pro_me_se","pro_me_p","Mediationeffect","sobel_p")
  results$cpg<-names(a)[14:lenfull]
  results$SomaId<-somaid
  

  write.table(results, paste("ChrCHRNUM_",somaid,"_",uniprot,".csv",sep = ""), quote = F,row.names = F,sep=",")
  
  
}

