#COVID-19 cytokine and patient data

################################################################################
################################################################################
#
#             Libraries 
#
#
#   Make sure to run this code so R will have all the functions it needs to work
#   If the requires give you an error, try running the installations



require(varrank)
require(DescTools)
require(arules)
require(ggplot2)
require(tidyverse)
require(ggpubr)
require(rstatix)

#install.packages("varrank")
#install.packages("tidyverse")
#install.packages("hrbrthemes")
#install.packages("stringi")
#install.packages("DescTools")
#install.packages("arules")
#install.packages("ggpubr")
#install.packages("rstatix")
#install.packages("caret")

library(arules)
library(hrbrthemes)
library(RColorBrewer)
library(varrank)
library(DescTools)
library(ggplot2)
library(ggpubr)
library(rstatix)
library(caret)

#HINT - sometimes you have to install other required packages to make the above 
#       packages work. For instance, if you run require(tidyverse) and you see 
#       Error: package load failed, and it's because there's a missing sub-package 
#       (it might say: "there is no package called 'stringi'," for example), then 
#       try installing that sub-package
#
#       If you come across this message while installing packages "Do you want to 
#       install from sources the package which needs compilation? (Yes/no/cancel)
#       write "no"

################################################################################
################################################################################





##### Goal - find most important cytokines at predicting oxygen 
#####        and level-of-care requirements for patients with COVID-19



#########################################################################
#           CHANGE HERE 
#
#       This is where you upload the data table you'll use to make your graphs

#COVID combined patient cytokine data.xlsx


#Overall folder where your data files live, should also contain your (empty) 
# graphs sub-folder where graphs will populate
projectFolder<-"/Users/annalouie/Desktop/El-Deiry Lab/Experiments/004 COVID cytokines"

#Name of spreadsheet CSV file with initial data
#Data should have columns of variables and rows as different observations (samples)
spreadsheetName<-"COVID combined patient cytokine data.csv"
#spreadsheetName<-"COVID combined patient cytokine data - Screen Negatives.csv"

#retrieving spreadsheet data
goLocation<-paste0(projectFolder,"/",spreadsheetName)
prelimCOVIDData<-as.data.frame(read.csv(goLocation))

rm(goLocation)
rm(spreadsheetName)

#makes list of all columns in data frame
initColumnNames<-colnames(prelimCOVIDData)

prelimCOVIDData<-prelimCOVIDData[1:72,]

intCOVIDData<-as_tibble(prelimCOVIDData)
cleanedCOVIDData<- intCOVIDData %>% 
  rename(
    Age = X1..Age,
    Sex = X2..Sex,
    Race = Race...Ethnicity,
    Health_Insurance = Health.Insurance,
    Living_situation = X7..Living.situation,
    Chronic_lung_disease = X8..Check.all.that.apply..choice.Chronic.lung.disease..such.as.asthma..COPD..or.emphysema..,
    Heart_disease = X8..Check.all.that.apply..choice.Heart.disease..e.g...previous.heart.attack..afib..history.of.stents..,
    Diabetes = X8..Check.all.that.apply..choice.Diabetes., 
    HTN = X8..Check.all.that.apply..choice.High.blood.pressure.,
    Overweight_or_Obese = Overweight.or.Obese,
    Number_of_Chronic_Conditions = Number.of.Chronic.Conditions,
    Household_contact_with_coronavirus = X9..Household.contact.with.positive.coronavirus.test.or.probable.coronavirus.infection.,
    Shortness_of_breath = Breathing.Difficulty.or.Shortness.of.Breath,
    Change_in_taste_or_smell = Change.in.taste.or.smell,
    Abnormal_vital_sign.Any = Abnormal.vital.sign..any.....only.severe.HTN., 
    Abnormal_RR = Abnormal.RR, 
    Abnormal_Temp = Abnormal.Temp,
    Abnormal_HR = Abnormal.HR, 
    Abnormal_BP = Abnormal.BP..severe., 
    Number_of_abnormal_vital_signs = Number.Abnormal.vital.sign, 
    Bronchodilator_administered_in_ED = X16..Medications.in.the.ED..Check.all.that.apply...choice.Bronchodilators..e.g...Albuterol..,
    Steroids_administered_in_ED = X16..Medications.in.the.ED..Check.all.that.apply...choice.Steroids..e.g...Prednisone..,
    Azithromycin_administered_in_ED = X16..Medications.in.the.ED..Check.all.that.apply...choice.Azithromycin.,
    Chest_X.ray = X14..Chest.X.ray,
    COVID_Severity_Score = COVID.Severity.Score,
    IL.12_p40 = IL.12..p40. ,
    Number_of_Symptoms = Number.of.Symptoms..range.0.9. ,
    COVID.19_PCR = X13..COVID.19.PCR.Testing
  )

cleanedColNames<-colnames(cleanedCOVIDData)
cleanedCOVIDData<-as.data.frame(cleanedCOVIDData)
rm(intCOVIDData)






################################################################################
################################################################################
#
#             Subsection 1
#             Statistics - Varrank
#
#
#   Goal - apply varrank to determine most important variables



################################################################################
################################################################################



#columns to include in data set that will be used for picking the most important
#variables. This should include variables you want it to look at for importance
#
#     IMPORTANT - for the function to work, all columns must be either 
#                 numeric or factor class
#
#   For COVID Data, we are including:
#       - all cytokines
#       - age
#       - race
#       - sex
#       - smoking
#       - health insurance
#       - number of abnormal vital signs
#       - number of symptoms
#       - number of chronic conditions
#       - chest x-ray

cytokinesColNameList<-c(cleanedColNames[49:63])
nonCytokineColNameList<- c("Age", "Sex", "Race",
                           "Health_Insurance", "Smoking",
                           "Number_of_Chronic_Conditions",
                           "Number_of_Symptoms",
                           "Chest_X.ray", 
                           "Number_of_abnormal_vital_signs"
)

outcomeVariableColName<-"COVID_Severity_Score"

recordIDColName<-"Record.ID"




#make all cytokine rows numeric and all other rows factors:
allVarRankVariableCols<-c(cytokinesColNameList,
                          nonCytokineColNameList, 
                          outcomeVariableColName)


#create a smaller data frame that contains just the columns that will be used for varrank,
#for all the columns are not the cytokine columns (assumed to be numeric), it converts
prelimVarRankData<-cleanedCOVIDData[,c(recordIDColName, "COVID.19_PCR", allVarRankVariableCols)]

#make sure all cytokine columns are numeric:
#isColDataNumeric function takes in a single column names and a data frame and
#returns true if the data in that column is numeric
#The all(sapply line of code below it returns true if all the columns in the 
#cytokinesColNameList are numeric
#
#If running the following code returns false - then one of your columns contains 
#non-numeric data. 
#
#Your cytokine list can ONLY contain columns that have numeric data for varrank 
#to work properly. Non-numeric columns can go in the other list.
#
#HINT - if you run the commented out line of code below, it will tell you which 
#column(s) are not numeric
#sapply(cytokinesColNameList, isColDataNumeric, myDataFrame=prelimVarRankData)

isColDataNumeric <- function(myColName, myDataFrame) {
  return(is.numeric(myDataFrame[,myColName]))
}
all(sapply(cytokinesColNameList, isColDataNumeric, myDataFrame=prelimVarRankData))

#Looks at all the columns in a list of columns (should all be numeric) and 
#uses a discretization function to split it into a few categories. Now those 
#columns in the data frame are transformed into factor data and lumped into a few 
#discreet categories. This is done to avoid over-fitting on small data sets, such 
#as when you have less than 10x as many samples as you do variables. 
#
# the second line just prints out TRUE for each column discretized successfully
#
#
#Options for how the program will split data: 
#     cluster - uses kmeans clustering - use this one if you think outliers are 
#               important and you want the program to put them in a different 
#               category 
#     frequency - will divide into groups with equal numbers of data points 
#               (split data by the median) use if you think outliers are going 
#               to throw things off
#
#currently it's set to cluster, but if you want to change it, in the discretize 
#function change the string (stuff in quotes) you put as input to discretizeDFColumns
#your two choices are "cluster" or "frequency" - don't forget the quotes
for (myColName in cytokinesColNameList) {
  prelimVarRankData[,myColName]<-discretize(prelimVarRankData[,myColName],method = "cluster", breaks=2)
  print(paste(myColName, "discretized successfully", is.factor(prelimVarRankData[,myColName])))
}
rm(myColName)



prelimVarRankData<- mutate_at(prelimVarRankData,c(nonCytokineColNameList, 
                                                  outcomeVariableColName),factor)
prelimVarRankData<-prelimVarRankData[,allVarRankVariableCols]
#estevez is best but slow and might short your computer, peng is fast

varTestAL<-varrank(data.df = prelimVarRankData,
                   method = "peng",
                   variable.important = outcomeVariableColName,
                   discretization.method = "cencov",
                   algo = "forward", scheme = "mid", verbose=FALSE)

summary(varTestAL)

saveLocation<-paste0(projectFolder,"/Graphs/", "Mutual_Information", ".tiff")
tiff(saveLocation, units = "in", width=10, height=7, res=300)

plot(varTestAL, cellnote = FALSE)

dev.off()













#removing all lists and objects you created for this subsection so as not to 
#clutter up your global environment (top right panel)
#HINT - broom in top right panel will clear everything from the global environment
rm(isColDataNumeric)
rm(recordIDColName)
rm(outcomeVariableColName)
rm(nonCytokineColNameList)
rm(nonCytokineColNameList2)
rm(cytokinesColNameList)
rm(allVarRankVariableCols)
rm(prelimVarRankData)







#Leave-One-Out cross-validation
#Confirm that cytokine ordering does not substantially change when leaving out any one sample

leaveOneOutCVResults<-as.data.frame(varTestAL$ordered.var)
counterAL=1

while(counterAL< (nrow(prelimVarRankData)+1)) {
  print(counterAL)
  oneLessVarRankData<-prelimVarRankData[-counterAL, ]
  varTestLOOCV<-varrank(data.df = oneLessVarRankData,
                        method = "peng",
                        variable.important = outcomeVariableColName,
                        discretization.method = "cencov",
                        algo = "forward", scheme = "mid", verbose=FALSE)
  newLOOCVcolName<-paste0("newLOOCVrank_", counterAL)
  leaveOneOutCVResults[,newLOOCVcolName]<-varTestLOOCV$ordered.var
  rm(varTestLOOCV)
  rm(newLOOCVcolName)
  rm(oneLessVarRankData)
  counterAL<-counterAL+1
}

rm(counterAL)







################################################################################
################################################################################
#
#             Subsection 2
#             ROC Curve
#
#
#   Goal - make an ROC curve for model predictive efficacy



################################################################################
################################################################################

#top cytokines: M.CSF, IP.10, IL.18, IL.1RA, MCP.1
#top cytokines kmeans cutoff values
#   - M-CSF 323
#   - IL-1RA 77.279
#   - IL-18 871
#   - IP-10 17953.1
#   - MCP-1 1618
#also age (cutoff age 50)


#myTopCytokineNamesList<-c("M.CSF", "IP.10", "IL.18", "IL.1RA", "MCP.1")
#myCytokineCutoffsList<-c(   323,   17953.1,   871,    77.279,   1618)
#these two lists must be in the same order with the values corresponding to the cytokine at the same position
#second set using age
myTopCytokineNamesList<-c("M.CSF", "IP.10", "IL.18", "IL.1RA", "Age")
myCytokineCutoffsList<-c(   323,   17953.1,   871,    77.279,   50)

topCytColNameList<-c("LLLLL", "HLLLL", "LHLLL", "LLHLL", "LLLHL", "LLLLH",
                     "HHLLL", "HLHLL", "HLLHL", "HLLLH", "LHHLL", "LHLHL",
                     "LHLLH", "LLHHL", "LLHLH", "LLLHH", "HHHLL", "HHLHL",
                     "HHLLH", "HLHHL", "HLHLH", "HLLHH", "LHHHL", "LHHLH",
                     "LHLHH", "LLHHH", "HHHHL", "HHHLH", "HHLHH", "HLHHH",
                     "LHHHH", "HHHHH")

topCytokineHiLoDF<-data.frame(matrix(NA_character_, nrow = nrow(cleanedCOVIDData), ncol = 32))
colnames(topCytokineHiLoDF)<-topCytColNameList
topCytokineHiLoDF$Record.ID<-cleanedCOVIDData$Record.ID
topCytokineHiLoDF$COVID_Severity_Score<-cleanedCOVIDData$COVID_Severity_Score

#If the value in myDataFrameIn[myRowNumIn, myColName] is greater than myCutoffValue
#function returns H, otherwise returns L
letterHiLoReturn <- function(myDataFrameIn, myColName, myRowNumIn, myCutoffValue) {
  if (myDataFrameIn[myRowNumIn,myColName]>myCutoffValue) {
    print("high")
    return("H")
  } else {
    print("low")
    return("L")
  }
}


#Relies on function letterHiLoReturn
#Applies letterHiLoReturn to a list of columns (myColsList), with a corresponding
#list of different cutoff values. Passes myDataFrame and myRowNum to 
#letterHiLoReturn. Returns a string with the concatenated results of the runs of
#letterHiLoReturn
fiveColHiLoTest <- function(myDataFrame, myColsList, myRowNum, myCutoffValsList){
  counterAL=1
  hiLoString<-""
  while(counterAL< (length(myColsList)+1)){
    
    hiLoString<-paste0(hiLoString,letterHiLoReturn(myDataFrame, myColsList[counterAL], 
                                                   myRowNum, myCutoffValsList[counterAL]))
    counterAL<-counterAL+1
    print(counterAL)
  }
  print(counterAL)
  rm(counterAL)
  return(hiLoString)
}



#This makes the topCytokineHiLoDF list for each sample the 5-letter High and Low
# string to represent whether it has high or low values of each of the five tested cytokines
counterAL<-1
while(counterAL< (nrow(cleanedCOVIDData)+1)) {
  print(counterAL)
  currIDNum<-cleanedCOVIDData[counterAL,"Record.ID"]
  fullHiLoString<-fiveColHiLoTest(cleanedCOVIDData, myTopCytokineNamesList, counterAL, myCytokineCutoffsList)
  
  
  topCytokineHiLoDF[topCytokineHiLoDF$Record.ID == currIDNum,fullHiLoString]<- fullHiLoString
  
  
  rm(currIDNum)
  rm(fullHiLoString)
  counterAL<-counterAL+1
}

rm(counterAL)


topCytokineHiLoDF$CSS_4to5<-topCytokineHiLoDF$COVID_Severity_Score>3
topCytokineHiLoDF$CSS_3to5<-topCytokineHiLoDF$COVID_Severity_Score>2

#data frame containing number of samples in each HiLo category
#number of samples in each HiLo category where CSS 4-5 is TRUE

rocHiLoDF<-data.frame(matrix(NA_integer_, nrow = length(topCytColNameList), ncol = 4))
colnames(rocHiLoDF)<-c("HiLoString", "Num_HiLoVals", "Num_CSS_3to5", "Num_CSS_4to5")
rocHiLoDF$HiLoString<-topCytColNameList

counterAL<-1
while(counterAL < (length(topCytColNameList)+1)) {
  print(counterAL)
  
  fullHiLoString<-rocHiLoDF[counterAL,"HiLoString"]
  
  rocHiLoDF[counterAL,"Num_HiLoVals"]<-length(which(topCytokineHiLoDF[, fullHiLoString] == fullHiLoString))
  
  rocHiLoDF[counterAL,"Num_CSS_3to5"]<-length(intersect(which(topCytokineHiLoDF[, fullHiLoString] == fullHiLoString), 
                                                        which(topCytokineHiLoDF[, "CSS_3to5"] == "TRUE")))
  
  rocHiLoDF[counterAL,"Num_CSS_4to5"]<-length(intersect(which(topCytokineHiLoDF[, fullHiLoString] == fullHiLoString), 
                                                        which(topCytokineHiLoDF[, "CSS_4to5"] == "TRUE")))
  
  rm(fullHiLoString)
  counterAL<-counterAL+1
}

rm(counterAL)

rocHiLoDF<-filter(rocHiLoDF, rocHiLoDF$Num_HiLoVals>0)


#rank rows of rocHiLoDF in decending order based on ratio of Num_CSS to Num_HiLoVals
rocHiLoDF$PortionPosVal<-rocHiLoDF$Num_CSS_4to5/rocHiLoDF$Num_HiLoVals
rocHiLoDF<-arrange(rocHiLoDF,desc(PortionPosVal))


#create graphingROCdf - data frame containing x (true positive rate) and y (false positive rate)
#columns. This will be used to graph ROC and calculate AUC
#Create True positive rate and false positive rate rows based on additive iterations 

graphingROCdf<-data.frame(matrix(NA_real_, nrow = nrow(rocHiLoDF), ncol = 2))
colnames(graphingROCdf)<-c("x_FP", "y_TP")
totRealPos<-sum(rocHiLoDF[,"Num_CSS_4to5"])
totRealNeg<-sum(rocHiLoDF[,"Num_HiLoVals"]) - sum(rocHiLoDF[,"Num_CSS_4to5"])

counterAL<-1
while(counterAL < (nrow(rocHiLoDF)+1)) {
  print(counterAL)
  
  
  graphingROCdf[counterAL,"y_TP"]<-sum(rocHiLoDF[1:counterAL,"Num_CSS_4to5"])/totRealPos
  graphingROCdf[counterAL,"x_FP"]<-((sum(rocHiLoDF[1:counterAL,"Num_HiLoVals"]) - 
                                       sum(rocHiLoDF[1:counterAL,"Num_CSS_4to5"]))/
                                      totRealNeg)
  counterAL<-counterAL+1
}

rm(counterAL)
rm(totRealNeg)
rm(totRealPos)

graphingROCdf<-rbind(c(0,0), graphingROCdf)

saveLocation<-paste0(projectFolder,"/Graphs/", "ROC_Curve", ".tiff")
xtitle<-"False Positive Rate"
ytitle<-"True Positive Rate"


currPlot<-ggplot(data = graphingROCdf, aes_string(x = "x_FP", y = "y_TP")) +
  geom_line(size=1) +
  geom_point(size = 2) +
  geom_abline(intercept = 0, slope = 1, color="darkred", linetype="dashed", size = 1) +
  xlab(xtitle) + ylab(ytitle) + theme_classic() +
  xlim(0,1) + ylim(0,1)

ggsave(saveLocation,currPlot, height=4, width=4)



aucVal<-AUC(graphingROCdf$x_FP,
            graphingROCdf$y_TP)




rm(fiveColHiLoTest)
rm(letterHiLoReturn)
rm(aucVal)
rm(myCytokineCutoffsList)
rm(myTopCytokineNamesList)
rm(topCytColNameList)
rm(topCytokineHiLoDF)
rm(rocHiLoDF)
rm(graphingROCdf)

#find the number of true positives in each HiLo category, 
#sort from most to least true positives,
#make all but the most positive false, see what rate of true/false positives is
#iterate this making one more group positive at a time, 
#plot all the true/false positive rates into ROC curve




################################################################################
################################################################################
#
#             Subsection 3
#             ANOVA
#
#
#   Goal - apply ANOVA to vioplots, add significance



################################################################################
################################################################################

#top cytokines: M.CSF, IP.10, IL.18, IL.1RA, MCP.1


cleanedCOVIDData[,"COVID_Severity_Score"]<-as.factor(cleanedCOVIDData[,"COVID_Severity_Score"])




########################################################
#Various ANOVA test results
#

#ANOVA
aov_M.CSF <- cleanedCOVIDData %>% anova_test(M.CSF ~ COVID_Severity_Score)

#tukey calculates significance when you have an anova showing a difference
pwc_M.CSF <- cleanedCOVIDData %>% tukey_hsd(M.CSF ~ COVID_Severity_Score)
pwc_M.CSF <- pwc_M.CSF %>% add_xy_position(x = "COVID_Severity_Score")


aov_IP.10 <- cleanedCOVIDData %>% anova_test(IP.10 ~ COVID_Severity_Score)
aov_IL.18 <- cleanedCOVIDData %>% anova_test(IL.18 ~ COVID_Severity_Score)
aov_IL.1RA <- cleanedCOVIDData %>% anova_test(IL.1RA ~ COVID_Severity_Score)
aov_MCP.1 <- cleanedCOVIDData %>% anova_test(MCP.1 ~ COVID_Severity_Score)
aov_IL.6 <- cleanedCOVIDData %>% anova_test(IL.6 ~ COVID_Severity_Score)
aov_TNFa <- cleanedCOVIDData %>% anova_test(TNFa ~ COVID_Severity_Score)
aov_IL.2 <- cleanedCOVIDData %>% anova_test(IL.2 ~ COVID_Severity_Score)
aov_IFNa2 <- cleanedCOVIDData %>% anova_test(IFNa2 ~ COVID_Severity_Score)
aov_IFNy <- cleanedCOVIDData %>% anova_test(IFNy ~ COVID_Severity_Score)
aov_IL.7 <- cleanedCOVIDData %>% anova_test(IL.7 ~ COVID_Severity_Score)
aov_IL.12 <- cleanedCOVIDData %>% anova_test(IL.12_p40 ~ COVID_Severity_Score)
aov_IL.1a <- cleanedCOVIDData %>% anova_test(IL.1a ~ COVID_Severity_Score)
aov_G.CSF <- cleanedCOVIDData %>% anova_test(G.CSF ~ COVID_Severity_Score)
aov_MIP.1a <- cleanedCOVIDData %>% anova_test(MIP.1a ~ COVID_Severity_Score)

#tukey calculates significance when you have an anova showing a difference
pwc_M.CSF <- cleanedCOVIDData %>% tukey_hsd(M.CSF ~ COVID_Severity_Score)
pwc_M.CSF <- pwc_M.CSF %>% add_xy_position(x = "COVID_Severity_Score")

#IP.10, M.CSF, IL.18, IL.1RA, MCP.1, IL.6, TNFa,  IFNy, IL.12
#not-significant anova - IL.2, IL.7, IL.1a, G.CSF, MIP.1a, IFNa2,

pwc_IP.10 <- cleanedCOVIDData %>% tukey_hsd(IP.10 ~ COVID_Severity_Score)
pwc_IP.10 <- pwc_IP.10 %>% add_xy_position(x = "COVID_Severity_Score")
pwc_IL.18 <- cleanedCOVIDData %>% tukey_hsd(IL.18 ~ COVID_Severity_Score)
pwc_IL.18 <- pwc_IL.18 %>% add_xy_position(x = "COVID_Severity_Score")
pwc_IL.1RA <- cleanedCOVIDData %>% tukey_hsd(IL.1RA ~ COVID_Severity_Score)
pwc_IL.1RA <- pwc_IL.1RA %>% add_xy_position(x = "COVID_Severity_Score")
pwc_MCP.1 <- cleanedCOVIDData %>% tukey_hsd(MCP.1 ~ COVID_Severity_Score)
pwc_MCP.1 <- pwc_MCP.1 %>% add_xy_position(x = "COVID_Severity_Score")
pwc_IL.6 <- cleanedCOVIDData %>% tukey_hsd(IL.6 ~ COVID_Severity_Score)
pwc_IL.6 <- pwc_IL.6 %>% add_xy_position(x = "COVID_Severity_Score")
pwc_TNFa <- cleanedCOVIDData %>% tukey_hsd(TNFa ~ COVID_Severity_Score)
pwc_TNFa <- pwc_TNFa %>% add_xy_position(x = "COVID_Severity_Score")
pwc_IFNy <- cleanedCOVIDData %>% tukey_hsd(IFNy ~ COVID_Severity_Score)
pwc_IFNy <- pwc_IFNy %>% add_xy_position(x = "COVID_Severity_Score")
pwc_IL.12 <- cleanedCOVIDData %>% tukey_hsd(IL.12_p40 ~ COVID_Severity_Score)
pwc_IL.12 <- pwc_IL.12 %>% add_xy_position(x = "COVID_Severity_Score")

