#### Code used for analysis of NMR metabolomics data in:
#### Manuscript: Early life infection and proinflammatory, atherogenic metabolomic and lipidomic profiles at 12 months of age: a population-based cohort study
#### Authors: Toby Mansell, Richard Saffery, Satvika Burugupalli, Anne-Louise Ponsonby, Mimi LK Tang, Martin O'Hely, Siroon Bekkering, Adam AT Smith, Rebecca Rowland, Sarath Ranganathan, Peter D Sly, Peter Vuillermin, Fiona Collier, Peter J Meikle, David P Burgner, on behalf of the Barwon Infant Study Investigator Group

#### Code based on: Vignettes for the ggforestplot package authored by Scheinin et al., provided by Nightingale Health
#### Code adapted and modified by: Toby Mansell
#### Last edited: Mar 31st, 2022

#### Generates: Figure 3 and associated figure supplements and source data, Figure 4 and associated figure supplements and source data, Supp. Files 1A-C and 3A and 3C, and metabolomics data for Figure 7 and its associated figure supplement and source data

#### Required for code: df_NG_biomarker_metadata.rda, provided as part of the ggforestplot package. 
#### For data access: Data is available upon reasonable request by bona fide researchers, with approval from BIS data custodians. Please contact the corresponding author for more details.

#### Note on computation: Bootstrap estimates are computationally intensive. Consider skipping these steps if computing cluster is not available.


library(tidyverse)
library(devtools)
library(ggforestplot)

set.seed(12345)

## Set working directory as appropriate
setwd("")

############# Reading in data #####################


# Read the biomarker concentration file
df_nmr_results <- readr::read_csv(
  # Enter the correct location for your file below
  file = "Data/BIS_infection_dataset_NMR.csv",
  # Set not only NA but TAG string as <NA> 
  na = c("NA", "TAG", ".d", ".t")
)


df_nmr_results_12m <- df_nmr_results[(df_nmr_results$time=="12m"),]

df_nmr_results_alt_names <- df_nmr_results_12m

alt_names <- 
  names(df_nmr_results_alt_names)

new_names <- 
  alt_names %>% 
  purrr::map_chr(function(id) {
    # Look through the alternative_ids
    hits <-
      purrr::map_lgl(
        df_NG_biomarker_metadata$alternative_names,
        ~ id %in% .
      )
    
    # If one unambiguous hit, return it.
    if (sum(hits) == 1L) {
      return(df_NG_biomarker_metadata$machine_readable_name[hits])
      # If not found, give a warning and pass through the input.
    } else {
      warning("Biomarker not found: ", id, call. = FALSE)
      return(id)
    } 
  })


# Name the vector with the new names  
names(alt_names) <- new_names

# Rename your result data frame with machine_readable_names 
df_nmr_results_alt_names <- 
  df_nmr_results_alt_names %>% 
  rename(!!alt_names)

# Extract names of NMR biomarkers at 12-months
nmr_biomarkers <- names(df_nmr_results_alt_names %>% select(XXL_VLDL_P:hsCRP))

# # Extract names of NMR biomarkers at 6-months
nmr_biomarkers_6m <- names(df_nmr_results_alt_names %>% select(Total_C_6m:hsCRP_6m))

### Metabolomic measures have their minimum non-zero value added to them to allow for log-transformation of measures include zero values.
NMR.minimumvalues <- data.frame(metabolite=character(),
                                minimum=double(),
                                stringsAsFactors=FALSE) 
holder <- NMR.minimumvalues

for(i in nmr_biomarkers){
  
  holder[1,1] <- paste(i)
  holder[1,2] <- min(df_nmr_results_alt_names[[i]][!!df_nmr_results_alt_names[[i]]], na.rm=TRUE)
  
  NMR.minimumvalues <- rbind(NMR.minimumvalues, holder)
  df_nmr_results_alt_names[[i]] <- df_nmr_results_alt_names[[i]]+holder[1,2]
  
}

NMR.minimumvalues <- data.frame(metabolite=character(),
                                minimum=double(),
                                stringsAsFactors=FALSE) 
holder <- NMR.minimumvalues

for(i in nmr_biomarkers_6m){
  
  holder[1,1] <- paste(i)
  holder[1,2] <- min(df_nmr_results_alt_names[[i]][!!df_nmr_results_alt_names[[i]]], na.rm=TRUE)
  
  NMR.minimumvalues <- rbind(NMR.minimumvalues, holder)
  df_nmr_results_alt_names[[i]] <- df_nmr_results_alt_names[[i]]+holder[1,2]
}


load("df_NG_biomarker_metadata.rda")

#################### Regression models for primary models ##########

### Creation of frame for caluclating correlation of estimated effects for number of infections, GlycA, and hsCRP - Figure 3C, 4B and 4C
beta.comparison.NMR <- data.frame(metabolite=character(229),
                                  infection.beta=double(229),
                                  glyca.beta=double(229),
                                  hscrp.beta=double(229),
                                  stringsAsFactors=FALSE) 

############# No. of reported infections as exposure - Figure 3A, Source Data 1 #####################

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, sex, z_scores_birth, gestage, hhincome, bfany, mth12collectiontostorage, anympsmoke) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + sex + z_scores_birth + gestage + hhincome + bfany + mth12collectiontostorage + anympsmoke
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker$adj_pvalue <- p.adjust(df_assoc_per_biomarker$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker, 'BIS_eLife_Figure3_SourceData1_a.csv')

beta.comparison.NMR$metabolite <- df_assoc_per_biomarker$biomarkerid
beta.comparison.NMR$infection.beta <- df_assoc_per_biomarker$estimate

# Display blood biomarker groups
df_NG_biomarker_metadata %>% 
  pull(group) %>% 
  unique()


# Choose the groups you want to plot and define the order with which group 
# categories will appear in the plot
group_order <- c(
  "Branched-chain amino acids",
  "Aromatic amino acids",
  "Amino acids",
  "Fluid balance",
  "Fatty acids",
  "Apolipoproteins",
  "Glycerides and phospholipids",
  "Cholesterol",
  "Glycolysis related metabolites",
  "Ketone bodies",
  "Lipoprotein subclasses",
  "Lipoprotein particle sizes",
  "Relative lipoprotein lipid concentrations",
  "Inflammation"
)

# Extract a subset of the df_NG_biomarker_metadata, with the desired group
# order, to set the order of biomarkers in the forestplot later on
df_with_groups <- 
  df_NG_biomarker_metadata %>% 
  # Select subset of variables
  select(name = name,
         group) %>% 
  # Filter and arrange for the wanted groups
  filter(group %in% group_order) %>%
  arrange(factor(group, levels = group_order))


df_to_plot <-
  df_assoc_per_biomarker %>%
  # use right_join, with df_grouping on the right, to preserve the order of 
  # biomarkers it specifies. 
  dplyr::right_join(., df_with_groups, by = "name") 

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        xlab = "Difference in 12-month biomarker (SD units) for each infection from birth to 12 months",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.1, 0.1)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Figure3A.png", width=6, height=6)

df_linear$model <- "Primary model"
figure_infection_base <- df_linear

df_linear$model <- "Household income"
figure_infection_hhincome <- df_linear

###### Bootstrapping - NOTE: computationally heavy 
df_nmr_results_alt_names_infection <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo),]
n <- df_assoc_per_biomarker_bmi$biomarkerid

library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in n){

  df_nmr_results_alt_names_infection[[i]] <- log(df_nmr_results_alt_names_infection[[i]])
  df_nmr_results_alt_names_infection[[i]] <- as.numeric(scale(df_nmr_results_alt_names_infection[[i]]))

  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste(i,"~ anyinfection4wk12mo + age_interview_m12 + gender + gestage + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke")), data=data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }


  duncan.boot <- boot(df_nmr_results_alt_names_infection, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")

  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]

  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)

}

bootstrap_infection_metab_bfany <- bootstrap_bca_cis
write.csv(bootstrap_infection_metab_bfany, "BIS_eLife_Figure3_SourceData1A_bootstrap.csv")


######## Several secondary analyses (comparing SEP measures and various sensitivity analyses) - Supp. File 1A #######
#### SEIFA
# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, seifa_disad, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + seifa_disad + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "SEIFA (area disadvantage)"
figure_infection_seifa <- df_linear

#### Mat. highest education
# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, meducation_t1t2_d, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + meducation_t1t2_d + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Maternal education"
figure_infection_meducation <- df_linear



### 2ndary exposure analysis
# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage, smokeinroom_bto12m, GDMnew, preeclampsia_yn) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage + smokeinroom_bto12m + GDMnew + preeclampsia_yn
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model + additonal exposures"
figure_infection_secondary <- df_linear



### Breastfeeding duration

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, hhincome, bftomax52wks, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + hhincome + bftomax52wks + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model with breastfeeding duration adjustment"
figure_infection_BFduration <- df_linear


### Excluding active infection CRP>5

df_nmr_results_alt_names_hsCRPlessthan5 <- df_nmr_results_alt_names[(df_nmr_results_alt_names$hsCRP<5),]

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_hsCRPlessthan5 %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)


hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model excluding hsCRP>5"
figure_infection_CRPexcl <- df_linear

### Excluding twins

df_nmr_results_alt_names_notwins <- df_nmr_results_alt_names[!grepl('-1',df_nmr_results_alt_names$idchild),]
df_nmr_results_alt_names_notwins <- df_nmr_results_alt_names_notwins[!grepl('-2',df_nmr_results_alt_names_notwins$idchild),]

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_notwins %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model excluding twins"
figure_infection_twinexcl <- df_linear


############# Excluding Processing time of >4hr

df_nmr_results_alt_names_4hr <- df_nmr_results_alt_names[(df_nmr_results_alt_names$mth12collectiontostorage<=0.16666),]


# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_4hr %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk12mo, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk12mo + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk12mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

# hscrp <- df_assoc_per_biomarker_bmi[75,]
hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model excluding >4hr blood processing time"
figure_infection_4hrprocessing <- df_linear

##### Combination figure - sensitivity - Supp File 1A (a)

df_linear <- rbind(figure_infection_base, figure_infection_secondary, figure_infection_CRPexcl, figure_infection_twinexcl, figure_infection_4hrprocessing, figure_infection_BFduration)


# Convert to factor for ordering
df_linear$model <- as.factor(df_linear$model)

# # Reorder factors to reverse order, so they appear in the correct order in the plot
df_linear$model = factor(df_linear$model,levels(df_linear$model)[c(6,3,4,5,2,1)])



df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        colour = model,
        xlab = "Difference in 12-month biomarker (SD units) for each infection from birth to 12 months",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.1, 0.1)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile1A_a.png", width=10, height=18)


##### Combination figure - SEP measures - Supp File 1A (b)

df_linear <- rbind(figure_infection_hhincome, figure_infection_seifa, figure_infection_meducation)


# Convert to factor for ordering
df_linear$model <- as.factor(df_linear$model)

# # Reorder factors to reverse order, so they appear in the correct order in the plot
df_linear$model = factor(df_linear$model,levels(df_linear$model)[c(3,2,1)])


df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        colour = model,
        xlab = "Difference in 12-month biomarker (SD units) for each infection from birth to 12 months",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.1, 0.1)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile1A_b.png", width=8, height=12)



###### 6m to 12m infections, adjusted for 6m infections and metabs - Figure 3-figure supplement 2a, Source Data 3 ######

beta.comparison.MS_6mto12m <- data.frame(metabolite=character(225),
                                         infection.beta=double(225),
                                         glyca.beta=double(225),
                                         hscrp.beta=double(225),
                                         stringsAsFactors=FALSE) 


df_nmr_results_alt_names_loop <- df_nmr_results_alt_names

results_6mto12m <-  data.frame(biomarkerid=character(),
                               estimate=double(),
                               se=double(),
                               pvalue=double(),
                               stringsAsFactors=FALSE)

regression <- results_6mto12m

nmr_biomarkers_6mto12mNMR <- nmr_biomarkers[nmr_biomarkers!="HDL2_C"]
nmr_biomarkers_6mto12mNMR <- nmr_biomarkers_6mto12mNMR[nmr_biomarkers_6mto12mNMR!="HDL3_C"]
nmr_biomarkers_6mto12mNMR <- nmr_biomarkers_6mto12mNMR[nmr_biomarkers_6mto12mNMR!="Total_CE"]
nmr_biomarkers_6mto12mNMR <- nmr_biomarkers_6mto12mNMR[nmr_biomarkers_6mto12mNMR!="Total_FC"]


for(i in nmr_biomarkers_6mto12mNMR){
  
  j <- paste0(i,"_6m")
  df_6mto12m_loop <- df_nmr_results_alt_names_loop[!is.na(df_nmr_results_alt_names_loop[[i]]),]
  df_6mto12m_loop[[i]] <- log(df_6mto12m_loop[[i]])   
  df_6mto12m_loop[[i]] <- as.numeric(scale(df_6mto12m_loop[[i]]))
  df_6mto12m_loop <- df_6mto12m_loop[!is.na(df_6mto12m_loop[[j]]),]
  df_6mto12m_loop[[j]] <- log(df_6mto12m_loop[[j]])   
  df_6mto12m_loop[[j]] <- as.numeric(scale(df_6mto12m_loop[[j]]))
  
  expData <- lm(paste0("`",i,"` ~ anyinfection9mo12mo + anyinfection4wk6mo + ",j," + age_interview_m12 + age_interview_m6 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + mth6CollectiontoStorage_hr + anympsmoke + gestage"), data = df_6mto12m_loop)
  
  regression[1,1] <- paste(i)
  regression[1,2] <- summary(expData)$coefficients[2,1]
  regression[1,3] <- summary(expData)$coefficients[2,2]
  regression[1,4] <- summary(expData)$coefficients[2,4]
  
  
  results_6mto12m <- rbind(results_6mto12m, regression)
}

df_assoc_per_biomarker_bmi <- results_6mto12m

df_assoc_per_biomarker_bmi <- df_assoc_per_biomarker_bmi %>% left_join(
  select(
    df_NG_biomarker_metadata, 
    name,
    biomarkerid = machine_readable_name
  ), 
  by = "biomarkerid")


df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker_bmi, 'BIS_eLife_Fig3_SourceData3A.csv')

beta.comparison.MS_6mto12m$metabolite <- df_assoc_per_biomarker_bmi$biomarkerid
beta.comparison.MS_6mto12m$infection.beta <- df_assoc_per_biomarker_bmi$estimate


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[225,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        xlab = "Difference in 12-month biomarker (SD units) for each infection from 6 to 12 months,\nadjusted for infections from birth to 6 months and 6-month metabolomic measures",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.15, 0.15)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Fig3_FigSupp2A.png", width=6, height=6)


############# Bootstrapping ###########

library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in nmr_biomarkers_6mto12mNMR){
  
  j <- paste0(i,"_6m")
  df_6mto12m_loop <- df_nmr_results_alt_names_loop[!is.na(df_nmr_results_alt_names_loop[[i]]),]
  df_6mto12m_loop[[i]] <- log(df_6mto12m_loop[[i]])   
  df_6mto12m_loop[[i]] <- as.numeric(scale(df_6mto12m_loop[[i]]))
  df_6mto12m_loop <- df_6mto12m_loop[!is.na(df_6mto12m_loop[[j]]),]
  df_6mto12m_loop[[j]] <- log(df_6mto12m_loop[[j]])   
  df_6mto12m_loop[[j]] <- as.numeric(scale(df_6mto12m_loop[[j]]))
  
  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste0("`",i,"` ~ anyinfection9mo12mo + anyinfection4wk6mo + ",j," + age_interview_m12 + age_interview_m6 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + mth6CollectiontoStorage_hr + anympsmoke + gestage")), data = data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }
  
  
  duncan.boot <- boot(df_6mto12m_loop, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")
  
  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]
  
  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)
  
}

BIS_bootstrap_6mto12minfection_12mmetab_R1 <- bootstrap_bca_cis
write.csv(BIS_bootstrap_6mto12minfection_12mmetab_R1, "BIS_eLife_Figure3_SourceData3A_bootstrap.csv")



############# GlycA as exposure - Figure 3B, Source Data 1B #####################

df_nmr_results_alt_names$gp_exposure <- log(df_nmr_results_alt_names$GlycA)
df_nmr_results_alt_names$gp_exposure <- as.numeric(scale(df_nmr_results_alt_names$gp_exposure))

df_nmr_results_alt_names_trimmed <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo), ]


# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, sex, z_scores_birth, gestage, hhincome, bfany, mth12collectiontostorage, anympsmoke) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + sex + z_scores_birth + gestage + hhincome + bfany + mth12collectiontostorage + anympsmoke
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker$adj_pvalue <- p.adjust(df_assoc_per_biomarker$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker, 'BIS_eLife_Figure3_SourceData1A.csv')

beta.comparison.NMR$glyca.beta <- df_assoc_per_biomarker$estimate

df_to_plot <-
  df_assoc_per_biomarker %>%
  # use right_join, with df_grouping on the right, to preserve the order of 
  # biomarkers it specifies. 
  dplyr::right_join(., df_with_groups, by = "name") 

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        shape = exposure,
        xlab = "Difference in 12-month biomarker (SD units) per 1 SD higher 12-month log GlycA",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.75, 0.75)) +
        ggplot2::scale_shape_manual(
          values = c(22L),
          labels = c("GlycA")) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8)) +
        ggplot2::theme(legend.position = "none")
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)


ggsave("Figures/BIS_eLife_Figure3B.png", width=6, height=6)



df_linear$model <- "Primary model"
figure_GlycA_base <- df_linear

df_linear$model <- "Household income"
figure_GlycA_hhincome <- df_linear

###### Bootstrapping ############
df_nmr_results_alt_names_infection <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo),]


library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in n[-228]){

  df_nmr_results_alt_names_infection[[i]] <- log(df_nmr_results_alt_names_infection[[i]])
  df_nmr_results_alt_names_infection[[i]] <- as.numeric(scale(df_nmr_results_alt_names_infection[[i]]))

  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste(i,"~ gp_exposure + age_interview_m12 + gender + z_scores_birth + gestage + hhincome + bfany + mth12collectiontostorage + anympsmoke")), data=data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }


  duncan.boot <- boot(df_nmr_results_alt_names_infection, boot.huber, 1000, maxit=100)
  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")

  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]

  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)

}

bootstrap_GlycA_metab_bfany_infectionsampleonly <- bootstrap_bca_cis
write.csv(bootstrap_GlycA_metab_bfany_infectionsampleonly, "BIS_eLife_Figure3_SourceData1B_bootstrap.csv")



######## Several secondary analyses (comparing SEP measures and various sensitivity analyses) - Supp. File 1B #######
##### SEIFA

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, seifa_disad, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + seifa_disad + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)


hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "SEIFA (area disadvantage)"
figure_GlycA_SEIFA <- df_linear


##### Maternal education

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, meducation_t1t2_d, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + meducation_t1t2_d + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Maternal education"
figure_GlycA_meducation <- df_linear


### Breastfeeding duration

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bftomax52wks, mth12collectiontostorage, anympsmoke, gestage, smokeinroom_bto12m, GDMnew, preeclampsia_yn) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bftomax52wks + mth12collectiontostorage + anympsmoke + gestage + smokeinroom_bto12m + GDMnew + preeclampsia_yn
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model with breastfeeding duration adjustment"
figure_GlycA_BFduration <- df_linear


### 2ndary covariate models

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage, smokeinroom_bto12m, GDMnew, preeclampsia_yn) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage + smokeinroom_bto12m + GDMnew + preeclampsia_yn
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

# hscrp <- df_assoc_per_biomarker_bmi[75,]
hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model + additonal exposures"
figure_GlycA_secondary <- df_linear

### Excluding active infection CRP>5

df_nmr_results_alt_names_hsCRPlessthan5 <- df_nmr_results_alt_names_trimmed[(df_nmr_results_alt_names_trimmed$hsCRP<5),]

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_hsCRPlessthan5 %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model excluding hsCRP>5"
figure_GlycA_CRPexcl <- df_linear

### Excluding twins

df_nmr_results_alt_names_notwins <- df_nmr_results_alt_names_trimmed[!grepl('-1',df_nmr_results_alt_names_trimmed$idchild),]
df_nmr_results_alt_names_notwins <- df_nmr_results_alt_names_notwins[!grepl('-2',df_nmr_results_alt_names_notwins$idchild),]

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_notwins %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model excluding twins"
figure_GlycA_twinexcl <- df_linear


############# Processing time of <=4hr

df_nmr_results_alt_names_4hr <- df_nmr_results_alt_names_trimmed[(df_nmr_results_alt_names_trimmed$mth12collectiontostorage<=0.16666),]


# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_4hr %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)



df_linear$model <- "Primary model excluding >4hr blood processing time"
figure_GlycA_4hrprocessing <- df_linear




##### Combination figure - sensitivity - Supp File 1B (a)

df_linear <- rbind(figure_GlycA_base, figure_GlycA_secondary, figure_GlycA_CRPexcl, figure_GlycA_twinexcl, figure_GlycA_4hrprocessing, figure_GlycA_BFduration)


# Convert to factor for ordering
df_linear$model <- as.factor(df_linear$model)

# # Reorder factors to reverse order, so they appear in the correct order in the plot
df_linear$model = factor(df_linear$model,levels(df_linear$model)[c(6,3,4,5,2,1)])



df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        colour = model,
        xlab = "Difference in 12-month biomarker (SD units) per 1 SD higher 12-month log GlycA",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.75, 0.75)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile1A_a.png", width=10, height=18)


##### Combination figure - SEP measures - Supp File 1B (b)


df_linear <- rbind(figure_GlycA_hhincome, figure_GlycA_SEIFA, figure_GlycA_meducation)


# Convert to factor for ordering
df_linear$model <- as.factor(df_linear$model)

# # Reorder factors to reverse order, so they appear in the correct order in the plot
df_linear$model = factor(df_linear$model,levels(df_linear$model)[c(3,2,1)])


df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        colour = model,
        xlab = "Difference in 12-month biomarker (SD units) per 1 SD higher 12-month log GlycA",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.75, 0.75)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile1B_b.png", width=8, height=12)



###### 12m GlycA, adjusted for 6m GlycA and metabs ######

df_nmr_results_alt_names_loop <- df_nmr_results_alt_names_trimmed

results_6mto12m <-  data.frame(biomarkerid=character(),
                               estimate=double(),
                               se=double(),
                               pvalue=double(),
                               stringsAsFactors=FALSE)

regression <- results_6mto12m


for(i in nmr_biomarkers_6mto12mNMR){
  
  j <- paste0(i,"_6m")
  df_6mto12m_loop <- df_nmr_results_alt_names_loop[!is.na(df_nmr_results_alt_names_loop[[i]]),]
  df_6mto12m_loop[[i]] <- log(df_6mto12m_loop[[i]])   
  df_6mto12m_loop[[i]] <- as.numeric(scale(df_6mto12m_loop[[i]]))
  df_6mto12m_loop <- df_6mto12m_loop[!is.na(df_6mto12m_loop[[j]]),]
  df_6mto12m_loop[[j]] <- log(df_6mto12m_loop[[j]])   
  df_6mto12m_loop[[j]] <- as.numeric(scale(df_6mto12m_loop[[j]]))
  
  expData <- lm(paste0("`",i,"` ~ gp_exposure + GlycA_6m + ",j," + age_interview_m12 + age_interview_m6 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + mth6CollectiontoStorage_hr + anympsmoke + gestage"), data = df_6mto12m_loop)
  
  regression[1,1] <- paste(i)
  regression[1,2] <- summary(expData)$coefficients[2,1]
  regression[1,3] <- summary(expData)$coefficients[2,2]
  regression[1,4] <- summary(expData)$coefficients[2,4]
  
  
  results_6mto12m <- rbind(results_6mto12m, regression)
}

df_assoc_per_biomarker_bmi <- results_6mto12m

df_assoc_per_biomarker_bmi <- df_assoc_per_biomarker_bmi %>% left_join(
  select(
    df_NG_biomarker_metadata, 
    name,
    biomarkerid = machine_readable_name
  ), 
  by = "biomarkerid")



df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker_bmi, 'BIS_eLife_Fig3_SourceData3B.csv')

beta.comparison.MS_6mto12m$glyca.beta <- df_assoc_per_biomarker_bmi$estimate


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[225,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$exposure <- "GlycA"

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        shape = exposure,
        xlab = "Difference in 12-month biomarker (SD unit) per 1 SD higher 12-month log GlycA,\nadjusted for 6-month GlycA and metabolomic measures",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.7, 0.7)) +
        ggplot2::scale_shape_manual(
          values = c(22L),
          labels = c("GlycA")) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8)) +
        ggplot2::theme(legend.position = "none")
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Fig3_FigSupp2A.png", width=6, height=6)

############# Bootstrapping ###########

library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in nmr_biomarkers_6mto12mNMR[-224]){
  
  j <- paste0(i,"_6m")
  df_6mto12m_loop <- df_nmr_results_alt_names_loop[!is.na(df_nmr_results_alt_names_loop[[i]]),]
  df_6mto12m_loop[[i]] <- log(df_6mto12m_loop[[i]])   
  df_6mto12m_loop[[i]] <- as.numeric(scale(df_6mto12m_loop[[i]]))
  df_6mto12m_loop <- df_6mto12m_loop[!is.na(df_6mto12m_loop[[j]]),]
  df_6mto12m_loop[[j]] <- log(df_6mto12m_loop[[j]])   
  df_6mto12m_loop[[j]] <- as.numeric(scale(df_6mto12m_loop[[j]]))
  
  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste0("`",i,"` ~ gp_exposure + GlycA_6m + ",j," + age_interview_m12 + age_interview_m6 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + mth6CollectiontoStorage_hr + anympsmoke + gestage")), data = data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }
  
  
  duncan.boot <- boot(df_6mto12m_loop, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")
  
  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]
  
  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)
  
}

BIS_bootstrap_6mto12mGlycA_12mmetab_R1 <- bootstrap_bca_cis
write.csv(BIS_bootstrap_6mto12mGlycA_12mmetab_R1, "BIS_eLife_Figure3_SourceData3B_bootstrap.csv")



############# hsCRP as exposure - Figure 4A, Source File 1 #####################


df_nmr_results_alt_names$hscrp_exposure <- log(df_nmr_results_alt_names$hsCRP)
df_nmr_results_alt_names$hscrp_exposure <- as.numeric(scale(df_nmr_results_alt_names$hscrp_exposure))


df_nmr_results_alt_names_trimmed <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo), ]


# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, sex, z_scores_birth, gestage, hhincome, bfany, mth12collectiontostorage, anympsmoke) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + sex + z_scores_birth + gestage + hhincome + bfany + mth12collectiontostorage + anympsmoke
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker$adj_pvalue <- p.adjust(df_assoc_per_biomarker$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker, 'BIS_eLife_Figure4_SourceData1.csv')

beta.comparison.NMR$hscrp.beta <- df_assoc_per_biomarker$estimate

df_to_plot <-
  df_assoc_per_biomarker %>%
  # use right_join, with df_grouping on the right, to preserve the order of 
  # biomarkers it specifies. 
  dplyr::right_join(., df_with_groups, by = "name") 

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        shape = exposure,
        xlab = "Difference in 12-month biomarker (SD units) per 1 SD higher 12-month log hsCRP",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.6, 0.6)) +
        ggplot2::scale_shape_manual(
          values = c(23L),
          labels = c("hsCRP")) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8)) +
        ggplot2::theme(legend.position = "none")
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)


ggsave("Figures/BIS_eLife_Figure4A.png", width=6, height=6)

###### Bootstrapping ############
df_nmr_results_alt_names_infection <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo),]


library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in n[-229]){

  df_nmr_results_alt_names_infection[[i]] <- log(df_nmr_results_alt_names_infection[[i]])
  df_nmr_results_alt_names_infection[[i]] <- as.numeric(scale(df_nmr_results_alt_names_infection[[i]]))

  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste(i,"~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + gestage + hhincome + bfany + mth12collectiontostorage + anympsmoke")), data=data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }


  duncan.boot <- boot(df_nmr_results_alt_names_infection, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")

  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]

  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)

}

bootstrap_hsCRP_metab_bfany_infectionsampleonly <- bootstrap_bca_cis
write.csv(bootstrap_hsCRP_metab_bfany_infectionsampleonly, "BIS_eLife_Figure4_SourceData1_bootstrap.csv")

######## Several secondary analyses (comparing SEP measures and various sensitivity analyses) - Supp. File 1C #######
#### SEIFA

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, seifa_disad, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + seifa_disad + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "SEIFA (area disadvantage)"
figure_hsCRP_SEIFA <- df_linear



#### Maternal education

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, meducation_t1t2_d, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + meducation_t1t2_d + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Maternal education"
figure_hsCRP_meducation <- df_linear


#### 2ndary covariate model


# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage, smokeinroom_bto12m, GDMnew, preeclampsia_yn) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage + smokeinroom_bto12m + GDMnew + preeclampsia_yn
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model + additonal exposures"
figure_hsCRP_secondary <- df_linear



#### BF duration


# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bftomax52wks, mth12collectiontostorage, anympsmoke, gestage, smokeinroom_bto12m, GDMnew, preeclampsia_yn) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bftomax52wks + mth12collectiontostorage + anympsmoke + gestage + smokeinroom_bto12m + GDMnew + preeclampsia_yn
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model with breastfeeding duration adjustment"
figure_hsCRP_BFduration <- df_linear

### Excluding active infection CRP>5

df_nmr_results_alt_names_hsCRPlessthan5 <- df_nmr_results_alt_names_trimmed[(df_nmr_results_alt_names_trimmed$hsCRP<5),]

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_hsCRPlessthan5 %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

df_linear$model <- "Primary model excluding hsCRP>5"
figure_hsCRP_CRPexcl <- df_linear

### Excluding twins

df_nmr_results_alt_names_notwins <- df_nmr_results_alt_names_trimmed[!grepl('-1',df_nmr_results_alt_names_trimmed$idchild),]
df_nmr_results_alt_names_notwins <- df_nmr_results_alt_names_notwins[!grepl('-2',df_nmr_results_alt_names_notwins$idchild),]

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names_notwins %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model excluding twins"
figure_hsCRP_twinexcl <- df_linear


############# Processing time of <=4hr

df_nmr_results_alt_names_4hr <- df_nmr_results_alt_names_trimmed[(df_nmr_results_alt_names_trimmed$mth12collectiontostorage<=0.16666),]


# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_4hr %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m12, gender, z_scores_birth, hhincome, bfany, mth12collectiontostorage, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[229,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$model <- "Primary model excluding >4hr blood processing time"
figure_hsCRP_4hrprocessing <- df_linear


##### Combination figure - sensitivity- Supp File 1C (a)

df_linear <- rbind(figure_hsCRP_base, figure_hsCRP_secondary, figure_hsCRP_CRPexcl, figure_hsCRP_twinexcl, figure_hsCRP_4hrprocessing, figure_hsCRP_BFduration)


# Convert to factor for ordering
df_linear$model <- as.factor(df_linear$model)

# # Reorder factors to reverse order, so they appear in the correct order in the plot
df_linear$model = factor(df_linear$model,levels(df_linear$model)[c(6,3,4,5,2,1)])

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        colour = model,
        xlab = "Difference in 12-month biomarker (SD units) per 1 SD higher 12-month log hsCRP",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.75, 0.75)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile1C_a.png", width=10, height=18)



##### Combination figure - SEP measures- Supp File 1C (b)
df_linear <- rbind(figure_hsCRP_hhincome, figure_hsCRP_SEIFA, figure_hsCRP_meducation)


# Convert to factor for ordering
df_linear$model <- as.factor(df_linear$model)

# # Reorder factors to reverse order, so they appear in the correct order in the plot
df_linear$model = factor(df_linear$model,levels(df_linear$model)[c(3,2,1)])


df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        colour = model,
        xlab = "Difference in 12-month biomarker (SD units) per 1 SD higher 12-month log hsCRP",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.75, 0.75)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile1C_b.png", width=8, height=12)


###### 12m hsCRP, adjusted for 6m hsCRP and metabs ######

df_nmr_results_alt_names_loop <- df_nmr_results_alt_names_trimmed

results_6mto12m <-  data.frame(biomarkerid=character(),
                               estimate=double(),
                               se=double(),
                               pvalue=double(),
                               stringsAsFactors=FALSE)

regression <- results_6mto12m


for(i in nmr_biomarkers_6mto12mNMR){
  
  j <- paste0(i,"_6m")
  df_6mto12m_loop <- df_nmr_results_alt_names_loop[!is.na(df_nmr_results_alt_names_loop[[i]]),]
  df_6mto12m_loop[[i]] <- log(df_6mto12m_loop[[i]])   
  df_6mto12m_loop[[i]] <- as.numeric(scale(df_6mto12m_loop[[i]]))
  df_6mto12m_loop <- df_6mto12m_loop[!is.na(df_6mto12m_loop[[j]]),]
  df_6mto12m_loop[[j]] <- log(df_6mto12m_loop[[j]])   
  df_6mto12m_loop[[j]] <- as.numeric(scale(df_6mto12m_loop[[j]]))
  
  expData <- lm(paste0("`",i,"` ~ hscrp_exposure + hsCRP_6m + ",j," + age_interview_m12 + age_interview_m6 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + mth6CollectiontoStorage_hr + anympsmoke + gestage"), data = df_6mto12m_loop)
  
  regression[1,1] <- paste(i)
  regression[1,2] <- summary(expData)$coefficients[2,1]
  regression[1,3] <- summary(expData)$coefficients[2,2]
  regression[1,4] <- summary(expData)$coefficients[2,4]
  
  
  results_6mto12m <- rbind(results_6mto12m, regression)
}

df_assoc_per_biomarker_bmi <- results_6mto12m

df_assoc_per_biomarker_bmi <- df_assoc_per_biomarker_bmi %>% left_join(
  select(
    df_NG_biomarker_metadata, 
    name,
    biomarkerid = machine_readable_name
  ), 
  by = "biomarkerid")


df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker_bmi, 'BIS_eLife_Fig4_SourceData3.csv')

beta.comparison.MS_6mto12m$hscrp.beta <- df_assoc_per_biomarker_bmi$estimate


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)


df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)


df_linear$exposure <- "hsCRP"

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        shape = exposure,
        xlab = "Difference in 12-month biomarker (SD unit) per 1 SD higher 12-month log GlycA,\nadjusted for 6-month GlycA and metabolomic measures",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.55, 0.55)) +
        ggplot2::scale_shape_manual(
          values = c(23L),
          labels = c("hsCRP")) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8)) +
        ggplot2::theme(legend.position = "none")
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Fig3_FigSupp2A.png", width=6, height=6)

############# Bootstrapping ###########

library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in nmr_biomarkers_6mto12mNMR[-225]){
  
  j <- paste0(i,"_6m")
  df_6mto12m_loop <- df_nmr_results_alt_names_loop[!is.na(df_nmr_results_alt_names_loop[[i]]),]
  df_6mto12m_loop[[i]] <- log(df_6mto12m_loop[[i]])   
  df_6mto12m_loop[[i]] <- as.numeric(scale(df_6mto12m_loop[[i]]))
  df_6mto12m_loop <- df_6mto12m_loop[!is.na(df_6mto12m_loop[[j]]),]
  df_6mto12m_loop[[j]] <- log(df_6mto12m_loop[[j]])   
  df_6mto12m_loop[[j]] <- as.numeric(scale(df_6mto12m_loop[[j]]))
  
  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste0("`",i,"` ~ hscrp_exposure + hsCRP_6m + ",j," + age_interview_m12 + age_interview_m6 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + mth6CollectiontoStorage_hr + anympsmoke + gestage")), data = data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }
  
  
  duncan.boot <- boot(df_6mto12m_loop, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")
  
  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]
  
  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)
  
}

BIS_bootstrap_6mto12mhsCRP_12mmetab_R1 <- bootstrap_bca_cis
write.csv(BIS_bootstrap_6mto12mhsCRP_12mmetab_R1, "BIS_eLife_Figure4_SourceData3_bootstrap.csv")



###### Scatterplot of metabolomic difference correlation - Figure 3C, 4B, 4C #####

beta.comparison.NMR_noinflam <- beta.comparison.NMR[1:227,]


# Add the regression line without the confidence interval
q <- ggplot(beta.comparison.NMR_noinflam, aes(x=infection.beta, y=glyca.beta)) +
  geom_point(shape=15, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="Infection Beta (per 1 infection)", y = "GlycA Beta (per SD log GlycA)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure3C.png")

cor.test(beta.comparison.NMR_noinflam$infection.beta, beta.comparison.NMR_noinflam$glyca.beta)

q <- ggplot(beta.comparison.NMR_noinflam, aes(x=infection.beta, y=hscrp.beta)) +
  geom_point(shape=18, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="Infection Beta (per 1 infection)", y = "hsCRP Beta (per SD log hsCRP)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure4B.png")

cor.test(beta.comparison.NMR_noinflam$infection.beta, beta.comparison.NMR_noinflam$hscrp.beta)


q <- ggplot(beta.comparison.NMR_noinflam, aes(x=glyca.beta, y=hscrp.beta)) +
  geom_point(shape=18, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="GlycA Beta (per SD log GlycA)", y = "hsCRP Beta (per SD log hsCRP)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure4C.png")

cor.test(beta.comparison.NMR_noinflam$glyca.beta, beta.comparison.NMR_noinflam$hscrp.beta)


###### Scatterplot of metabolomic difference correlation - 6m to 12m- Figure 3-fig supp 2C, 4-fig supp 2B, 4-fig supp 2C #####


beta.comparison.NMR_noinflam <- beta.comparison.MS_6mto12m[1:223,]


# Add the regression line without the confidence interval
q <- ggplot(beta.comparison.NMR_noinflam, aes(x=infection.beta, y=glyca.beta)) +
  geom_point(shape=15, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="Infection Beta (per 1 infection)", y = "GlycA Beta (per SD log GlycA)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure3_FigSupp2C.png")

cor.test(beta.comparison.NMR_noinflam$infection.beta, beta.comparison.NMR_noinflam$glyca.beta)

q <- ggplot(beta.comparison.NMR_noinflam, aes(x=infection.beta, y=hscrp.beta)) +
  geom_point(shape=18, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="Infection Beta (per 1 infection)", y = "hsCRP Beta (per SD log hsCRP)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure4_FigSupp2B.png")

cor.test(beta.comparison.NMR_noinflam$infection.beta, beta.comparison.NMR_noinflam$hscrp.beta)


q <- ggplot(beta.comparison.NMR_noinflam, aes(x=glyca.beta, y=hscrp.beta)) +
  geom_point(shape=18, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="GlycA Beta (per SD log GlycA)", y = "hsCRP Beta (per SD log hsCRP)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure4_FigSupp4C.png")

cor.test(beta.comparison.NMR_noinflam$glyca.beta, beta.comparison.NMR_noinflam$hscrp.beta)



######### Metabolomics mediation - Fig 7 and associated figure supplement and Source Data ###############

library(medflex)

set.seed(12345)

mediation_effects <-  data.frame(mediator=character(),
                                 biomarkerid=character(),
                                 total_estimate=double(),
                                 total_se=double(),
                                 total_p=double(),
                                 direct_estimate=double(),
                                 direct_se=double(),
                                 direct_p=double(),
                                 indirect_estimate=double(),
                                 indirect_se=double(),
                                 indirect_p=double(),
                                 mediation=double(),
                                 model=character(),
                                 time=character(),
                                 stringsAsFactors=FALSE)
regression <- mediation_effects

df_mediation_data <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$gp_exposure),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$anyinfection4wk12mo),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$age_interview_m12),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$gender),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$z_scores_birth),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$hhincome),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$bfany),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$mth12collectiontostorage),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$anympsmoke),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$gestage),]

metabs <- c("Phe", "HDL_C", "HDL2_C", "ApoA1", "HDL3_C", "Citrate", "HDL_size", "TG_by_PG")

for(i in metabs){
  
  
  df_mediation_loop <- df_mediation_data[!is.na(df_mediation_data[[i]]),]
  df_mediation_loop[[i]] <- log(df_mediation_loop[[i]])
  df_mediation_loop[[i]] <- as.numeric(scale(df_mediation_loop[[i]]))
  
  expData <- neImpute(paste0(i," ~ anyinfection4wk12mo + gp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), data = df_mediation_loop, nRep = 5)
  neMod1 <- neModel(paste0(i," ~ anyinfection4wk12mo0 + anyinfection4wk12mo1 + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), expData = expData, se = "robust")
  
  effdecomp <- neEffdecomp(neMod1)
  
  regression[1,1] <- "GlycA"
  regression[1,2] <- paste(i)
  regression[1,6] <- summary(effdecomp)[9]$test$coefficients[1]
  regression[1,9] <- summary(effdecomp)[9]$test$coefficients[2]
  regression[1,3] <- summary(effdecomp)[9]$test$coefficients[3]
  regression[1,7] <- summary(effdecomp)[9]$test$sigma[1]
  regression[1,10] <- summary(effdecomp)[9]$test$sigma[2]
  regression[1,4] <- summary(effdecomp)[9]$test$sigma[3]
  regression[1,8] <- summary(effdecomp)[9]$test$pvalues[1]
  regression[1,11] <- summary(effdecomp)[9]$test$pvalues[2]
  regression[1,5] <- summary(effdecomp)[9]$test$pvalues[3]
  regression[1,12] <- regression[1,9]/regression[1,3]
  regression[1,13] <- "Primary"
  regression[1,14] <- "Birth to 12m"
  
  mediation_effects <- rbind(mediation_effects, regression)
}

mediation_effects_primary_bto12m_GlycA <- mediation_effects


######### Metabolomics mediation - 6m to 12m - Figure 7-figure supplement 1, Source Data 2


mediation_effects <-  data.frame(mediator=character(),
                                 biomarkerid=character(),
                                 total_estimate=double(),
                                 total_se=double(),
                                 total_p=double(),
                                 direct_estimate=double(),
                                 direct_se=double(),
                                 direct_p=double(),
                                 indirect_estimate=double(),
                                 indirect_se=double(),
                                 indirect_p=double(),
                                 mediation=double(),
                                 model=character(),
                                 time=character(),
                                 stringsAsFactors=FALSE)
regression <- mediation_effects

df_mediation_data_6mto12m <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$gp_exposure),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anyinfection4wk12mo),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$age_interview_m12),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$gender),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$z_scores_birth),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$hhincome),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$bfany),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$mth12collectiontostorage),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anympsmoke),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$gestage),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anyinfection4wk6mo),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anyinfection9mo12mo),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$GlycA_6m),]


metabs <- c("Phe", "HDL_C", "ApoA1", "Citrate", "HDL_size", "TG_by_PG")

for(i in metabs){
  
  
  j <- paste0(i,"_6m")
  df_mediation_loop <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m[[i]]),]
  df_mediation_loop[[i]] <- log(df_mediation_loop[[i]])   
  df_mediation_loop[[i]] <- as.numeric(scale(df_mediation_loop[[i]]))
  df_mediation_loop <- df_mediation_loop[!is.na(df_mediation_loop[[j]]),]
  df_mediation_loop[[j]] <- log(df_mediation_loop[[j]])   
  df_mediation_loop[[j]] <- as.numeric(scale(df_mediation_loop[[j]]))
  
  expData <- neImpute(paste0(i," ~ anyinfection9mo12mo + gp_exposure + anyinfection4wk6mo + GlycA_6m + ",j," + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), data = df_mediation_loop, nRep = 5)
  neMod1 <- neModel(paste0(i," ~ anyinfection9mo12mo0 + anyinfection9mo12mo1 + anyinfection4wk6mo + GlycA_6m + ",j," + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), expData = expData, se = "robust")
  
  effdecomp <- neEffdecomp(neMod1)
  
  regression[1,1] <- "GlycA"
  regression[1,2] <- paste(i)
  regression[1,6] <- summary(effdecomp)[9]$test$coefficients[1]
  regression[1,9] <- summary(effdecomp)[9]$test$coefficients[2]
  regression[1,3] <- summary(effdecomp)[9]$test$coefficients[3]
  regression[1,7] <- summary(effdecomp)[9]$test$sigma[1]
  regression[1,10] <- summary(effdecomp)[9]$test$sigma[2]
  regression[1,4] <- summary(effdecomp)[9]$test$sigma[3]
  regression[1,8] <- summary(effdecomp)[9]$test$pvalues[1]
  regression[1,11] <- summary(effdecomp)[9]$test$pvalues[2]
  regression[1,5] <- summary(effdecomp)[9]$test$pvalues[3]
  regression[1,12] <- regression[1,9]/regression[1,3]
  regression[1,13] <- "Primary"
  regression[1,14] <- "6m to 12m"
  
  mediation_effects <- rbind(mediation_effects, regression)
}

mediation_effects_primary_6mto12m_GlycA <- mediation_effects


###### hsCRP ######

mediation_effects <-  data.frame(mediator=character(),
                                 biomarkerid=character(),
                                 total_estimate=double(),
                                 total_se=double(),
                                 total_p=double(),
                                 direct_estimate=double(),
                                 direct_se=double(),
                                 direct_p=double(),
                                 indirect_estimate=double(),
                                 indirect_se=double(),
                                 indirect_p=double(),
                                 mediation=double(),
                                 model=character(),
                                 time=character(),
                                 stringsAsFactors=FALSE)
regression <- mediation_effects

df_mediation_data <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$hscrp_exposure),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$anyinfection4wk12mo),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$age_interview_m12),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$gender),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$z_scores_birth),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$hhincome),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$bfany),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$mth12collectiontostorage),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$anympsmoke),]
df_mediation_data <- df_mediation_data[!is.na(df_mediation_data$gestage),]

metabs <- c("Phe", "HDL_C", "HDL2_C", "ApoA1", "HDL3_C", "Citrate", "HDL_size", "TG_by_PG")

for(i in metabs){
  
  
  df_mediation_loop <- df_mediation_data[!is.na(df_mediation_data[[i]]),]
  df_mediation_loop[[i]] <- log(df_mediation_loop[[i]])   
  df_mediation_loop[[i]] <- as.numeric(scale(df_mediation_loop[[i]]))
  
  expData <- neImpute(paste0(i," ~ anyinfection4wk12mo + hscrp_exposure + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), data = df_mediation_loop, nRep = 5)
  neMod1 <- neModel(paste0(i," ~ anyinfection4wk12mo0 + anyinfection4wk12mo1 + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), expData = expData, se = "robust")
  
  effdecomp <- neEffdecomp(neMod1)
  
  regression[1,1] <- "hsCRP"
  regression[1,2] <- paste(i)
  regression[1,6] <- summary(effdecomp)[9]$test$coefficients[1]
  regression[1,9] <- summary(effdecomp)[9]$test$coefficients[2]
  regression[1,3] <- summary(effdecomp)[9]$test$coefficients[3]
  regression[1,7] <- summary(effdecomp)[9]$test$sigma[1]
  regression[1,10] <- summary(effdecomp)[9]$test$sigma[2]
  regression[1,4] <- summary(effdecomp)[9]$test$sigma[3]
  regression[1,8] <- summary(effdecomp)[9]$test$pvalues[1]
  regression[1,11] <- summary(effdecomp)[9]$test$pvalues[2]
  regression[1,5] <- summary(effdecomp)[9]$test$pvalues[3]
  regression[1,12] <- regression[1,9]/regression[1,3]
  regression[1,13] <- "Primary"
  regression[1,14] <- "Birth to 12m"
  
  mediation_effects <- rbind(mediation_effects, regression)
}

mediation_effects_primary_bto12m_hsCRP <- mediation_effects


######### Metabolomics mediation - 6m to 12m


mediation_effects <-  data.frame(mediator=character(),
                                 biomarkerid=character(),
                                 total_estimate=double(),
                                 total_se=double(),
                                 total_p=double(),
                                 direct_estimate=double(),
                                 direct_se=double(),
                                 direct_p=double(),
                                 indirect_estimate=double(),
                                 indirect_se=double(),
                                 indirect_p=double(),
                                 mediation=double(),
                                 model=character(),
                                 time=character(),
                                 stringsAsFactors=FALSE)
regression <- mediation_effects

df_mediation_data_6mto12m <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$hscrp_exposure),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anyinfection4wk12mo),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$age_interview_m12),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$gender),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$z_scores_birth),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$hhincome),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$bfany),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$mth12collectiontostorage),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anympsmoke),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$gestage),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anyinfection4wk6mo),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$anyinfection9mo12mo),]
df_mediation_data_6mto12m <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m$hsCRP_6m),]


metabs <- c("Phe", "HDL_C", "ApoA1", "Citrate", "HDL_size", "TG_by_PG")

for(i in metabs){
  
  j <- paste0(i,"_6m")
  df_mediation_loop <- df_mediation_data_6mto12m[!is.na(df_mediation_data_6mto12m[[i]]),]
  df_mediation_loop[[i]] <- log(df_mediation_loop[[i]])   
  df_mediation_loop[[i]] <- as.numeric(scale(df_mediation_loop[[i]]))
  df_mediation_loop <- df_mediation_loop[!is.na(df_mediation_loop[[j]]),]
  df_mediation_loop[[j]] <- log(df_mediation_loop[[j]])   
  df_mediation_loop[[j]] <- as.numeric(scale(df_mediation_loop[[j]]))
  
  expData <- neImpute(paste0(i," ~ anyinfection9mo12mo + hscrp_exposure + anyinfection4wk6mo + hsCRP_6m + ",j," + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), data = df_mediation_loop, nRep = 5)
  neMod1 <- neModel(paste0(i," ~ anyinfection9mo12mo0 + anyinfection9mo12mo1 + anyinfection4wk6mo + hsCRP_6m + ",j," + age_interview_m12 + gender + z_scores_birth + hhincome + bfany + mth12collectiontostorage + anympsmoke + gestage"), family = gaussian(link = "identity"), expData = expData, se = "robust")
  
  effdecomp <- neEffdecomp(neMod1)
  
  regression[1,1] <- "hsCRP"
  regression[1,2] <- paste(i)
  regression[1,6] <- summary(effdecomp)[9]$test$coefficients[1]
  regression[1,9] <- summary(effdecomp)[9]$test$coefficients[2]
  regression[1,3] <- summary(effdecomp)[9]$test$coefficients[3]
  regression[1,7] <- summary(effdecomp)[9]$test$sigma[1]
  regression[1,10] <- summary(effdecomp)[9]$test$sigma[2]
  regression[1,4] <- summary(effdecomp)[9]$test$sigma[3]
  regression[1,8] <- summary(effdecomp)[9]$test$pvalues[1]
  regression[1,11] <- summary(effdecomp)[9]$test$pvalues[2]
  regression[1,5] <- summary(effdecomp)[9]$test$pvalues[3]
  regression[1,12] <- regression[1,9]/regression[1,3]
  regression[1,13] <- "Primary"
  regression[1,14] <- "6m to 12m"
  
  mediation_effects <- rbind(mediation_effects, regression)
}

mediation_effects_primary_6mto12m_hsCRP <- mediation_effects


mediation_results <- rbind(mediation_effects_primary_bto12m_GlycA, mediation_effects_primary_bto12m_hsCRP, 
                           mediation_effects_primary_6mto12m_GlycA, mediation_effects_primary_6mto12m_hsCRP)

write.csv(mediation_results, 'BIS_eLife_Fig7_SourceData_1_2_NMR.csv')


#### Mediation plot - Figure 7 and figure supplement 1 ####


mediation_results_plot <- rbind(mediation_effects_primary_bto12m_GlycA, mediation_effects_primary_bto12m_hsCRP, 
                                mediation_effects_primary_6mto12m_GlycA, mediation_effects_primary_6mto12m_hsCRP)


df_to_plot <-
  mediation_results_plot %>%
  # use right_join, with df_grouping on the right, to preserve the order of 
  # biomarkers it specifies. 
  dplyr::right_join(., df_with_groups, by = "biomarkerid") 


df_to_plot <- mediation_results_plot

df_to_plot <- na.omit(df_to_plot)

df_to_plot_base <- df_to_plot[df_to_plot$time=="Birth to 12m",]

mediation_plot <-  data.frame(name=character(),
                              estimate=double(),
                              se=double(),
                              pvalue=double(),
                              mediator=character(),
                              effect=character(),
                              group=character(),
                              stringsAsFactors=FALSE)

regression <- mediation_plot

for(i in 1:nrow(df_to_plot_base)){
  
  regression[1:3,1] <- df_to_plot_base[i, 2]
  regression[1:3,5] <- df_to_plot_base[i, 1]
  regression[1:3,7] <- "NMR metabolomics"
  regression[1,6] <- "Indirect effect"
  regression[2,6] <- "Direct effect"
  regression[3,6] <- "Total effect"
  regression[1,2] <- df_to_plot_base[i,9]
  regression[1,3] <- df_to_plot_base[i,10]
  regression[1,4] <- df_to_plot_base[i,11]
  regression[2,2] <- df_to_plot_base[i,6]
  regression[2,3] <- df_to_plot_base[i,7]
  regression[2,4] <- df_to_plot_base[i,8]
  regression[3,2] <- df_to_plot_base[i,3]
  regression[3,3] <- df_to_plot_base[i,4]
  regression[3,4] <- df_to_plot_base[i,5]
  
  
  mediation_plot <- rbind(mediation_plot, regression)
}


mediation_plot_trim <- mediation_plot[(mediation_plot$mediator!="hsCRP") | (mediation_plot$effect!="Total effect"),]

mediation_plot_trim$mediator[mediation_plot_trim$effect=="Total effect"] <- "Total effect"
mediation_plot_trim <- mediation_plot_trim[mediation_plot_trim$effect!="Direct effect",]
mediation_plot_trim$mediator[mediation_plot_trim$mediator=="GlycA"] <- "GlycA indirect effect"
mediation_plot_trim$mediator[mediation_plot_trim$mediator=="hsCRP"] <- "hsCRP indirect effect"

mediation_plot_trim$mediator <- as.factor(mediation_plot_trim$mediator)
mediation_plot_trim$mediator <- factor(mediation_plot_trim$mediator,levels(mediation_plot_trim$mediator)[c(2,1,3)])

mediation_plot_trim$name[mediation_plot_trim$name=="Phe"] <- "Phenylalanine"
mediation_plot_trim$name[mediation_plot_trim$name=="HDL_C"] <- "HDL-C"
mediation_plot_trim$name[mediation_plot_trim$name=="HDL2_C"] <- "HDL2-C"
mediation_plot_trim$name[mediation_plot_trim$name=="HDL3_C"] <- "HDL3-C"
mediation_plot_trim$name[mediation_plot_trim$name=="HDL_size"] <- "HDL particle size"
mediation_plot_trim$name[mediation_plot_trim$name=="TG_by_PG"] <- "TG/PG"

ggplot_multi <-
  mediation_plot_trim %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = pvalue,
        psignif = 0.05,
        shape = mediator,
        colour = mediator,
        xlab = "Total effect of infections from birth to 12 months of age on 12-month metabolomic measures\nand indirect effects mediated by 12-month GlycA or hsCRP",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.1, 0.1)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
      + ggplot2::scale_shape_manual(values = c(23L, 22L, 21L))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 1,
  heights = ggplot_multi$rel_heights
)

ggsave("Figures/BIS_eLife_Fig7_NMR.png", width=7, height=4)

##### 6m to 12m plot

df_to_plot_base <- df_to_plot[df_to_plot$time=="6m to 12m",]

mediation_plot <-  data.frame(name=character(),
                              estimate=double(),
                              se=double(),
                              pvalue=double(),
                              mediator=character(),
                              effect=character(),
                              group=character(),
                              stringsAsFactors=FALSE)

regression <- mediation_plot

for(i in 1:nrow(df_to_plot_base)){

  regression[1:3,1] <- df_to_plot_base[i, 2]
  regression[1:3,5] <- df_to_plot_base[i, 1]
  regression[1:3,7] <- "NMR metabolomics"
  regression[1,6] <- "Indirect effect"
  regression[2,6] <- "Direct effect"
  regression[3,6] <- "Total effect"
  regression[1,2] <- df_to_plot_base[i,9]
  regression[1,3] <- df_to_plot_base[i,10]
  regression[1,4] <- df_to_plot_base[i,11]
  regression[2,2] <- df_to_plot_base[i,6]
  regression[2,3] <- df_to_plot_base[i,7]
  regression[2,4] <- df_to_plot_base[i,8]
  regression[3,2] <- df_to_plot_base[i,3]
  regression[3,3] <- df_to_plot_base[i,4]
  regression[3,4] <- df_to_plot_base[i,5]
  
  
  mediation_plot <- rbind(mediation_plot, regression)
}


mediation_plot_trim <- mediation_plot[(mediation_plot$mediator!="hsCRP") | (mediation_plot$effect!="Total effect"),]

mediation_plot_trim$mediator[mediation_plot_trim$effect=="Total effect"] <- "Total effect"
mediation_plot_trim <- mediation_plot_trim[mediation_plot_trim$effect!="Direct effect",]
mediation_plot_trim$mediator[mediation_plot_trim$mediator=="GlycA"] <- "GlycA indirect effect"
mediation_plot_trim$mediator[mediation_plot_trim$mediator=="hsCRP"] <- "hsCRP indirect effect"

mediation_plot_trim$mediator <- as.factor(mediation_plot_trim$mediator)
mediation_plot_trim$mediator <- factor(mediation_plot_trim$mediator,levels(mediation_plot_trim$mediator)[c(2,1,3)])

mediation_plot_trim$name[mediation_plot_trim$name=="Phe"] <- "Phenylalanine"
mediation_plot_trim$name[mediation_plot_trim$name=="HDL_C"] <- "HDL-C"
mediation_plot_trim$name[mediation_plot_trim$name=="HDL_size"] <- "HDL particle size"
mediation_plot_trim$name[mediation_plot_trim$name=="TG_by_PG"] <- "TG/PG"

ggplot_multi <-
  mediation_plot_trim %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = pvalue,
        psignif = 0.05,
        shape = mediator,
        colour = mediator,
        xlab = "Total effect of infections from 6 to 12 months of age on 12-month metabolomic measures\nand indirect effects mediated by 12-month GlycA or hsCRP\nadjusted for infections birth to 6-month, 6-month metabolomic measures, and 6-month GlycA or hsCRP",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.12, 0.12)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
      + ggplot2::scale_shape_manual(values = c(23L, 22L, 21L))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 1,
  heights = ggplot_multi$rel_heights
)

ggsave("Figures/BIS_eLife_Fig7_FigSupp1_NMR.png", width=7, height=3.5)



#### Reverse causality model - Birth metabolomics predicting birth to 6m infections - Supp. File 3A ####

# Read the biomarker concentration file
df_nmr_results <- readr::read_csv(
  # Enter the correct location for your file below
  file = "Data/BIS_infection_dataset_NMR.csv",
  # Set not only NA but TAG string as <NA>
  na = c("NA", "TAG", ".d")
)


df_nmr_results_birth <- df_nmr_results[(df_nmr_results$time=="Birth"),]

df_nmr_results_alt_names <- df_nmr_results_birth

alt_names <- 
  names(df_nmr_results_alt_names)

new_names <- 
  alt_names %>% 
  purrr::map_chr(function(id) {
    # Look through the alternative_ids
    hits <-
      purrr::map_lgl(
        df_NG_biomarker_metadata$alternative_names,
        ~ id %in% .
      )
    
    # If one unambiguous hit, return it.
    if (sum(hits) == 1L) {
      return(df_NG_biomarker_metadata$machine_readable_name[hits])
      # If not found, give a warning and pass through the input.
    } else {
      warning("Biomarker not found: ", id, call. = FALSE)
      return(id)
    } 
  })


# Name the vector with the new names  
names(alt_names) <- new_names

# Rename your result data frame with machine_readable_names 
df_nmr_results_alt_names <- 
  df_nmr_results_alt_names %>% 
  rename(!!alt_names)

# Extract names of NMR biomarkers
nmr_biomarkers <- names(df_nmr_results_alt_names %>% select(XXL_VLDL_P:hsCRP))

load("df_NG_biomarker_metadata.rda")


NMR.minimumvalues <- data.frame(metabolite=character(),
                                minimum=double(),
                                stringsAsFactors=FALSE) 
holder <- NMR.minimumvalues


for(i in nmr_biomarkers){
  
  holder[1,1] <- paste(i)
  holder[1,2] <- min(df_nmr_results_alt_names[[i]][!!df_nmr_results_alt_names[[i]]], na.rm=TRUE)
  
  NMR.minimumvalues <- rbind(NMR.minimumvalues, holder)
  df_nmr_results_alt_names[[i]] <- df_nmr_results_alt_names[[i]]+holder[1,2]
  
}

df_nmr_results_alt_names_infectiondataonly <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo), ]

# Encoding mode of delivery
df_nmr_results_alt_names_infectiondataonly$modebirth_3[df_nmr_results_alt_names_infectiondataonly$modebirth==1] <- 1
df_nmr_results_alt_names_infectiondataonly$modebirth_3[df_nmr_results_alt_names_infectiondataonly$modebirth==2] <- 1
df_nmr_results_alt_names_infectiondataonly$modebirth_3[df_nmr_results_alt_names_infectiondataonly$modebirth==3] <- 1
df_nmr_results_alt_names_infectiondataonly$modebirth_3[df_nmr_results_alt_names_infectiondataonly$modebirth==4] <- 2
df_nmr_results_alt_names_infectiondataonly$modebirth_3[df_nmr_results_alt_names_infectiondataonly$modebirth==5] <- 3



birthmetab_infections <- data.frame(estimate=double(),
                                    se=double(),
                                    p.value=double(),
                                    adj.p.value=double(),
                                    biomarkerid=character(),
                                    stringsAsFactors=FALSE) 
regression <- birthmetab_infections

for(i in nmr_biomarkers){
  df_nmr_results_alt_names_infectiondataonly[,paste(i)] <- log(df_nmr_results_alt_names_infectiondataonly[,paste(i)])
  df_nmr_results_alt_names_infectiondataonly[,paste(i)] <- as.numeric(scale(df_nmr_results_alt_names_infectiondataonly[,paste(i)]))
  df_regression <- glm(as.formula(paste("anyinfection4wk6mo ~ ",i,"+ age_interview_m6 + gender + z_scores_birth + hhincome + bfany + gestage + factor(modebirth_3)", sep="")), family = quasipoisson(link = "log"), data=df_nmr_results_alt_names_infectiondataonly)
  
  regression[1,1] <- coef(summary(df_regression))[2, 1]
  regression[1,2] <- coef(summary(df_regression))[2, 2]
  regression[1,3:4] <- coef(summary(df_regression))[2, 4]
  regression[1,5] <- paste(i)
  
  birthmetab_infections <- rbind(birthmetab_infections, regression)
}

birthmetab_infections$adj.p.value <- p.adjust(birthmetab_infections$adj.p.value, method = "BH")

birthmetab_infections_names <- left_join(birthmetab_infections,
                                         select(
                                           df_NG_biomarker_metadata,
                                           name,
                                           biomarkerid = machine_readable_name
                                         ),
                                         by = "biomarkerid")

birthmetab_infections_names[89,6] <- "C-reactive protein*"


# Display blood biomarker groups
df_NG_biomarker_metadata %>%
  pull(group) %>%
  unique()


# Choose the groups you want to plot and define the order with which group
# categories will appear in the plot
group_order <- c(
  "Branched-chain amino acids",
  "Aromatic amino acids",
  "Amino acids",
  "Fluid balance",
  "Fatty acids",
  "Apolipoproteins",
  "Glycerides and phospholipids",
  "Cholesterol",
  "Glycolysis related metabolites",
  "Ketone bodies",
  "Lipoprotein subclasses",
  "Lipoprotein particle sizes",
  "Relative lipoprotein lipid concentrations",
  "Inflammation"
)

# Extract a subset of the df_NG_biomarker_metadata, with the desired group
# order, to set the order of biomarkers in the forestplot later on
df_with_groups <-
  df_NG_biomarker_metadata %>%
  # Select subset of variables
  select(name = name,
         group) %>%
  # Filter and arrange for the wanted groups
  filter(group %in% group_order) %>%
  arrange(factor(group, levels = group_order))


df_to_plot <-
  birthmetab_infections_names %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")



df_to_plot <- na.omit(df_to_plot)

hscrp <- birthmetab_infections[89,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj.p.value,
        psignif = 0.05,
        xlab = "Estimated difference in number of infections from birth to 6 months\nper 1 SD of log metabolite at birth",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.25, 0.25)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_SuppFile3A.png", width=6, height=6)

########## Models with 6-month metabolomics as outcome - Figure 3-figure supplement 1, and Figure 4-figure supplement 1 ########


df_nmr_results_6m <- df_nmr_results[(df_nmr_results$time=="6m"),]

df_nmr_results_6m_completecase <- df_nmr_results_6m[!is.na(df_nmr_results_6m$anyinfection4wk12mo),]

df_nmr_results_alt_names <- df_nmr_results_6m_completecase

alt_names <- 
  names(df_nmr_results_alt_names)

new_names <- 
  alt_names %>% 
  purrr::map_chr(function(id) {
    # Look through the alternative_ids
    hits <-
      purrr::map_lgl(
        df_NG_biomarker_metadata$alternative_names,
        ~ id %in% .
      )
    
    # If one unambiguous hit, return it.
    if (sum(hits) == 1L) {
      return(df_NG_biomarker_metadata$machine_readable_name[hits])
      # If not found, give a warning and pass through the input.
    } else {
      warning("Biomarker not found: ", id, call. = FALSE)
      return(id)
    } 
  })


# Name the vector with the new names  
names(alt_names) <- new_names

# Rename your result data frame with machine_readable_names 
df_nmr_results_alt_names <- 
  df_nmr_results_alt_names %>% 
  rename(!!alt_names)

load("Data/df_NG_biomarker_metadata.rda")

nmr_biomarkers <- names(df_nmr_results_alt_names %>% select(Total_C:hsCRP))

NMR.minimumvalues <- data.frame(metabolite=character(),
                                minimum=double(),
                                stringsAsFactors=FALSE) 
holder <- NMR.minimumvalues


for(i in nmr_biomarkers){
  
  holder[1,1] <- paste(i)
  holder[1,2] <- min(df_nmr_results_alt_names[[i]][!!df_nmr_results_alt_names[[i]]], na.rm=TRUE)
  
  NMR.minimumvalues <- rbind(NMR.minimumvalues, holder)
  df_nmr_results_alt_names[[i]] <- df_nmr_results_alt_names[[i]]+holder[1,2]
  
}


beta.comparison.NMR <- data.frame(metabolite=character(251),
                                  infection.beta=double(251),
                                  glyca.beta=double(251),
                                  hscrp.beta=double(251),
                                  stringsAsFactors=FALSE) 

############# Infections - Figure 3-figure supplement 1A #####################

df_nmr_results_alt_names$anyinfection4wk6mo <- df_nmr_results_alt_names$anyinfection4wk + df_nmr_results_alt_names$anyinfection3mo + df_nmr_results_alt_names$anyinfection6mo

# Select only variables to be used for the model and collapse to a long data 
# format
df_long <-
  df_nmr_results_alt_names %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, anyinfection4wk6mo, age_interview_m6, sex, z_scores_birth, hhincome, bfany, mth6CollectiontoStorage_hours, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ anyinfection4wk6mo + age_interview_m6 + sex + z_scores_birth + hhincome + bfany + mth6CollectiontoStorage_hours + anympsmoke  + gestage
      ),
    key = biomarkerid,
    predictor = anyinfection4wk6mo
  ) %>% 
  # Join this dataset with the grouping data in order to choose a different 
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata, 
      name,
      biomarkerid = machine_readable_name
    ), 
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker_bmi, 'BIS_eLife_Fig3_SourceData2A.csv')

beta.comparison.NMR$metabolite <- df_assoc_per_biomarker_bmi$biomarkerid
beta.comparison.NMR$infection.beta <- df_assoc_per_biomarker_bmi$estimate

# Display blood biomarker groups
df_NG_biomarker_metadata %>% 
  pull(group) %>% 
  unique()

group_order <- c(
  "Branched-chain amino acids",
  "Aromatic amino acids",
  "Amino acids",
  "Fluid balance",
  "Fatty acids",
  "Apolipoproteins",
  "Glycerides and phospholipids",
  "Cholesterol",
  "Glycolysis related metabolites",
  "Ketone bodies",
  "Lipoprotein subclasses",
  "Lipoprotein particle sizes",
  "Relative lipoprotein lipid concentrations",
  "Inflammation"
)

# Extract a subset of the df_NG_biomarker_metadata, with the desired group
# order, to set the order of biomarkers in the forestplot later on
df_with_groups <- 
  df_NG_biomarker_metadata %>% 
  # Select subset of variables
  select(name = name,
         group) %>% 
  # Filter and arrange for the wanted groups
  filter(group %in% group_order) %>%
  arrange(factor(group, levels = group_order))


df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name") %>%
  arrange(factor(group, levels = group_order))

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[251,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)


df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))


df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)


df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        xlab = "Difference in 6-month biomarker (SD units) for each infection from birth to 6 months",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.15, 0.15)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )

patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Fig3_FigSupp1A.png", width=6, height=6)



###### Bootstrapping ############
df_nmr_results_alt_names_infection <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo),]
n <- df_assoc_per_biomarker_bmi$biomarkerid

library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in n){
  
  df_nmr_results_alt_names_infection[[i]] <- log(df_nmr_results_alt_names_infection[[i]])
  df_nmr_results_alt_names_infection[[i]] <- as.numeric(scale(df_nmr_results_alt_names_infection[[i]]))
  
  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste(i,"~ anyinfection4wk6mo + age_interview_m6 + sex + z_scores_birth + hhincome + bfany + mth6CollectiontoStorage_hours + anympsmoke + gestage")), data=data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }
  
  
  duncan.boot <- boot(df_nmr_results_alt_names_infection, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")
  
  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]
  
  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)
  
}

bootstrap_infection_metab_bfany <- bootstrap_bca_cis
write.csv(bootstrap_infection_metab_bfany, "BIS_eLife_Figure3_SourceData2A_bootstrap.csv")

############# GlycA - Figure 3-figure supplement 1B #####################

df_nmr_results_alt_names$gp_exposure <- log(df_nmr_results_alt_names$GlycA)
df_nmr_results_alt_names$gp_exposure <- as.numeric(scale(df_nmr_results_alt_names$gp_exposure))



df_nmr_results_alt_names_trimmed <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk6mo), ]

# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, gp_exposure, age_interview_m6, sex, z_scores_birth, hhincome, bfany, mth6CollectiontoStorage_hours, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ gp_exposure + age_interview_m6 + sex + z_scores_birth + hhincome + bfany + mth6CollectiontoStorage_hours + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = gp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker_bmi, 'BIS_eLife_Fig3_SourceData2B.csv')

beta.comparison.NMR$glyca.beta <- df_assoc_per_biomarker_bmi$estimate

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

# hscrp <- df_assoc_per_biomarker_bmi[75,]
hscrp <- df_assoc_per_biomarker_bmi[251,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="GlycA",]

df_to_plot <- rbind(df_to_plot, hscrp)


# Choose the groups you want to plot and define the order with which group 
# categories will appear in the plot
group_order <- c(
  "Branched-chain amino acids",
  "Aromatic amino acids",
  "Amino acids",
  "Fluid balance",
  "Fatty acids",
  "Apolipoproteins",
  "Glycerides and phospholipids",
  "Cholesterol",
  "Glycolysis related metabolites",
  "Ketone bodies",
  "Lipoprotein subclasses",
  "Lipoprotein particle sizes",
  "Relative lipoprotein lipid concentrations",
  "Inflammation"
)


df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))

df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "GlycA"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        shape = exposure,
        xlab = "Difference in 6-month biomarker (SD units) per SD increase in 6-month log GlycA",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.65, 0.65)) +
        ggplot2::scale_shape_manual(
          values = c(22L),
          labels = c("GlycA")) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8)) +
        ggplot2::theme(legend.position = "none")
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )


patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Fig3_FigSupp1B.png", width=6.5, height=6)



###### Bootstrapping ############
df_nmr_results_alt_names_infection <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo),]


library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in n[-82]){
  
  df_nmr_results_alt_names_infection[[i]] <- log(df_nmr_results_alt_names_infection[[i]])
  df_nmr_results_alt_names_infection[[i]] <- as.numeric(scale(df_nmr_results_alt_names_infection[[i]]))
  
  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste(i,"~ gp_exposure + age_interview_m6 + sex + z_scores_birth + hhincome + bfany + mth6CollectiontoStorage_hours + anympsmoke + gestage")), data=data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }
  
  
  duncan.boot <- boot(df_nmr_results_alt_names_infection, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")
  
  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]
  
  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)
  
}

bootstrap_GlycA_metab_bfany_infectionsampleonly <- bootstrap_bca_cis
write.csv(bootstrap_GlycA_metab_bfany_infectionsampleonly, "BIS_eLife_Figure3_SourceData2B_bootstrap.csv")


############# hsCRP - Figure 4-figure supplement 1A #####################


df_nmr_results_alt_names$hscrp_exposure <- log(df_nmr_results_alt_names$hsCRP)
df_nmr_results_alt_names$hscrp_exposure <- as.numeric(scale(df_nmr_results_alt_names$hscrp_exposure))


df_nmr_results_alt_names_trimmed <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk6mo), ]


# Select only variables to be used for the model and collapse to a long data
# format
df_long <-
  df_nmr_results_alt_names_trimmed %>%
  # Select only model variables
  dplyr::select(nmr_biomarkers, hscrp_exposure, age_interview_m6, sex, z_scores_birth, hhincome, bfany, mth6CollectiontoStorage_hours, anympsmoke, gestage) %>%
  # log-tranform biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(log(.))) %>%
  # Scale biomarkers
  dplyr::mutate_at(.vars = c(nmr_biomarkers), .funs = dplyr::funs(as.numeric(scale(.)))) %>%
  # Collapse to a long data format
  tidyr::gather(key = biomarkerid, value = biomarkervalue, nmr_biomarkers)

df_long <- na.omit(df_long)

# Estimate sex- and age-adjusted associations of metabolite to BMI
df_assoc_per_biomarker_bmi <-
  ggforestplot::discovery_regression(
    df_long = df_long,
    model = "lm",
    formula =
      formula(
        biomarkervalue ~ hscrp_exposure + age_interview_m6 + sex + z_scores_birth + hhincome + bfany + mth6CollectiontoStorage_hours + anympsmoke + gestage
      ),
    key = biomarkerid,
    predictor = hscrp_exposure
  ) %>%
  # Join this dataset with the grouping data in order to choose a different
  # biomarker naming option
  left_join(
    select(
      df_NG_biomarker_metadata,
      name,
      biomarkerid = machine_readable_name
    ),
    by = "biomarkerid")

df_assoc_per_biomarker_bmi$adj_pvalue <- p.adjust(df_assoc_per_biomarker_bmi$pvalue, method = "BH")

write.csv(df_assoc_per_biomarker_bmi, 'BIS_eLife_Fig4_SourceData2.csv')

beta.comparison.NMR$hscrp.beta <- df_assoc_per_biomarker_bmi$estimate

df_to_plot <-
  df_assoc_per_biomarker_bmi %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name")

df_to_plot <- na.omit(df_to_plot)

hscrp <- df_assoc_per_biomarker_bmi[251,]
hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)

df_to_plot <- df_to_plot[!df_to_plot$biomarkerid=="hsCRP",]

# Choose the groups you want to plot and define the order with which group 
# categories will appear in the plot
group_order <- c(
  "Branched-chain amino acids",
  "Aromatic amino acids",
  "Amino acids",
  "Fluid balance",
  "Fatty acids",
  "Apolipoproteins",
  "Glycerides and phospholipids",
  "Cholesterol",
  "Glycolysis related metabolites",
  "Ketone bodies",
  "Lipoprotein subclasses",
  "Lipoprotein particle sizes",
  "Relative lipoprotein lipid concentrations",
  "Inflammation"
)


df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))



df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)
df_linear$exposure = "hsCRP"

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj_pvalue,
        psignif = 0.05,
        shape = exposure,
        xlab = "Difference in 6-month biomarker (SD units) per SD increase in 6-month log hsCRP",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.65, 0.65)) +
        ggplot2::scale_shape_manual(
          values = c(23L),
          labels = c("hsCRP")) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8)) +
        ggplot2::theme(legend.position = "none")
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )


patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)

ggsave("Figures/BIS_eLife_Fig4_FigSupp1A.png", width=6.5, height=6)

###### Bootstrapping ############
df_nmr_results_alt_names_infection <- df_nmr_results_alt_names[!is.na(df_nmr_results_alt_names$anyinfection4wk12mo),]


library(boot)

set.seed(12345)


bootstrap_bca_cis <-  data.frame(biomarkerid=character(),
                                 estimate=double(),
                                 l.ci=double(),
                                 u.ci=double(),
                                 stringsAsFactors=FALSE)
regression <- bootstrap_bca_cis

for(i in n[-251]){
  
  df_nmr_results_alt_names_infection[[i]] <- log(df_nmr_results_alt_names_infection[[i]])
  df_nmr_results_alt_names_infection[[i]] <- as.numeric(scale(df_nmr_results_alt_names_infection[[i]]))
  
  boot.huber <- function(data, indices, maxit=20){
    data <- data[indices,] # select obs. in bootstrap sample
    mod <- lm(as.formula(paste(i,"~ hscrp_exposure + age_interview_m6 + sex + z_scores_birth + hhincome + bfany + mth6CollectiontoStorage_hours + anympsmoke + gestage")), data=data, maxit=maxit)
    coefficients(mod) # return coefficient vector
  }
  
  
  duncan.boot <- boot(df_nmr_results_alt_names_infection, boot.huber, 1000, maxit=100)

  ci_boot_bca <- boot.ci(duncan.boot, index=2, type="bca")
  
  regression[1,1] <- paste(i)
  regression[1,2] <- mean(duncan.boot$t[,2])
  regression[1,3] <- ci_boot_bca$bca[4]
  regression[1,4] <- ci_boot_bca$bca[5]
  
  bootstrap_bca_cis <- rbind(bootstrap_bca_cis, regression)
  
}

bootstrap_hsCRP_metab_bfany_infectionsampleonly <- bootstrap_bca_cis
write.csv(bootstrap_hsCRP_metab_bfany_infectionsampleonly, "BIS_eLife_Figure4_SourceData2_bootstrap.csv")


###### Scatterplot of 6-month metabolomic difference correlation - Figure 3-fig supp 1C, 4-fig supp 1B, 4-fig supp 1C #####


beta.comparison.NMR_noinflam <- beta.comparison.NMR[c(1:81,83:250),]


# Add the regression line without the confidence interval
q <- ggplot(beta.comparison.NMR_noinflam, aes(x=infection.beta, y=glyca.beta)) +
  geom_point(shape=15, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="Infection Beta (per 1 infection)", y = "GlycA Beta (per SD log GlycA)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure3_FigSupp1C.png")

cor.test(beta.comparison.NMR_noinflam$infection.beta, beta.comparison.NMR_noinflam$glyca.beta)

q <- ggplot(beta.comparison.NMR_noinflam, aes(x=infection.beta, y=hscrp.beta)) +
  geom_point(shape=18, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="Infection Beta (per 1 infection)", y = "hsCRP Beta (per SD log hsCRP)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure4_FigSupp1B.png")

cor.test(beta.comparison.NMR_noinflam$infection.beta, beta.comparison.NMR_noinflam$hscrp.beta)

q <- ggplot(beta.comparison.NMR_noinflam, aes(x=glyca.beta, y=hscrp.beta)) +
  geom_point(shape=18, color="darkgrey")+
  geom_smooth(method=lm, se=FALSE, color="black")+
  theme_classic()+
  labs(x="GlycA Beta (per SD log GlycA)", y = "hsCRP Beta (per SD log hsCRP)")

ggsave(plot = q, width = 3, height = 3, dpi = 600, filename = "Figures/BIS_eLife_Figure4_FigSupp1C.png")

cor.test(beta.comparison.NMR_noinflam$glyca.beta, beta.comparison.NMR_noinflam$hscrp.beta)




#### Reverse causality model - 6m metabs predicting infections from 6m to 12m - Supp File 3C ####


df_nmr_results_alt_names$anyinfection9mo12mo <- df_nmr_results_alt_names$anyinfection9mo + df_nmr_results_alt_names$anyinfection12mo
df_nmr_results_alt_names$anyinfection4wk6mo <- df_nmr_results_alt_names$anyinfection4wk + df_nmr_results_alt_names$anyinfection3mo + df_nmr_results_alt_names$anyinfection6mo

df_nmr_results_alt_names_reverse <- df_nmr_results_alt_names

birthmetab_infections_infadj <- data.frame(estimate=double(),
                                           se=double(),
                                           p.value=double(),
                                           adj.p.value=double(),
                                           biomarkerid=character(),
                                           stringsAsFactors=FALSE)
regression <- birthmetab_infections_infadj

for(i in nmr_biomarkers){
  df_nmr_results_alt_names_reverse[,paste(i)] <- log(df_nmr_results_alt_names_reverse[,paste(i)])
  df_nmr_results_alt_names_reverse[,paste(i)] <- as.numeric(scale(df_nmr_results_alt_names_reverse[,paste(i)]))
  df_regression <- glm(as.formula(paste("anyinfection9mo12mo ~ ",i,"+ anyinfection4wk6mo + age_interview_m6 + age_interview_m12 + sex + z_scores_birth + hhincome + bfany + gestage + anympsmoke + mth6CollectiontoStorage_hours", sep="")), family = quasipoisson(link = "log"), data=df_nmr_results_alt_names_reverse)
  
  regression[1,1] <- coef(summary(df_regression))[2, 1]
  regression[1,2] <- coef(summary(df_regression))[2, 2]
  regression[1,3:4] <- coef(summary(df_regression))[2, 4]
  regression[1,5] <- paste(i)
  
  birthmetab_infections_infadj <- rbind(birthmetab_infections_infadj, regression)
}

birthmetab_infections_infadj$adj.p.value <- p.adjust(birthmetab_infections_infadj$p.value, method = "BH")


birthmetab_infections_names <- left_join(birthmetab_infections_infadj,
                                         select(
                                           df_NG_biomarker_metadata,
                                           name,
                                           biomarkerid = machine_readable_name
                                         ),
                                         by = "biomarkerid")

birthmetab_infections_names[251,6] <- "C-reactive protein*"

df_to_plot <-
  birthmetab_infections_names %>%
  # use right_join, with df_grouping on the right, to preserve the order of
  # biomarkers it specifies.
  dplyr::right_join(., df_with_groups, by = "name") %>%
  arrange(factor(group, levels = group_order))

df_to_plot <- na.omit(df_to_plot)


hscrp <- birthmetab_infections_infadj[251,]

hscrp$name <- "C-reactive protein*"
hscrp$group <- "Inflammation"

df_to_plot <- rbind(df_to_plot, hscrp)



# Choose the groups you want to plot and define the order with which group 
# categories will appear in the plot
group_order <- c(
  "Branched-chain amino acids",
  "Aromatic amino acids",
  "Amino acids",
  "Fluid balance",
  "Fatty acids",
  "Apolipoproteins",
  "Glycerides and phospholipids",
  "Cholesterol",
  "Glycolysis related metabolites",
  "Ketone bodies",
  "Lipoprotein subclasses",
  "Lipoprotein particle sizes",
  "Relative lipoprotein lipid concentrations",
  "Inflammation"
)


df_to_plot <- df_to_plot %>%
  arrange(match(group, c("Amino acids", "Fatty acids", "Ketone bodies", "Apolipoproteins", "Glycerides and phospholipids", "Cholesterol", "Glycolysis related metabolites", "Lipoprotein particle sizes", "Fluid balance", "Inflammation")))


df_linear <- df_to_plot[(df_to_plot$group!="Lipoprotein subclasses"),]
df_linear <- df_linear[(df_linear$name!="Glucose"),]
df_linear <- df_linear[(df_linear$name!="Lactate"),]
df_linear <- df_linear[!grepl("_pct", df_linear$biomarkerid),]

df_linear$rownumber = 1:nrow(df_linear)

df_linear <-
  df_linear %>%
  dplyr::arrange(rownumber)

ggplot_multi <-
  df_linear %>%
  nest(-group, .key = "data") %>%
  # Apply forestplot to each group
  mutate(
    gg_groups = purrr::map2(
      data, group, ~ forestplot(
        df = .x,
        pvalue = adj.p.value,
        psignif = 0.05,
        xlab = "Estimated difference in number of infections from 6m to 12 months\nper 1 SD of log metabolite at 6m",
        title = .y,
      ) +
        ggplot2::coord_cartesian(xlim = c(-0.17, 0.17)) + ggplot2::scale_color_brewer(palette="Dark2") + ggplot2::theme(plot.title = element_text(size=9), axis.title.x=element_text(size=8), axis.text.x=element_text(size=8), axis.text.y=element_text(size=8),  legend.text=element_text(size=8))
    ),
    # Optional: remove x-axis and legend for all plots except the bottom one
    gg_groups = ifelse(
      test = row_number() != n(),
      yes =
        purrr::map(gg_groups, ~ . +
                     theme(
                       axis.text.x = element_blank(),
                       axis.title.x = element_blank(),
                       axis.ticks.x = element_blank(),
                       plot.margin = unit(c(1, 2, 1, 2), "mm")
                     ) +
                     ggplot2::theme(legend.position = "none")),
      no = gg_groups
    ),
    rel_heights = purrr::map(
      data,  ~ nrow(.)
    ) %>% unlist()
  )


patchwork::wrap_plots(
  ggplot_multi$gg_groups,
  ncol = 2,
  heights = c(9, 3, 9, 3, 2)
)


ggsave("Figures/BIS_eLife_SuppFile3C.png", width=6, height=8)
