################################################################################
# This code fits a liner regression model to estimate the relationship between #
# probit-transformed prevalence of underweight, obesity and severe obesity and #
# mean BMI, separately for each of these prevalences. The prevalence of        #
# underweight, obesity or severe obesity depends on population mean BMI as     #
# well as age group in 10-year bands, region, and the year when the data were  #
# collected. All analyses were done separately for men and women. Regressions  #
# also included interactions between mean BMI and age group, mean BMI and      #
# region, age group and region, age group and year, and year and region. The   #
# mean BMI values were from a recent comprehensive analysis of worldwide       #
# trends in mean BMI published in "Worldwide trends in body-mass index,        #
# underweight, overweight, and obesity from 1975 to 2016: a pooled analysis of #
# 2416 population-based measurement studies in 128.9 million children,         #
# adolescents, and adults.", Lancet 2017 (available for download at            #
# www.ncdrisc.org).
################################################################################


################################################################################
# We have chosen to use a line width of more than 80 characters for some of    #
# the code, so that the whole of particular sections of the code can be seen   #
# at the same time. On our laptops and desktops the code displays most clearly #
# in Windows using Programmer's Notepad, and in Linux using gedit.             #
################################################################################

##### FUNCTIONS ################################################################

generate_model <-                                                                     
  # the generate_model function generates a model for each prevalence
  
  function(df, sex_sub, tail_BMI) {    
# Inputs are as follows:                                                       #
# - df = the dataframe of input, it can be either data_underweight,            #
#   data_obesity or data_severeObesity;                                        #
# - sex_sub = to choose whih sex to subset to run the analysis, either "Male"  #
#   or "Female";                                                               #
# - tail_BMI = to choose which prevalence to run the analsyis, it can be       #
#   "underweight", "obesity" or "severeObesity"                                #
    
    if (!tail_BMI %in% c("underweight", "obesity", "severeObesity")) {                # avoid spelling mistakes for tail_BMI
      stop(
        "your tail_BMI must be one of the following options: 
        underweight, obesity, severeObesity"
      )
    }
    if (!sex_sub %in% c("Male", "Female")) {                                          # avoid spelling mistakes for sex_sub
      stop("make sure your gender is either Male or Female")
    }
    
    if (sex_sub == "Male") {                                                          # subset by sex for the analsys
      df <- subset(df, sex == "Male")
    } else {
      df <- subset(df, sex == "Female")
    }
    
    if (tail_BMI == "underweight") {                                                  # select the prevalence "underweight" to regress
      
      hist(df$prev_bmi_185less)                                                       # plot distribution of "prev_bmi_185less" (prevalence of individuals with BMI > 18.5)
      df$underweight <- qnorm(df$prev_bmi_185less)                                    # normalise the distribution and assign it to "underweight"
      hist(df$underweight)                                                            # plot normalised distribution of underweight
      df <- df[!is.na(df$prev_bmi_185less) & !is.na(df$mean_bmi), ]                   # drop any NA values in either prevalence or mean
      df$tail_BMI <- df$underweight                                                   # assign "underweight" to target variable "tail_BMI"
      
    } else if (tail_BMI == "obesity") {                                               # select the prevalence "obesity" to regress
      
      hist(df$prev_bmi_30plus)                                                        # plot distribution of "prev_bmi_30plus" (prevalence of individuals with BMI > 30.0)
      df$obesity <- qnorm(df$prev_bmi_30plus)                                         # normalise the distribution and assign it to "obesity"
      hist(df$obesity)                                                                # plot normalised distribution of obesity
      df <- df[!is.na(df$prev_bmi_30plus) & !is.na(df$mean_bmi), ]                    # drop any NA values in either prevalence or mean
      df$tail_BMI <- df$obesity                                                       # assign "obesity" to target variable "tail_BMI"
      
    } else if (tail_BMI == "severeObesity") {                                         # select the prevalence "severeObesity" to regress
      
      hist(df$prev_bmi_35plus)                                                        # plot distribution of "prev_bmi_35plus" (prevalence of individuals with BMI > 35.0)
      df$severeObesity <- qnorm(df$prev_bmi_35plus)                                   # normalise the distribution and assign it to "severeObesity"
      hist(df$severeObesity)                                                          # plot normalised distribution of severeObesity
      df <- df[!is.na(df$prev_bmi_35plus) & !is.na(df$mean_bmi), ]                    # drop any NA values in either prevalence or mean
      df$tail_BMI <- df$severeObesity                                                 # assign "severeObesity" to target variable "tail_BMI"
      
    }
    
    model_final <-
      lm(
        tail_BMI ~ mean_bmi_c + age_group + mid_year_c + Superregion +                # target variable of the model is "tail_BMI",
          # which has been previously assigned to either
          # "underweight", "obesity" or "severeObesity".
          # This prevalence depends on mean BMI (mean_bmi_c),
          # age group (age_group), year (mid_year_c) and 
          # region (Superregion), as well as interations between:
          mean_bmi_c * Superregion +                                                  # mean BMI and region
          age_group * Superregion +                                                   # age group and region
          mean_bmi_c * age_group +                                                    # mean BMI and age group
          mid_year_c * age_group +                                                    # year and age group
          mid_year_c * Superregion,                                                   # year and region
        data = df
      )
    print(summary(model_final))
    
    return(model_final)
  }

create_6models <-                                                                     
# the create_6models function takes the generate_model function above and      #
# generates a model for each prevalence/sex combination as specified in each   #
# sex_sub and tail_BMI argument.                                               #
  
  function (data_underweight, data_obesity, data_severeObesity) {
# the inputs are the 3 databases for each of the prevalences as specified in   #
# their names. The numerical parts of the variables names below respectively   #
# stand for 1 = "Male" and 2 = "Female".                                       #
    underweight_model_1   <- generate_model(data_underweight, 
                                            "Male", "underweight")
    underweight_model_2   <- generate_model(data_underweight, 
                                            "Female", "underweight")
    obesity_model_1       <- generate_model(data_obesity, 
                                            "Male", "obesity")
    obesity_model_2       <- generate_model(data_obesity, 
                                            "Female", "obesity")
    severeObesity_model_1 <- generate_model(data_severeObesity, 
                                            "Male", "severeObesity")
    severeObesity_model_2 <- generate_model(data_severeObesity, 
                                            "Female", "severeObesity")
    
    # collate all models in a single list
    list_models = list(                                                               
      underweight1 = underweight_model_1,
      underweight2 = underweight_model_2,
      obesity1 = obesity_model_1,
      obesity2 = obesity_model_2,
      severeObesity1 = severeObesity_model_1,
      severeObesity2 = severeObesity_model_2
    )
    return(list_models)
  }


generate_table <-  function(list_models, tail_BMI, data) {                            
# The generate_table function predicts the prevalence of anything specified in #
# "tail_BMI" and calculates the contribution of mean BMI. The inputs are:      #
# - list_models = a list of models to use, it must be the exact outcome of the #
#   create_6models function;                                                   #
# - tail_BMI = the target prevalence, "underweight", "obesity" or              #
#   "severeObesity";                                                           #
# - data = the data for the target prevalence, "data_underweight",             #
#   "data_obesity" or "data_severeObesity"                                     #
# The dataset below has estimates coming from a previous NCD-RisC publication  #
# and they can be requested directly to NCD-RisC. These are regional           #
# age-specific estimates by 10-year age bands.                                 #
  mean <-
    read.csv(
      "NCD_RisC_Lancet_2017_mean_bmi_age-specific_Superregion&Global.csv"
    )
  
  data_model <- mean[which(mean$Superregion %in% data$Superregion), ]
  
  # get regional mean bmi levels, by sex, age_group and year
  data_means <- data_model[data_model$year %in% c(1985:2016) &
                             data_model$Superregion != "Global" &
                             !data_model$age_band %in% c("18-19", "80+"),
                           c("Superregion", "year", "age_band",
                             "weight_mean", "sex")]
  # assign new names
  setnames(
    data_means,
    old = c("weight_mean", "age_band", "year"),
    new = c("weight_mean_bmi", "age_group", "mid_year")
  )
  
  # recode sex
  data_means$sex <- ifelse(data_means$sex == "m", "Male", "Female")
  
  # center mean BMI
  scaled_data           <- scale(data$mean_bmi)
  scaled_center         <- attributes(scaled_data)$`scaled:center`
  data_means$mean_bmi_c <- data_means$weight_mean_bmi - scaled_center
  
  # center time
  data_means$mid_year_c <- data_means$mid_year - 2000
  
  # pick model as specified in the prevalences of "tail_BMI"
  if (tail_BMI == "underweight") {
    model_m <- list_models$underweight1
    model_f <- list_models$underweight2
  } else if (tail_BMI == "obesity") {
    model_m <- list_models$obesity1
    model_f <- list_models$obesity2
  } else if (tail_BMI == "severeObesity") {
    model_m <- list_models$severeObesity1
    model_f <- list_models$severeObesity2
  }
  
  ### predict ###
  # predict prevalences
  data_means$prev <- NA
  data_means$prev[data_means$sex == "Female"] <-
    pnorm(predict(model_f, newdata = data_means[data_means$sex == "Female", ], 
                  allow.new.levels = FALSE))
  data_means$prev[data_means$sex == "Male"] <-
    pnorm(predict(model_m, newdata = data_means[data_means$sex == "Male", ], 
                  allow.new.levels = FALSE))
  
  # predicting prevalences using 1985 curves (thus fixing time)
  # we allow mean BMI for each age group and region to change, 
  # while keeping year fixed at 1985
  data_pred <- data_means
  data_pred$mid_year_c <- 1985 - 2000
  data_pred$prev <- NA
  data_pred$prev[data_pred$sex == "Female"] <-
    pnorm(predict(model_f, newdata = data_pred[data_pred$sex == "Female", ], 
                  allow.new.levels = FALSE))
  data_pred$prev[data_pred$sex == "Male"] <-
    pnorm(predict(model_m, newdata = data_pred[data_pred$sex == "Male", ], 
                  allow.new.levels = FALSE))
  data_means$prev_fixed_curve <- data_pred$prev
  
  # calculate change since 1985
  data_plot <- data_means
  data_plot_1985 <-
    data_plot[data_plot$mid_year == 1985, c(
      "Superregion", "age_group", "sex",
      "prev", "prev_fixed_curve")]
  data_plot <-
    merge(
      data_plot,
      data_plot_1985,
      by = c("Superregion", "age_group", "sex"),
      suffixes = c("", "_1985")
    )
  data_plot$prev_dif <- data_plot$prev - data_plot$prev_1985                          # predicted difference
  data_plot$prev_fixed_curve_dif <- data_plot$prev_fixed_curve -                      # fixed curves difference
    data_plot$prev_fixed_curve_1985
  
  return(data_plot)
}

##### Runs #####################################################################
################################################################################
# The original dataset for running the regression (of which a publicly         #
# available subset is provided in Source Data 1) contains the following        #
# variables:                                                                   #
# - id_study = string, a unique study identification;                          #
# - sex = factor, either "Male" or "Female";                                   #
# - age_group = factor (20-29 30-39 40-49 50-59 60-69 70-79);                  #
# - mean_bmi_c = numerical, centered mean BMI;                                 #
# - mid_year_c = integer, centered year;                                       #
# - Superregion = factor, regions of the world ("Central and Eastern Europe",  #
#   "Central Asia and North Africa-Middle East", "East and South East Asia",   #
#   "High-income Asia Pacific", "High-income Western countries", "Latin        #
#   America and Caribbean", "Oceania", "South Asia", "Sub-Saharan Africa")     #
#                                                                              #
# For the purposes of our analysis, we applied the following inclusion and     #
# exclusion criteria as described in the Methods section on each study-age-sex #
# stratum:                                                                     #
# - data that did not cover the complete 10-year age groups, e.g. 25-29 or     #
# 60-64 years, are excluded;                                                   #
# - data with a prevalence of interest exactly as 0 or 1                       #
#   to allow probit transformation;                                            #
# - data that had fewer than 25 participants.                                  #
#                                                                              #
# The original dataset is then split into 3 separate datasets for running the  #
# regression, one for each prevalence. The inputs are: data_underweight,       #
# data_obesity and data_severeObesity. Each dataset will contain a different   #
# variable according to the prevalence specified in their name:                #
# - data_underweight will have "prev_bmi_185less" (prevalence of individuals   #
#   with BMI < 18.5);                                                          #
# - data_obesity will have "prev_bmi_30plus" (prevalence of individuals with   #
#   BMI >= 30.0);                                                              #
# - data_severeObesity will have "prev_bmi_35plus (prevalence of individuals   #
#   with BMI >= 35.0)                                                          #
################################################################################

# run the regression models for each prevalence 
models_final <- create_6models(data_underweight, data_obesity, data_severeObesity)

# use the models to predict prevalences from mean BMI
#outcome of underweight
tables_u  <- generate_table(models_final, "underweight", data_underweight)
#outcome for obesity
tables_o  <- generate_table(models_final, "obesity", data_obesity)
#outcome for severe obesity
tables_so <- generate_table(models_final, "severeObesity", data_severeObesity)