1. Introduction

The analysis reported here is part of the manuscript Romand et al. 2021. The script performs statistical analysis on data grouped by Treatment and Plant Line and plots a graph. Different approaches are used depending on the normality of the data as determined by the Shapiro test. If data follow a normal distribution, then the script performs Anova and post-hoc Tukey tests and draws a plot with confidence interval centered on mean. If data do not follow a normal distribution then a Kruskal-Wallis and a pot-hoc Dunn test are performed. The same plot is drawn using the medians and boostrap calculated 95% confidence intervals. The plots are also saved as pdf and vector (SVG) files.

2. Data import

The script will analyse a data file placed in the working directory with the following layout.

A 3 column data set in xlsx. Column 1, Line; column 2, Treatment and column 3 the measured quantity, in this case Quantum Yield (AB). The 2 first columns are factors that are used to group the data.

In this example we have 2 levels “N” and “NoN” for Treatment and 3 levels “WT2”, “QM”, “rsh1-1” for Line.

This script can be simply modified to analyse different parameters or different treatments. It can also be modified to analyse data grouped by additional factors such as Time.

3. Data analysis

3.1. Code

Set variables for personalisation

#1. Data
#------------------------------------------------------------------------------

# Put your input file in variable DATA 
DATA <- "AB.xlsx"


#2. Parameters for plot
#------------------------------------------------------------------------------
# Points color
COLOUR <- "#00B050"

# Select a palette for CI color
PALETTE <- "PuRd"



Setup R packages

if (!require(gridExtra)) { install.packages("gridExtra", repos = "http://cran.us.r-project.org") }
if (!require(ggbeeswarm)) { install.packages("ggbeeswarm", repos = "http://cran.us.r-project.org") }
if (!require(Rmisc)) { install.packages("Rmisc", repos = "http://cran.us.r-project.org") }
if (!require(ggplot2)) { install.packages("ggplot2", repos = "http://cran.us.r-project.org") }
if (!require(RColorBrewer)) { install.packages("RColorBrewer", repos = "http://cran.us.r-project.org") }
if (!require(dplyr)) { install.packages("dplyr", repos = "http://cran.us.r-project.org") }
if (!require(rstatix)) { install.packages("rstatix", repos = "https://cloud.r-project.org") }
if (!require(purrr)) { install.packages("purrr", repos = "http://cran.us.r-project.org") }
if (!require(tidyr)) { install.packages("tidyr", repos = "http://cran.us.r-project.org") }
if (!require(magrittr)) { install.packages("magrittr", repos = "http://cran.us.r-project.org") }
if (!require(boot)) { install.packages("boot", repos = "http://cran.us.r-project.org") }
if (!require(devtools)) { install.packages("devtools", repos = "http://cran.us.r-project.org") }
if (!require(knitr)) { install.packages("knitr", repos = "http://cran.us.r-project.org") }
if (!require(svglite)) { install.packages("svglite", repos = "http://cran.us.r-project.org") }

library(gridExtra)
library(ggplot2)
library(ggbeeswarm)
library(Rmisc)
library(readxl)
library(RColorBrewer)
library(rstatix)
library(gridExtra)
library(boot)
library(magrittr)
library(dplyr)
library(purrr)
library(tidyr)
library(rstatix)
library(knitr)
library(svglite)

Functions used in the script

#===============================================================================
# Draw the plot if data follow a normal distribution
#===============================================================================

plot_normal <- function(df, my_colours, my_summary) {
    p<-ggplot(data = df, aes(x=Line, y=AB)) +
        geom_quasirandom(dodge.width=0.8,alpha = 0.6, colour = COLOUR)+
        geom_pointrange(data = my_summary, aes(ymin=AB-ci, ymax=AB+ci, color=Line), position=position_dodge(width=0.8))+
        scale_colour_manual(values=my_colours)+
        facet_wrap(~Treatment, strip.position = "bottom", scales = "free_x") +
        theme_classic() + 
        theme(panel.spacing = unit(0, "lines"), 
              strip.background = element_blank(),
              strip.placement = "outside")+
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
    print(p)
}

#===============================================================================
# Draw the plot if data do not follow a normal distribution
#===============================================================================

plot_not_normal <- function(df, my_colours, conf_int) {
    p<-ggplot(data=df, aes(x=Line, y=AB)) +
        geom_quasirandom(dodge.width=0.8,alpha = 0.6, colour = COLOUR)+
        geom_linerange(data = booted_summary, aes(ymin=lower_ci_perc, ymax=upper_ci_perc, color=Line), position = position_dodge(width=0.8))+
        geom_point(data=booted_summary, aes(y=AB, color=Line), size = 2, 
                   position=position_dodge(width=0.8))+
        scale_colour_manual(values=my_colours)+
        facet_wrap(~Treatment, strip.position = "bottom", scales = "free_x") +
        theme_classic() + 
        theme(panel.spacing = unit(0, "lines"), 
              strip.background = element_blank(),
              strip.placement = "outside")+
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
        print(p)
       }

#===============================================================================
# Check normality of the data. Exit the loop if one set of data doesn't follow a 
# normal distribution.
# Return TRUE if all data follow a normal distribution
#===============================================================================

check_normality <- function(shapiro_df) {
    # Data are supposed to follow normal distribution
    flag_normal <- TRUE
    
    for (i in 1 : nrow(shapiro_df)) {
        if(shapiro_df[i, 4] > 0.05) {
            
        } else {
            
            flag_normal <- FALSE
            break
        }
    }
    return(flag_normal)
}

#===============================================================================
#Function to calculate the median (for bootstrapping)
#===============================================================================

samplemedian <- function(d, i) {
    median(d[i])
}

#===============================================================================
#Function for Dunn test
#===============================================================================
test_dunn <- function() {
    pval <- as.data.frame(df %>% group_by(Treatment) %>% dunn_test(AB ~ Line, p.adjust.method = "BY"))
    print(df %>% group_by(Treatment) %>% dunn_test(AB ~ Line, p.adjust.method = "BY"))
    return(pval)
}

Main script

# Import Data
df <- read_excel(DATA)

# Define color
my_colours = brewer.pal(n = 9, PALETTE)[9:3]

# Determine data normality using the Shapiro test
shapiro_df <- df %>%group_by(Treatment, Line)%>%
    summarise(statistic = shapiro.test(AB)$statistic, 
              p.value = shapiro.test(AB)$p.value)

flag_normal <- check_normality(shapiro_df)


# Data treatement according to normality status
if(flag_normal == TRUE) {
    #print("Data follow a normal distribution")
    
    # Summary
    my_summary <- summarySE(df, measurevar="AB", groupvars=c("Line", "Treatment"))
    
    # Plot
    plot_normal(df, my_colours, my_summary)
    
    # Stats
    anova_results <- df %>% group_by(Treatment) %>%  anova_test(AB ~ Line)
    
    tukey_results <- df %>% group_by(Treatment) %>%  tukey_hsd(AB ~ Line)
    
    # Save plot
    ggsave("my_ggplot1.pdf", width=4, height=5)
    ggsave("my_ggplot1.svg", width=4, height=5)
    
} else {
    #print("Data do not follow a normal distribution")
    
    # Calculate Median and 95%CI by bootstrap (Largely based on code by Peter Kamerman: 
    #https://www.painblogr.org/2017-10-18-purrring-through-bootstraps.html)
    
    #1. Set the seed
    set.seed(12345)
    
    #2. Create nested datafile to allow boostrapping on each subgroup.
    my_data_nested <- df %>% group_by(Line,Treatment) %>% nest()
    
    #3. Add boostrapped median values to my_nested_data
    my_data_nested %<>% dplyr::mutate(booted = purrr::map(.x = data, ~ boot::boot(data = .x$AB,statistic = samplemedian, R = 10000, stype = "i")))
    
    #4. Plot each <S3: boot> object to check boostrapping worked!
    ## Note: Saved to an object (plots) to stop the summary being printed
    plots<-purrr::map(.x = my_data_nested$booted, ~ plot(.x))
    print(plots)
    
    #5. Add boostrapped 95%CI around median values to my_nested_data, 
    #95%CI determined based on "perc" & "bca"
    my_data_nested %<>% mutate(booted_ci = purrr::map(.x = booted, 
                                                      ~ boot::boot.ci(.x,conf = 0.95, 
                                                                      type = c("perc", "bca"))))
    
    #6. Table of each <S3: bootci> object to check
    ci_table <-purrr::map(.x = my_data_nested$booted_ci, ~ print(.x))
    
    #7. Get the stats (median and 95% CI) from the nested dataframe into a regular dataframe 
    #(my_data_booted_summary) with 95%CI determined by "bca" or "perc" method
    booted_summary <- my_data_nested %>% mutate(AB= purrr::map(.x = booted_ci, ~ .x$t0),
                                                lower_ci_bca = purrr::map(.x = booted_ci,~ .x$bca[[4]]),    
                                                upper_ci_bca = purrr::map(.x = booted_ci,~ .x$bca[[5]]),
                                                lower_ci_perc = purrr::map(.x = booted_ci,~ .x$perc[[4]]),
                                                upper_ci_perc = purrr::map(.x = booted_ci,~ .x$perc[[5]])) %>%  
        dplyr::select(-data, -booted, -booted_ci) %>% tidyr::unnest(cols = c(AB, lower_ci_bca, upper_ci_bca, lower_ci_perc, upper_ci_perc))
    
    # Plot
    plot_not_normal(df, my_colours, booted_summary)
    
    # Stats
    
    kruskal_pval <- (df %>% group_by(Treatment)%>%kruskal_test(AB ~ Line)) %>% select(Treatment, p)
    pval_dunn <- test_dunn()
    
       # Save plot
    ggsave("my_ggplot1.pdf", width=4, height=5)
    ggsave("my_ggplot1.svg", width=4, height=5)
}

3.2. Results

a. Normality test and summary statistics

Table 1 : Shapiro test results.
Treatment Line statistic p.value
no_N 1_WT1 0.9935041 0.9746658
no_N 2_OXRSH1 0.9115141 0.4904610
no_N 3_rsh1-1 0.8568301 0.2490562
no_N 4_QM 0.8756175 0.3202561

Data follow a normal distribution. Now calculating means and confidence intervals for each line-treatment combination.



b. Final data plot

Fig 1. : Confidence interval plot


c. Statistical analysis

Summary statistics
Table 2 : Statistical summary.
Line Treatment N AB sd se ci
1_WT1 no_N 4 2.466 0.109 0.055 0.174
2_OXRSH1 no_N 4 3.207 0.055 0.027 0.087
3_rsh1-1 no_N 4 1.957 0.051 0.025 0.081
4_QM no_N 4 3.101 0.133 0.066 0.211


Statistical tests within each treatment condition.
Table 3 : Results of Anova test.
Treatment Effect DFn DFd F p p<.05 ges
no_N Line 3 12 155.189 7.3e-10 * 0.975


Table 4 : Results of Tukey test.
Treatment group1 group2 p.adj p.adj.signif
no_N 1_WT1 2_OXRSH1 5.60e-07 ****
no_N 1 3_rsh1 2.96e-05 ****
no_N 1_WT1 4_QM 2.99e-06 ****
no_N 1 3_rsh1 1.66e-09 ****
no_N 2_OXRSH1 4_QM 4.13e-01 ns
no_N 3_rsh1 4_QM 4.36e-09 ****


4. R session information

InfoSession <- devtools::session_info()
print(InfoSession)
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       macOS  10.16                
##  system   x86_64, darwin17.0          
##  ui       X11                         
##  language (EN)                        
##  collate  fr_FR.UTF-8                 
##  ctype    fr_FR.UTF-8                 
##  tz       Europe/Paris                
##  date     2021-07-22                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version date       lib source        
##  abind          1.4-5   2016-07-21 [1] CRAN (R 4.0.2)
##  backports      1.2.0   2020-11-02 [1] CRAN (R 4.0.2)
##  beeswarm       0.2.3   2016-04-25 [1] CRAN (R 4.0.2)
##  boot         * 1.3-25  2020-04-26 [1] CRAN (R 4.0.2)
##  broom          0.7.5   2021-02-19 [1] CRAN (R 4.0.2)
##  cachem         1.0.5   2021-05-15 [1] CRAN (R 4.0.2)
##  callr          3.7.0   2021-04-20 [1] CRAN (R 4.0.2)
##  car            3.0-10  2020-09-29 [1] CRAN (R 4.0.2)
##  carData        3.0-4   2020-05-22 [1] CRAN (R 4.0.2)
##  cellranger     1.1.0   2016-07-27 [1] CRAN (R 4.0.2)
##  cli            2.5.0   2021-04-26 [1] CRAN (R 4.0.2)
##  colorspace     2.0-0   2020-11-11 [1] CRAN (R 4.0.2)
##  crayon         1.3.4   2017-09-16 [1] CRAN (R 4.0.2)
##  curl           4.3     2019-12-02 [1] CRAN (R 4.0.1)
##  data.table     1.13.2  2020-10-19 [1] CRAN (R 4.0.2)
##  desc           1.3.0   2021-03-05 [1] CRAN (R 4.0.2)
##  devtools     * 2.4.2   2021-06-07 [1] CRAN (R 4.0.2)
##  digest         0.6.25  2020-02-23 [1] CRAN (R 4.0.2)
##  dplyr        * 1.0.2   2020-08-18 [1] CRAN (R 4.0.2)
##  ellipsis       0.3.1   2020-05-15 [1] CRAN (R 4.0.2)
##  evaluate       0.14    2019-05-28 [1] CRAN (R 4.0.1)
##  farver         2.0.3   2020-01-16 [1] CRAN (R 4.0.2)
##  fastmap        1.1.0   2021-01-25 [1] CRAN (R 4.0.2)
##  forcats        0.5.1   2021-01-27 [1] CRAN (R 4.0.2)
##  foreign        0.8-80  2020-05-24 [1] CRAN (R 4.0.2)
##  fs             1.5.0   2020-07-31 [1] CRAN (R 4.0.2)
##  generics       0.1.0   2020-10-31 [1] CRAN (R 4.0.2)
##  ggbeeswarm   * 0.6.0   2017-08-07 [1] CRAN (R 4.0.2)
##  ggplot2      * 3.3.5   2021-06-25 [1] CRAN (R 4.0.2)
##  glue           1.4.1   2020-05-13 [1] CRAN (R 4.0.2)
##  gridExtra    * 2.3     2017-09-09 [1] CRAN (R 4.0.2)
##  gtable         0.3.0   2019-03-25 [1] CRAN (R 4.0.2)
##  haven          2.3.1   2020-06-01 [1] CRAN (R 4.0.2)
##  highr          0.8     2019-03-20 [1] CRAN (R 4.0.2)
##  hms            0.5.3   2020-01-08 [1] CRAN (R 4.0.2)
##  htmltools      0.5.0   2020-06-16 [1] CRAN (R 4.0.2)
##  knitr        * 1.31    2021-01-27 [1] CRAN (R 4.0.2)
##  labeling       0.4.2   2020-10-20 [1] CRAN (R 4.0.2)
##  lattice      * 0.20-41 2020-04-02 [1] CRAN (R 4.0.2)
##  lifecycle      1.0.0   2021-02-15 [1] CRAN (R 4.0.2)
##  magrittr     * 1.5     2014-11-22 [1] CRAN (R 4.0.2)
##  memoise        2.0.0   2021-01-26 [1] CRAN (R 4.0.2)
##  munsell        0.5.0   2018-06-12 [1] CRAN (R 4.0.2)
##  openxlsx       4.2.3   2020-10-27 [1] CRAN (R 4.0.2)
##  pillar         1.4.6   2020-07-10 [1] CRAN (R 4.0.2)
##  pkgbuild       1.2.0   2020-12-15 [1] CRAN (R 4.0.2)
##  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.0.2)
##  pkgload        1.2.1   2021-04-06 [1] CRAN (R 4.0.2)
##  plyr         * 1.8.6   2020-03-03 [1] CRAN (R 4.0.2)
##  prettyunits    1.1.1   2020-01-24 [1] CRAN (R 4.0.2)
##  processx       3.5.2   2021-04-30 [1] CRAN (R 4.0.2)
##  ps             1.4.0   2020-10-07 [1] CRAN (R 4.0.2)
##  purrr        * 0.3.4   2020-04-17 [1] CRAN (R 4.0.2)
##  R6             2.4.1   2019-11-12 [1] CRAN (R 4.0.2)
##  RColorBrewer * 1.1-2   2014-12-07 [1] CRAN (R 4.0.2)
##  Rcpp           1.0.5   2020-07-06 [1] CRAN (R 4.0.2)
##  readxl       * 1.3.1   2019-03-13 [1] CRAN (R 4.0.2)
##  remotes        2.4.0   2021-06-02 [1] CRAN (R 4.0.2)
##  rio            0.5.16  2018-11-26 [1] CRAN (R 4.0.2)
##  rlang          0.4.11  2021-04-30 [1] CRAN (R 4.0.2)
##  rmarkdown      2.8     2021-05-07 [1] CRAN (R 4.0.2)
##  Rmisc        * 1.5     2013-10-22 [1] CRAN (R 4.0.2)
##  rprojroot      1.3-2   2018-01-03 [1] CRAN (R 4.0.2)
##  rstatix      * 0.7.0   2021-02-13 [1] CRAN (R 4.0.2)
##  rstudioapi     0.13    2020-11-12 [1] CRAN (R 4.0.2)
##  scales         1.1.1   2020-05-11 [1] CRAN (R 4.0.2)
##  sessioninfo    1.1.1   2018-11-05 [1] CRAN (R 4.0.2)
##  stringi        1.5.3   2020-09-09 [1] CRAN (R 4.0.2)
##  stringr        1.4.0   2019-02-10 [1] CRAN (R 4.0.2)
##  svglite      * 2.0.0   2021-02-20 [1] CRAN (R 4.0.2)
##  systemfonts    1.0.1   2021-02-09 [1] CRAN (R 4.0.2)
##  testthat       3.0.2   2021-02-14 [1] CRAN (R 4.0.2)
##  tibble         3.0.3   2020-07-10 [1] CRAN (R 4.0.2)
##  tidyr        * 1.1.2   2020-08-27 [1] CRAN (R 4.0.2)
##  tidyselect     1.1.0   2020-05-11 [1] CRAN (R 4.0.2)
##  usethis      * 2.0.1   2021-02-10 [1] CRAN (R 4.0.2)
##  vctrs          0.3.2   2020-07-15 [1] CRAN (R 4.0.2)
##  vipor          0.4.5   2017-03-22 [1] CRAN (R 4.0.2)
##  withr          2.4.2   2021-04-18 [1] CRAN (R 4.0.2)
##  xfun           0.23    2021-05-15 [1] CRAN (R 4.0.2)
##  yaml           2.2.1   2020-02-01 [1] CRAN (R 4.0.2)
##  zip            2.1.1   2020-08-27 [1] CRAN (R 4.0.2)
## 
## [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library

5. Citations

  1. Bache, Stefan Milton and Wickham, Hadley (2020). magrittr: A Forward-Pipe Operator for R. R package version 2.0.1. https://CRAN.R-project.org/package=magrittr

  2. Canty, Angelo and Ripley, Brian (2021). boot: Bootstrap R (S-Plus) Functions. R package version 1.3-28.

  3. Clarke, Erik, and Scott Sherrill-Mix. 2017. Ggbeeswarm: Categorical Scatter (Violin Point) Plots. https://CRAN.R-project.org/package=ggbeeswarm.

  4. Henry, Lionel and Wickham, Hadley (2020). purrr: Functional Programming Tools. R package version 0.3.4. https://CRAN.R-project.org/package=purrr

  5. Hope, Ryan M. 2013. Rmisc: Rmisc: Ryan Miscellaneous. https://CRAN.R-project.org/package=Rmisc.

  6. Kassambara, Alboukadel. 2021. Rstatix: Pipe-Friendly Framework for Basic Statistical Tests. https://CRAN.R-project.org/package=rstatix.

  7. Neuwirth, Erich. 2014. RColorBrewer: ColorBrewer Palettes. https://CRAN.R-project.org/package=RColorBrewer.

  8. R Core Team. 2020. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.

  9. Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. https://ggplot2.tidyverse.org.

  10. Wickham, Hadley, and Jennifer Bryan. 2019. Readxl: Read Excel Files. https://CRAN.R-project.org/package=readxl.

  11. Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2021. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.

  12. Wickham, Hadley (2021). tidyr: Tidy Messy Data. R package version 1.1.3. https://CRAN.R-project.org/package=tidyr

  13. Wickham, Hadley, Hester, Jim and Chang, Winston (2021). devtools: Tools to Make Developing R Packages Easier. R package version 2.4.2. https://CRAN.R-project.org/package=devtools