### DSB_simulation.R
### R code for simulating probabilistic distributions of numbers of symmetric DSBs in mouse crosses
### Vaclav Gergelits 2017
### If you use this code, please cite Gregorova et al. eLife 2018
### This is a free code shared in the hope it may be useful. It is shared without any given or implied warranty
###############################

library(readr); library(dplyr); library(ggplot2)

### Data preparation
###############################
dChrs <- data_frame(
        chr = paste0("chr", c(1:19, "X", "Y")),
chr_length  = c(195471971, 182113224, 160039680, 156508116, 151834684, 149736546, 145441459, 129401213, 124595110, 130694993, 
                122082543, 120129022, 120421639, 124902244, 104043685,  98207768,  94987271,  90702639,  61431566, 171031299, 88110223),  # data from: library(BSgenome.Mmusculus.UCSC.mm10)
spo11_oligo = c(69606, 73115, 53878, 60332, 62217, 52717, 52687, 50769, 50618, 49930,
                55509, 45270, 46151, 44188, 40639, 37055, 39641, 34544, 25629, 36779, 9345))  # data from: Lange, Cell 2016

dChrs %>%
  mutate(spo11_ratio = spo11_oligo / sum(spo11_oligo)) %>% 
  filter(chr %in% paste0("chr", 1:19)) -> dChrs

read_csv("crosses_F1_2chr_4chr.csv") %>% 
  filter(cross == "F1") %>% 
  mutate(p_asyn_id = n_asyn / n_cells) %>%
  group_by(chr) %>% mutate(p_asyn = mean(p_asyn_id)) %>% filter(row_number() == 1) %>% ungroup() %>% 
  left_join(., dChrs, by = "chr") %>% 
  dplyr::select(chr, cross, p_asyn, PP_length_mid, chr_length, spo11_ratio) -> DSBs_model_F1

read_csv("crosses_F1_2chr_4chr.csv") %>% 
  filter(cross %in% c("2cross", "4cross")) %>% 
  mutate(p_asyn = n_asyn / n_cells) %>%
  left_join(., dChrs, by = "chr") %>% 
  dplyr::select(chr, cross, p_asyn, PP_length_mid, chr_length, spo11_ratio) -> DSBs_model_2chr_4chr

### Definition of simulating function
###############################
sim_n_DSBs_on_chr_var_per_cell <- function(PP_length_mid_sum = 47, Pos_max = 104, Spo11ratio = 0.043, PROB_SYM_DSB = 0.28, 
                                           MEAN_DSBs = 250, SD_DSBs = 20, N_SIM = 1000) {
  VECT_sym_DSBs_on_chr <- NULL
  Total_DSB_per_chr_rounded_vect <- NULL
  for (i in 1:N_SIM) {
    Total_DSB_per_chr_rounded <- (Spo11ratio * rnorm(n = 1, mean = MEAN_DSBs, sd = SD_DSBs)) %>% round %>% max(0, .)
    DSBs_on_chr               <- runif(n = Total_DSB_per_chr_rounded, min = 1, Pos_max)
    DSBs_in_PP                <- sum(DSBs_on_chr < PP_length_mid_sum)
    DSBs_in_PB                <- sum(DSBs_on_chr >= PP_length_mid_sum)
    sym_DSBs_in_PB            <- rbinom(n = 1, size = DSBs_in_PB, prob = PROB_SYM_DSB)
    sym_DSBs_on_chr           <- DSBs_in_PP + sym_DSBs_in_PB
    VECT_sym_DSBs_on_chr      <- c(VECT_sym_DSBs_on_chr, sym_DSBs_on_chr)
    Total_DSB_per_chr_rounded_vect <- c(Total_DSB_per_chr_rounded_vect, Total_DSB_per_chr_rounded)
  }
  
  results <- c()
  results[1] <- mean(VECT_sym_DSBs_on_chr < 1);        names(results)[1] <- "Prob_less_than_0.9"
  results[2] <- mean(VECT_sym_DSBs_on_chr < 2);        names(results)[2] <- "Prob_less_than_1.9"
  results[3] <- mean(VECT_sym_DSBs_on_chr < 3);        names(results)[3] <- "Prob_less_than_2.9"
  results[4] <- mean(VECT_sym_DSBs_on_chr);            names(results)[4] <- "mean_sym_DSBs"
  results[5] <- sd(VECT_sym_DSBs_on_chr);              names(results)[5] <- "sd_sym_DSBs"
  results[6] <- mean(Total_DSB_per_chr_rounded_vect);  names(results)[6] <- "mean_all_DSBs_per_chr"
  results[7] <- sd(Total_DSB_per_chr_rounded_vect);    names(results)[7] <- "sd_all_DSBs_per_chr"
  return(results)
}

### Parameters based on previous studies:
###############################
PROB_SYM_DSB_ALL <- 0.28 # Davies, Nature 2016; Samgulova, Genes & Development 2016
MEAN_DSBs_ALL    <- 250  # Bhattacharyya, PNAS 2013
SD_DSBs_ALL      <- 20   # Bhattacharyya, PNAS 2013

### Simulation
###############################
N_SIM_ALL <- 100000
DSBs_model_F1$Prob_less_than_2.9_sym_DSBs <- DSBs_model_F1$Prob_less_than_1.9_sym_DSBs <- DSBs_model_F1$Prob_less_than_0.9_sym_DSBs <- NA
for(I in 1:nrow(DSBs_model_F1)) {
  sim_results <- 
    sim_n_DSBs_on_chr_var_per_cell(PP_length_mid_sum = DSBs_model_F1$PP_length_mid[I], 
                                   Pos_max           = DSBs_model_F1$chr_length[I] * 1e-6, 
                                   Spo11ratio        = DSBs_model_F1$spo11_ratio[I], 
                                   PROB_SYM_DSB      = PROB_SYM_DSB_ALL,
                                   MEAN_DSBs         = MEAN_DSBs_ALL,
                                   SD_DSBs           = SD_DSBs_ALL,
                                   N_SIM             = N_SIM_ALL)
  DSBs_model_F1$Prob_less_than_0.9_sym_DSBs[I] <- sim_results[1]
  DSBs_model_F1$Prob_less_than_1.9_sym_DSBs[I] <- sim_results[2]
  DSBs_model_F1$Prob_less_than_2.9_sym_DSBs[I] <- sim_results[3]
  DSBs_model_F1$mean_sym_DSBs[I]               <- sim_results[4]
}

DSBs_model_2chr_4chr$Prob_less_than_2.9_sym_DSBs <- DSBs_model_2chr_4chr$Prob_less_than_1.9_sym_DSBs <- DSBs_model_2chr_4chr$Prob_less_than_0.9_sym_DSBs <- NA
for(I in 1:nrow(DSBs_model_2chr_4chr)) {
  sim_results <- 
    sim_n_DSBs_on_chr_var_per_cell(PP_length_mid_sum = DSBs_model_2chr_4chr$PP_length_mid[I], 
                                   Pos_max           = DSBs_model_2chr_4chr$chr_length[I] * 1e-6, 
                                   Spo11ratio        = DSBs_model_2chr_4chr$spo11_ratio[I],
                                   PROB_SYM_DSB      = PROB_SYM_DSB_ALL,
                                   MEAN_DSBs         = MEAN_DSBs_ALL,
                                   SD_DSBs           = SD_DSBs_ALL,
                                   N_SIM             = N_SIM_ALL)
  
  DSBs_model_2chr_4chr$Prob_less_than_0.9_sym_DSBs[I] <- sim_results[1]
  DSBs_model_2chr_4chr$Prob_less_than_1.9_sym_DSBs[I] <- sim_results[2]
  DSBs_model_2chr_4chr$Prob_less_than_2.9_sym_DSBs[I] <- sim_results[3]
}


### Plot the results of the simulations:
###############################

## F1 cross
DSBs_model_F1 %>%
  gather(starts_with("Prob_less_than_"), key = "less_than", value = "Cum_Prob") %>%
  mutate(less_than = factor(as.factor(substr(less_than, 16, 18)), levels = c("0.9", "1.9", "2.9"), 
                            labels = c("x = 1", "x = 2", "x = 3"))) %>% 
  ggplot(aes(x = Cum_Prob, y = p_asyn, color = less_than)) +
  geom_text(aes(label = chr), size = 6) + 
  geom_abline() +
  geom_smooth(method = "lm") +
  
  xlim(NA, 0.5) + ylim(NA, 0.5) +
  guides(colour = guide_legend(title = "Less than x\nsymmetric DSBs")) +
  theme_grey(base_size = 23) +
  theme(legend.position = c(1,0), legend.justification = c(1,0), 
        legend.background = element_rect(fill="gray92", size=.5, linetype="dotted")) +
  labs(x = "Probability of less than x symmetric DSBs per chr", y = "Probability of asynapsis")

## 4-chr cross
DSBs_model_2chr_4chr %>%
  filter(cross == "4cross") %>% 
  mutate(chr = factor(as.factor(chr), levels = c("chr15", "chr16", "chr18", "chr19"),
                      labels = c("Chr 15", "Chr 16", "Chr 18", "Chr 19"))) %>% 
  gather(starts_with("Prob_less_than_"), key = "less_than", value = "Cum_Prob") %>%
  mutate(less_than = factor(as.factor(substr(less_than, 16, 18)), levels = c("0.9", "1.9", "2.9"), labels = c("x = 1", "x = 2", "x = 3"))) %>% 
  ggplot(aes(x = Cum_Prob, y = p_asyn, color = less_than)) +
  geom_abline() +
  geom_smooth(method = "lm") +
  geom_point() + 
  xlim(NA, 0.75) + ylim(NA, 0.75) +
  guides(colour = guide_legend(title = "Less than x\nsymmetric DSBs")) +
  facet_wrap(~ chr) +
  labs(x = "Probability of less than x symmetric DSBs per chr", y = "Probability of asynapsis")

## 2-chr cross
DSBs_model_2chr_4chr %>%
  filter(cross == "2cross") %>% 
  mutate(chr = factor(as.factor(chr), levels = c("chr5", "chr12", "chr7", "chr15", "chr17", "chr18"),
                    labels = c("Chr 5", "Chr 12", "Chr 7", "Chr 15", "Chr 17", "Chr 18"))) %>%
  gather(starts_with("Prob_less_than_"), key = "less_than", value = "Cum_Prob") %>%
  mutate(less_than = factor(as.factor(substr(less_than, 16, 18)), levels = c("0.9", "1.9", "2.9"), labels = c("x = 1", "x = 2", "x = 3"))) %>% 
  ggplot(aes(x = Cum_Prob, y = p_asyn, color = less_than)) +
  geom_abline() +
  geom_smooth(method = "lm") +
  geom_point() + 
  xlim(NA, 0.75) + ylim(NA, 0.75) +
  guides(colour = guide_legend(title = "Less than x\nsymmetric DSBs")) +
  facet_wrap(~ chr, nrow = 3) +
  labs(x = "Probability of less than x symmetric DSBs per chr", y = "Probability of asynapsis")
