resolution = 72 * 1.5
labcex = 1.8
smlabcex = 1.4
legendcex = 1.3
ymax = 0.9
themar = c(4.7,4.7,2.1,2.1) 
theblue = rgb(0,71,255, maxColorValue=255)

calc_ci = function(mean, se) {
  return(c(mean - 1.96 * se, mean + 1.96 * se))
}

#### read data on identified NCOs
full_table = read.table("williams_nco_file1.tsv", header=T)

#### test for enrichment of homozygous/heterozygous calls in the recipient of
#### the NCOs. Excludes sites that are not ascertained in the
#### standard way and are homozygous by detection.
binom.test(sum(full_table$recipient_het[full_table$rate_count != 0] == "Y"),
	   sum(!is.na(full_table$recipient_het[full_table$rate_count != 0])),
	   .5, alternative="t")

#### test for bias due to chip in any differences in number of detected events
#### by sex
mf_matrix = matrix(c(sum(full_table$sex[full_table$SNP_density == 'D'] == 'M'),
		     sum(full_table$sex[full_table$SNP_density == 'L'] == 'M'),
		     sum(full_table$sex[full_table$SNP_density == 'D'] == 'F'),
		     sum(full_table$sex[full_table$SNP_density == 'L'] == 'F')),
		   nrow=2, byrow=T)
chisq.test(mf_matrix)
mf_matrix

#### GC-bias tests, rate estimate, plot

# For GC bias, don't include ambiguous NCO sites, nor sites with a SNP in the
# PRDM9 motif
gctab = subset(full_table, (rate_count==1 | rate_count==0) &
							  SNP_PRDM9_motif=="N")

# test for differences in GC transmissions across SNP densities:
chisq.test(matrix(c(sum(gctab$GC_trans[gctab$SNP_density == 'D']=='Y',na.rm=T),
		    sum(gctab$GC_trans[gctab$SNP_density == 'L']=='Y',na.rm=T),
		    sum(gctab$GC_trans[gctab$SNP_density == 'D']=='N',na.rm=T),
		    sum(gctab$GC_trans[gctab$SNP_density == 'L']=='N',na.rm=T)),
		  nrow=2))
# test for differences between male and female GC transmission rates:
chisq.test(matrix(c(sum(gctab$GC_trans[gctab$sex == 'M'] == 'Y', na.rm=T),
		    sum(gctab$GC_trans[gctab$sex == 'F'] == 'Y', na.rm=T),
		    sum(gctab$GC_trans[gctab$sex == 'M'] == 'N', na.rm=T),
		    sum(gctab$GC_trans[gctab$sex == 'F'] == 'N', na.rm=T)),
		  nrow=2))

# number GC transmissions and number possible transmissions:
n_gc_trans = sum(gctab$GC_trans == "Y", na.rm = T)
denom_gc_trans = sum(!is.na(gctab$GC_trans))

# test for significant gBGC overall; gives P-value, transmission rate, 95% CI
binom.test(n_gc_trans, denom_gc_trans, .5, alternative="t")

# GC transmissions at CpG sites:
n_cpg_gc_trans = sum(gctab$GC_trans[gctab$CpG_site == 'Y'] == "Y")
denom_cpg_gc_trans = sum(!is.na(gctab$GC_trans[gctab$CpG_site == 'Y']))
cpg_gc_trans_rate = n_cpg_gc_trans / denom_cpg_gc_trans
n_cpg_gc_trans
denom_cpg_gc_trans
cpg_gc_trans_rate

# test for difference between CpG and non-CpG transmission rate
chisq.test(matrix(c(sum(gctab$GC_trans[gctab$CpG_site == 'Y'] == 'Y', na.rm=T),
		    sum(gctab$GC_trans[gctab$CpG_site == 'N'] == 'Y', na.rm=T),
		    sum(gctab$GC_trans[gctab$CpG_site == 'Y'] == 'N', na.rm=T),
		    sum(gctab$GC_trans[gctab$CpG_site == 'N'] == 'N', na.rm=T)),
		  nrow=2))

# gBGC sites stratified by recombination rate:
gc_trans_bins=c(sum(gctab$GC_trans[gctab$HapMap_rate < 1.2 ] == "Y", na.rm = T),
		sum(gctab$GC_trans[gctab$HapMap_rate >= 1.2 &
				   gctab$HapMap_rate < 2.5   ] =="Y",na.rm = T),
		sum(gctab$GC_trans[gctab$HapMap_rate >= 2.5 &
				   gctab$HapMap_rate < 5   ] == "Y", na.rm = T),
		sum(gctab$GC_trans[gctab$HapMap_rate >= 5 &
				   gctab$HapMap_rate < 7.5 ] == "Y", na.rm = T),
		sum(gctab$GC_trans[gctab$HapMap_rate >= 7.5 &
				   gctab$HapMap_rate < 10  ] == "Y", na.rm = T),
		sum(gctab$GC_trans[gctab$HapMap_rate >= 10 ] == "Y", na.rm = T))
denom_gc_trans_bins = c(sum(!is.na(gctab$GC_trans[gctab$HapMap_rate < 1.2])),
			sum(!is.na(gctab$GC_trans[gctab$HapMap_rate >= 1.2 &
						  gctab$HapMap_rate < 2.5  ])),
			sum(!is.na(gctab$GC_trans[gctab$HapMap_rate >= 2.5 &
						  gctab$HapMap_rate < 5  ])),
			sum(!is.na(gctab$GC_trans[gctab$HapMap_rate >= 5 &
						  gctab$HapMap_rate < 7.5])),
			sum(!is.na(gctab$GC_trans[gctab$HapMap_rate >= 7.5 &
						  gctab$HapMap_rate < 10 ])),
			sum(!is.na(gctab$GC_trans[gctab$HapMap_rate >= 10])))

# test for difference in GC transmission rate across bins
# The following gives a warning that the chi-squared approximation may be
# incorrect; however, calling this with simulate.p.value=TRUE gives a slightly
# less significant P-value; will use this:
chisq.test(rbind(gc_trans_bins,                        # row 1: GC transmission
		 denom_gc_trans_bins - gc_trans_bins)) # row 2: AT transmission

gc_trans_mat = rbind(gc_trans_bins / denom_gc_trans_bins)
gc_trans_mat_se = sqrt(gc_trans_mat * (1 - gc_trans_mat) / denom_gc_trans_bins)

# plot gBGC rates stratified by recombination rate bin
png("gc-trans.png", width=540 * resolution / 72, height=480 * resolution / 72,
    res = resolution)
par(mar=themar)
bp = barplot(gc_trans_mat, col="red",
	     names = c("r<1.2", "1.2\u2264r<2.5", "2.5\u2264r<5","5\u2264r<7.5",
		       "7.5\u2264r<10", "r\u226510"),
	     xlab="Recombination rate (cM/Mb)",
	     ylab="Proportion of NCO sites transmitting GC", ylim=c(0,1),
	     cex.lab=labcex, cex.names=smlabcex, border=NA,
	     width=2, space=1, xlim=c(1.5,24.5),
	     cex.axis=labcex, yaxt='n')
axis(1, at=bp, tick=F,
     labels=c("", "1.2\u2264r<2.5", "", "5\u2264r<7.5", "", "r\u226510"),
     cex.axis=smlabcex)
axis(2, at=seq(0,1,.1), cex.axis=labcex)
seg_lwd = 2
# plot SE bars:
nbins=length(bp)
segments(0, .5, bp[nbins]+2, .5, lwd=seg_lwd, lty=2)
for(i in 1:nbins) {
  segments(bp[i], gc_trans_mat[i] - gc_trans_mat_se[i],
	   bp[i], gc_trans_mat[i] + gc_trans_mat_se[i], lwd=seg_lwd)
  epsilon = 0.12
  segments(bp[i]-epsilon, gc_trans_mat[i] - gc_trans_mat_se[i],
	   bp[i]+epsilon, gc_trans_mat[i] - gc_trans_mat_se[i],
	   lwd=seg_lwd)
  segments(bp[i]-epsilon, gc_trans_mat[i] + gc_trans_mat_se[i],
	   bp[i]+epsilon, gc_trans_mat[i] + gc_trans_mat_se[i],
	   lwd=seg_lwd)
}
dev.off()

#### LD-based recombination rate numbers, plot:

# Number of sites in each of 6 bins from --
# Autosomal genome
autosome_bins = c(2164960226, 264823044, 173321346, 73972855, 43844054,
		  70526264)

# Among all informative sites
ninformative_bins = c(8064596, 1589634, 1083202, 484682, 301916, 594944)
# The NCO sites:
# Note: we don't include sites with 0 count since we compare this to the
#    informative sites and 0 count sites are not technically informative so not
#    comparable.
recomb_nco_bins = c(sum(full_table$rate_count[full_table$HapMap_rate < 1.2]),
		    sum(full_table$rate_count[full_table$HapMap_rate >= 1.2 &
					      full_table$HapMap_rate < 2.5]),
		    sum(full_table$rate_count[full_table$HapMap_rate >= 2.5 &
					      full_table$HapMap_rate < 5]),
		    sum(full_table$rate_count[full_table$HapMap_rate >= 5 &
					      full_table$HapMap_rate < 7.5]),
		    sum(full_table$rate_count[full_table$HapMap_rate >= 7.5 &
					      full_table$HapMap_rate < 10]),
		    sum(full_table$rate_count[full_table$HapMap_rate >= 10])   )
ld_recomb_mat = rbind(autosome_bins / sum(autosome_bins),
		      ninformative_bins / sum(ninformative_bins),
		      recomb_nco_bins / sum(recomb_nco_bins) )

n_nco = sum(recomb_nco_bins)

png("ld-recomb-rate.png", width=540 * resolution / 72,
    height = 480 * resolution / 72, res = resolution)
par(mar=themar)
colors = c("black", theblue, "red")
bp = barplot(ld_recomb_mat, beside=T,
	names = c("r<1.2", "1.2\u2264r<2.5", "2.5\u2264r<5", "5\u2264r<7.5",
		  "7.5\u2264r<10", "r\u226510"),
	xlab="Recombination rate (cM/Mb)", ylab="Proportion of sites",
	ylim=c(0,ymax), col = colors, border=NA,
	cex.lab=labcex, cex.names=smlabcex, cex.axis=labcex)
axis(1, at=bp[2,], tick=F,
     labels=c("", "1.2\u2264r<2.5", "", "5\u2264r<7.5", "", ""),
     cex.axis=smlabcex)
legend("topright", c("Total autosomal genome",
		     "Informative sites",
		     "NCO SNP sites"),
       fill = colors, border=NA, cex=legendcex)
dev.off()


#### Crossover rate (deCODE) numbers, plot:

# Number of sites in each of 6 bins from --
# Autosomal genome
decode_autosome_bins = c(1407257267, 574059034, 389937381, 134862677, 62185419,
			 96944375)
# Among informative sites
decode_inform_bins = c(7745467, 1440310, 1124122, 476343, 250442, 485877)
# The NCO sites: (As above, we don't include 0 count sites)
decode_nco_bins = c(sum(full_table$rate_count[full_table$deCODE_rate < 1.2],
								      na.rm=T),
		    sum(full_table$rate_count[full_table$deCODE_rate >= 1.2 &
					full_table$deCODE_rate < 2.5], na.rm=T),
		    sum(full_table$rate_count[full_table$deCODE_rate >= 2.5 &
					full_table$deCODE_rate < 5], na.rm=T),
		    sum(full_table$rate_count[full_table$deCODE_rate >= 5 &
					full_table$deCODE_rate < 7.5], na.rm=T),
		    sum(full_table$rate_count[full_table$deCODE_rate >= 7.5 &
					full_table$deCODE_rate < 10], na.rm=T),
		    sum(full_table$rate_count[full_table$deCODE_rate >= 10],
								    na.rm=T)  )

xover_mat = rbind(decode_autosome_bins / sum(decode_autosome_bins),
		  decode_inform_bins / sum(decode_inform_bins),
		  decode_nco_bins / sum(decode_nco_bins) )

n_nco_xover_map = sum(decode_nco_bins)
n_nco_xover_map

png("xover-rate.png", width=540 * resolution / 72,
    height = 480 * resolution / 72, res = resolution)
par(mar=themar)
bp = barplot(xover_mat, beside=T,
	names = c("r<1.2", "1.2\u2264r<2.5", "2.5\u2264r<5","5\u2264r<7.5",
		  "7.5\u2264r<10", "r\u226510"),
	xlab="Crossover rate (cM/Mb)", ylab="Proportion of sites",
	ylim=c(0,ymax), col = colors, border=NA,
	cex.lab=labcex, cex.names=smlabcex, cex.axis=labcex)
axis(1, at=bp[2,], tick=F,
     labels=c("", "1.2\u2264r<2.5", "", "5\u2264r<7.5", "", ""),
     cex.axis=smlabcex)
legend("topright", c("Autosomal genome",
		     "Informative sites",
		     "NCO SNP sites"),
       fill = colors, border=NA, cex=legendcex)
dev.off()

#### Recombination hotspots:

# Examine hotspot enrichment first using deCODE map:

# Note that the rate_count == 0 sites should count as full (and unambiguous)
# conversions, so we explicitly count them:
n_xover_hotspots = sum(full_table$hotspot_count[full_table$deCODE_rate >= 10],
									na.rm=T)
n_xover_tested = sum(full_table$hotspot_count[!is.na(full_table$deCODE_rate)])
last_bin = length(decode_inform_bins)
# null expected proportion:
exp_xover_hotspot = decode_inform_bins[ last_bin ] / sum(decode_inform_bins)
n_xover_hotspots
n_xover_tested
exp_xover_hotspot
binom.test(n_xover_hotspots, n_xover_tested, exp_xover_hotspot, alternative="g")

# Now using HapMap LD-based map:

n_ld_hotspots = sum(full_table$hotspot_count[full_table$HapMap_rate >= 10])
n_ld_tested = sum(full_table$hotspot_count[!is.na(full_table$HapMap_rate)])
last_bin = length(decode_inform_bins)
exp_ld_prop_hotspot = ninformative_bins[ last_bin ] / sum(ninformative_bins)
exp_ld_prop_hotspot
n_ld_hotspots
n_ld_tested
exp_ld_prop_hotspot
binom.test(n_ld_hotspots, n_ld_tested, exp_ld_prop_hotspot, alternative="g")
# this inverts the counts, giving the probability of "failures" for the
# "failure" probability of 1-p --
pbinom(n_ld_tested-n_ld_hotspots, n_ld_tested, 1-exp_ld_prop_hotspot)

#### NCO rate with bias correction

# Approach: get rates in each of the 6 different recombination rate bins,
# then multiply them by the proportion of sites that fall in those bins
# genome-wide

sum(ninformative_bins)
n_nco_bins = c(sum(full_table$rate_count[full_table$HapMap_rate < 1.2]),
	      sum(full_table$rate_count[full_table$HapMap_rate >= 1.2 &
					full_table$HapMap_rate < 2.5]),
	      sum(full_table$rate_count[full_table$HapMap_rate >= 2.5 &
					full_table$HapMap_rate < 5]),
	      sum(full_table$rate_count[full_table$HapMap_rate >= 5 &
					full_table$HapMap_rate < 7.5]),
	      sum(full_table$rate_count[full_table$HapMap_rate >= 7.5 &
					full_table$HapMap_rate < 10]),
	      sum(full_table$rate_count[full_table$HapMap_rate >= 10]))
rate_bins = n_nco_bins / ninformative_bins
rate_corr = sum(rate_bins * ld_recomb_mat[1,])
rate_corr
# confidence interval estimated by bootstrap

#### Overlap with DSB and LD-defined hotspots

# DSB results are for unambiguous events (rate_count == 1 or 0) and individuals
# that do not carry the PRDM9 C allele
dsb_tab = subset(full_table, (rate_count==1 | rate_count==0) & rs6889665 != 1)

## NOTE: these results count each site individually; the paper reports overlap
## by event, with overlap counted if any site within an event overlaps. This
## only matters for the events that contain multiple SNPs.

# overlap for both sexes
dsb_num = sum(dsb_tab$DSB_prdm9A_hotspot == 1)
dsb_denom = sum(!is.na(dsb_tab$DSB_prdm9A_hotspot))
dsb_num
dsb_denom
dsb_num / dsb_denom

dsb_num_male = sum(dsb_tab$DSB_prdm9A_hotspot[dsb_tab$sex == 'M'] == 1)
dsb_denom_male = sum(!is.na(dsb_tab$DSB_prdm9A[ dsb_tab$sex == 'M' ]))
dsb_num_male
dsb_denom_male
dsb_num_male / dsb_denom_male

# LD results are for unambiguous events (rate_count == 1 or 0) in males
hotspot_tab = subset(full_table, (rate_count==1 | rate_count==0) & sex == 'M')

hotspot_num = sum(hotspot_tab$HapMap_hotspot == 1)
hotspot_denom = sum(!is.na(hotspot_tab$HapMap_hotspot))
hotspot_num
hotspot_denom
hotspot_num / hotspot_denom

# Jeffreys and May based NCO rate:
crossover_rate = 1.2e-6 # mean cM/Mb rate genome-wide
# lower and upper bounds from the paper; divide by 100 to get Morgans not cM
sperm_based_rate = c(crossover_rate / 100 * 4 * 55,
		     crossover_rate / 100 * 15 * 290)
sperm_based_rate


# Probability of two independent NCOs within the four intervals with clustering:
# rescale the cM distance from the HapMap2 map
cM_rate = c(0.034266, 0.277534)
mean_tract_length = 75  # assuming this: shorter means more tracts- conservative
p_tract = cM_rate / crossover_rate * rate_corr / 100
p_tract * p_tract       # probability of two independent events
