clusters=read.table("Math1_clusters_filtered_mm10_cutoff6_10percentcutoff_counted.txt", header = T, 
                               stringsAsFactors=FALSE)
#The file above contains a list of cTag-PAPERCLIP clusters present in granule cell precursors and adult granule cells
#See Methods (section Analysis of cTag-PAPERCLIP data) for details about how this file was obtained
#from raw sequencing data

clusters = clusters[,c(1,2,14,15:20)]
data = clusters[,-c(1,2,3)]

#sum up counts from all clusters (per gene)
aggregated = aggregate(clusters[,4:9], by=list(clusters$name2), FUN=sum)

#sum up counts from all replicates
#all_p0 and all_p21 columns correspond to the sum of all counts in all clusters in all replicates
aggregated$all_p0 = aggregated$p0_rep1 + aggregated$p0_rep2 + aggregated$p0_rep4
aggregated$all_p21 = aggregated$p21_rep1 + aggregated$p21_rep2 + aggregated$p21_rep4

#sum up counts - per cluster - from all replicates
data$p0_sum = data$p0_rep1 + data$p0_rep2 + data$p0_rep4
data$p21_sum = data$p21_rep1 + data$p21_rep2 + data$p21_rep4

#add the sum of all counts in all clusters (per gene) to each cluster name
clusters$all_p0 = aggregated[match(clusters$name2, aggregated$Group.1), 8]
clusters$all_p21 = aggregated[match(clusters$name2, aggregated$Group.1), 9]

#for each cluster, calculate the sum of all counts per gene minus the sum of counts in the particular cluster
data$all_p0_minus_p0_sum = clusters$all_p0 - data$p0_sum
data$all_p21_minus_p21_sum = clusters$all_p21 - data$p21_sum
data = data[,-c(1:6)]

#Contingency table for Fisher's exact test:

#sum of counts in cluster A in cell type 1    sum of counts in all other clusters in a gene in cell type 1    
#
#sum of counts in cluster A in cell type 2    sum of counts in all other clusters in a gene in cell type 2

#Fisher's exact test

n <- dim(data)[1];
p <- 1:n;
oddratio <- p;
for (i in 1:n)
{
  if (i-as.integer(i/1000) * 1000 == 0)
  {
    cat (sprintf("%d\n",i));
  }
  dat <- matrix(data=as.numeric(data[i,]), nrow=2)
  out <- fisher.test(dat);
  p[i] <- out$p.value;
  oddratio[i] <- out$estimate;
}

out <- cbind(oddratio, p);
clusters$pvalue = out[,2]
clusters$fdr = p.adjust(clusters$pvalue, method="BH") #Correction for multiple hypothesis testing
sig = clusters[clusters$fdr < 0.05, ]

sig$chr = clusters[match(sig$name_cluster, clusters$name_cluster), 3]
sig$start = clusters[match(sig$name_cluster, clusters$name_cluster), 4]
sig$end = clusters[match(sig$name_cluster, clusters$name_cluster), 5]

sig = sig[! sig$name2 == "Camta1" & ! sig$name2 == "Pnkd" &
            ! sig$name2 == "Gabbr1" & ! sig$name2 == "Capn15" &
            ! sig$name2 == "Chd9" & ! sig$name2 == "Mir3098" &
            ! sig$name2 == "Pip5k1b",]

#The genes above were manually removed because they were found to be incorrectly assigned as undergoing APA
#For example due to alternative promoters
#The table above ("sig") corresponds to Supplementary Table 4 (Sheet 2)

sig_genes = aggregate(sig[,4:9], by=list(sig$name2), FUN=sum)  #737
#The table above ("sig_genes") corresponds to Supplemntary Table 4 (Sheet 1 - list of genes that contain
#cTag-PAPERCLIP clusters that change significantly between granule cell precursors and adult granule cells)
rownames(sig_genes) = sig_genes$Group.1
