######################################################################################################################################################
## FASTQ ALIGNMENTS
#   Index genome using bwa
    ===   bwa index -p sacCer3 -a bwtsw sacCer3.fa
    ===   samtools faidx sacCer3.fa
    export fastq_dir=$(pwd)
    export fastq_extension_1=_1.fq.clean
    export fastq_extension_2=_2.fq.clean
    export genome_name=sacCer3
    export genome_source=~
    export min_mapping_score=30
    export user=btaylor  
    for sample in $(ls *$fastq_extension_1.gz | sed "s/$fastq_extension_1.gz//g" ); do
        qsub -cwd -j y -pe smp 1  -b y \
                yeast.GATK.alignment.pipeline.sh $fastq_dir $sample $fastq_extension_1 $fastq_extension_2 $genome_name $genome_source $min_mapping_score $user
                # this script file requires modification to the users own file system
    done

######################################################################################################################################################
## READ DEPTH ACROSS SAMPLES
    java -Xmx4g -jar $picard_directory/CreateSequenceDictionary.jar R=sacCer3.fa O=sacCer3.dict
    java -Xmx4g -jar $gatk_directory/GenomeAnalysisTK.jar \
                                     -T DepthOfCoverage \
                                     -R sacCer3.fa \
                                     -o output.coverage.txt \
                                     -I input.bam

######################################################################################################################################################
## SOMATIC SNIPER
## This calls SNV on each sample using a single reference bam file. It output a vcf file for filtering
   export genome_dir=genomes
   export genome_fa=sacCer3.fa
   export ref_bam=*****
   export ploidy=2
   cd $fastq_dir
   for sample in  $( ls *bam  | sed -e "s/.bam//g" ); do
        qsub -cwd -j y -pe smp 1  -b y \
                yeast_somatic_sniper.sh $fastq_dir $sample $ref_bam $genome_dir $genome_fa $user $ploidy
                # this script file requires modification to the users own file system
   done

######################################################################################################################################################
## FILTER
## somatic score > 50, allelic freq > 0.3, read count >4, avg_pos_as_fraction (REF OR SAMPLE) < 0.1, avg_distance_to_effective_3p_end < 0.1, ave base quality < 20,ave read length > 50 bp
    export genome=sacCer3.fa
    export reference_bam=$ref_bam
    cd $fastq_dir
    ~btaylor/car_group/samtools-0.1.19/samtools index $ref_bam
    for sample_vcf in $(ls *vcf); do 
        qsub -cwd -j y -pe smp 1  -b y \
            yeast_filtering.sh $fastq_dir $sample_vcf $reference_bam $genome $user
            # this script file requires modification to the users own file system
    done

#####################################################################################################################################################
## COMBINE FILTERED FILES FOR R/BIOCONDUCTOR ANALYSIS
    for line in $(ls *somatic-sniper.filtered.txt); do
        export nom=$(echo $line | sed 's/.q30.dedup.RR.somatic-sniper.filtered.txt//')
        sed "s/$/\t$nom/" $line | sed '1d' > $line.tmpR
        head -1 $line > head.tmpR.head
    done
    paste head.tmpR.head <(echo -e "\t"sample) >   head.tmpR2.head 
    cat head.tmpR2.head *tmpR > combined_variants.txt

######################################################################################################################################################
## R/BIOCONDUCTOR ANALYSIS
## UNIX COMMANDS INDICATED BY LINE STARTING ===
## All other analysis in R/Bioconductor

library(GenomicFeatures)
library(rtracklayer)
library(IRanges)
library(seqLogo)
library(Biostrings)
library(BSgenome)
library(BSgenome.Scerevisiae.UCSC.sacCer3)
library(ggplot2)	
library(lattice)
library(gridExtra)
library(latticeExtra)
library(reshape)
library(data.table)
library(zoo)
library(Vennerable)

#### GENERATE INITIAL DATABASE
a<-read.delim('Supplementary-file-1.txt', header=T, sep='\t')
    a$ref<-a$ref_base
    a$mut<-a$mut_base
    a$ref_pyr<-a$ref
    a$ref_pyr[a$ref %in% c('A', 'G')]<-as.character(reverseComplement(DNAStringSet(a$ref[a$ref %in% c('A', 'G')])))

    a$mut_pyr<-a$mut 
    a$mut_pyr[a$ref %in% c('A', 'G')]<-as.character(reverseComplement(DNAStringSet(a$mut[a$ref %in% c('A', 'G')])))
    a$ft_pyr<-paste(a$ref_pyr, a$mut_pyr, sep='>')

    a$strand<-factor('+', levels=c('+', '-'))
    a$strand[a$ref %in% c('A', 'G')]<-'-'

    a$seq_pyr<-a$flanking_sequence_pyrimidine_context

    a$mut_name<-paste(a$chr, a$pos, a$ref, sep='_')
    a$treat<-a$mutator
    a$sample<-a$clone
   
### convert to GRanges object
a<-GRanges(seqnames = a$chr, ranges =IRanges(start=a$pos, end=a$pos), strand=a$strand, 
			group=a$treat, clone=a$clone, sample=a$sample, seqinfo=seqinfo(Scerevisiae), genotype='BY4743_UNG-/-',
            ref=a$ref, mut=a$mut, ref_pyr=a$ref_pyr, mut_pyr=a$mut_pyr, ft_pyr=a$ft_pyr, seq_pyr=a$seq_pyr, allelic_freq=a$allelic_freq,
            allele_1_phase=a$allele_1_phase, allele_2_phase=a$allele_2_phase)
a<-a[a$ref %in% c('C','G'),]
# Generate random datasets
    #source('~btaylor/rscripts/montecarlo.sh')

    ##find all WRC, YCC and C motifs in the genome
        dict0 <- list(DNAString('WRC'), DNAString('YCC'), DNAString('C'))
        matches<-mclapply(dict0, mc.cores=3, function(y) {
            seqnames <- as.list(seqnames(Scerevisiae))
            matches<-lapply(seqnames, function(x){
                    subject <- Scerevisiae[[x]]        
                    plus_matches <- matchPattern(y, subject, fixed=FALSE)           
                    plus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(plus_matches)), strand='+', seqinfo=seqinfo(Scerevisiae))
                    minus_matches <- matchPattern(reverseComplement(y), subject, fixed=FALSE)           
                    minus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(minus_matches)), strand='-', seqinfo=seqinfo(Scerevisiae))
                    c(plus_matches, minus_matches)
                    })
            tmp<-unlist(GRangesList(matches))
            tmp$group<-as.character(y)
            tmp
            })
    ### generate random dataset at equivalent mutation rates
        a.s <- split(a, a$group)
        random<-unlist(GRangesList(mclapply(a.s, mc.cores=3, function(x) {             
            tmp<-as.data.frame(x)
            med.x<-mean(aggregate(tmp$sample, by=list(tmp$sample), length)$x)
            nsamples<-length(unique(tmp$sample))

            final<-GRanges()
            for(i in 1:nsamples) { 
                if(x$group[1]=='AID*') tmp<-matches[[1]]
                if(x$group[1]=='sA3G*') tmp<-matches[[2]]
                if(x$group[1]=='EMS') tmp<-matches[[3]]
                tmp<-tmp[runif(n=med.x, min=1, max=length(tmp))]
                tmp$group<-paste(tmp$group, 'med', sep='_')
                tmp$clone<-paste('sim', 'med', i, sep='_')

                final<-c(final, tmp)
            }
            #reduce down to single point mutation
            start(final)[as.character(strand(final))=='+']<-end(final)[as.character(strand(final))=='+']
            end(final)[as.character(strand(final))=='-']<-start(final)[as.character(strand(final))=='-']
            final
            })
            ))

    random$sample<-random$clone
    random$genotype<-'DIP_UNG-/-'
    random$ref<-getSeq(Scerevisiae, seqnames(random), start(random), end(random), strand='+', as.character=T)
        base<-c('A', 'C', 'G', 'T')
        random$mut<-sample(base, length(random), replace=TRUE)
        while(length(random$mut[random$mut==random$ref])>0){
            random$mut[random$mut==random$ref]<-sample(base, length(random$mut[random$mut==random$ref]), replace=TRUE)
            print(length(random$mut[random$mut==random$ref]))
            }
        # remove bases less than 3 from end of chromsome
        tmp1<-as.data.frame(seqlengths(random))
        clength<-tmp1[,1][match(as.character(seqnames(random)), rownames(tmp1))]
        tmp2<-clength-start(random)
        random<-random[tmp2>3,]
        # remove things too close to begining
            random<- random[start(random)>3,]
        # now can getSeq without problems
    random$ref_pyr<-getSeq(Scerevisiae, random, as.character=TRUE)
    random$mut_pyr<-random$mut
    random$mut_pyr[as.character(strand(random))=='-']<-as.character(reverseComplement(DNAStringSet(random$mut[as.character(strand(random))=='-'])))
    random$ft_pyr<-paste(random$ref_pyr, random$mut_pyr, sep='>')
    random$seq_pyr<-getSeq(Scerevisiae, seqnames(random), (start(random)-2), (end(random)+2), strand=strand(random), as.character=T)
    random$allelic_freq<-0.5
    random$allele_1_phase<-NA 
    random$allele_2_phase<-NA 
    # combine mutation and random datasets
        a$group<-as.character(a$group)
        a$clone<-as.character(a$clone)
        a$sample<-as.character(a$sample)       
    a<-c(a, random)
    names(a)<-1:length(a)

## END NEW RANDOM DATABASE


# FEATURES DATABASE
# Feature table downloaded from Yeastmine
    tran<-read.table(file='~btaylor/yeastmine2.tsv', header=F)
    tran$V8<-gsub('chrmt', 'chrM', tran$V8)
    tran$V7[tran$V7=='0']<-'*'
    tran$V7[tran$V7=='-1']<-'-'
    tran$V7[tran$V7=='1']<-'+'
    tran$V1<-as.character(tran$V1)
    tran$V1[tran$V1=='']<-as.character(tran$V4[tran$V1==''])
#    tran$V2<-as.character(tran$V2)
#    tran$V2[tran$V2=='']<-tran$V4[tran$V2=='']
    tran<-GRanges(seqnames=tran$V8, ranges=IRanges(start=tran$V5, end=tran$V6), strand=as.character(tran$V7), name=tran$V1, type=tran$V4, seqinfo=seqinfo(Scerevisiae))
    tran<-tran[!tran$type %in% c('not_physically_mapped', 'not_in_systematic_sequence_of_S288C', 'insertion'),]
    tran$type<-as.character(tran$type)


## Transcription Start site database from Xu2009
===     # download and modify files in unix
===     wget http://downloads.yeastgenome.org/published_datasets/Xu_2009_PMID_19169243/track_files/Xu_2009_ORF-Ts_V64.gff3        
===     wget http://downloads.yeastgenome.org/published_datasets/Xu_2009_PMID_19169243/track_files/Xu_2009_SUTs_V64.gff3
===     wget http://downloads.yeastgenome.org/published_datasets/Xu_2009_PMID_19169243/track_files/Xu_2009_other_transcripts_V64.gff3
===     wget http://downloads.yeastgenome.org/published_datasets/Xu_2009_PMID_19169243/track_files/Xu_2009_CUTs_V64.gff3
===     cat Xu_2009_ORF-Ts_V64.gff3 | tr ';' '\t' | sed 's/Name=//g' | sed 's/ /_/g' | sed 's/source=//g' | sed 's/ID=//g' | sed 's/end_confidence=//g' |\
===             sed 's/shares_5NFR_in_tandem_with=//g' | sed 's/shares_5NFR_in_divergent_configuration_with=//g' | sed 's/initiates_from_3NFR_of_and_opposite_strand_to=//g' | sed 's/initiates_from_3NFR_of_and_same_strand_to=//g' |\
===              grep -v "^#" | grep -v "^$" > Xu_2009_ORF-Ts_V64.gff4
===     cat Xu_2009_SUTs_V64.gff3 | tr ';' '\t' | sed 's/Name=//g' | sed 's/source=//g' | sed 's/ID=//g' | sed 's/end_confidence=//g' |\
===             sed 's/shares_5NFR_in_divergent_configuration_with=//g' | sed 's/shares_5NFR_in_tandem_with=//g' | sed 's/initiates_from_3NFR_of_and_opposite_strand_to=//g' | sed 's/initiates_from_3NFR_of_and_same_strand_to=//g' |\
===             sed 's/sense_antisense_pair_with=//g' | sed 's|expr_in_YPE/YPD=||g' | sed 's|expr_in_YPGal/YPD=||g' | sed 's|expr_of_SUT_in_sense_antisense_pair_in_YPE/YPD=||g' |\
===             sed 's|expr_of_SUT_in_sense_antisense_pair_in_YPGal/YPD=||g' | sed 's|expr_of_SUT_sharing_5NFR_in_YPE/YPD=||g' | sed 's|expr_of_SUT_sharing_5NFR_in_YPGal/YPD=||g' |\
===              grep -v "^#" | grep -v "^$" > Xu_2009_SUTs_V64.gff4
===     cat Xu_2009_other_transcripts_V64.gff3 | tr ';' '\t' | sed 's/Name=//g' | sed 's/ /_/g' | sed 's/source=//g' | sed 's/ID=//g' | sed 's/end_confidence=//g' |\
===             sed 's/Alias=//g' | sed 's/Shares_5NFR_in_divergent_configuration_with=//g' | sed 's/Shares_5NFR_in_tandem_with=//g' | sed 's/Initiates_from_3NFR_of_and_opposite_strand_to=//g' |\
===             sed 's/Initiates_from_3NFR_of_and_same_strand_to=//g' |\
===              grep -v "^#" | grep -v "^$" > Xu_2009_other_transcripts_V64.gff4
===     cat Xu_2009_CUTs_V64.gff3 | tr ';' '\t' | sed 's/Name=//g' | sed 's/source=//g' | sed 's/ID=//g' | sed 's/end_confidence=//g' |\
===             sed 's/shares_5NFR_in_divergent_configuration_with=//g' | sed 's/shares_5NFR_in_tandem_with=//g' | sed 's/initiates_from_3NFR_of_and_opposite_strand_to=//g' | sed 's/initiates_from_3NFR_of_and_same_strand_to=//g' |\
===              grep -v "^#" | grep -v "^$" > Xu_2009_CUTs_V64.gff4

x1<-read.table(file='~btaylor/Mutation_Database/Xu_2009_ORF-Ts_V64.gff4', header=F)
x2<-read.table(file='~btaylor/Mutation_Database/Xu_2009_SUTs_V64.gff4', header=F)
x3<-read.table(file='~btaylor/Mutation_Database/Xu_2009_other_transcripts_V64.gff4', header=F)
x4<-read.table(file='~btaylor/Mutation_Database/Xu_2009_CUTs_V64.gff4', header=F)

Xu2009<-list(
ORF=GRanges(seqnames=x1$V1, ranges=IRanges(start=x1$V4, end=x1$V5), strand=x1$V7, type=x1$V3, data=x1$V2, 
                    ID=x1$V9,
                    Name=x1$V10,
                    end_confidence=x1$V11,
                    source=x1$V12,
                    shares_5NFR_in_divergent_configuration_with=x1$V13,
                    shares_5NFR_in_tandem_with=x1$V14,
                    initiates_from_3NFR_of_and_opposite_strand_to=x1$V15,
                    initiates_from_3NFR_of_and_same_strand_to=x1$V16,
                    seqinfo=seqinfo(Scerevisiae)),
SUT=GRanges(seqnames=x2$V1, ranges=IRanges(start=x2$V4, end=x2$V5), strand=x2$V7, type=x2$V3, data=x2$V2, 
                    ID=x2$V9,
                    Name=x2$V10,
                    end_confidence=x2$V11,
                    source=x2$V12,
                    shares_5NFR_in_divergent_configuration_with=x2$V13,
                    shares_5NFR_in_tandem_with=x2$V14,
                    initiates_from_3NFR_of_and_opposite_strand_to=x2$V15,
                    initiates_from_3NFR_of_and_same_strand_to=x2$V16,
                    sense_antisense_pair_with=x2$V17,
                    expr_YPE_YPD=x2$V18,
                    expr_YPGal_YPD=x2$V19,
                    expr_in_sense_antisense_pair_in_YPE_YPD=x2$V20,
                    expr_in_sense_antisense_pair_in_YPGal_YPD=x2$V21,
                    expr_sharing_5NFR_in_YPE_YPD=x2$V22,
                    expr_sharing_5NFR_in_YPGal_YPD=x2$V23,
                    seqinfo=seqinfo(Scerevisiae)),
OT=GRanges(seqnames=x3$V1, ranges=IRanges(start=x3$V4, end=x3$V5), strand=x3$V7, type=x3$V3, data=x3$V2, 
                    ID=x3$V9,
                    Name=x3$V10,
                    Name2=x3$V11,
                    end_confidence=x3$V12,
                    source=x3$V13,
                    shares_5NFR_in_divergent_configuration_with=x3$V14,
                    shares_5NFR_in_tandem_with=x3$V15,
                    initiates_from_3NFR_of_and_opposite_strand_to=x3$V16,
                    initiates_from_3NFR_of_and_same_strand_to=x3$V17,
                    seqinfo=seqinfo(Scerevisiae)),
CUT=GRanges(seqnames=x4$V1, ranges=IRanges(start=x4$V4, end=x4$V5), strand=x4$V7, type=x4$V3, data=x4$V2, 
                    ID=x4$V9,
                    Name=x4$V10,
                    end_confidence=x4$V11,
                    source=x4$V12,
                    shares_5NFR_in_divergent_configuration_with=x4$V13,
                    shares_5NFR_in_tandem_with=x4$V14,
                    initiates_from_3NFR_of_and_opposite_strand_to=x4$V15,
                    initiates_from_3NFR_of_and_same_strand_to=x4$V16,
                    seqinfo=seqinfo(Scerevisiae))
)

## Rhee TRANSCRIPTIONAL START SITE Dataset
===     wget http://downloads.yeastgenome.org/published_datasets/Rhee_2012_PMID_22258509/track_files/Rhee_2012_TATA_elements_V64.gff3
===     cat Rhee_2012_TATA_elements_V64.gff3 | tr ";" "\t" | sed '/^#/d' | sed '/^$/d' > Rhee_2012_TATA_elements_V64.2.gff3
rhee<-read.table(file='/lmb/home/btaylor/Mutation_Database/Rhee_2012_TATA_elements_V64.2.gff3')     #### sacCer3 TATA positions
rhee$gene<-gsub("Name=", "", rhee$V10)
rhee$gene<-gsub("_TATA.*", "", rhee$gene )
rhee$class<-gsub("rna_class=", "", rhee$V12)
rhee$mismatch<-gsub("num_TATA_mismatch=", "", rhee$V24)
rhee$type<-gsub(".*_TATA_", "", rhee$V10)
rhee$taf.class<-gsub("taf1_class=", "", rhee$V13)
rhee<-GRanges(seqnames=rhee$V1, ranges=IRanges(start=rhee$V4, end=rhee$V5), strand=rhee$V7, gene=rhee$gene, class=rhee$class, mismatch=rhee$mismatch, type=rhee$type, taf.class=rhee$taf.class, seqinfo=seqinfo(Scerevisiae))

###############################################################
## DETERMINE MUTATIONALLY ENRICHED LOCI
###############################################################
        # calculate number of each motif in the genome --> total number of targetable sites
        dict0 <- list(DNAString('WRC'), DNAString('YCC'), DNAString('C'))
        matches<-mclapply(dict0, mc.cores=3, function(y) {
            seqnames <- as.list(seqnames(Scerevisiae))
            matches<-lapply(seqnames, function(x){
                    subject <- Scerevisiae[[x]]        
                    plus_matches <- matchPattern(y, subject, fixed=FALSE)           
                    plus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(plus_matches)), strand='+', seqinfo=seqinfo(Scerevisiae))
                    minus_matches <- matchPattern(reverseComplement(y), subject, fixed=FALSE)           
                    minus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(minus_matches)), strand='-', seqinfo=seqinfo(Scerevisiae))
                    c(plus_matches, minus_matches)
                    })
            tmp<-unlist(GRangesList(matches))
            data.frame(number.hits=length(tmp), motif=as.character(y))
            })
        matches<-do.call(rbind, matches)

    # step 1. makes ranges across chromosomes
        bin=150
        sep=10
        li<-as.list(seqlengths(Scerevisiae))
        histo<-lapply(names(li), function(x){
            trim(
            GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], ceiling(bin/sep)), width=bin), seqinfo=seqinfo(Scerevisiae))
            )
        })      
        histo<-unlist(GRangesList(histo))

    # step 2. count overlaps with ranges for histogram like data
        a<-subset(a, a$ref %in% c('C','G'))
        a$ref<-as.character(a$ref)

        a.s<-split(a, a$group)

        a1<-mclapply(names(a.s), mc.cores=12, function(x){
            z<-a.s[[x]]
            strand(z)<-'*'
            tmp2<-histo 
            tmp2$count<-countOverlaps(tmp2, z)		# counts at each point in a
            tmp2$sample<-x
            tmp2[tmp2$count>0]
        })
        names(a1)<-lapply(a1, function(x){ x$sample[1] } )
        
        a1<-unlist(GRangesList(a1))
        a1$motif<-NA 
        a1$motif[a1$sample %in% c('sA3G*', 'YCC_med')]<-'YCC'
        a1$motif[a1$sample %in% c('AID*', 'WRC_med')]<-'WRC'
        a1$motif[a1$sample %in% c('EMS', 'C_med')]<-'C'

    # step 3. threshold #1 by probability
        cut.offs2<-data.frame(nmuts=aggregate(a$sample, by=list(a$group), length), nsamples=aggregate(a$sample, by=list(a$group), function(x) { length(unique(x))})$x)
        cut.offs2$mean<-cut.offs2$nmuts.x/cut.offs2$nsamples
            cut.offs2$motif<-NA
            cut.offs2$motif[cut.offs2$nmuts.Group.1 %in% c('sA3G*', 'YCC_med')]<-'YCC'
            cut.offs2$motif[cut.offs2$nmuts.Group.1 %in% c('AID*', 'WRC_med')]<-'WRC'
            cut.offs2$motif[cut.offs2$nmuts.Group.1 %in% c('EMS', 'C_med')]<-'C'
            cut.offs2$size<-matches$number.hits[match(cut.offs2$motif, matches$motif)]
            
            cut.offs2$cut.off<-NA
        for(i in 1:nrow(cut.offs2)){
            cut.offs2$cut.off[i]<-qbinom(1e-2, size=ceiling(cut.offs2$mean[i]), prob=ceiling(cut.offs2$mean[i])/cut.offs2$size[i], lower.tail=F)
            }
            a1$cut.off<-cut.offs2$cut.off[match(a1$sample, cut.offs2$nmuts.Group.1)]

        #remove fragments below cutoff
        a2<-a1[a1$count > a1$cut.off,]

    # step 4. merge overlapping windows
        a2<-split(a2, a2$sample)
        a2<-lapply(a2, function(x){
                nom<-x$sample[1]
                x<-reduce(x, min.gapwidth=150)
                x$sample<-nom
                y<-a[a$group==x$sample[1],]
                x$count<-countOverlaps(x, y)
                x
                })
        a2<-unlist(GRangesList(a2))

    # step 5. threshold #2 MELs to have minimum mutation count of > 3
        a2<-a2[a2$count > 3,]

    # step 6. remove edge mutations
        a2$code<-1:length(a2)
        a2s<-split(a2, a2$code)
        a1.3<-mclapply(names(a2s),  mc.cores=12, function(x){
                    if( (as.numeric(x) %% 100)==0 ) print(x)    # simple counter every 100
                    x<-a2s[[x]]
                    z<-a[a$group==x$sample]
                    z<-z[z %over% x]
                    y<-GRanges(seqnames=seqnames(x), ranges=IRanges(start=seq(start(x), end(x), 1), width=1), seqinfo=seqinfo(Scerevisiae))
                    y$sample<-x$sample[1]
                    y$count<-countOverlaps(y, z)		# counts at each point in a
                    y<-y[y$count>0]
                    # remove counts below lower quartile
                    if(length(y)==1) y$nsample<-length(unique(z$sample[z %over% y]))
                    if(length(y)>1) { 
                        y<-y[y$count >= quantile(y$count,0.25)[[1]] ]
                            # try remove counts below peak - 4SD
                        if(length(y)>2) y<-y[y$count >= (max(y$count)-(sd(y$count)*4)),]
                        if(median(y$count)>1) y<-y[y$count > 1 ]        # remove end bases when hotspot is made up of more densily mutated regions  ==> doesn't work if there are lots of flanking 1
                        y<-reduce(y, min.gapwidth=150)     # bring the hotspot together
                        if(length(y)>0) {                               
                                    y$count<-countOverlaps(y, z)
                                    y$sample<-x$sample[1]   
                                    names(y)<-1:length(y)
                                    y<-split(y, names(y))
                                    y<-lapply(y, function(q2){
                                        q2$nsample<-length(unique(z$sample[z %over% q2]))
                                        q2
                                        })
                                    y<-unlist(GRangesList(y))
                                    names(y)<-1:length(y)
                                    }
                        }
                    as.data.frame(y)
                    })
                a1.3<-do.call(rbind, a1.3)
                a1.3<-GRanges(seqnames=a1.3$seqnames, ranges=IRanges(start=a1.3$start, end=a1.3$end), 
                            strand='*', count=a1.3$count, sample=a1.3$sample, 
                            nsamples=a1.3$nsample, seqinfo=seqinfo(Scerevisiae))

    # step 7. threshold #3 MELs to have minimum mutation count of > 5
        a1.3<-a1.3[a1.3$count>5]

    # step 8. threshold #4 MELs to be comprising mutations from at least 4 samples
        a1.3<-a1.3[a1.3$nsamples > 3]

###############################################################
## FIGURE 1D MUTATIONS PER CHROMOSOME
###############################################################

    chrl<-as.data.frame(seqinfo(Scerevisiae))
    tmp <-aggregate(a$sample, by=list(a$group, a$sample, as.character(seqnames(a)), a$ref), length)

    tmp3<-tmp
    tmp3$chrl<-chrl$seqlength[match(tmp3$Group.3, rownames(chrl))]
    tmp3<-subset(tmp3, tmp3$Group.1 %in% c('AID*','sA3G*', 'EMS'))
    tmp3$x[tmp3$Group.4=='G']<- -1 * tmp3$x[tmp3$Group.4=='G']
    tmp3$Group.1<-factor(tmp3$Group.1, levels=c('AID*','sA3G*','EMS'))

    p2<-ggplot(subset(tmp3, tmp3$Group.1=='AID*'),aes(x=chrl, y=abs(x))) + 
            geom_smooth(se=FALSE, aes(colour=Group.4), size=0.5, method = "lm") +
            geom_jitter(aes(colour=Group.4), alpha=0.3, size=I(0.5), stat='identity', size=I(0.2), position = position_jitter(width = 50000, height=0)) +
            facet_wrap(~Group.1, scales='free_y') +
            scale_colour_manual(name='Base', values=c('G'='red','C'='black')) +
            theme_bw() +
            coord_cartesian(ylim = c(0,90)) + 
            theme(axis.ticks.x=element_blank(),
                    axis.text.x=element_blank(),
                    axis.text.y=element_text(size=6),
                    axis.title.y=element_text(size=7),
                    axis.ticks=element_blank(),
                    strip.text=element_text(size=6),
                    legend.position='none') +
            xlab('') +
            ylab('')  +
            theme(plot.margin = unit(c(0,0,0,0), "lines"))
    p3<-ggplot(subset(tmp3, tmp3$Group.1=='sA3G*'),aes(x=chrl, y=abs(x))) + 
            geom_smooth(se=FALSE, aes(colour=Group.4), size=0.5, method = "lm") +
            geom_jitter(aes(colour=Group.4), alpha=0.3, size=I(0.5), stat='identity', size=I(0.2), position = position_jitter(width = 50000, height=0)) +
            facet_wrap(~Group.1, scales='free_y') +
            scale_colour_manual(name='Base', values=c('G'='red','C'='black')) +
            theme_bw() +
            coord_cartesian(ylim = c(0,80)) + 
            theme(axis.ticks.x=element_blank(),
                    axis.text.x=element_blank(),
                    axis.text.y=element_text(size=6),
                    axis.title.y=element_text(size=7),
                    strip.text=element_text(size=6),
                    axis.ticks=element_blank(),
                    legend.position='none') +
            xlab('') +
            ylab('')  +
            theme(plot.margin = unit(c(0,0,0,0), "lines"))

    p4<-ggplot(subset(tmp3, tmp3$Group.1=='EMS'),aes(x=chrl, y=abs(x))) + 
            geom_smooth(se=FALSE, aes(colour=Group.4), size=0.5, method = "lm") +
            geom_jitter(aes(colour=Group.4), alpha=0.3, size=I(0.5), stat='identity', size=I(0.2), position = position_jitter(width = 50000, height=0)) +
            facet_wrap(~Group.1, scales='free_y') +
            scale_colour_manual(name='Base', values=c('G'='red','C'='black')) +
            theme_bw() +
            coord_cartesian(ylim = c(0,30)) + 
            theme(axis.ticks.x=element_blank(),
                    axis.text.x=element_blank(),
                    axis.text.y=element_text(size=6),
                    axis.title.y=element_text(size=7),
                    strip.text=element_text(size=6),
                    axis.ticks=element_blank(),
                    legend.position='none') +
            xlab('') +
            ylab('')  +
            theme(plot.margin = unit(c(0,0,0,0), "lines"))

        grid.arrange(p2,p3,p4, nrow=1)

 
    # correlation statistics
        tmpx1<-subset(tmp3, tmp3$Group.1=='AID*')
        cor.test(tmpx1$chrl, abs(tmpx1$x), alternative = "greater", method = "spearman") 

        tmpx1<-subset(tmp3, tmp3$Group.1=='sA3G*')
        cor.test(tmpx1$chrl, abs(tmpx1$x), alternative = "greater", method = "spearman") 

        tmpx1<-subset(tmp3, tmp3$Group.1=='EMS')
        cor.test(tmpx1$chrl, abs(tmpx1$x), alternative = "greater", method = "spearman") 

###############################################################
## FIGURE 2A RAIDAL HISTOGRAMS
###############################################################
# split genome into 1kb widnows to generate ranges for histogram data, 
    li<-as.list(seqlengths(Scerevisiae))
    histo<-lapply(names(li), function(x){
        trim(
        GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], ceiling(1000)), width=1000), seqinfo=seqinfo(Scerevisiae))
        )
    })      
    histo<-unlist(GRangesList(histo))

# count overlaps with ranges for histogram like data
    a.s<-split(a, a$group)
    a.s<-a.s[c('AID*','sA3G*','EMS')]
	a1<-mclapply(names(a.s), mc.cores=3, function(x){
		z<-a.s[[x]]
        strand(z)<-'*'
        tmp2<-histo 
        tmp2$count<-countOverlaps(tmp2, z)		# counts at each point in a
        tmp2$sample<-x
        tmp2
	})
    x<-unlist(GRangesList(a1))    

# remove start and end chromsomes as telomer no read depth
tmp<-split(histo, seqnames(histo))
tmp<-lapply(tmp, function(z){
    c(z[1], z[length(z)])
    })
tmp<-unlist(GRangesList(tmp))

x1<-x[!x %over% tmp]

# calculate Z scores
x1<-split(x1, x1$sample)
x1<-lapply(x1, function(y){
    y$scale<-scale(y$count, center=TRUE, scale=TRUE)
    y
    })
x1<-unlist(GRangesList(x1))
tmp2<-GRanges(seqnames=c('chrV','chrII'), ranges=IRanges(start=c(26694,260001), end=c(38466,290000)), names=c('CAN1','BEST'), seqinfo=seqinfo(Scerevisiae))
x1$over<-'none'
x1$over[x1 %over% tmp2[1] ] <- tmp2$names[1]
x1$over[x1 %over% tmp2[2] ] <- tmp2$names[2]
x1$sample<-factor(x1$sample, levels=c('AID*','sA3G*','EMS'))

x2<-resize(x1, width=40000, fix='center')

library(ggbio)

ggplot() + 
        layout_circle(saccerIdeo, geom = "text", aes(label = seqnames2), radius = 35, space.skip = 0.01, size=2) + 
        layout_circle(x2, radius=10, trackWidth = 25, geom = "bar", aes(x=start, y=scale, fill=over, colour=over), size=0, space.skip = 0.01, position='identity', stat='identity', 
            grid = FALSE, grid.n = 3, grid.background='white', grid.line = "gray70") + 
            facet_wrap(~sample) +
        scale_fill_manual(values=c('none'='black','CAN1'='red','BEST'='green')) +
        scale_colour_manual(values=c('none'='black','CAN1'='red','BEST'='green')) +
        layout_circle(saccerIdeo, geom = "ideo", radius = 34,  trackWidth = 0.025, space.skip = 0.01) +
        theme(legend.position='none') 
             
###############################################################
## FIGURE 2B Mutation Z-score on chromosome II
###############################################################
library(ggbio)
# make mutation coverage file
    li<-as.list(seqlengths(Scerevisiae))
    histo<-mclapply(names(li), mc.cores=12, function(x){
        trim(
        GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], 150), width=150), seqinfo=seqinfo(Scerevisiae))
        )
    })      
    histo<-unlist(GRangesList(histo))
# count mutations per sample
    a.s<-split(a, a$group)
	a1<-mclapply(names(a.s), mc.cores=12, function(x){
		z<-a.s[[x]]
        strand(z)<-'*'
        tmp2<-histo 
        tmp2$count<-countOverlaps(tmp2, z)		# counts at each point in a
        tmp2$sample<-x
        tmp2$mid<-(start(tmp2)+end(tmp2))/2
        tmp2
	})
    a1<-unlist(GRangesList(a1))    
    a1<-split(a1, a1$sample)
    a1<-mclapply(a1, function(y){
            y$scale<-scale(y$count)
            y
            })
    a1<-unlist(GRangesList(a1))
    names(a1)<-1:length(a1)
    
    x<-subset(a1, seqnames(a1)=='chrII')
    x<-subset(x, x$sample %in% c('AID*', 'sA3G*', 'EMS'))
    x$sample<-factor(x$sample, levels=c('AID*','sA3G*','EMS'))
  
    ggplot() + 
        geom_rect(data=as.data.frame(x), aes(xmin=260001, xmax=290000, ymin=0, ymax=Inf), fill='cyan', alpha=0.002) +
        geom_area(data=as.data.frame(x), aes(x=mid, y=scale, colour=sample, fill=sample), size=I(0.1)) +
        facet_grid(sample~.) + 
        theme_bw() + 
        ylab('Z-score') +
        xlab('Genomic coordinate') + 
        scale_colour_manual(values=c('AID*'='red','sA3G*'='black','EMS'='blue')) + 
        theme(legend.position="none",
                axis.text.x=element_text(size=6),
                axis.title.x=element_text(size=7),
                axis.title.y=element_text(size=7),
                axis.text.y=element_text(size=6),
                axis.ticks=element_blank(),
                strip.text=element_text(size=7),
                panel.grid.minor=element_blank(),
                plot.background=element_blank(),
                panel.background=element_blank()
                ) 
            
###############################################################
## FIGURE 2C Chromsome regions
###############################################################
    library(ggbio)

        # select one of the two following regions
            # region 1
            zoom<-GRanges(seqnames='chrII', ranges=IRanges(start=275001, end=280000), seqinfo=seqinfo(Scerevisiae))
            # region 2
            zoom<-a1.3[a1.3$sample=='A3G'][order(a1.3$count[a1.3$sample=='A3G'], decreasing=TRUE)][1]

    zoom<-resize(zoom, width=5000, fix='center')
    y<-a[findOverlaps(a, zoom)@queryHits]
    y<-subset(y, y$group %in% c('AID*', 'sA3G*', 'EMS'))
    y$group<-factor(y$group, levels=c('AID*','sA3G*','EMS'))

    x<-a1.3[findOverlaps(a1.3, zoom)@queryHits]
    x<-subset(x, x$sample %in% c('AID*', 'sA3G*', 'EMS'))
    x$y<-1

            p1<-ggplot(data = y , aes(x = start, y = as.factor(sample), colour=ref)) + geom_point(size=I(1.5))+
            scale_colour_manual(name='Base', values=c('G'='red', 'C'='black', "A"='blue', "T"='green', "CC"='orange', "CCC"='pink', "GG"='purple'))+
            scale_y_discrete(name='')+
            facet_grid(group~., scale='free_y', space='free_y') +
            xlim(zoom)+
            theme_bw() +
            theme(	panel.margin=unit(0, "cm"),
                    axis.text.x=element_blank(),
                    axis.text.y=element_blank(),
                    axis.title.x=element_blank(),
                    axis.ticks=element_blank(),
                    panel.grid.major = element_line(colour='grey70'),
                    strip.text.y=element_text(size=8, angle=0),
                    legend.key.size=unit(0.5, "cm"),
                    legend.position='none'
                                        ) 

        p2<-ggplot(x) + geom_bar(aes(y=y, fill=sample)) +
            xlim(zoom) + 
            scale_fill_manual(values=c('AID*'='orange', 'sA3G*'='purple')) +
            scale_colour_manual(values=c('AID*'='orange', 'sA3G*'='purple')) +
            theme_bw() +
            facet_grid(sample~.) +
            theme(  panel.margin=unit(0, "cm"),
                    legend.title=element_blank(),
                    legend.key.size=unit(0.5, "cm"),
                    legend.position='none',
                    strip.text.y=element_text(size=8, angle=0),
                    axis.text.y=element_blank(),
                    axis.ticks=element_blank(),
                    axis.title.y=element_blank()
                                        )

        tracks(p1, p2, heights=c(1,0.2), xlab.height = unit(0.25, "lines")) + xlim(zoom) 

###############################################################
## Figure 2D Base pair resultion MEL
###############################################################
    zoom<-GRanges(seqnames='chrII', ranges=IRanges(start=278287, end=278438), seqinfo=seqinfo(Scerevisiae))
    a1.3[a1.3 %over% zoom]
    tmp<-DNAString(getSeq(Scerevisiae, zoom, as.character=T))

    # for each clone, remove any base thats not mutated
    y<-a[a$group %in% c('sA3G*','AID*')]
    y<-y[y %over% zoom]
    # calculate number of mutations at each base
    tmp<-aggregate(start(y), by=list(start(y), y$group, y$ref, y$mut), length)
    tmp$Group.1 <- tmp$Group.1 - 278287 + 1
    tmp[order(tmp$Group.1),]

###############################################################
## Figure 2E MEL overlaps
###############################################################
    library(venneuler)

    x1<-subset(a1.3, a1.3$sample=='AID*')
        x1$code<-paste('AID*', 1:length(x1), sep='_')
    x2<-subset(a1.3, a1.3$sample=='sA3G*')
        x2$code<-paste('sA3G*', 1:length(x2), sep='_')
    x3<-subset(a1.3, a1.3$sample=='EMS')
        x3$code<-paste('EMS', 1:length(x3), sep='_')

    tmp<-findOverlaps(x1,x2, maxgap=10)
    x2$code[tmp@subjectHits]<-x1$code[tmp@queryHits]

    tmp<-findOverlaps(x1,x3, maxgap=10)
    x3$code[tmp@subjectHits]<-x1$code[tmp@queryHits]

    tmp<-findOverlaps(x2,x3, maxgap=10)
    x3$code[tmp@subjectHits]<-x2$code[tmp@queryHits]

    m <- data.frame(elements=c(x1$code, x2$code, x3$code), sets=c(rep('AID*', length(x1$code)), rep('sA3G*', length(x2$code)), rep('EMS', length(x3$code)) ) )
    plot(venneuler(m))

###############################################################
## Figure 2F MEL width
###############################################################
    library(ggbio)
    tmp<-a1.3[a1.3$sample %in% c('AID*', 'sA3G*')]
    tmp$sample<-factor(tmp$sample, levels=c('AID*','sA3G*'))
    ggplot(tmp) + geom_boxplot(aes(x=sample, y=width),outlier.size = 0.5, size=I(0.25)) +
            theme_bw() +
            ylab('MEL width (bp)') +
            xlab('') +
            theme(  axis.text.y=element_text(size=6),
                    axis.text.x=element_text(size=6, angle=90),
                    axis.ticks=element_blank(),
                    axis.title=element_text(size=7),
                    legend.text=element_text(size=6),
                    legend.key=element_blank(),
                    panel.grid=element_blank())

    aggregate(width(tmp), by=list(tmp$sample), median)

###############################################################
## Figure 2G Number of mutations within MELs and amount of Genome covered
###############################################################
    a.tmp<-a[a$group %in% c('AID*','sA3G*')]    
    a13.tmp<-a1.3[a1.3$sample %in% c('AID*','sA3G*')]    
    a.tmp$hot<-factor('no',levels=c('yes','no'))
    a.tmp$hot[a.tmp %over% a13.tmp]<-'yes'
    a.tmp$hot[a.tmp %over% a13.tmp]<-'yes'
    detach(package:ggbio)
    a.tmp$hot<-factor(a.tmp$hot, levels=c('yes','no'))
    p1<-ggplot(as.data.frame(a.tmp)) + geom_bar(aes(x='all', fill=hot), colour='black', size=0.5, position='fill') +
            scale_fill_manual(values=c('yes'='black','no'='white')) +
            theme_bw() +
            theme(legend.position='none',
                    axis.text=element_blank(),
                    axis.title=element_blank(),
                    axis.ticks=element_blank()) +
            coord_flip()

    muts<-data.frame(area=c(
                        sum(width(reduce(a1.3[a1.3$sample %in% c('AID*', 'sA3G*')]))),
                        sum(seqlengths(Scerevisiae)[1:16]) - sum(width(reduce(a1.3[a1.3$sample %in% c('AID*', 'sA3G*')])))),
                        type=factor(c('yes','no'), levels=c('yes','no'))
                        )
    p2<-ggplot(muts, aes(x = 'all', y = area, fill= type)) + geom_bar(colour='black', size=0.5, stat = "identity") +
            scale_fill_manual(values=c('yes'='black','no'='white')) +
            theme_bw() +
            theme(legend.position='none',
                    axis.text=element_blank(),
                    axis.title=element_blank(),
                    axis.ticks=element_blank()) +
            coord_flip()

    grid.arrange(p1,p2)

###############################################################
## MEL homozygous mutations
###############################################################
    # Edit somatic sniper to give phasing input
    ===     $bedtools_dir/bedtools intersect -header -a input.vcf -b <( sed '1d' input.filtered.txt) | cut -f1-9,11 > phasing.vcf
    # Phasing using Hapcompass  http://www.brown.edu/Research/Istrail_Lab/hapcompass.php
    ===     java -Xmx4g -jar ~btaylor/car_group/hapcompass_v0.6.3/hapcompass.jar \
                        --bam input.bam \
                        --vcf phasing.vcf \
                        -o output.compass.phase
    # combine phasing files for all samples
    ===     for line in $(ls *compass.phase_MWER_solution.txt | sed 's/.txt//g'); do 
                paste <( sed '/^$/d' $line.txt | cut -f7 | awk -v b=1 ' { a=$1; if ( a != "" ) b=a; print b }' ) <(cut -f1-6 $line.txt |  sed '/^$/d') > $line.f1
                awk -v b=0 ' { if ( $2== "BLOCK" ) b=b+1; print b"\t"$0 }' $line.f1 | sed '/BLOCK/d' | sed "s/$/\t$line/g" | sed 's/.q30.dedup.RR.compass.phase_MWER_solution//g' > $line.f2
                rm $line.f1
            done
    ===     cat *f2 > combined.phase.f2
    ===     awk ' { if ( $2 == 5 ) print $0 } ' combined.phase.f2
    # phasing then inported into R and added to mutation data

    bin=150
    sep=10
    li<-as.list(seqlengths(Scerevisiae))
    histo<-lapply(names(li), function(x){
        trim(
        GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], ceiling(bin/sep)), width=bin), seqinfo=seqinfo(Scerevisiae))
        )
    })      
    histo<-unlist(GRangesList(histo))

    x<- a[a$group %in% c('AID*', 'sA3G*')]
    x$allele_1_phase[is.na(x$allele_1_phase)]<-0
    x$allele_2_phase[is.na(x$allele_2_phase)]<-0
    x<-split(x, x$group)

    y<-a1.3[a1.3$sample %in% c('AID*', 'sA3G*')]
    y<-split(y, y$sample)

    tmp.x<-lapply(list('AID*','sA3G*'), function(z){
        x<- a[a$group==z]
        x$allele_1_phase[is.na(x$allele_1_phase)]<-0
        x$allele_2_phase[is.na(x$allele_2_phase)]<-0

        y<-a1.3[a1.3$sample==z]
        # Generate randomise MELs by Monte Carlo simulation
        boot<-as.list(seq(1, 1000,1))
        tmp<-mclapply(boot, mc.cores=12, function(q){
                        random<-sample(histo, length(y), replace=FALSE)
                        width(random)<-width(y)
                        random$type<-paste('sim', z, sep='_')
                        random$round<-q
                        random
                        })
        tmp2<-unlist(GRangesList(tmp))
        values(y)<-data.frame(type=z, round=1)
        y$type<-as.character(y$type)
        y<-c(y, random) 
        y$code<-1:length(y)
        tmp<-as.data.frame(findOverlaps(x,y))
        tmp$allele_1_phase<-x$allele_1_phase[tmp$queryHits]
        tmp$allele_2_phase<-x$allele_2_phase[tmp$queryHits]
        tmp$sample<-x$clone[tmp$queryHits]

        tmp2<-aggregate(tmp$allele_1_phase, by=list(tmp$subjectHits, tmp$sample), max)
        tmp2$code<-paste(tmp2$Group.1, tmp2$Group.2, sep='_')
        tmp3<-aggregate(tmp$allele_2_phase, by=list(tmp$subjectHits, tmp$sample), max)
        tmp3$code<-paste(tmp3$Group.1, tmp3$Group.2, sep='_')
        tmp2$allele1<-tmp2$x
        tmp2$allele2<-tmp3$x[match(tmp2$code, tmp3$code)]
        tmp2$sum<-rowSums(tmp2[,5:6])
        tmp2$sum[tmp2$sum==1]<-0

        tmp2$mel.type<-y$type[tmp2$Group.1]    
        tmp2$round<-y$round[tmp2$Group.1]    

        tmp3<-aggregate(tmp2$sum, by=list(tmp2$Group.1, tmp2$mel.type, tmp2$round), max)
        table(tmp3$x, tmp3$Group.2)
        tmp4<-aggregate(tmp3$x, by=list(tmp3$x, tmp3$Group.2), length)
        tmp4<-split(tmp4, tmp4$Group.2)
        tmp4<-lapply(tmp4, function(r){
            r$percentage.biallelic.within.MEL<-100* ( r$x / sum(r$x) )
            r[r$Group.1==2,]
            })
        tmp4<-do.call(rbind, tmp4)
        data.frame(mutator=tmp4$Group.2, percentage.biallelic.within.MEL=tmp4$percentage.biallelic.within.MEL)
    })

    do.call(rbind, tmp.x)

###############################################################
## Figure 2H Inter-Motif distance
###############################################################
    ##### DISTANCE TO NEXT YCC - different between a3gs
    tmp1<-matches[matches$motif=='YCC']
    tmp2<-a1.3[a1.3$sample=='sA3G*']
    tmp3<-as.data.frame(distanceToNearest(tmp1, tmp2))
           # remove matches which are the mutated base
        tmp3<-tmp3[tmp3$distance==0,]
        tmp3<-tmp3[!is.na(tmp3$distance),]
        tmp1<-tmp1[-tmp3$queryHits]
    tmp3<-as.data.frame(distanceToNearest(tmp2, tmp1))
        # remove matches which are next to the mutated base
        tmp3<-tmp3[tmp3$distance!=1,]
        tmp3<-tmp3[!is.na(tmp3$distance),]
        tmp1<-tmp1[-tmp3$queryHits]

    tmp3<-as.data.frame(distanceToNearest(tmp2, tmp1))
    tmp3$type<-'sA3G* MEL'
    tmp3.a3g<-tmp3
    # this is distance for hotspot motifs

    # now calculate distances for all YCC motifs that are not in hotspots
    tmp1x<-matches[matches$motif=='YCC']
    tmp1x<-tmp1x[!tmp1x %over% tmp2]
    # remove overlapping YCCs
    tmp3x<-as.data.frame(distanceToNearest(tmp1x))
    tmp3x<-tmp3x[tmp3x$distance!=0,]
    tmp3x<-tmp3x[tmp3x$distance!=1,]
    tmp3x$type<-'Genomic YCC'
    tmp3.ycc<-tmp3x

    tmp1<-matches[matches$motif=='WRC']
    tmp2<-a1.3[a1.3$sample=='AID*']
    tmp3<-as.data.frame(distanceToNearest(tmp1, tmp2))
           # remove matches which are the mutated base
        tmp3<-tmp3[tmp3$distance==0,]
        tmp3<-tmp3[!is.na(tmp3$distance),]
        tmp1<-tmp1[-tmp3$queryHits]
    tmp3<-as.data.frame(distanceToNearest(tmp2, tmp1))
        # remove matches which are next to the mutated base
        tmp3<-tmp3[tmp3$distance!=1,]
        tmp3<-tmp3[!is.na(tmp3$distance),]
        tmp1<-tmp1[-tmp3$queryHits]

    tmp3<-as.data.frame(distanceToNearest(tmp2, tmp1))
    tmp3$type<-'AID* MEL'
    tmp3.aid<-tmp3
    # this is distance for hotspot motifs

    # now calculate distances for all YCC motifs that are not in hotspots
    tmp1x<-matches[matches$motif=='WRC']
    tmp1x<-tmp1x[!tmp1x %over% tmp2]
    # remove overlapping YCCs
    tmp3x<-as.data.frame(distanceToNearest(tmp1x))
    tmp3x<-tmp3x[tmp3x$distance!=0,]
    tmp3x<-tmp3x[tmp3x$distance!=1,]
    tmp3x$type<-'Genomic WRC'
    tmp3.wrc<-tmp3x

    tmp3f<-rbind(tmp3.aid, tmp3.wrc, tmp3.a3g, tmp3.ycc)
    tmp3f$type<-factor(tmp3f$type, levels=c('AID* MEL', 'Genomic WRC', 'sA3G* MEL', 'Genomic YCC'))
    tmp3f$motif<-'WRC'
    tmp3f$motif[tmp3f$type %in% c('Genomic YCC','sA3G* MEL')]<-'YCC'

    ggplot(tmp3f) + geom_boxplot(aes(x=type, y=distance), outlier.size=NA, notch=F, size=I(0.25)) +
            theme_bw() +
            ylab('Distance between Motifs (bp)') + 
            xlab('') + 
            coord_cartesian(ylim=c(0,50)) +
            theme(axis.title.x=element_blank(),
                    axis.text.x=element_text(angle=90, size=6),
                    axis.title.y=element_text(size=7),
                    axis.text.y=element_text(size=6),
                    axis.ticks=element_blank(),
                    panel.grid=element_blank())
                    
###############################################################
## Figure 2 - figure supplement 1 - diploid / haploid MEL overlap
###############################################################
    # call haploid MELS
        y4<-read.delim(file='/lmb/home/btaylor/Mutation_Database/haploid_mutation_data.txt', sep='\t')
        y4x<-GRanges(seqnames=y4$seqnames, ranges=IRanges(start=y4$start, width=1), strand=y4$strand, apobec=y4$apobec, sample=y4$sample, ref_base=y4$ref_base, seqinfo=seqinfo(Scerevisiae))

    dict0 <- list(DNAString('WRC'), DNAString('YCC'), DNAString('C'))
    matches<-mclapply(dict0, mc.cores=3, function(y) {
        seqnames <- as.list(seqnames(Scerevisiae))
        matches<-lapply(seqnames, function(x){
                subject <- Scerevisiae[[x]]        
                plus_matches <- matchPattern(y, subject, fixed=FALSE)           
                plus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(plus_matches)), strand='+', seqinfo=seqinfo(Scerevisiae))
                minus_matches <- matchPattern(reverseComplement(y), subject, fixed=FALSE)           
                minus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(minus_matches)), strand='-', seqinfo=seqinfo(Scerevisiae))
                c(plus_matches, minus_matches)
                })
        tmp<-unlist(GRangesList(matches))
        data.frame(number.hits=length(tmp), motif=as.character(y))
        })
    matches<-do.call(rbind, matches)

    ### ENRICHED LOCI - less strigent filtering as less mutations
    bin=150    
    sep=10
        li<-as.list(seqlengths(Scerevisiae))
        histo<-lapply(names(li), function(x){
            trim(
            GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], ceiling(bin/sep)), width=bin), seqinfo=seqinfo(Scerevisiae))
            )
        })      
        histo<-unlist(GRangesList(histo))
    # 2. count overlaps with ranges for histogram like data

        y4s<-split(y4x, y4x$apobec)
        y41<-lapply(names(y4s), function(x){
            z<-y4s[[x]]
            strand(z)<-'*'
            tmp2<-histo 
            tmp2$count<-countOverlaps(tmp2, z)		# counts at each point in a
            tmp2$sample<-x
            tmp2[tmp2$count>3]
        })   
        y41.2<-unlist(GRangesList(y41))       
        
        y41.3<-split(y41.2, y41.2$sample)
        y41.3<-lapply(y41.3, function(x){
                nom<-x$sample[1]
                x<-reduce(x, min.gapwidth=50)
                x$sample<-nom
                z<-y4x[y4x$apobec==nom]
                x$count<-countOverlaps(x,z)
                x
                })
        y41.3<-unlist(GRangesList(y41.3))      

        # overlaps between HAPLOID AND DIPLOID
                x1<-subset(a1.3, a1.3$sample=='AID')
                x2<-subset(a1.3, a1.3$sample=='A3G')
                x3<-subset(y41.3, y41.3$sample=='AID')
                x4<-subset(y41.3, y41.3$sample=='A3G')

    x1$code<-paste(as.character(seqnames(x1)), start(x1), end(x1), sep='_')
    x2$code<-paste(as.character(seqnames(x2)), start(x2), end(x2), sep='_')
    x3$code<-paste(as.character(seqnames(x3)), start(x3), end(x3), sep='_')
        tmp<-as.data.frame(distanceToNearest(x1,x3))
        tmp<-tmp[tmp$distance < 50,]
        tmp<-tmp[!is.na(tmp$distance),]
        x3$code[tmp$subjectHits]<-x1$code[tmp$queryHits]
    x4$code<-paste(as.character(seqnames(x4)), start(x4), end(x4), sep='_')
        tmp<-as.data.frame(distanceToNearest(x2,x4))
        tmp<-tmp[tmp$distance < 50,]
        tmp<-tmp[!is.na(tmp$distance),]
        x4$code[tmp$subjectHits]<-x2$code[tmp$queryHits]

    # Table Data for AID*
    Venn(list('Diploid AID*'=x1$code, 'Haploid AID*'=x3$code))

    # Table Data for sA3G*
    Venn(list('Diploid A3G*'=x2$code, 'Haploid A3G*'=x4$code) )

###############################################################
## Figure 3A - MEL overlap with features
###############################################################
    # Define Gene bodies
    t2<-Xu2009[['ORF']]
    values(t2)<-data.frame(name=t2$Name)
    t2<-t2[seqnames(t2)!='chrM',]
    t2$type<-'Gene Body'

    # Define Promoter regions
    tran2<-promoters(t2, upstream=500, downstream=50)
    tran2$type<-'Promoter'

    # Define Intergenic regions
    tmp<-c(t2, tran2, tran[tran$type=='ARS'])
    strand(tmp)<-'*'
    tmp<-gaps(reduce(tmp))
    tmp<-tmp[as.character(strand(tmp))=='*']
    tmp$name<-'gap'
    tmp$type<-'Intergenic'
        
    # combine regions into single object
    tranx<-c(tran2, t2, tran[tran$type=='ARS'], tmp)

    a13x<-subset(a1.3, a1.3$sample %in% c('AID*','sA3G*'))
    a13x$pr.ORF<-factor('no', levels=c('no','yes'))
    a13x$ORF<-factor('no', levels=c('no','yes'))
    a13x$ARS<-factor('no', levels=c('no','yes'))
    a13x$int<-factor('no', levels=c('no','yes'))

    tmp<-as.data.frame(findOverlaps(a13x, tranx, maxgap=0))
    tmp$type<-tranx$type[tmp$subjectHits]
    a13x$pr.ORF[tmp$queryHits[tmp$type=='Promoter'] ] <- 'yes'
    a13x$ORF[tmp$queryHits[tmp$type=='Gene Body'] ] <- 'yes'
    a13x$ARS[tmp$queryHits[tmp$type=='ARS'] ] <- 'yes'
    a13x$int[tmp$queryHits[tmp$type=='Intergenic'] ] <- 'yes'

    detach(package:ggbio)

    tmp2<-data.frame(sample=a13x$sample, ARS=a13x$ARS, pr.ORF=a13x$pr.ORF, ORF=a13x$ORF, int=a13x$int)
    tmp3<-melt(tmp2, id.var='sample')
    tmp4<-aggregate(tmp3$value, by=list(tmp3$sample, tmp3$variable,tmp3$value), length)

            bin=150
            sep=10
    li<-as.list(seqlengths(Scerevisiae))
    histo<-lapply(names(li), function(x){
        trim(
        GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], ceiling(bin/sep)), width=bin), seqinfo=seqinfo(Scerevisiae))
        )
    })      
    histo<-unlist(GRangesList(histo))
    boot<-as.list(seq(1, 2,1))
    tmp.2<-mclapply(boot, mc.cores=12, function(x){
                    random.aid<-sample(histo, length(a1.3[a1.3$sample=='AID*']), replace=FALSE)
                    width(random.aid)<-width(a1.3[a1.3$sample=='AID*'])
                    random.aid$type<-'sim.AID'
                    random.aid$round<-x
        
                    random.a3g<-sample(histo, length(a1.3[a1.3$sample=='sA3G*']), replace=FALSE)
                    width(random.a3g)<-width(a1.3[a1.3$sample=='sA3G*'])
                    random.a3g$type<-'sim.A3G'
                    random.a3g$round<-x
                    
                    x<-c(random.aid, random.a3g)
                    x$pr.ORF<-factor('no', levels=c('no','yes'))
                    x$ORF<-factor('no', levels=c('no','yes'))
                    x$ARS<-factor('no', levels=c('no','yes'))
                    x$int<-factor('no', levels=c('no','yes'))

                    tmp.2<-as.data.frame(findOverlaps(x, tranx, maxgap=0))
                    tmp.2$type<-tranx$type[tmp.2$subjectHits]
                    x$pr.ORF[tmp.2$queryHits[tmp.2$type=='Promoter'] ] <- 'yes'
                    x$ORF[tmp.2$queryHits[tmp.2$type=='Gene Body'] ] <- 'yes'
                    x$ARS[tmp.2$queryHits[tmp.2$type=='ARS'] ] <- 'yes'
                    x$int[tmp.2$queryHits[tmp.2$type=='Intergenic'] ] <- 'yes'
                    tmp.22<-data.frame(sample=x$type, ARS=x$ARS, pr.ORF=x$pr.ORF, ORF=x$ORF, int=x$int)
                    tmp.3<-melt(tmp.22, id.var='sample')
                    tmp.4<-aggregate(tmp.3$value, by=list(tmp.3$sample, tmp.3$variable,tmp.3$value), length)
                    tmp.4$sim<-x$round[1]
                    tmp.4
                    })

    tmp.2<-do.call(rbind, tmp.2)
    tmp.4<-aggregate(tmp.2$x, by=list(tmp.2$Group.1, tmp.2$Group.2, tmp.2$Group.3), mean)
    tmp.5<-rbind(tmp4, tmp.4)
    tmp.5$Group.3<-gsub('no', 'Non-overlapping', tmp.5$Group.3)
    tmp.5$Group.3<-gsub('yes', 'Overlapping', tmp.5$Group.3)
    tmp.5$Group.2<-gsub('pr.ORF','Promoter', tmp.5$Group.2)
    tmp.5$Group.2<-gsub('ORF','Gene body', tmp.5$Group.2)
    tmp.5$Group.2<-gsub('int','Intergenic', tmp.5$Group.2)
    tmp.5$Group.2<-factor(tmp.5$Group.2, levels=c('Promoter','Gene body','Intergenic','ARS'))
    tmp.5$Group.1<-gsub('AID', 'AID*', tmp.5$Group.1)
    tmp.5$Group.1<-gsub('A3G', 'sA3G*', tmp.5$Group.1)
    tmp.5$Group.1<-factor(tmp.5$Group.1, levels=c('AID*', 'sim.AID*', 'sA3G*', 'sim.sA3G*'))
    ggplot(tmp.5) + 
            geom_bar(aes(x=Group.1, y=x, fill=Group.3), position='fill', stat='identity') + 
            facet_wrap(~Group.2, nrow=1) +
            scale_fill_manual(values=c('Non-overlapping'='grey40', 'Overlapping'='green2')) + 
            scale_y_continuous(name='% hotspots overlapping feature', breaks=c(0,0.5,1), labels=c('0','50','100')) +
            theme_bw() +
            theme(axis.text.x=element_text(angle=90, size=6),
                    axis.title.x=element_blank(),
                    axis.text.y=element_text(size=6),
                    axis.title.y=element_text(size=7),
                    legend.title=element_blank(),
                    strip.text=element_text(size=6),
                    axis.ticks=element_blank(),
                    legend.text=element_text(size=6),
                    legend.key.size=unit(.25, "cm"),
                    legend.position='none')
 
###############################################################
## Figure 3B - mutation alignment to TSS and TTS
###############################################################
    # Define TTS and TSS
   transcripts2<-Xu2009[['ORF']] 
   tts<-transcripts2[as.character(strand(transcripts2))=='-',]
        width(tts)<-1
        tssn<-transcripts2[as.character(strand(transcripts2))=='+',]
        start(tssn)<-end(tssn)
        width(tssn)<-1
        tts<-c(tts,tssn)
        tts<-tts[order(as.character(seqnames(tts)), start(tts)),]
        tts$feature='TTS'       
    tss<-promoters(transcripts2, upstream=1, downstream=0)
        tss$feature='TSS'
    trans<-c(tss,tts)           

    # percentages within promoter regions
    pr<-promoters(Xu2009[['ORF']], upstream=500, downstream=50)
    strand(pr)<-'*'
    tmp<-a[a$group %in% c('AID*','sA3G*', 'EMS')]
    tmp$pr<-factor('no', levels=c('yes','no'))
    tmp$pr[tmp %over% pr]<-'yes'
    tmp2<-aggregate(tmp$pr, by=list(tmp$pr, tmp$group), length)
    tmp3<-aggregate(tmp$pr, by=list(tmp$group), length)
    tmp2$total<-tmp3$x[match(tmp2$Group.2, tmp3$Group.1)]
    tmp2$percentages<-(tmp2$x / tmp2$total ) *100
    tmp2[tmp2$Group.1=='yes',]       
    # find distance of each mutation to TSS and TTS
    a2<-a
    strand(a2)<-'*'
    tmp<-as.data.frame(distanceToNearest(a2, trans))
    tmp<-subset(tmp, !is.na(tmp$subjectHits))
    tmp$sample<-a$group[tmp$queryHits]
    tmp<-subset(tmp, tmp$sample %in% c('AID*', 'sA3G*', 'EMS'))
    tmp$mut.pos<-start(a2)[tmp$queryHits]
    tmp$tss.pos<-start(trans)[tmp$subjectHits]
    tmp$tss.strand<-as.character(strand(trans))[tmp$subjectHits]
    tmp$dis<- NA
    tmp$dis[tmp$tss.strand=='+']<-tmp$mut.pos[tmp$tss.strand=='+']-tmp$tss.pos[tmp$tss.strand=='+']
    tmp$dis[tmp$tss.strand=='-']<-tmp$tss.pos[tmp$tss.strand=='-']-tmp$mut.pos[tmp$tss.strand=='-']
    tmp$hotspot<-a2$clus[tmp$queryHits]
    tmp$feature<-trans$type[tmp$subjectHits]
    tmp$t.type<-trans$feature[tmp$subjectHits]

    tmp2<-aggregate(tmp$dis, by=list(tmp$dis, tmp$t.type, tmp$sample), length)
    # Calculate normalise to mutation density 
        code<-aggregate(tmp$sample, by=list(tmp$sample), length)
        tmp2$code<-code$x[match(tmp2$Group.3, code$Group.1)]
        tmp2$norm<-tmp2$x/tmp2$code
        tmp3<-tmp2
        tmp3$Group.3<-factor(tmp3$Group.3, levels=c('AID*','sA3G*','EMS'))

    ggplot(tmp3) + geom_line(aes(x=Group.1, y=norm), size=I(0.2)) + facet_grid(Group.3~Group.2) +
            coord_cartesian(xlim=c(-1000,1000)) +
        xlab('Distance to feature') +
        ylab('Normalised mutation density') +
        scale_x_continuous(breaks=c(-500,0,500)) + 
        theme_bw() +
        theme(axis.text.y=element_blank(),
                axis.ticks.y=element_blank(),
                strip.text.y = element_text(size = 6),
                strip.text.x = element_text(size = 6),
                axis.ticks=element_blank(),
                axis.text.x = element_text(size = 6),
                panel.margin=unit(0, "cm"),
                axis.title = element_blank(),
                legend.position='none')

###############################################################
## Figure 3C - alignment of mutations to TATA and TSS
###############################################################
#load(file='~btaylor/Mutation_Database/Rhee.rdata')
        trans<-promoters(Xu2009[['ORF']], upstream=1, downstream=0)
        trans<- trans[trans$Name %in% rhee$gene]
        rhee<-rhee[rhee$gene %in% trans$Name]
        rhee$mid<- ( start(rhee) + end(rhee) ) /2
        a2<-a
        strand(a2)<-'*'

tmp<-as.data.frame(distanceToNearest(a2, rhee))
tmp<-tmp[tmp$distance < 1000,]
tmp<-subset(tmp, !is.na(tmp$subjectHits))

tmp$sample<-a$group[tmp$queryHits]
tmp<-subset(tmp, tmp$sample %in% c('AID*', 'sA3G*', 'EMS'))
tmp$mut.pos<-start(a2)[tmp$queryHits]

tmp$tata.pos<-rhee$mid[tmp$subjectHits]
tmp$tss.strand<-as.character(strand(rhee))[tmp$subjectHits]
tmp$dis<- NA
tmp$dis[tmp$tss.strand=='+']<-tmp$mut.pos[tmp$tss.strand=='+']-tmp$tata.pos[tmp$tss.strand=='+']        
tmp$dis[tmp$tss.strand=='-']<-tmp$tata.pos[tmp$tss.strand=='-']-tmp$mut.pos[tmp$tss.strand=='-']
tmp$gene <- rhee$gene[tmp$subjectHits]
tmp$tatatype <- rhee$type[tmp$subjectHits]
tmp$taf.class <- rhee$taf.class[tmp$subjectHits]
            tmp$tss<-start(trans)[match(tmp$gene, trans$Name)]
            tmp$dis2tss<-NA
            tmp$dis2tss[tmp$tss.strand=='+']<-tmp$tss[tmp$tss.strand=='+']-tmp$tata.pos[tmp$tss.strand=='+']
            tmp$dis2tss[tmp$tss.strand=='-']<-tmp$tata.pos[tmp$tss.strand=='-']-tmp$tss[tmp$tss.strand=='-']        
    # are mutations between TATA and TSS
            tmp2<-as.data.frame(distanceToNearest(a2, trans))
            tmp2<-tmp2[tmp2$distance < 1000,]
            tmp2<-subset(tmp2, !is.na(tmp2$subjectHits))
            tmp2$sample<-a$group[tmp2$queryHits]
            tmp2<-subset(tmp2, tmp2$sample %in% c('AID*', 'sA3G*', 'EMS'))
            tmp2$mut.pos<-start(a2)[tmp2$queryHits]
            tmp2$gene <- trans$Name[tmp2$subjectHits]
            tmp2$tss<-start(trans)[tmp2$subjectHits]

            tmp2$tata.pos<-rhee$mid[match(tmp2$gene, rhee$gene) ]          
            tmp2$tss.strand<-as.character(strand(trans))[tmp2$subjectHits]
            
            tmp2$dis<- NA
            tmp2$dis[tmp2$tss.strand=='+']<-tmp2$mut.pos[tmp2$tss.strand=='+']-tmp2$tss[tmp2$tss.strand=='+']        
            tmp2$dis[tmp2$tss.strand=='-']<-tmp2$tss[tmp2$tss.strand=='-']-tmp2$mut.pos[tmp2$tss.strand=='-']
            
            tmp2$dis2tata<-NA
            tmp2$dis2tata[tmp2$tss.strand=='+']<-tmp2$tata.pos[tmp2$tss.strand=='+']-tmp2$tss[tmp2$tss.strand=='+']        
            tmp2$dis2tata[tmp2$tss.strand=='-']<-tmp2$tss[tmp2$tss.strand=='-']-tmp2$tata.pos[tmp2$tss.strand=='-']

final<-rbind(data.frame(type='TATA element', gene=tmp$gene, dis=tmp$dis, sample=tmp$sample),
        data.frame(type='TSS', gene=tmp2$gene, dis=tmp2$dis, sample=tmp2$sample) )
final$sample<-factor(final$sample, levels=c('AID*','sA3G*','EMS'))

ggplot(final) + geom_point(aes(x=dis, y=gene), size=I(0.15), alpha=0.5) +
                xlim(c(-250,250)) + 
                facet_grid(sample~type) + 
                theme_bw() +
                theme(  axis.text.y=element_blank(),
                        axis.text.x=element_text(size=6),
                        axis.ticks=element_blank(),
                        axis.title=element_blank(),
                        strip.text=element_text(size=6), 
                        panel.margin=unit(0, "cm"),
                        panel.grid.major=element_blank(),
                        panel.grid.minor=element_blank())

###############################################################
## Figure 3 - figure supplement 2 - alignment of mutations to types of TATA like elements
###############################################################
    ggplot(tmp) + geom_freqpoly(aes(x=dis, colour=sample), binwidth=2) +
                    facet_grid(tatatype*taf.class~., scales='free_y') +
                    xlim(c(-250,250)) + 
                    theme_bw() +
                    theme(  axis.text.y=element_blank(),
                            axis.text.x=element_text(size=6),
                            axis.ticks=element_blank(),
                            axis.title=element_blank(),
                            strip.text=element_text(size=6), 
                            panel.margin=unit(0, "cm"),
                            panel.grid.major=element_blank(),
                            panel.grid.minor=element_blank())


###############################################################
## Figure 3D - Motif penetrance around TATA elements
###############################################################
# requires object final from Figure 3C
        f2<-final[final$type=='TATA element',]
        f2<-f2[f2$dis > -300 & f2$dis < 300,]

        apo<-list('AID*','sA3G*','EMS')
        out.f<-mclapply(apo, mc.cores=3, function(z){
            tmp<-rhee[rhee$gene %in% f2$gene[f2$sample==z],]
            tmp<-resize(tmp, width=600, fix='center')
            tmp2<-getSeq(Scerevisiae, tmp)
            lst<-as.list(seq(1, (600-9), 5))
            out2<-mclapply(lst, mc.cores=12, function(x){
                y<-DNAStringSet(substr(tmp2, x, x+9))
                if(z=='AID*') pat<-'WRC'
                if(z=='sA3G*') pat<-'YCC'
                if(z=='EMS') pat<-'C'
                out1<-as.data.frame(cbind(vcountPattern(DNAString(pat), y, fix=FALSE), vcountPattern(reverseComplement(DNAString(pat)), y, fix=FALSE) ))
                out1$x<-rowSums(out1)
                out1$x[out1$x>0] <- 1
                out1$pos<-x
                out1$apo<-z
                out1$motif<-pat
                out1
                })
            out2<-do.call(rbind, out2)
            out2$yes<-factor('No motif', levels=c('Motif','No motif'))
            out2$yes[out2$x!=0]<-'Motif'       
            out2$pos<-out2$pos-((600/2)-4)
            out2
            })
        out.f<-do.call(rbind, out.f)


        p1<-ggplot(f2) + geom_freqpoly(aes(x=dis), binwidth=1) +
                facet_grid(sample~., scales='free_y') +
                coord_cartesian(xlim=c(-50,50)) +
                theme_bw() +
                theme(  axis.text.y=element_blank(),
                        axis.text.x=element_text(size=6),
                        axis.ticks=element_blank(),
                        axis.title=element_blank(),
                        strip.text=element_text(size=6), 
                        panel.margin=unit(0, "cm"),
                        legend.position='none')

        p2<-ggplot(out.f) + geom_histogram(aes(x=pos, fill=yes), binwidth=5) +
                coord_cartesian(xlim=c(-50,50)) +
                facet_grid(apo~., scales='free_y') +
                theme_bw() +
                theme(  axis.text.y=element_blank(),
                        axis.text.x=element_text(size=6),
                        axis.ticks=element_blank(),
                        axis.title=element_blank(),
                        strip.text=element_text(size=6), 
                        panel.margin=unit(0, "cm"),
                        legend.title=element_blank(),
                        legend.position='none') +
                scale_fill_manual(values=c('Motif'='blue','No motif'='gold'))

    grid.arrange(p1,p2)

###############################################################
## Figure 3 - figure supplement 1 - ARS motif and mutation density
###############################################################
    # ARS data from Eaton et al. Conserved nucleosome positioning defines replication origins. Genes Dev. 2010 Apr 15;24(8):748-53.Eaton 2010, 
===     wget http://downloads.yeastgenome.org/published_datasets/Eaton_2010_PMID_20351051/track_files/Eaton_2010_ORC_ACS_V64.gff3
===     grep -v "^#" Eaton_2010_ORC_ACS_V64.gff3 > Eaton_2010_ORC_ACS_V64.gff4

    ea<-read.delim(file='Eaton_2010_ORC_ACS_V64.gff4', sep='\t', header=F)
    ea<-GRanges(seqnames=ea$V1, ranges=IRanges(start=ea$V4, end=ea$V5), score=ea$V6, strand=ea$V7, id=ea$V9, seqinfo=seqinfo(Scerevisiae))
    ea2<-resize(ea, width=1, fix='center')

    tmp<-tran[tran$type=='ARS']
       dict0 <- list(DNAString('WRC'), DNAString('YCC'))
        matches<-mclapply(dict0, mc.cores=3, function(y) {
            seqnames <- as.list(seqnames(Scerevisiae))
            matches<-lapply(seqnames, function(x){
                    subject <- Scerevisiae[[x]]        
                    plus_matches <- matchPattern(y, subject, fixed=FALSE)           
                    plus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(plus_matches)), strand='+', seqinfo=seqinfo(Scerevisiae))
                    minus_matches <- matchPattern(reverseComplement(y), subject, fixed=FALSE)           
                    minus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(minus_matches)), strand='-', seqinfo=seqinfo(Scerevisiae))
                    c(plus_matches, minus_matches)
                    })
            matches<-unlist(GRangesList(matches))
            matches$group<-as.character(y)
            matches
            })
    matches<-unlist(GRangesList(matches))
    strand(matches)<-'*'
   
	tmp<-as.data.frame(findOverlaps(ea2, matches, maxgap=1000) )
    tmp$start<-start(ea2)[tmp$queryHits]
    tmp$ars.strand<-as.character(strand(ea2))[tmp$queryHits]
    tmp$motif.start<-start(matches)[tmp$subjectHits]
    tmp$distance<- tmp$motif.start - tmp$start
    tmp$distance[tmp$ars.strand=='-']<- tmp$start[tmp$ars.strand=='-'] - tmp$motif.start[tmp$ars.strand=='-']
    tmp$motif<-matches$group[tmp$subjectHits]

    input<-a[a$group %in% c('AID*','sA3G*')]
    input<-input[input$group %in% c('AID*','sA3G*')]
    strand(input)<-'*'
	tmp2<-as.data.frame(findOverlaps(ea2, input, maxgap=1000) )
    tmp2$start<-start(ea2)[tmp2$queryHits]
    tmp2$ars.strand<-as.character(strand(ea2))[tmp2$queryHits]
    tmp2$motif.start<-start(input)[tmp2$subjectHits]
    tmp2$distance<- tmp2$motif.start - tmp2$start
    tmp2$distance[tmp2$ars.strand=='-']<- tmp2$start[tmp2$ars.strand=='-'] - tmp2$motif.start[tmp2$ars.strand=='-']
    tmp2$motif<-input$group[tmp2$subjectHits]

    tmp3<-rbind(tmp, tmp2)

    # Plot showing mutation density
    grid.arrange(
        ggplot(subset(tmp2, tmp2$motif=='AID*')) + geom_freqpoly(aes(x=distance, colour=motif), binwidth=5) + xlim(c(-500,500)) +ylim(c(0,473)),
        ggplot(subset(tmp2, tmp2$motif=='sA3G*')) + geom_freqpoly(aes(x=distance, colour=motif), binwidth=5) + xlim(c(-500,500)) +ylim(c(0,233))
        )

    # Plot showing motif density
    apo<-list('AID*','sA3G*','EMS')
    out.f<-mclapply(apo, mc.cores=3, function(z){
        tmp<-resize(ea2, width=1500, fix='center')
        tmp2<-getSeq(Scerevisiae, tmp)
        lst<-as.list(seq(1, (1500-9), 5))
        out2<-mclapply(lst, mc.cores=12, function(x){
            y<-DNAStringSet(substr(tmp2, x, x+9))
            if(z=='AID*') pat<-'WRC'
            if(z=='sA3G*') pat<-'YCC'
            if(z=='EMS') pat<-'C'
            out1<-as.data.frame(cbind(vcountPattern(DNAString(pat), y, fix=FALSE), vcountPattern(reverseComplement(DNAString(pat)), y, fix=FALSE) ))
            out1$x<-rowSums(out1)
            out1$x[out1$x>0] <- 1
            out1$pos<-x
            out1$apo<-z
            out1$motif<-pat
            out1
            })
        out2<-do.call(rbind, out2)
        out2$yes<-factor('No motif', levels=c('Motif','No motif'))
        out2$yes[out2$x!=0]<-'Motif'       
        out2$pos<-out2$pos-((1500/2)-4)
        out2
        })
        out.f<-do.call(rbind, out.f)
        ggplot(out.f) + geom_histogram(aes(x=pos, fill=yes), binwidth=5) +
                coord_cartesian(xlim=c(-500,500)) +
                facet_grid(apo~., scales='free_y') +
                theme_bw() +
                theme(  axis.text.y=element_blank(),
                        axis.text.x=element_text(size=6),
                        axis.ticks=element_blank(),
                        axis.title=element_blank(),
                        strip.text=element_text(size=6), 
                        panel.margin=unit(0, "cm"),
                        legend.title=element_blank(),
                        legend.position='none') +
                scale_fill_manual(values=c('Motif'='blue','No motif'='gold'))

###############################################################
## Figure 3E - Transcription rate of MELs targetted genes
###############################################################
    # TRANSCRIPTION RATE FROM García-Martínez, J; Aranda, A. and Pérez-Ortín, J.E.  Mol. Cell.  15:303-313  (2004)
===     wget http://scsie.uv.es/chipsdna/TR%20final%20data.txt 
===     mv TR*txt TR_final_data.txt
===     sed -i 's/ /_/g' TR_final_data.txt

    trf<-data.table(read.delim('TR_final_data.txt', header=T, sep='\t')    )
    trf<-trf[ORF_name!='',]
    exp2<-data.table(Sistematic_name=trf$ORF_name, log2_TR_14.5h_Galactose=trf$TR_t5)
    exp2$log2_TR_14.5h_Galactose[exp2$log2_TR_14.5h_Galactose=='.']<-NA
    exp2$log2_TR_14.5h_Galactose<-as.numeric(as.character(exp2$log2_TR_14.5h_Galactose))
    exp2$log2_TR_14.5h_Galactose<-log2(exp2$log2_TR_14.5h_Galactose)

    regi<-promoters(Xu2009[['ORF']], upstream=500, downstream=50)
    regi$tr<-exp2$log2_TR_14.5h_Galactose[match(regi$Name, exp2$Sistematic_name)]
    regi<-regi[!is.na(regi$tr),]

    pr.aid<-regi[regi %over% a1.3[a1.3$sample=='AID*'] ]
    pr.aid$name2<-'AID*'
    pr.a3g<-regi[regi %over% a1.3[a1.3$sample=='sA3G*'] ]
    pr.a3g$name2<-'sA3G*'
    regi$name2<-'All'

    exp5<-c(pr.aid, pr.a3g, regi)
    exp5$name2<-factor(exp5$name2, levels=c('AID*','sA3G*', 'All'))

    library(ggbio)
    ggplot(exp5) + geom_boxplot(size=0.25, aes(x=name2, fill=name2, y=tr), notch = T,  notchwidth=0.01, outlier.size = 0.5) +
            theme_bw() +
            scale_fill_manual(values=c('AID*'='red','sA3G*'='black','All'='white')) +
            theme(axis.text.x=element_blank(),
                    axis.title=element_text(size=7),
                    axis.text.y=element_text(size=6),
                    legend.key=element_blank(),
                    legend.text=element_text(size=6),
                    legend.title=element_blank(),
                    legend.position='none',
                    axis.ticks=element_blank(),
                    strip.background=element_rect(colour='grey80')) +
            xlab('') + ylab('Relative transcription rate')

###############################################################
## Figure 3F - RNAP enrichment
###############################################################
    # Data from Kim H, Erickson B, Luo W, Seward D, Graber JH, Pollock DD, Megee PC, Bentley DL (2010). Gene-specific RNA polymerase II phosphorylation and the CTD code. Nat Struct Mol Biol. 2010 Oct;17(10):1279-86. 
    ===     wget http://downloads.yeastgenome.org/published_datasets/Kim_2010_PMID_20835241/track_files/Kim_2010_RNA_PolII_23C_ChIP_chip_V64.bedgraph
    ===     sed -i '/^#/d' Kim_2010_RNA_PolII_23C_ChIP_chip_V64.bedgraph
    polII_23c_k<-read.delim('Kim_2010_RNA_PolII_23C_ChIP_chip_V64.bedgraph', header=F, sep='\t', skip=2)
    polII_23c_k$V1<-as.character(polII_23c_k$V1)
    polII_23c_k$V1[polII_23c_k$V1=='chrmt']<-'chrM'
    polII_23c_k<-trim( GRanges(seqnames=polII_23c_k$V1, ranges=IRanges(start=polII_23c_k$V2, end=polII_23c_k$V3), score=polII_23c_k$V4, seqinfo=seqinfo(Scerevisiae)) )

    ===     wget http://downloads.yeastgenome.org/published_datasets/Kim_2010_PMID_20835241/track_files/Kim_2010_RNA_PolII_37C_ChIP_chip_V64.bedgraph
    ===     sed -i '/^#/d' Kim_2010_RNA_PolII_37C_ChIP_chip_V64.bedgraph
    polII_37c_k<-read.delim('Kim_2010_RNA_PolII_37C_ChIP_chip_V64.bedgraph', header=F, sep='\t', skip=2)
    polII_37c_k$V1<-as.character(polII_37c_k$V1)
    polII_37c_k$V1[polII_37c_k$V1=='chrmt']<-'chrM'
    polII_37c_k<-trim( GRanges(seqnames=polII_37c_k$V1, ranges=IRanges(start=polII_37c_k$V2, end=polII_37c_k$V3), score=polII_37c_k$V4, seqinfo=seqinfo(Scerevisiae)) )

    ===     wget http://downloads.yeastgenome.org/published_datasets/Kim_2010_PMID_20835241/track_files/Kim_2010_RNA_PolII_Ser7P_25C_ChIP_chip_V64.bedgraph
    ===     sed -i '/^#/d' Kim_2010_RNA_PolII_Ser7P_25C_ChIP_chip_V64.bedgraph
    polII_s7p_k<-read.delim('Kim_2010_RNA_PolII_Ser7P_25C_ChIP_chip_V64.bedgraph', header=F, sep='\t', skip=2)
    polII_s7p_k$V1<-as.character(polII_s7p_k$V1)
    polII_s7p_k$V1[polII_s7p_k$V1=='chrmt']<-'chrM'
    polII_s7p_k<-trim(GRanges(seqnames=polII_s7p_k$V1, ranges=IRanges(start=polII_s7p_k$V2, end=polII_s7p_k$V3), score=polII_s7p_k$V4, seqinfo=seqinfo(Scerevisiae)) )


    ===     wget http://downloads.yeastgenome.org/published_datasets/Kim_2010_PMID_20835241/track_files/Kim_2010_RNA_PolII_Ser2P_23C_ChIP_chip_V64.bedgraph
    ===     sed -i '/^#/d' Kim_2010_RNA_PolII_Ser2P_23C_ChIP_chip_V64.bedgraph
    polII_s2p_k<-read.delim('Kim_2010_RNA_PolII_Ser2P_23C_ChIP_chip_V64.bedgraph', header=F, sep='\t', skip=2)
    polII_s2p_k$V1<-as.character(polII_s2p_k$V1)
    polII_s2p_k$V1[polII_s2p_k$V1=='chrmt']<-'chrM'
    polII_s2p_k<-trim( GRanges(seqnames=polII_s2p_k$V1, ranges=IRanges(start=polII_s2p_k$V2, end=polII_s2p_k$V3), score=polII_s2p_k$V4, seqinfo=seqinfo(Scerevisiae)) )

    ===     wget http://downloads.yeastgenome.org/published_datasets/Kim_2010_PMID_20835241/track_files/Kim_2010_RNA_PolII_Ser5P_25C_ChIP_chip_V64.bedgraph
    ===     sed -i '/^#/d' Kim_2010_RNA_PolII_Ser5P_25C_ChIP_chip_V64.bedgraph
    polII_s5p_k<-read.delim('Kim_2010_RNA_PolII_Ser5P_25C_ChIP_chip_V64.bedgraph', header=F, sep='\t', skip=2)
    polII_s5p_k$V1<-as.character(polII_s5p_k$V1)
    polII_s5p_k$V1[polII_s5p_k$V1=='chrmt']<-'chrM'
    polII_s5p_k<-trim( GRanges(seqnames=polII_s5p_k$V1, ranges=IRanges(start=polII_s5p_k$V2, end=polII_s5p_k$V3), score=polII_s5p_k$V4, seqinfo=seqinfo(Scerevisiae)) )

    query2<-list('polII_23c_k'=polII_23c_k,
		'polII_37c_k'=polII_37c_k,
		'polII_s7p_k'=polII_s7p_k,
		'polII_s5p_k'=polII_s5p_k,
		'polII_s2p_k'=polII_s2p_k)
    for(i in 1:length(query2)){ query2[[i]]$names<-names(query2)[i] }

    ### Average score per promoter       --- split into polII and polIII transcripts
    pr<-promoters(Xu2009[['ORF']], upstream=500, downstream=50)
    pr.out<-mclapply(query2, mc.cores=12, function(x){
        apos2<-as.data.frame(findOverlaps(x, pr))
        apos2$score<-x$score[apos2$queryHits]
        tmp<-data.frame(aggregate(apos2$score, by=list(apos2$subjectHits), sum))
        colnames(tmp)<-c('row', x$names[1])
        tmp[,2]
        })
    pr.out<-do.call(cbind, pr.out)
    values(pr)<-cbind(as.data.frame(values(pr)), pr.out)

    regi<-pr
    pr.aid<-regi[regi %over% a1.3[a1.3$sample=='AID*'] ]
    pr.aid$name2<-'AID*'
    pr.a3g<-regi[regi %over% a1.3[a1.3$sample=='sA3G*'] ]
    pr.a3g$name2<-'sA3G*'
    regi$name2<-'All'
    regi$tr<-exp2$log2_TR_14.5h_Galactose[match(regi$Name, exp2$Sistematic_name)]
    regi$name2<-'Hi'
    regi<-regi[!is.na(regi$tr),]
    # cut regi into 5 groups on basis of level
    tmp<-sort(regi$tr, decreasing=F)
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(4/5) ) ] )] <- 'Medium-Hi'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(3/5) )] )]  <- 'Medium'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(2/5) )] )]  <- 'Medium-Low'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(1/5) )] )]  <- 'Low'
    regi$name2[is.na(regi$tr)]<-'Low'
    values(regi)<-values(regi)[,1:(ncol(values(regi))-1)]

    exp5<-c(pr.aid, pr.a3g, regi)       # pr.b, 
    exp5$name2<-factor(exp5$name2, levels=c('AID*','sA3G*', 'Low','Medium-Low','Medium','Medium-Hi','Hi'))      #,'AID*/sA3G*'

    exp5<-values(exp5)
    exp5<-exp5[,c(11:ncol(exp5))]
    exp5<-as.data.frame(exp5)
    exp6<-melt(exp5, id.var='name2')

    exp6$name2<-factor(exp5$name2, levels=c('AID*','sA3G*', 'Low','Medium-Low','Medium','Medium-Hi','Hi'))      # 'AID*/sA3G*', 
    exp6<-exp6[exp6$variable!='polII_37c_k',]
    exp6$variable<-gsub('polII_23c_k', 'RNAP II', exp6$variable)
    exp6$variable<-gsub('polII_s7p_k','S7P', exp6$variable)
    exp6$variable<-gsub('polII_s5p_k','S5P', exp6$variable)
    exp6$variable<-gsub('polII_s2p_k','S2P', exp6$variable)
    exp6$variable<-factor(exp6$variable, levels=c('RNAP II','S2P','S5P','S7P'))
       
    ggplot(exp6) + geom_boxplot(size=0.25, aes(x=name2, y=value, fill=name2), notch = T,  notchwidth=0.01, outlier.size = 0.5) +
            theme_bw() +
            scale_fill_manual(values=c('AID*'='red', 'sA3G*'='black', 'Low'='grey80','Medium-Low'='grey70','Medium'='grey60','Medium-Hi'='grey50','Hi'='grey40')) +
            facet_wrap(~variable, scales='free_y', nrow=1) + 
            theme(  axis.text.x=element_blank(),
                    axis.text.y=element_text(size=6),
                    axis.ticks = element_blank(),
                    axis.title=element_text(size=7),
                    axis.title.x=element_blank(),
                    legend.key=element_blank(),
                    legend.text=element_text(size=6),
                    legend.title=element_blank(),
                    strip.text=element_text(size=6),
                    legend.position='none',
                    strip.background=element_rect(colour='grey80')) + 
            ylab('Relative enrichment')

###############################################################
## Figure 4A - MEL overlap with tRNA genes
###############################################################
    # find proportion of tRNA genes targetted by MELs
    tran2<-tran[tran$type=='tRNA']
    tran2<-tran2[seqnames(tran2)!='chrM',]
    tran2$aid<-factor('no', levels=c('no','yes'))
    tran2$aid[tran2 %over% a1.3[a1.3$sample=='AID*'] ] <-'yes'
    tran2$a3g<-factor('no', levels=c('no','yes'))
    tran2$a3g[tran2 %over% a1.3[a1.3$sample=='sA3G*'] ] <-'yes'

    tmp2<-rbind( data.frame(type='sA3G*', aggregate(tran2$a3g, by=list(tran2$a3g), length) ),
           data.frame(type='AID*', aggregate(tran2$aid, by=list(tran2$aid), length) ) )
    tmp2$Group.1<-gsub('no', 'Non-overlapping', tmp2$Group.1)
    tmp2$Group.1<-gsub('yes', 'Overlapping', tmp2$Group.1)

    # 1. Generate randomised MEL database and calculate proportion of tRNA genes for each one
    bin=150
    sep=10
    li<-as.list(seqlengths(Scerevisiae)[1:16])   
    histo<-lapply(names(li), function(x){
        trim(
        GRanges(seqnames=x, ranges=IRanges(start=seq(1,li[[x]], ceiling(bin/sep)), width=bin), seqinfo=seqinfo(Scerevisiae))
        )
    })      
    histo<-unlist(GRangesList(histo))
            
    histo<-histo[histo %over% tmp1]
    boot<-as.list(seq(1, 1000,1))
    tmp.2<-mclapply(boot, mc.cores=12, function(x){
                    random.aid<-sample(histo, length(a1.3[a1.3$sample=='AID*']), replace=FALSE)
                    width(random.aid)<-width(a1.3[a1.3$sample=='AID*'])
                    random.aid$type<-'sim.AID'
                    random.aid$round<-x
        
                    random.a3g<-sample(histo, length(a1.3[a1.3$sample=='sA3G*']), replace=FALSE)
                    width(random.a3g)<-width(a1.3[a1.3$sample=='sA3G*'])
                    random.a3g$type<-'sim.A3G'
                    random.a3g$round<-x
                    
                    x<-c(random.aid, random.a3g)
                    # now overlap with tRNA genes
                    tran3<-tran[tran$type=='tRNA']
                    tran3<-tran3[seqnames(tran3)!='chrM',]
                    strand(tran3)<-'*'
                    tran3$aid<-factor('no', levels=c('no','yes'))
                    tran3$aid[tran3 %over% x[x$type=='sim.AID'] ] <-'yes'
                    tran3$a3g<-factor('no', levels=c('no','yes'))
                    tran3$a3g[tran3 %over% x[x$type=='sim.A3G'] ] <-'yes'
                    rbind(  data.frame(type='sim.sA3G*', aggregate(tran3$a3g, by=list(tran3$a3g), length), round=x$round[1] ),
                            data.frame(type='sim.AID*', aggregate(tran3$aid, by=list(tran3$aid), length), round=x$round[1] ) )
                    })
    tmp.2<-do.call(rbind, tmp.2)
    tmp.3<-aggregate(tmp.2$x, by=list(tmp.2$type, tmp.2$Group.1), mean)
    tmp.3$Group.2<-gsub('no', 'Non-overlapping', tmp.3$Group.2)
    tmp.3$Group.2<-gsub('yes', 'Overlapping', tmp.3$Group.2)
    colnames(tmp.3)<-colnames(tmp2)
    tmp4<-rbind(tmp2,tmp.3)
    tmp4$type<-factor(tmp4$type, levels=c('AID*','sA3G*','sim.AID*','sim.sA3G*'))

    detach(package:ggbio)
    ggplot(tmp4) + 
            geom_bar(aes(x=type, y=x, fill=Group.1), stat='identity') + 
            scale_fill_manual(values=c('Non-overlapping'='grey40', 'Overlapping'='green2')) + 
            theme_bw() +
            ylab('tRNA genes') + 
            theme(axis.text.x=element_text(angle=90, size=6),
                    axis.title.x=element_blank(),
                    axis.text.y=element_text(size=6),
                    axis.title.y=element_text(size=7),
                    legend.title=element_blank(),
                    strip.text=element_text(size=6),
                    axis.ticks=element_blank(),
                    legend.text=element_text(size=6),
                    legend.key.size=unit(.25, "cm"))

###############################################################
## Figure 4B - Mutation alignment to tRNA TSS
###############################################################
    trans<-promoters(tran[tran$type=='tRNA'], upstream=1, downstream=0)
    a2<-a
    strand(a2)<-'*'
    tmp<-as.data.frame(distanceToNearest(a2, trans))
    tmp$sample<-a$group[tmp$queryHits]
    tmp<-subset(tmp, tmp$sample %in% c('AID*', 'sA3G*', 'EMS'))
    tmp$mut.pos<-start(a2)[tmp$queryHits]
    tmp$tss.pos<-start(trans)[tmp$subjectHits]
    tmp$tss.strand<-as.character(strand(trans))[tmp$subjectHits]
    tmp$dis<- NA
    tmp$dis[tmp$tss.strand=='+']<-tmp$mut.pos[tmp$tss.strand=='+']-tmp$tss.pos[tmp$tss.strand=='+']
    tmp$dis[tmp$tss.strand=='-']<-tmp$tss.pos[tmp$tss.strand=='-']-tmp$mut.pos[tmp$tss.strand=='-']

    tmp2<-aggregate(tmp$dis, by=list(tmp$dis, tmp$sample), length)
    # normalise to mutation density 
        code<-aggregate(tmp$sample, by=list(tmp$sample), length)
        tmp2$code<-code$x[match(tmp2$Group.2, code$Group.1)]
        tmp2$norm<-tmp2$x/tmp2$code
        tmp3<-tmp2
    tmp3$Group.2<-factor(tmp3$Group.2, levels=c('AID*','sA3G*','EMS'))

    ggplot(tmp3) + geom_line(aes(x=Group.1, y=norm), size=I(0.2)) + 
        facet_grid(Group.2~.) +
        coord_cartesian(xlim=c(-250,250)) +
        xlab('Distance to feature') +
        ylab('Normalised mutation density') +
        scale_x_continuous(breaks=c(-250,0,250)) + 
        theme_bw() +
        theme(axis.text.y=element_blank(),
                axis.ticks=element_blank(),
                strip.text.y = element_text(size = 6),
                strip.text.x = element_text(size = 6),
                axis.text.x = element_text(size = 6),
                axis.title.x = element_text(size = 7),
                axis.title.y = element_text(size = 7),
                legend.position='none')

###############################################################
## Figure 4C - Mutation frequency in gene promoters
###############################################################
    # Define promoter regions
        t0<-promoters(Xu2009[['ORF']], upstream=500, downstream=50)
            values(t0)<-data.frame(name=t0$Name)
            t0$type='mRNA'
        t1<-tran[tran$type %in% c('tRNA'),]
        t1<-resize(t1, width=550, fix='center')
        t2<-promoters(tran[tran$type=='snoRNA',], upstream=500, downstream=50)
          # take out downstream polycistronic snoRNA
            t2<-t2[!t2$name %in% c('snR128', 'snR51','snR70','snR53','snR61','snR55','snR77','snR76','snR75','snR74','snR73','snR72') ]
          # take out intronic snoRNA
            t2<-t2[!t2$name %in% c('snR18','snR24','snR59','snR54','snR39','snR38','snR44','snR191') ]
          # add promoters of intron as as snoRNA feature
            t2<-c(t2, t0[grep('YAL003W|YMR116C|YPL198W|YML056C|YGL076C|YKL081W|YLR367W|YNR053C',t0$name)] )
            t2$type<-'snoRNA'       
        t3<-promoters(tran[tran$type=='snRNA',], upstream=500, downstream=50)

    # Split mRNA promoters into bins defined by the transcription rate
    pr<-t0
    pr$name<-gsub('%.*','',pr$name)     
    # TRANSCRIPTION RATE FROM García-Martínez, J; Aranda, A. and Pérez-Ortín, J.E.  Mol. Cell.  15:303-313  (2004)
    trf<-data.table(read.delim('TR_final_data.txt', header=T, sep='\t')    )
    trf<-trf[ORF_name!='',]
    exp2<-data.table(Sistematic_name=trf$ORF_name, log2_TR_14.5h_Galactose=trf$TR_t5)
    pr$tr<-exp2$log2_TR_14.5h_Galactose[match(pr$name, exp2$Sistematic_name)]
    pr<-pr[!is.na(pr$tr)]
    regi<-pr
    regi$tr<-as.numeric(as.character(regi$tr))
    regi<-regi[!is.na(regi$tr)]
    tmp<-sort(regi$tr, decreasing=F)
    regi$name2<-'Hi'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(4/5) ) ] )] <- 'Medium-Hi'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(3/5) )] )]  <- 'Medium'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(2/5) )] )]  <- 'Medium-Low'
    regi$name2[regi$tr <= max(tmp[1:(length(tmp)*(1/5) )] )]  <- 'Low'
    regi$name2[is.na(regi$tr)]<-'Low'
    values(regi)<-data.frame(name='mRNA', type=regi$name2)
    regi$type<-as.character(regi$type)
    regi<-c(regi,t1,t2,t3)
    regi$name<-factor(regi$type, levels=c('Low','Medium-Low','Medium','Medium-Hi','Hi','tRNA','snoRNA', 'snRNA')) 
    strand(regi)<-'*'

    ## for each group, find average mutation rate, downsampling the total number to half the size of the EMS dataset to allow fair bootstrapping and comparison
    boot<-as.list(1:1000)
    apo<-list('AID*','sA3G*','EMS')
    out1<-lapply(apo, function(z){
                            data<-a[a$group==z]
                            out<-mclapply(boot, mc.cores=12, function(x){
                                            y<-data[sample.int(length(data), (13928/2), replace=FALSE)] # 13928 size of EMS database         
                                            countOverlaps(regi, y)
                                            })
                            out<-do.call(cbind, out)
                            out
                            })
                    names(out1)<-apo

    out2<-do.call(cbind, mclapply(out1, mc.cores=3, function(x){
        data.frame(median=apply(x, 1, median), mean=apply(x, 1, mean) )
        }) 
        )
    values(regi)<-cbind(as.data.frame(values(regi)), out2)

    regi2<-as.data.frame(regi)

    regi3<-data.frame(name=regi2$name, AID=regi2$AID..mean, A3G=regi2$sA3G..mean, EMS=regi2$EMS.mean)
    tmp2<-melt(regi3, id.vars=c('name'))
    tmp2$variable<-gsub('AID', 'AID*', tmp2$variable)
    tmp2$variable<-gsub('A3G', 'sA3G*', tmp2$variable)
    tmp2$variable<-factor(tmp2$variable, levels=c('AID*','sA3G*','EMS'))
    tmp2$name<-factor(tmp2$name, levels=c('Low','Medium-Low','Medium','Medium-Hi','Hi','tRNA','snoRNA', 'snRNA'))

    ggplot(tmp2, aes(x=name, y=value)) + 
                facet_wrap(~variable, scales='free_y') + 
                geom_boxplot(aes(fill=name), size=I(0.25), outlier.shape = NA) + 
                theme_bw() +
                scale_y_continuous(breaks=c(0,2,4,6,8,10)) +
                coord_cartesian(ylim=c(0,8)) +
                xlab('') +
                ylab('Average mutation frequency') +
                scale_fill_manual(values=c('snoRNA'='red', 'tRNA'='green', 'snRNA'='blue', 'Low'='grey80','Medium-Low'='grey70','Medium'='grey60','Medium-Hi'='grey50','Hi'='grey40')) +           
                theme(axis.text.y=element_text(angle=90, size=6),
                        axis.text.x=element_blank(),
                        axis.ticks=element_blank(),
                        axis.title.y=element_text(size=7),
                        strip.text=element_text(size=6),
                        legend.text=element_text(size=6),
                        legend.title=element_blank(),
                        legend.key.size=unit(.25, "cm"),
                        legend.key=element_blank(),
                        legend.position='none',
                        panel.grid.major=element_blank(),
                        panel.grid.minor=element_blank())                    

###############################################################
## Figure 4D - example tRNA MELs
###############################################################
    # Region 1
    zoom<-GRanges(seqnames='chrIV', ranges=IRanges(start=884361, end=884493), seqinfo=seqinfo(Scerevisiae))
    # Region 2
    zoom<-GRanges(seqnames='chrXV', ranges=IRanges(start=594354, end=594425), seqinfo=seqinfo(Scerevisiae))

    strand(zoom) <-'*'
    zoom<-resize(zoom, width=1000, fix='center')

    y<-a[findOverlaps(a, zoom)@queryHits]
    y<-subset(y, y$group %in% c('AID*', 'sA3G*', 'EMS'))
    x<-a1.3[findOverlaps(a1.3, zoom)@queryHits]
    x<-subset(x, x$sample %in% c('AID*', 'sA3G*', 'EMS'))
    x$y<-1
    library(ggbio)

    tran2<-tran[tran$type %in% c('ARS','centromere','intron','long_terminal_repeat','ncRNA','noncoding_exon','ORF','retrotransposon','rRNA','snoRNA','snRNA','telomere','tRNA'),]
    tran2$type[tran2$type=='ORF']<-'transcript'

    y$group<-factor(y$group, levels=c('AID*','sA3G*','EMS'))
            p1<-ggplot(data = y , aes(x = start, y = as.factor(sample), colour=ref)) + geom_point(size=I(1.5))+
                scale_colour_manual(name='Base', values=c('G'='red', 'C'='black', "A"='blue', "T"='green', "CC"='orange', "CCC"='pink', "GG"='purple'))+
                scale_y_discrete(name='')+
                facet_grid(group~., scale='free_y', space='free_y') +
                xlim(zoom)+
                theme_bw() +
                theme(	panel.margin=unit(0, "cm"),
                        axis.text.x=element_blank(),
                        axis.text.y=element_blank(),
                        axis.title.x=element_blank(),
                        axis.ticks=element_blank(),
                        panel.grid.major = element_line(colour='grey70'),
                        strip.text.y=element_text(size=8, angle=0),
                        legend.key.size=unit(0.5, "cm"),
                        legend.position='none'
                                            ) 

            p2<-ggplot(x) + geom_bar(aes(y=y, fill=sample)) +
                xlim(zoom) + 
                scale_fill_manual(values=c('AID*'='orange', 'sA3G*'='purple')) +
                scale_colour_manual(values=c('AID*'='orange', 'sA3G*'='purple')) +
                theme_bw() +
                facet_grid(sample~.) +
                theme(  panel.margin=unit(0, "cm"),
                        legend.title=element_blank(),
                        legend.key.size=unit(0.5, "cm"),
                        legend.position='none',
                        strip.text.y=element_text(size=8, angle=0),
                        axis.text.y=element_blank(),
                        axis.ticks=element_blank(),
                        axis.title.y=element_blank()
                                            )

                tmptran <-tran2[tran2 %over% zoom]
                tmptran2 <-Xu2009[['ORF']][Xu2009[['ORF']] %over% zoom]
                values(tmptran2)<-data.frame(name=tmptran2$data, type=tmptran2$type)
                tmptran<-c(tmptran,tmptran2)

            p_tran<-ggplot(tmptran) + geom_arrow(size=I(1), type='open', aes(fill=type, y=name, colour=name), arrow.rate=0.05, angle=30) + 
                    xlim(zoom) +
                    theme_bw() + 
                    theme(
                        axis.text.x=element_blank(),
                        axis.text.y=element_blank(),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        axis.ticks=element_blank(),
                        panel.grid.major = element_line(colour='grey70'),
                        legend.title=element_blank()) 
           
            tracks(p1, p2, p_tran, heights=c(1,0.2, 0.2), xlab.height = unit(0.25, "lines")) + xlim(zoom) 

###############################################################
## Figure 4 - figure supplement 1 - median number of mutabile motifs in promoters seperated by presence of a MEL
###############################################################
    dict0 <- list(DNAString('WRC'), DNAString('YCC'), DNAString('C'))
    matches<-mclapply(dict0, mc.cores=3, function(y) {
        seqnames <- as.list(seqnames(Scerevisiae))
        matches<-lapply(seqnames, function(x){
                subject <- Scerevisiae[[x]]        
                plus_matches <- matchPattern(y, subject, fixed=FALSE)           
                plus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(plus_matches)), strand='+', seqinfo=seqinfo(Scerevisiae))
                minus_matches <- matchPattern(reverseComplement(y), subject, fixed=FALSE)           
                minus_matches<- GRanges(seqnames=x, ranges=IRanges(ranges(minus_matches)), strand='-', seqinfo=seqinfo(Scerevisiae))
                c(plus_matches, minus_matches)
                })
        tmp<-unlist(GRangesList(matches))
        tmp$motif<-as.character(y)
	tmp
        })
    matches<-unlist(GRangesList(matches))

    ## Define regions
        t4<-tran[tran$type %in% c('tRNA'),]
            t4$type<-'tRNA_slim'
    tmp1<-c(t0,t1,t2, t3, t4)
    strand(tmp1)<-'*'               

    m.aid<-matches[matches$motif=='WRC']
    m.a3g<-matches[matches$motif=='YCC']
    tmp2<-rbind(    data.frame(type=tmp1$type, number=countOverlaps(tmp1, m.aid), muts=countOverlaps(tmp1, a[a$group=='AID*']), apo='AID*'), 
                    data.frame(type=tmp1$type, number=countOverlaps(tmp1, m.a3g), muts=countOverlaps(tmp1, a[a$group=='sA3G*']), apo='sA3G*')
                    )
    tmp2$target<-factor('no',levels=c('yes','no'))
    tmp2$target[tmp2$muts > 0 ] <- 'yes'

    tmp2$hotspot.aid<-factor('no',levels=c('yes','no'))
        tmp.x<-countOverlaps(tmp1, a1.3[a1.3$sample=='AID*'])
        tmp2$hotspot.aid[tmp.x > 0] <- 'yes'
    tmp2$hotspot.a3g<-factor('no',levels=c('yes','no'))
        tmp.x<-countOverlaps(tmp1, a1.3[a1.3$sample=='sA3G*'])
        tmp2$hotspot.a3g[tmp.x > 0] <- 'yes'
            
    ### NUMBER OF MOTIFS IN tRNA TARGETTED BY AID and A3G
    tmp2.x<-tmp2[tmp2$apo == 'AID*',]
    aggregate(tmp2.x$number, by=list(tmp2.x$hotspot.aid, tmp2.x$type, tmp2.x$apo), summary)

    tmp2.x<-tmp2[tmp2$apo == 'sA3G*',]
    aggregate(tmp2.x$number, by=list(tmp2.x$type, tmp2.x$apo, tmp2.x$hotspot.a3g), summary)

###############################################################
## Figure 4 - figure supplement 2 - Number of mutable motifs against number of mutations 
###############################################################
    # number of mutations vs number of motifs in MELs
    tmp<-a1.3[a1.3$sample %in% c('AID*','sA3G*')]
    tmp$aid.motif<-countOverlaps(tmp, matches[matches$motif=='WRC'] )
    tmp$a3g.motif<-countOverlaps(tmp, matches[matches$motif=='YCC'] )
    #### test correlation
    tmp2.aid<-tmp2[tmp2$apo=='AID*' & tmp2$target=='yes' & tmp2$type!='tRNA_slim',]
        cor.test(tmp2.aid$muts, tmp2.aid$number, alternative='two.sided',method="spearman")       # use spearman as cant assume normal distribution
    tmp2.a3g<-tmp2[tmp2$apo=='sA3G*' & tmp2$target=='yes' & tmp2$type!='tRNA_slim',]
        cor.test(tmp2.a3g$muts, tmp2.a3g$number, alternative='two.sided',method="spearman")       # use spearman as cant assume normal distribution

    tmp2.tmp<-tmp2[tmp2$type!='tRNA_slim',]
    ggplot(tmp2.tmp) + 
                geom_point(aes(x=log2(muts), y=number), position=position_jitter(width=1, height=1), size=I(1), alpha=0.3, colour='red') +
                ylab('Number of deaminase motifs') +
                xlab('Log2 number of mutations') +
                theme_bw() +
                xlim(c(-1,7)) +
                facet_wrap(~apo, scales='free')    

###############################################################
## Figure 4 - figure supplement 3 - Mutations at the rDNA locus
###############################################################
    # Index sacCer3 genome
===    bwa index -p sacCer3 -a bwtsw sacCer3.fa
    # Align
===    bwa mem -v 1 -a -t 1 -M sacCer3 -R "@RG\tID:$line\tPL:illumina\tPU:illumina\tLB:sample\tSM:sample\tCN:BGI" reads_1.fq.clean.gz reads_2.fq.clean.gz > reads.wg.rDNA.sam
    # sam > bam, sort and index
===    samtools view -F 4 -b@ 1 -T sacCer3.fa reads.wg.rDNA.sam -o reads.wg.rDNA.bam
===    samtools sort -@ 1 -m 4G reads.wg.rDNA.bam reads.wg.rDNA.srt
===    samtools index reads.wg.rDNA.srt.bam
    # extract rDNA region
===    samtools view -b reads.wg.rDNA.srt.bam chrXII:434839-508289 -o reads.region.bam
    # call mutations using somatic sniper
===    bam-somaticsniper -q 40 -F vcf -N 4000 \
              -f sacCer3.fa \
              reads.region.bam \
              reference.region.bam \
              reads.region.vcf
    # edit vcfs
===    sed "s/$/\t$line/g" reads.region.vcf | tr ":" "\t" > reads.region.vcf.2
    # combine all samples together
===    cat *.region.vcf.2 > all.rDNA.vcf

    a<-read.table('all.rDNA.vcf', header=F)
    a$ft<-paste(a$V4, a$V5, sep='>')
    a2<-a
    a2<-subset(a, a$V47 > 50)
    a2$strand<-'-'
    a2$strand[a2$V4 %in% c('C','T')]<-'+'
    a<-GRanges(seqnames=a2$V1, ranges=IRanges(start=a2$V2, width=1), strand=a2$strand, seqinfo=seqinfo(Scerevisiae))
    values(a)<-a2[,4:(ncol(a2)-1)]
    a$V48<-gsub('.wg.rDNA.srt.final.region.somatic-sniper.vcf','',a$V48)
    a<-a[a$V47>50,]     # somatic sniper score
    a<-a[a$V24 > 10,]   # read depth sample
    a<-a[a$V37 > 10,]   # read depth ref

    b<-(strsplit(as.character(a$V26), ','))
    b<-as.data.frame(do.call(rbind,b))
    colnames(b)<-c('refA','refC','refG','refT')
    b<-apply(b, 2, function(x) { as.numeric(as.character(x)) } )

    c<-(strsplit(as.character(a$V39), ','))
    c<-as.data.frame(do.call(rbind,c))
    c<-apply(c, 2, function(x) { as.numeric(as.character(x)) } )
    colnames(c)<-c('mutA','mutC','mutG','mutT')

    values(a)<-cbind(as.data.frame(values(a)), b,c)

    # remove bases where ref has any reads - very strict

    a$ref.mut.count<- 0
    a$ref.mut.count[a$V5=='A']<-a$refA[a$V5=='A']
    a$ref.mut.count[a$V5=='C']<-a$refC[a$V5=='C']
    a$ref.mut.count[a$V5=='G']<-a$refG[a$V5=='G']
    a$ref.mut.count[a$V5=='T']<-a$refT[a$V5=='T']

    a<-a[a$ref.mut.count < 3]
    a$seq<-getSeq(Scerevisiae, seqnames(a), strand=as.character(strand(a)), start=start(a)-2, end=end(a)+2, as.character=T)
    a$sample<-NA
    a$sample[grep('3G', a$V48)]<-'A3G'
    a$sample[grep('EMS', a$V48)]<-'EMS'
    a$sample[grep('AID', a$V48)]<-'AID'
    a$dinuc<-substr(a$seq, 2,3)

        ###############################################################
        ## Figure 4 - figure supplement 3A - Mutation context
        library(seqLogo)

        a.aid<-subset(a, a$group=='AI')
            cmat<-consensusMatrix(as.character(a.aid$seq))
            cmp<-makePWM(cmat/colSums(cmat[1:4,]))
            cmp<-cmp@pwm
            cmp<-cmp[c(3,2,4,1),]
        p1<-barchart(t(cmp ), horizontal=FALSE, 
            scales = list(draw = FALSE), 
            ylab='', 
            col=c('orange','blue', 'red', 'green'),
            box.ratio=10, main='AID*')
        a.a3g<-subset(a, a$group=='3G')
            cmat<-consensusMatrix(as.character(a.a3g$seq))
            cmp<-makePWM(cmat/colSums(cmat[1:4,]))
            cmp<-cmp@pwm
            cmp<-cmp[c(3,2,4,1),]
        p2<-barchart(t(cmp ), horizontal=FALSE, 
            scales = list(draw = FALSE), 
            ylab='', 
            col=c('orange','blue', 'red', 'green'),
            box.ratio=10, main='sA3G*')
        a.ems<-subset(a, a$group=='EM')
            cmat<-consensusMatrix(as.character(a.ems$seq))
            cmp<-makePWM(cmat/colSums(cmat[1:4,]))
            cmp<-cmp@pwm
            cmp<-cmp[c(3,2,4,1),]
        p3<-barchart(t(cmp ), horizontal=FALSE, 
            scales = list(draw = FALSE), 
            ylab='', 
            col=c('orange','blue', 'red', 'green'),
            box.ratio=10, main='EMS')

            grid.arrange(p1,p2,p3)

        ###############################################################
        ## Figure 4 - figure supplement 3B - rDNA locus
            zoom<-GRanges(seqnames='chrXII', ranges=IRanges(start=450575, end=468812), seqinfo=seqinfo(Scerevisiae))               
            a$sample<-a$V48
            a$ref<-a$V4
            a$group<-substr(a$sample, 1,2)
            a$group<-gsub('3G','sA3G*', a$group)
            a$group<-gsub('AI','AID*', a$group)
            a$group<-gsub('EM','EMS*', a$group)
            a$group<-factor(a$group, levels=c('AID*','sA3G*','EMS'))
            y<-a[findOverlaps(a, zoom)@queryHits]
            ggplot(y) + geom_point(aes(x=start, y=sample, colour=ref)) + 
                    facet_grid(group~.) +  
                    theme_bw() +
                    xlim(zoom) + 
                    scale_colour_manual(values=c('A'='green','C'='black','G'='red')) +
                    theme(axis.text.y=element_blank(),
                            axis.ticks=element_blank() ) 

###############################################################
## Figure 5 - figure supplement 1A - Alignment of A3A and A3B mutations to TSSs
###############################################################
        # Define TSS sites
        t0<-promoters(Xu2009[['ORF']], upstream=1, downstream=0)
            values(t0)<-data.frame(name=t0$Name)
            t0$type='mRNA'
            t0$name<-as.character(t0$name)
        t1<-promoters(tran[tran$type %in% c('tRNA'),], upstream=1, downstream=0)
        t2<-promoters(tran[tran$type=='snoRNA',], upstream=1, downstream=0)
          # take out downstream polycistronic snoRNA
            t2<-t2[!t2$name %in% c('snR128', 'snR51','snR70','snR53','snR61','snR55','snR77','snR76','snR75','snR74','snR73','snR72') ]
          # take out intronic snoRNA
            t2<-t2[!t2$name %in% c('snR18','snR24','snR59','snR54','snR39','snR38','snR44','snR191') ]
          # add promoters of intron as as snoRNA feature
            t2<-c(t2, t0[grep('YAL003W|YMR116C|YPL198W|YML056C|YGL076C|YKL081W|YLR367W|YNR053C',t0$name)] )
            t2$type<-'snoRNA'       
        t3<-promoters(tran[tran$type=='snRNA',], upstream=1, downstream=0)
        trans<-c(t0,t1,t2,t3)
 
        # Import A3A and A3B mutation data
    y4x<-read.delim(file='A3A_A3B_haploid_mutation_data.txt', sep='\t', header=T)
    a2<-GRanges(seqnames=y4x$seqnames, ranges=IRanges(start=y4x$start, width=1), strand=y4x$strand, apobec=y4x$apobec, sample=y4x$sample,ref_base=y4x$ref_base, seqinfo=seqinfo(Scerevisiae))
    strand(a2)<-'*'
    tmp<-as.data.frame(distanceToNearest(a2, trans))
    tmp<-subset(tmp, !is.na(tmp$subjectHits))
    tmp$sample<-a2$apobec[tmp$queryHits]
    tmp$mut.pos<-start(a2)[tmp$queryHits]
    tmp$tss.pos<-start(trans)[tmp$subjectHits]
    tmp$tss.strand<-as.character(strand(trans))[tmp$subjectHits]
    tmp$dis<- NA
    tmp$dis[tmp$tss.strand=='+']<-tmp$mut.pos[tmp$tss.strand=='+']-tmp$tss.pos[tmp$tss.strand=='+']
    tmp$dis[tmp$tss.strand=='-']<-tmp$tss.pos[tmp$tss.strand=='-']-tmp$mut.pos[tmp$tss.strand=='-']
    tmp$feature<-trans$type[tmp$subjectHits]
    tmp$t.type<-trans$feature[tmp$subjectHits]

    tmp2<-aggregate(tmp$dis, by=list(tmp$dis, tmp$sample), length)
    tmp$feat<-factor('no',levels=c('yes','no'))
    tmp$feat[tmp$feature=='tRNA']<-'yes'

    ggplot(tmp) + geom_freqpoly(aes(x=dis, colour=feat), binwidth=20) + facet_grid(.~sample) +
            coord_cartesian(xlim=c(-1000,1000)) +
        xlab('Distance to TSS') +
        ylab('Mutation count') +
        scale_x_continuous(breaks=c(-500,0,500)) + 
        scale_colour_manual(name='tRNA', values=c('yes'='red','no'='black')) +
        theme_bw() +
        theme(axis.text.y=element_blank(),
                axis.ticks.y=element_blank(),
                strip.text.y = element_text(size = 8),
                strip.text.x = element_text(size = 6),
                axis.text.x = element_text(size = 8),
                axis.title.x = element_text(size = 8),
                axis.title.y = element_text(size = 8),
                legend.position='none')

###############################################################
## Figure 5 - figure supplement 1B - Alignment of PD4120a mutations to TSSs
###############################################################
    library(BSgenome.Hsapiens.UCSC.hg19)
    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
    ### Data from Alexandrov et al., (2013). Signatures of mutational processes in human cancer. Nature 500,415–421.
    ===     wget ftp://ftp.sanger.ac.uk/pub/cancer/AlexandrovEtAl/somatic_mutation_data/Breast/Breast_clean_somatic_mutations_for_signature_analysis.txt
    ===     sed 's/ /_/g' Breast_clean_somatic_mutations_for_signature_analysis.txt | grep "PD4120a" | grep "subs" > Breast_clean_somatic_mutations_for_signature_analysis.ed.txt

    ### Retrieve human transcript data from UCSC Genome Browser
    txdb<-makeTranscriptDbFromUCSC(
                 genome="hg19",
                 tablename="wgEncodeGencodeBasicV17",
                 transcript_ids=NULL,
                 circ_seqs=DEFAULT_CIRC_SEQS,
                 url="http://genome.ucsc.edu/cgi-bin/",
                 goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath",
                 miRBaseBuild=NA)
    pr<-promoters(txdb, upstream=1, downstream=0)

    tmp<-read.table(file='Breast_clean_somatic_mutations_for_signature_analysis.ed.txt')
    tmp$V3<-paste('chr', tmp$V3, sep='')
    tmp$V3<-gsub('chrMT', 'chrM', tmp$V3)
    d<-GRanges(seqnames=tmp$V3, ranges=IRanges(start=tmp$V4, end=tmp$V5), ref=tmp$V6, mut=tmp$V7, type=tmp$V2, id=tmp$V1, seqinfo=seqinfo(Hsapiens))
    d$ref<-as.character(d$ref)
    strand(d)<-'-'
    strand(d)[d$ref %in% c('C','T')]<-'+'
    d$seq_base<-getSeq(Hsapiens, d, as.character=T)   
    d$seq<-getSeq(Hsapiens, seqnames(d), start(d)-1, end(d)+1, as.character=T)   ### theres a problem with dinuc -- think its sorted out now
    d$seq2<-paste(substr(d$seq, 1,1), d$ref, substr(d$seq, 3,3), sep='')        # this is the 1-,0,+1 with correct ref base
    d$seq_b2<-substr(d$seq2, 2,2)       # this is the same as ref
    d$seq2_pyr<-d$seq2        
    d$seq2_pyr[as.character(strand(d))=='-']<-as.character(reverseComplement(DNAStringSet(d$seq2_pyr[as.character(strand(d))=='-'])) )
    d$ref_pyr<-d$ref 
    d$ref_pyr[as.character(strand(d))=='-']<-as.character(reverseComplement(DNAStringSet(d$ref[as.character(strand(d))=='-'])) )
    d$dinuc<-substr(d$seq2_pyr, 1,2)
    d$dinuc.p1<-substr(d$seq2_pyr, 2,3)
    d$cpg<-'no'
    d$cpg[d$dinuc.p1=='CG']<-'yes'
    d$dinuc[d$cpg=='yes']<-'CpG'
    x<-d
    strand(x)<-'*'
    # take only non-cpg mutations
            x<-x[x$cpg=='no',]
    # take only non-kataegic mutations
            x<-x[order(as.character(seqnames(x)), start(x) ),]
            x$icd<-append(diff(start(x)),0)
                n<-append(diff(as.numeric(seqnames(x))),0)
                x$icd[n==1]<- NA 
            x<-c(subset(x, x$icd > 5000), subset(x, is.na(x$icd)) )     # remove kataegic mutations
            x<-x[order(x$id, as.character(seqnames(x)), start(x) ),]   

    pr<-promoters(txdb, upstream=1, downstream=0)
    tmp<-as.data.frame(distanceToNearest(x, pr))
    x$tss<-start(pr)[tmp$subjectHits]
    x$tss.strand<-as.character(strand(pr))[tmp$subjectHits]
    x$dis<-start(x)-x$tss       # + strand - tss - mut
    x<-x[!is.na(x$dis),]

    x$dis[x$tss.strand=='-']<-x$tss[x$tss.strand=='-'] - start(x)[x$tss.strand=='-']       # + strand - tss - mut
    x$mut_pyr <- x$mut
    x$mut_pyr[x$ref %in% c('A','G')]<-as.character(reverseComplement(DNAStringSet(as.character(x$mut_pyr)[x$ref %in% c('A','G')])))
    x<-data.table(dis=x$dis, dinuc=x$dinuc, ft=paste(x$ref_pyr, x$mut_pyr, sep='>'))

    ggplot(x) + geom_freqpoly(aes(x=dis, colour=ft), binwidth=100, size=I(0.5)) + facet_wrap(nrow=3, ft~dinuc) + #, scales='free_y')  +
            theme_bw() + xlim(c(-5000,5000)) +
            theme(legend.position='none') 

###############################################################
## END
###############################################################
