#!/usr/bin/Rscript

## Written by Matthew J. Maurer, 2013
## Version 0.13

require(gsalib)
require(Biostrings)

################################################################################################
## Functions
################################################################################################
## Get specified columns from a data frame, in the order listed
## d is a data frame
## cols2Get is a vector of fields in the data frame (i.e. column names) in the form of cols2Get = c("Field_1", "Field_2", "Field_3")
## For example, cols2Get = c("SampleID", "Date", "Time")
getCols <- function(d, cols2Get){
	 d2 <- d[, match(cols2Get, colnames(d))]
	return(d2)
}
################################################################################################
## Group plates into sets based on common conditions
## d is a data frame
## Group is a vector of fields in the data frame (i.e. column names) in the form of Group = c("Field_1", "Field_2", "Field_3")
## For example, Group = c("SampleID", "Date", "Time")
## Sep is a character that will be used to separate the concatenated field names
CreateGroup <- function (d, Group, Sep = ".") {
	for (g in rev(Group)){
		d$Group <- paste(d[[g]], d$Group, sep = Sep)
	}

	#remove the trailing separator
	Sep2 <- paste(Sep, "$", sep = "")
	d$Group <- sub(Sep2, "", d$Group)	

	return(d$Group)
}
################################################################################################
organizeTable <- function (d) {
	## keep only records that match specific conditions 
	d <- d[
		d$Filter %in% c("called", "raw") 
		& ! d$IntervalStratification %in% c("outside.intervals")
		& d$Novelty %in% c("novel")
		& d$OneBPIndel %in% c("all")
		,]

	## pull out only specific fields of the CountVariants portion of the report
	d <- getCols(d, c("EvalRod", "Filter", "IntervalStratification", "Novelty", 
		"nCalledLoci", "nVariantLoci", "nSNPs", "nInsertions", "nDeletions"))

	## reorder dataframe	
	d <- d[order(d[["Novelty"]], d[["Filter"]], d[["IntervalStratification"]]),]
	
	return(d)
}
################################################################################################
organizeSuppTable <- function(d){
	## Make sure the column for chromosomes is called CHROM and not X.CHROM
	colnames(d) <- gsub("X.CHROM", "CHROM", colnames(d))

	## Change the name of the column called EFF....Biotype.... to cas9siteSeq
	colnames(d) <- gsub("EFF....BIOTYPE...", "cas9SiteSeq", colnames(d))

	## Add NA to all cas9SiteSeq records that are empty
	#d$cas9SiteSeq <- ifelse(d$cas9SiteSeq=="", NA, d$cas9SiteSeq)

	## Rename VariantTypes
	levels(d$VariantType) <- gsub("DEL.*","DEL",levels(d$VariantType), perl=T)
	levels(d$VariantType) <- gsub("INS.*","INS",levels(d$VariantType), perl=T)

	## Add target locus field, and enter records
	d$TargetLocus <- ""
	d[d$Samples %in% c("OR01", "OR02", "OR03", "OR04", "OR05"),]$TargetLocus<-'URA3'
	d[d$Samples %in% c("OR06", "OR07", "OR08", "OR09"),]$TargetLocus<-'LYP1'
	d$TargetLocus <- ifelse(d$TargetLocus=="","URA3/LYP1", d$TargetLocus)

	## Add target locus guide sequence field, and enter records
	d$TargetLocusGuideSeq <- ""
	d[d$Samples %in% c("OR01", "OR02", "OR03", "OR04", "OR05"),]$TargetLocusGuideSeq<-URA3GuideSeq
	d[d$Samples %in% c("OR06", "OR07", "OR08", "OR09"),]$TargetLocusGuideSeq<-LYP1GuideSeq
	d$TargetLocusGuideSeq <- ifelse(d$TargetLocusGuideSeq=="",
		paste(URA3GuideSeq, "/", LYP1GuideSeq, sep = ""),
		d$TargetLocusGuideSeq
	)

	return(d)
}
################################################################################################
calcAlnScores <- function(d, substitutionMatrix = NULL, scoreOnly = TRUE){

	## Calc global alignment score for URA3 guide seq
	URA3GuideSeqAlnScores <- by(d$cas9SiteSeq, d$cas9SiteSeq, function(l){
		pairwiseAlignment(as.character(l[1]), URA3GuideSeq, substitutionMatrix=substitutionMatrix, scoreOnly=scoreOnly)
		}
	)

	## Calc global alignment score LYP1 guide seq
	LYP1GuideSeqAlnScores <- by(d$cas9SiteSeq, d$cas9SiteSeq, function(l){
		pairwiseAlignment(as.character(l[1]), LYP1GuideSeq, substitutionMatrix=substitutionMatrix, scoreOnly=scoreOnly)
		}
	)

	## Put alignment scores back into table
	d$AlnScore_URA3 <- unsplit(URA3GuideSeqAlnScores, d$cas9SiteSeq)
	d$AlnScore_LYP1 <- unsplit(LYP1GuideSeqAlnScores, d$cas9SiteSeq)

	## Remove alignment scores from records corresponding to samples that were not 
	## treated with the given guide sequence construct
	d$AlnScore_URA3 <- ifelse(d$TargetLocus %in% c("URA3/LYP1", "URA3"), d$AlnScore_URA3, NA)
	d$AlnScore_LYP1 <- ifelse(d$TargetLocus %in% c("URA3/LYP1", "LYP1"), d$AlnScore_LYP1, NA)
	
	return(d)
}
################################################################################################
calcNumMatches <- function(d, substitutionMatrix = nucleotideSubstitutionMatrix(match=1,mismatch=0,baseOnly=T), scoreOnly = TRUE){

	## Calc global alignment score for URA3 guide seq
	URA3GuideSeqNumMatches <- by(d$cas9SiteSeq, d$cas9SiteSeq, function(l){
		cas9SiteSeq <- strsplit(as.character(l[1]),"")
		TargetLocusGuideSeq <- strsplit(as.character(URA3GuideSeq),"")
		d<-cbind(as.data.frame(cas9SiteSeq),as.data.frame(TargetLocusGuideSeq))
		NumMatches <- by(d,rownames(d),function(d2){match(d2[[1]],d2[[2]])}) 
		NumMatches <- sum(unsplit(NumMatches,rownames(d)),na.rm=T)
		}
	)

	## Calc global alignment score LYP1 guide seq
	LYP1GuideSeqNumMatches <- by(d$cas9SiteSeq, d$cas9SiteSeq, function(l){
		cas9SiteSeq <- strsplit(as.character(l[1]),"")
		TargetLocusGuideSeq <- strsplit(as.character(LYP1GuideSeq),"")
		d<-cbind(as.data.frame(cas9SiteSeq),as.data.frame(TargetLocusGuideSeq))
		NumMatches <- by(d,rownames(d),function(d2){match(d2[[1]],d2[[2]])}) 
		NumMatches <- sum(unsplit(NumMatches,rownames(d)),na.rm=T)
		}
	)

	## Put alignment scores back into table
	d$NumMatches_URA3 <- unsplit(URA3GuideSeqNumMatches, d$cas9SiteSeq)
	d$NumMatches_LYP1 <- unsplit(LYP1GuideSeqNumMatches, d$cas9SiteSeq)

	## Remove alignment scores from records corresponding to samples that were not 
	## treated with the given guide sequence construct
	d$NumMatches_URA3 <- ifelse(d$TargetLocus %in% c("URA3/LYP1", "URA3"), d$NumMatches_URA3, NA)
	d$NumMatches_LYP1 <- ifelse(d$TargetLocus %in% c("URA3/LYP1", "LYP1"), d$NumMatches_LYP1, NA)
	
	return(d)
}
################################################################################################
## Function to calculate probability of better match for GuideSeq
## amongst all (or a random selection of) Cas9 sequences (randSeqs)
calcProbs <- function(GuideSeq, AlnScores, randSeqs){
	## GuideSeq should be a character string or variable holding a character string
	## AlnScores should be a vector of alignment scores for sequences that you wish to determine the probability of finding a better match to
	## such as a vector from the data frame d3$AlnScore_URA3
	## randSeqs should be a vector of character strings that you wish to align the GuideSeq against
	allScores <- sapply(randSeqs, function(Seq){
		pairwiseAlignment(GuideSeq, Seq, substitutionMatrix = ntSubMatrix, scoreOnly = TRUE)
		}
	)
	pvals <- sapply(AlnScores, function(AlnScore){
		sum(allScores >= AlnScore)/length(randSeqs)
		}
	)
	return(pvals)
}
################################################################################################
## Remove filtered variants and simplify table
rmFltVar <- function(d){
	## Keep only passing variants
	d <- d[d$FILTER=="PASS",]

	## Remove FILTER column
	d <- d[,-match("FILTER", colnames(d))]
	
	levels(d$Samples) <- gsub(",?filterIn[a-zA-Z0-9]+","",levels(d$Samples)) #removes sampleID from d$Samples that are makred "filteredIn" under d$Samples
	levels(d$Samples) <- gsub("^,?","",levels(d$Samples)) #remove leading "," from any d$Samples members left over from the previous gsub step

	return(d)
}
################################################################################################
################################################################################################
## Main
################################################################################################
#############################################

args<-commandArgs(TRUE)

dataDir <- ifelse(is.na(args[1]),".",args[1])

setwd(dataDir)

#############################################

## Read in gatkreport
l_window <- gsa.read.gatkreport("Individuals_Window_detailed.gatkreport")

## Grab the CountVariants part of the report and put in a data frame
d_window <- l_window$CountVariants

## Change the names from overlap.interval to "Within_Xbp_GG"
levels(d_window$IntervalStratification)[3] <- "Xbp_GG"

d1 <- d_window

d1 <- organizeTable(d1)

colnames(d1)[match(c("nVariantLoci","nSNPs","nInsertions","nDeletions"), colnames(d1))] <- c("nVar", "nSNP", "nINS", "nDEL")

d2 <- d1[d1$Filter %in% "called",]
d2 <- d2[, -5]

d2 <- reshape(d2, 
	varying = NULL, 
	v.names = c("nVar","nSNP","nINS","nDEL"), 
	timevar = "IntervalStratification",
	idvar = c("EvalRod"), 
	drop = c("Filter", "Novelty"), 
	direction="wide",
	sep = "_")

#############################################
#############################################

## d3b
## Each cas9 target seq that falls within the specified distance of a variant site is given
## Samples are grouped so each cas9 target seq is listed only once, but variant sites 
## are redundantly listed

URA3GuideSeq <- "ACGTTACAGAAAAGCAGGCT"
LYP1GuideSeq <- "CATAATAACGTCCAATAAAT"


## Build a table with samples grouped according to variants detected
File="pre-suppTable1_onePerLine.tab"
d3 <- read.delim(File, na.strings="")
d3 <- organizeSuppTable(d3)


##############################################################
## Calc global alignment score for URA3 and LYP1 guide seqs ##
##############################################################
#ntSubMatrix <- NULL
ntSubMatrix <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = TRUE)
seqMatchMatrix <- nucleotideSubstitutionMatrix(match = 1, mismatch = 0, baseOnly = TRUE)
#ntSubMatrix <- nucleotideSubstitutionMatrix(match = 1, mismatch = -1, baseOnly = TRUE)

############
## For d3 ##
############
d3 <- calcNumMatches(d3, substitutionMatrix = seqMatchMatrix)
d3 <- calcAlnScores(d3, substitutionMatrix = ntSubMatrix)


################################################################################
## Determine the probability of there being a better Cas9 match in the genome ##
################################################################################
## load bed file with cas9 sites
dCas9Bed <- read.delim("cas9Sites_window.bed", header=F)

## randomly select 10000 cas9 sequences from the genome
randCas9Seqs <- as.character(sample(dCas9Bed[,5],10000))
#randCas9Seqs <- as.character(sample(dCas9Bed[,5],10))	# for testing, comment out the above line and use this line

############
## For d3 ##
############
## Run function to calculate probability of better match for GuideSeq
## amongst all (or a random selection of) Cas9 sequences (randSeqs)
d3$Probs_URA3 <- calcProbs(URA3GuideSeq, d3$AlnScore_URA3, randCas9Seqs)
d3$Probs_LYP1 <- calcProbs(LYP1GuideSeq, d3$AlnScore_LYP1, randCas9Seqs)

## Remove probabilities from records corresponding to samples that were not 
## treated with the given guide sequence construct
d3$Probs_URA3 <- ifelse(d3$TargetLocus %in% c("URA3/LYP1", "URA3"), d3$Probs_URA3, NA)
d3$Probs_LYP1 <- ifelse(d3$TargetLocus %in% c("URA3/LYP1", "LYP1"), d3$Probs_LYP1, NA)

################################################################################
## Reorganize tables 
################################################################################
## Below are the columns in d3 
#c("CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", "VariantType", "Samples", "set", "cas9SiteSeq", "TargetLocus", "TargetLocusGuideSeq", "AlnScore_URA3", "AlnScore_LYP1", "Probs_URA3", "Probs_LYP1")         
## Below are the columns in d4
#c("CHROM", "POS", "REF", "ALT", "QUAL", "FILTER", "VariantType", "Samples", "set", "cas9SiteSeq", "TargetLocus", "TargetLocusGuideSeq", "AlnScore", "Probs")         

d3b <- d3

## Remove sampleIDs from d3b$Samples that correspond to samples for which that variant was filtered
## This is achieved by copying d3b$set into d3b$Samples, and then using gsub to remove "filterIn[SampleID]".
d3b$Samples <- d3b$set
levels(d3b$Samples) <- gsub("-",",",levels(d3b$Samples))	#substitute "," for "-"

## Remove irrelavent columns, and reorder the remaining columns
d3b <- getCols (d3b, cols2Get = c("CHROM", "POS", "REF", "ALT", "FILTER", "VariantType", 
	"Samples", "cas9SiteSeq", "TargetLocus", "TargetLocusGuideSeq", "NumMatches_URA3", "NumMatches_LYP1",
	"AlnScore_URA3", "AlnScore_LYP1", "Probs_URA3", "Probs_LYP1")         
)

## remove records that do not have a cas9SiteSeq entry (i.e. the variants that
## were not within x bp from a cas9 sequence
d3b <- d3b[!is.na(d3b$cas9SiteSeq),]

d7 <- getCols (d3b, cols2Get = c("CHROM", "POS", "REF", "ALT", "FILTER", "VariantType", 
	"cas9SiteSeq", "NumMatches_URA3", "NumMatches_LYP1", "Samples")         
)

#####################################################
## Remove filtered records and the "FILTER" column
d7Flt <- rmFltVar(d7)

d7Flt <- organizeSuppTable(d7Flt)

##################
## Write tables ##
##################
write.table(d7, file="d7.tab", sep = "\t", row.names=F)

write.table(d7Flt, file="d7Flt.tab", sep = "\t", row.names=F)

##################
##  Save RData  ##
##################
save.image(file = "cas9_analysis.RData")