#######################################################################################################################
#######################################################################################################################
### Performs the permutation test of coevolution for putatively interacting sites across the alphaherpesvirinae   #####
### Method and results described in:																			  #####
### Butt BG, Owen DJ, Jeffries CM, Ivanova L, Hill CH, Houghton JW, Ahmed MF, Antrobus R,                         #####
### Svergun DI, Welch JJ, Crump CM, Graham SC                                                                     #####
### "Insights into herpesvirus assembly from the structure of the pUL7:pUL51 complex"                             #####
#######################################################################################################################
#######################################################################################################################
# setwd('Change to site of the "inputfiles folder"')

## This flag corresponds to the 4 data sets listed in Table S4
mydataset <- 2 # [1-4]

# Should we remove UL7 sequences that are short or difficult to align?
removesequences <- switch(mydataset,F,T,T,T)
# Should we remove interactions that do not involve side chain atoms?
sidechainonly <- switch(mydataset,F,F,T,F)
# Should we retain only sections of alignment conserved across viral subfamilies?
useonlyHMMER <- switch(mydataset,F,F,F,T)

# Number of times to randomly permute the interaction pairs
nperms <- 1e6

library(seqinr)
inputdir <- 'inputfiles/'

# Which viral subfamily
vg <- 'alpha' 
# 1 for sidechain only, 0 otherwise (2 for reduced set of interactions alignable across subfamilies)
minannot <- 0
if(sidechainonly)
	minannot <- 1
#######################################################################################################################
#######################################################################################################################
# Calculates the correlation coefficient in allelic sites between states (Cf. Zaykin et al. 2008)
getcor <- function(i,j,t5,t7)
{
	t5 <- t5[,i]
	t7 <- t7[,j]
	
	l <- length(t5)
	u7 <- unique(t7); l7 <- length(u7)
	u5 <- unique(t5); l5 <- length(u5)
	
	# Return 0 if either site is monomorphic
	if(min(l7,l5)==1) return(0)
	
	# These are the possible combinations of two amino acids at this site
	doubs <- as.vector(outer(u7,u5,FUN=paste0))
	# Frequencies....
	pij <- table(factor(paste0(t7,t5),levels=doubs))/l
	pi <- rep(table(factor(t7,levels=u7))/l,times=l5)
	pj <- rep(table(factor(t5,levels=u5))/l,each=l7)
	
	# This is eqs. 1 and 3 of Zaykin et al.
	rij <- as.vector(pij-pi*pj)
	rij <- rij/as.vector(sqrt(pi*(1-pi)*pj*(1-pj)))
	ans <- (l7-1)*(l5-1)*l/l5/l7 * sum(rij*rij)
	
	# An alternative information based measure...
	#ans <- pij*log2(pij/(pi*pj))
	#ans[which(pij==0)] <- 0
	#ans <- sum(ans)
		
	return(ans)	
}	
#######################################################################################################################
#######################################################################################################################
# Return the summed correlation measures for all pairs of interacting sites
getval <- function(i,j,a)
{
	b <- mapply(FUN=function(i,j,a) {return(a[i,j])},i,j,MoreArgs=list(a=a))
	return(sum(b))
}	
#######################################################################################################################
#######################################################################################################################
x <- read.table(paste0(inputdir,'InteractionsLookup.txt'),header=T)
y <- read.table(paste0(inputdir,'virgroups.txt'),header=T)
y <- y[which(y$group==vg),]

# Load in the files that match the annotated interaction sites to the multiple alignment
load(paste0(inputdir,'UL51.',vg,'.Rdata')); A5 <- A; rm(A)
f5 <- read.fasta(paste0(inputdir,'ul51.',vg,'.alignment.fas'))
# Remove short sequence
f5 <- f5[-which(names(f5) %in% c('A0A2Z4H851_HHV1/2-166'))]

if(useonlyHMMER) vg <- paste0(vg,'.HMMER')
load(paste0(inputdir,'UL7.',vg,'.Rdata')); A7 <- A; rm(A)
f7 <- read.fasta(paste0(inputdir,'ul7.',vg,'.alignment.fas'))
# Remove short sequences
f7 <- f7[-which(names(f7) %in% c('A0A120I2R6_HHV2','A0A097HXP5_HHV3','A0A286MM74_9ALPH','A0A2Z4H5E9_HHV1','A0A120I2N0_HHV1'))]


### remove further strains that are too short or unalignable (removing an unacceptable number of interacting sites)
if(removesequences)
	f7 <- f7[-which(names(f7) %in% c('B7FEJ7_9ALPH','A0A0X8E9M8_HHV1'))]

n5 <- as.vector(sapply(names(f5[-1]),FUN=function(z) {strsplit(z,'/',fixed=T)[[1]][1]}))
n7 <- as.vector(sapply(names(f7[-1]),FUN=function(z) {strsplit(z,'/',fixed=T)[[1]][1]}))
strains <- which(y$UniProtIDpUL51 %in% n5 & y$UniProtIDpUL7 %in% n7)

mt5 <- match(y$UniProtIDpUL51[strains],n5)
mt7 <- match(y$UniProtIDpUL7[strains],n7)

f5 <- f5[c(1,mt5+1)]
f7 <- f7[c(1,mt7+1)]
t7 <- matrix(unlist(f7),length(f7),length(f7[[1]]),byrow=T)
t5 <- matrix(unlist(f5),length(f5),length(f5[[1]]),byrow=T)

# Check for interactions involving residues that are missing in one or more sequence
m7 <- apply(t7[-1,],MARGIN=2,FUN=function(z) {length(which(z=='-')) })
m5 <- apply(t5[-1,],MARGIN=2,FUN=function(z) {length(which(z=='-'))})

# Remove any interacting sites that are missing a residue in any sequence in the alignment
A7['annot',which(A7['shared.gene',]==0)] <- 0
wA <- which(A7['annot',]>minannot)
wt <- which(t7[1,]>minannot)
w <- which(m7[wt]>0); 
A7['annot',wA[w]] <- 0

A5['annot',which(A5['shared.gene',]==0)] <- 0
wA <- which(A5['annot',]>minannot)
wt <- which(t5[1,]>minannot)
w <- which(m5[wt]>0)
A5['annot',wA[w]] <- 0

## Get the complete set of potential interactions (after excluding sequences and gaps)
i5 <- A5['HSV1',which(A5['annot',] > minannot)]
i7 <- A7['HSV1',which(A7['annot',] > minannot)]
if(!(all(i7 %in% x[,'UL7'])))
	stop('Problem with UL7 annotation')
if(!(all(i5 %in% x[,'UL51'])))
	stop('Problem with UL51 annotation')

w <- which( (x[,'UL51'] %in% i5) & (x[,'UL7'] %in% i7)); lw <- length(w)
x <- x[w,]

# Get the final set of alignments...
wA <- which(A7['annot',]>minannot)
missing7 <- which(!(A7['HSV1',wA] %in% x[,'UL7']))
wt <- which(t7[1,]>minannot)
t7 <- t7[,wt]; m7 <- m7[wt]
w <- c(missing7,which(m7>0))
if(length(w)>0)
	t7 <- t7[,-w]
t7 <- t7[-1,]

wA <- which(A5['annot',]>minannot)
missing5 <- which(!(A5['HSV1',wA] %in% x[,'UL51']))
wt <- which(t5[1,]>minannot)
t5 <- t5[,wt]; m5 <- m5[wt]
w <- c(missing5,which(m5>0))
if(length(w)>0)
	t5 <- t5[,-w]
t5 <- t5[-1,]

# Create a new version of the interaction lookup table that corresponds to the alignments.
xi <- x[,c('UL51','UL7')]
xi[,'UL7'] <- match(x[,'UL7'],sort(unique(x[,'UL7'])))
xi[,'UL51'] <- match(x[,'UL51'],sort(unique(x[,'UL51'])))

# The number of alleles left in the analysis:
n5 <- dim(t5)[2]
n7 <- dim(t7)[2]

## TEST: Does a sequence coevolve with iteself... it should!
#xi[,'UL7'] <- rep(1:n5,length.out=dim(xi)[1])
#xi[,'UL51'] <- rep(1:n5,length.out=dim(xi)[1])
#t7 <- t5; n7 <- n5

## Populate a matrix with the correlation between all pairs of sites involved in interactions
a <- matrix(NA,n5,n7)
for(i in 1:n5)
{
	for(j in 1:n7)
	{
		a[i,j] <- getcor(i,j,t5,t7)	
	}
}

## The true value of the test statistic "z"
true.val <- getval(xi[,'UL51'],xi[,'UL7'],a)
## Randomly permute the interaction pairs
perms <- sapply(1:nperms,FUN=function(z,i,j,a) { getval(i,sample(j,replace=T),a) } ,a=a,i=xi[,'UL51'],j=xi[,'UL7'])

## Display the result
graphics.off()
hist(c(perms,true.val))
abline(v=true.val,col='red')
cat('Str:',length(strains),' I:',dim(xi)[1],' UL51:',length(unique(xi[,'UL51'])),' UL7:',length(unique(xi[,'UL7'])),'\n')
cat('z=',true.val,'p=',length(which(perms>true.val))/length(perms),'\n')