# Reads reformatted output of CAFE v4 from cafe_output_bash_process.txt into R for processing
# Enrichment analyses assume that the following functions are loaded from the ortho.R code:
# 	enrich.test
#	enriched.terms
#	write.enrich.terms.table

# Load libraries
library(dplyr)
# biomart should also be installed, but is not loaded because of name overlap with dplyr

# Table of p-values for family-wide gene family expansion rate shift
table2 <- read.table("vert_cafe_1r_table_og-familypval.txt", sep = "\t", header = TRUE, row.names = 1)

# Table of Viterbi p-values for shifts in gene family expansion/contraction rate
table3 <- read.table("vert_cafe_1r_table_viterbipval.txt", sep = "\t", header = TRUE, na.strings = "-", row.names = 1)
colnames(table3) <- gsub("X", "", colnames(table3))

# Get gene families with significant rate shift
table3_sig <- table3[which(table2$Family.wide.P.value < .05),]

# Load cluster annotations for enrichment test
cluster_ann <- read.table("cluster_functional_annotation.tsv",comment.char = "",header=TRUE,sep="\t",quote="",na.strings = "None",stringsAsFactors = FALSE)

# split annotations so that they're easier to process, and reformat
cluster_n <- length(unique(cluster_ann$X.cluster_id))
cluster_GO <- cluster_ann[1:cluster_n, ]
cluster_Pfam <- cluster_ann[(cluster_n+1):nrow(cluster_ann), ]

# Read in orthogroup information so they can be matched with human cancer gene information
# Read in the Orthogroups.csv, an output file from OrthoFinder
orthogroups_csv <- read.table("Orthogroups.csv", stringsAsFactors = FALSE, header = TRUE, sep = "\t")
colnames(orthogroups_csv)[1] <- "clusters"
ensembl_hsap_ids <- orthogroups_csv %>% select(clusters,hsap) %>% transform(hsap = strsplit(hsap, ", ")) %>% unnest(hsap)
BM_hsap_ids <- getBM(filters = "ensembl_peptide_id_version", attributes=c("ensembl_peptide_id_version","ensembl_gene_id","external_gene_name","description"), values = ensembl_hsap_ids$hsap, mart = useMart("ensembl",dataset="hsapiens_gene_ensembl"))

# Download and format cancer census table
# Download version used was from cancer.sanger.ac.uk: cancer_gene_census_2020_04_07_v91.csv 
census_raw <- read.csv("cancer_gene_census_2020_04_07_v91.csv")
census_ENSG <- apply(census_raw, 1,
function(x) {
y <- unlist(strsplit(x["Synonyms"], ","))
z <- grep("ENSG", y, value = TRUE)
z <- z[length(z)]
z
}
)
census_ENSG <- unlist(lapply(census_ENSG, function(x) ifelse(length(x) > 0, x, NA)))
census_ENSG <- sapply(census_ENSG, function(x) strsplit(x, "\\.")[[1]][1])

ensembl_hsap_ids_gene <- full_join(x = ensembl_hsap_ids, y = BM_hsap_ids, by = c("hsap" = "ensembl_peptide_id_version"))
census_tab <- cbind(census_raw, census_ENSG)
census_tab_clusters <- ensembl_hsap_ids_gene[match(census_ENSG, ensembl_hsap_ids_gene$ensembl_gene_id),1]
census_tab <- cbind(census_tab, census_tab_clusters)

cancer_ogs <- unique(census_tab$clusters)
cancer_ogs <- cancer_ogs[! is.na(cancer_ogs)]
cancer_ogs_t1 <- unique(census_tab$clusters[census_tab$Tier == 1])

hsap_ogs <- rownames(orthogroups)[which(orthogroups$hsap > 0)]

cluster_cancer <- unique(cluster_ann$X.cluster_id)
cluster_cancer <- data.frame("X.cluster_id" = cluster_cancer, "domain_ids" = ifelse(cluster_cancer %in% census_tab$clusters, "cancer", "noncancer"), "domain_description" = ifelse(cluster_cancer %in% census_tab$clusters, "cancer", "noncancer"))


# Analysis:

# Get number of shifts for gene families with significant rate shifts
nshifts <- apply(table3_sig, MARGIN = 1, function(x) length(which(x < 0.05)))

# Get mean number of shifts
mean(nshfits)

# Get the enrichments of all gene families with a rate shift (result provided as supplementary table):
allshift_vall <- enrich.test(list(cluster_GO,cluster_IPR,cluster_Pfam), rownames(table3_sig), rownames(table3))
enriched.terms(cluster_ann,rownames(table3_sig),allshift_vall)
enrich_allshift_tab <- enriched.terms.table(cluster_ann,rownames(table3_sig),allshift_vall)
write.enrich.terms.table(enrich_allshift_tab, "Supp_Table_allshift_fxn.txt")


# Focusing on gene families and branch-specific rate shifts:

# These are relevant gigantism node numbers:
# rtyp<6>: 6
# mmol<28>: 29
# lafr<48>: 49
# bacu<66>: 67
# bmys<68>: 68
# bwls<67>: 66
# 6, 28, 48, (66, 68) or 67

# These are non-giant sister node numbers:
# cpun<4>: 
# dnig<30>: 
# trub<32>
# <31>:  # Tetraodontidae
# pcap<50>: 
# ttru<64>: 
# 4,30,32,31,50,64

# Determine which gene families for which there was a rate shift on giant branches including MRCA of baleen whales and not other vertebrates
which(apply(table3_sig, MARGIN = 1, function(x) 
all(c((x[colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] < 0.05),
(x[! colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] >= 0.05)))
))
#none

# Determine which gene families for which there was a rate shift on any giant branch including MRCA of baleen whales and not other vertebrates
which(apply(table3_sig, MARGIN = 1, function(x) 
any(x[colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] < 0.05) &
all(x[! colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] >= 0.05)))
# OG0000309 OG0000515 OG0000521 OG0000609 OG0003706 OG0010763 



# Determine which gene families shifted along whale shark branch (inclusive of other branches in the phylogeny):
ws_shift <- names(which(apply(table3_sig, MARGIN = 1, function(x) x[colnames(table3_sig) %in% "6"] < 0.05)))

# enrichment test:
ws_shift_vsallsig <- enrich.test(list(cluster_GO,cluster_Pfam), ws_shift, rownames(table3_sig))
enriched.terms(cluster_ann,ws_shift,ws_shift_vsallsig)


# Determine which gene families shifted along whale shark branch only:
which(apply(table3_sig, MARGIN = 1, function(x) 
x[colnames(table3_sig) %in% "6"] < 0.05 &
all(x[! colnames(table3_sig) %in% "6"] >= 0.05))
)
# OG0010763


# Testing for enrichment of cancer genes among gene families with a rate shift:

# cancer genes:
# Test for enrichment of genes with a shift along a branch leading to a giant taxon:
enrich_cancer_gshift <- enrich.test(list(cluster_cancer), 
names(which(apply(table3_sig, MARGIN = 1, function(x) 
any(x[colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] < 0.05)
))),
rownames(table3_sig))

# Test for enrichment of genes with a shift along a branch sister to a giant taxon
enrich.test(list(cluster_cancer), 
names(which(apply(table3_sig, MARGIN = 1, function(x) 
any(x[colnames(table3_sig) %in% as.character(c(4,30,32,31,50,64))] < 0.05)
))),
rownames(table3_sig))

# Test for enrichment of genes with a shift along six random branches
# First, generate 100 random samples
randbranchsamples <- lapply(1:100, function(x) sample(colnames(table3), 6))

# Perform enrichment tests on all 100 random samples
enrichtest_randbranchsamples <- lapply(randbranchsamples, function(y) enrich.test(list(cluster_cancer), 
names(which(apply(table3_sig, MARGIN = 1, function(x) 
any(x[colnames(table3_sig) %in% y] < 0.05)
))),
rownames(table3_sig)))

# Proportion of random sample odds ratios for which the observed value is greater (aka p-value):
length(which(sapply(enrichtest_randbranchsamples, function(x) x$Fisher.Test$estimate.odds.ratio[1]) > enrich_cancer_gshift$Fisher.Test$estimate.odds.ratio[1]))/length(enrichtest_randbranchsamples)


# Testing for whether gene families that shifted along giant branches were enriched for functional terms:
gshift_vsig <- enrich.test(list(cluster_GO,cluster_IPR,cluster_Pfam),
names(which(apply(table3_sig, MARGIN = 1, function(x) 
any(x[colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] < 0.05)
))),
rownames(table3_sig)
)
enriched.terms(cluster_ann,
names(which(apply(table3_sig, MARGIN = 1, function(x) 
any(x[colnames(table3_sig) %in% as.character(c(6,28,48,66,68,67))] < 0.05)
))),
gshift_vsig)



# Comparison of rates along branches of cancer and non-cancer genes on giant vs. non-giant branches:

table_trees <- read.tree("vert_cafe_1r_table_og-trees.txt", skip = 1)

# Write function to estimate rates by taking the difference in size between ancestors and dividing by time (branch length)
cafeBranchRates <- function(tree) {
	counts <- as.numeric(c(unlist(lapply(strsplit(tree$tip.label, "_"), "[", 2)), 
		unlist(lapply(strsplit(tree$node.label, "_"), "[", 2))
	))
	diffs <- apply(tree$edge, 1, function(x) counts[x[1]]-counts[x[2]])
	rates <- diffs/tree$edge.length
	tree2 <- tree
	tree2$edge.length <- rates
	tree2
}

# Run the function
cafe_rates <- lapply(table_trees, cafeBranchRates)

# Compute mean rates for different categories of giant and non-giant branches for each gene family
caferates_bg_v_giants_abs <- lapply(cafe_rates, function(x) {
	bgrates <- mean( abs(x$edge.length[ ! x$edge[,2] %in% c(grep("rtyp|mmol|lafr|bacu|bmys", x$tip.label), 69) ]) )
	giantrates <- mean( abs(x$edge.length[ x$edge[,2] %in% c(grep("rtyp|mmol|lafr|bacu|bmys", x$tip.label), 69) ] ))
	c("bg" = bgrates, "giant" = giantrates)
	}
)
caferates_bg_v_giants_abs <- do.call(rbind, caferates_bg_v_giants_abs)

# Subset gene family rates for just the ones with a significant rate shift:
caferates_bg_v_giants_abs_sig <- caferates_bg_v_giants_abs[rownames(caferates_bg_v_giants_abs) %in% rownames(table3_sig),]
cafe_cancer_sig <- cafe_cancer[names(cafe_cancer) %in% rownames(table3_sig)]

# Tidy the object for testing in lmer
caferates_bg_v_giants_abs_sig_tidy <- data.frame("rates" = c(caferates_bg_v_giants_abs_sig[,1], caferates_bg_v_giants_abs_sig[,2]), 
	"branchtype" = rep(c("bg", "giant"), each = nrow(caferates_bg_v_giants_abs_sig)), 
	"OG" = rep(rownames(caferates_bg_v_giants_abs_sig), 2),
	"cancer" = rep(cafe_cancer_sig, 2)
	)

# Using a mixed model to test whether rates of gene family evolution differs along giant vs. non-giant branches between cancer vs. non-cancer gene families
mixed.lmer.sig <- lmer(rates ~ branchtype*cancer + (1|OG), data = caferates_bg_v_giants_abs_sig_tidy)

# Plot the figure of differences in rates in gene family evolution of cancer and non-cancer genes for giant vs. non-giant branches
pdf(file = "Fig4.pdf", width = 8, height = 6)

# Specify output
par(mar = c(4.5,11,1,1))
# Small offset for 0 rates which cannot be plotted on log scale
xmin <- 0.001

# Plot main plot area
plot(0, pch = "", xlim = c(xmin, max(caferates_bg_v_giants_abs_sig_tidy$rates)), log = "x", ylim = c(0.25,4.75), yaxs = "i", yaxt = "n", xlab = "Rates of Gene Family Size Evolution", ylab = "")
axis(2, at = rev(seq_len(4)), 
     labels = c("Non-cancer gene families\nalong non-giant branches", 
                "Non-cancer gene families\nalong giant branches", 
                "Cancer gene families\nalong non-giant branches", 
                "Cancer gene families\nalong giant branches"),
     las = 2
)

# Plot the density plots, boxplots, and points for each group:
dens.noncancer.bg.sig <- density(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",1]+xmin, cut  = 0)
polygon(x = c(min(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",1])+xmin,dens.noncancer.bg.sig$x), 
	y = c(4,dens.noncancer.bg.sig$y/(2*max(dens.noncancer.bg.sig$y)) + 4),
	col = "turquoise"
	)
boxplot(x = caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",1]+xmin, at = 4, horizontal = TRUE, add = TRUE, notch = TRUE, boxwex = 0.25, col = "turquoise", outline = FALSE)
points(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",1]+xmin, jitter(rep(0, length(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",1])), factor = 8)+ 3.75, pch = 16, cex = 0.5)
dens.noncancer.giant.sig <- density(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",2]+xmin, from = xmin)
polygon(x = c(xmin, dens.noncancer.giant.sig$x[dens.noncancer.giant.sig$x > 0]), 
	y = c(3, dens.noncancer.giant.sig$y/(2*max(dens.noncancer.giant.sig$y)) + 3),
	col = "turquoise"
	)
boxplot(x = caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",2]+xmin, at = 3, horizontal = TRUE, add = TRUE, notch = TRUE, boxwex = 0.25, col = "turquoise", outline = FALSE)
points(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",2]+xmin, jitter(rep(0, length(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "noncancer",2])), factor = 8) + 2.75, pch = 16, cex = 0.5)
dens.cancer.bg.sig <- density(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",1]+xmin, cut = 0)
polygon(x = c(min(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",1])+xmin, dens.cancer.bg.sig$x[dens.cancer.bg.sig$x > 0]), 
	y = c(2, dens.cancer.bg.sig$y/(2*max(dens.cancer.bg.sig$y)) + 2),
	col = "sienna2"
	)
boxplot(x = caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",1]+xmin, at = 2, horizontal = TRUE, add = TRUE, notch = TRUE, boxwex = 0.25, col = "sienna2", outline = FALSE)
points(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",1]+xmin, y = jitter(rep(0, length(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",1])), factor = 8) + 1.75, pch = 16, cex = 0.5)
dens.cancer.giant.sig <- density(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",2]+xmin, from = xmin)
polygon(x = c(xmin, dens.cancer.giant.sig$x[dens.cancer.giant.sig$x > 0]), 
	y = c(1, dens.cancer.giant.sig$y/(2*max(dens.cancer.giant.sig$y)) + 1),
	col = "sienna2"
	)
boxplot(x = caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",2]+xmin, at = 1, horizontal = TRUE, add = TRUE, notch = TRUE, boxwex = 0.25, col = "sienna2", outline = FALSE)
points(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",2]+xmin, y = jitter(rep(0, length(caferates_bg_v_giants_abs_sig[cafe_cancer_sig %in% "cancer",2])), factor = 8) + 0.75, pch = 16, cex = 0.5)

dev.off()