# before proceeding, set working directory to be wherever RNA-seq and other data files are stored

# clear memory
rm(list=ls())

Import data files

# RNA-seq results, filtered for differentially expressed genes (FDR < 0.05)
dat <- read.table("diff_exp.txt",sep="\t",header=T,stringsAsFactors=F)
head(dat)
##    Symbol log.FC      FC log.CPM     CPM      P.value          FDR
## 1   S100g  10.26 1224.97    0.22    1.17 1.020000e-17 7.300000e-16
## 2  Slc9a4   8.78  439.22   -1.07    0.47 3.470000e-12 1.120000e-10
## 3    Pga5   6.74  106.78   10.73 1697.60 3.350000e-39 1.810000e-36
## 4 Glycam1   6.66  101.37    0.33    1.26 1.064832e-03 5.725028e-03
## 5    Chia   6.60   96.79    9.35  653.29 2.630000e-35 1.120000e-32
## 6    Fgl1   6.57   94.94    1.20    2.29 2.180000e-25 4.070000e-23
# gene signatures from: 
# Singh et al., Cell 2009 (http://www.cell.com/cancer-cell/abstract/S1535-6108%2809%2900111-1, Table S1: top 250 RAS addiction-specific probes)
# Loboda et al., BMC Med Genom 2010 (http://www.biomedcentral.com/1755-8794/3/26, Additional File 3)
# All gene symbols updated to most current HGNC nomenclature (via http://www.genenames.org/cgi-bin/symbol_checker)
singh <- scan("singh.txt",what="character",sep="\n")
#collapse to unique symbols
singh <- unique (singh)
loboda <- scan("loboda.txt",what="character",sep="\n")
ras <- unique (c(singh,loboda))

# PDAC subtype signatures from Collisson et al., Nature Medicine 2011
classical <- scan("Collisson_classical.txt",what="character",sep="\n")
exocrine <- scan("Collisson_exocrine.txt",what="character",sep="\n")

Convert human genes to mouse, via HomoloGene IDs

# species conversion function
convert <- function (symbols,in_spec,out_spec,hom) {
  # symbols = vector of gene symbols from first species
  # in_spec = Taxon ID of first species
  # out_spec = Taxon ID of species to which you wish to convert gene symbols
  # hom = HomoloGene table
  
  hom1 = hom[hom$NCBI.Taxon.ID==in_spec,c("HomoloGene.ID","Symbol")]
  colnames(hom1) = c("HomoloGene.ID","Former.Symbol")
  hom2 = hom[hom$NCBI.Taxon.ID==out_spec,c("HomoloGene.ID","Symbol","EntrezGene.ID")]
  
  matches = toupper(hom1$Former.Symbol) %in% toupper(symbols)
  IDs = hom1$HomoloGene.ID[matches]
  out = hom2[hom2$HomoloGene.ID %in% IDs,]
  
  out2 <- merge (out,hom1,by="HomoloGene.ID")
  
  return (out2)
}

# read mouse-human homology table (downloaded from http://www.informatics.jax.org/homology.shtml)
hom <- read.delim("HOM_MouseHumanSequence.rpt",sep="\t")
ras_mus <- convert (ras,9606,10090,hom)
classical_mus <- convert (classical,9606,10090,hom)
exocrine_mus <- convert (exocrine,9606,10090,hom)

write.table(ras_mus,"ras_mus.txt",sep="\t",row.names=F)
write.table(classical_mus,"classical_mus.txt",sep="\t",row.names=F)
write.table(exocrine_mus,"exocrine_mus.txt",sep="\t",row.names=F)

Label classes of significantly changed genes, set up for plotting

# add class names to gene sets
ras_mus$Class="RAS"
ras_mus$Plot.order=2
classical_mus$Class="classical"
classical_mus$Plot.order=4
exocrine_mus$Class="exocrine"
exocrine_mus$Plot.order=3
# unique = set of specific genes to mark in volcano plot
unique <- data.frame(c("Sox9","Cpa1","Ptf1a","Nr5a2","Bhlha15"))
unique$Class="unique"
unique$Plot.order=5

sets <- rbind(ras_mus,classical_mus,exocrine_mus)
# get rid of extraneous variables
sets <- sets[c(2,5,6)]

# merge gene set info into RNA-seq data-set
dat2<-merge(dat,sets,all.x=T)

# set class of unlabeled genes to "other"
dat2[is.na(dat2$Class),]$Class <- "other"
dat2$Plot.order[dat2$Class == "other"] <- 1

# order data for plotting
dat2 <- dat2[order(dat2$Plot.order),]

Draw pretty plot, suitable for saving to file

#read the color palette file, assigns colors to each class
palette <- read.table("palette.txt",sep="\t",header=T,comment.char="",stringsAsFactors=F)

library(ggplot2)
theplot <- ggplot(dat2, aes(x=dat2$log.FC, y=-log10(dat2$FDR), colour=dat2$Class, size=1.5)) + geom_point() + theme_bw(15) + labs(x="log2 fold change", y="-log10 FDR") + scale_color_manual(values=palette$Color[order(palette$Class)], name="Class") + scale_size(guide="none") + coord_fixed(ratio=0.5)

theplot

Interrogating enrichment of individual classes

# dist = function for returning binomial test results of enrichment of genes in set y among up- vs. down-regulated genes in dataset x

dist <- function (x,y) {
  upreg <- x$Symbol[x$log.FC>0]
  downreg <- x$Symbol[x$log.FC<0]
  upreg.class <- x$Symbol[x$Class == y & x$log.FC>0]
  downreg.class <- x$Symbol[x$Class == y & x$log.FC<0]
  cat("Gene class:",y,"\n")
  cat("Upregulated genes:",length(upreg.class),"out of",length(unique(upreg)),"\n")
  cat("Downregulated genes:",length(downreg.class),"out of",length(unique(downreg)),"\n")
  cat("Binomial test p-value: ",prop.test(c(length(upreg.class),length(downreg.class)),c(length(unique(upreg)),length(unique(downreg))))$p.value,"\n \n")
}

# report on individual classes, excluding "other" and "unique":

with(palette[palette$Class != "other" & palette$Class != "unique",], for (i in 1:length(Class)) dist (dat2,Class[i]))
## Gene class: exocrine 
## Upregulated genes: 3 out of 1589 
## Downregulated genes: 12 out of 1505 
## Binomial test p-value:  0.02949373 
##  
## Gene class: classical 
## Upregulated genes: 8 out of 1589 
## Downregulated genes: 0 out of 1505
## Warning in prop.test(c(length(upreg.class), length(downreg.class)),
## c(length(unique(upreg)), : Chi-squared approximation may be incorrect
## Binomial test p-value:  0.01630234 
##  
## Gene class: RAS 
## Upregulated genes: 59 out of 1589 
## Downregulated genes: 29 out of 1505 
## Binomial test p-value:  0.003989242 
##