# load all required libraries (packages)
library(dplyr)
library(Seurat)
library(patchwork)
library(cellranger)
library(ggplot2)

# set working directory
setwd(paste0(getwd()))



#################
##  LOAD DATA  ##
#################

# LCL777_B958
# Provenance: Luftig Lab, B958 infection of healthy donor PBMCs
# data.dir may vary based on user-specified path
lcl777_b958.data <- Read10X(data.dir = './LCL777_B958/outs/filtered_gene_bc_matrices/170310_GRCh38_and_B958/')
# min.cells = number of cells in which a gene must be present to be included in analysis
# min.features = number of distinct mRNA transcripts a cell must express to be included in analysis
lcl777_b958 <- CreateSeuratObject(counts = lcl777_b958.data, project = "LCL777_B958", min.cells = 3, min.features = 200)
print(lcl777_b958)

# LCL777_M81
# Provenance: Luftig Lab, M81 infection of healthy donor PBMCs
# data.dir may vary based on user-specified path
lcl777_aggr.data <- Read10X(data.dir = './AGGR_LCL777_B958_M81/outs/filtered_gene_bc_matrices_mex/170310_GRCh38_and_B958/')
lcl777_m81.data <- lcl777_aggr.data[, 1907:3798]
# min.cells = number of cells in which a gene must be present to be included in analysis
# min.features = number of distinct mRNA transcripts a cell must express to be included in analysis
lcl777_m81 <- CreateSeuratObject(counts = lcl777_m81.data, project = "LCL777_M81", min.cells = 3, min.features = 200)
print(lcl777_m81)

# LCL461_B958
# Provenance: Luftig Lab, B958 infection of healthy donor PBMCs
# data.dir may vary based on user-specified path
lcl461_b958.data <- Read10X(data.dir = './LCL461_B958/outs/filtered_gene_bc_matrices/GRCh38_B958_mkref_out_170705/')
# min.cells = number of cells in which a gene must be present to be included in analysis
# min.features = number of distinct mRNA transcripts a cell must express to be included in analysis
lcl461_b958 <- CreateSeuratObject(counts = lcl461_b958.data, project = "LCL461_B958", min.cells = 3, min.features = 200)
print(lcl461_b958)

# GM12878 LCL
# Provenance: Osorio et al, Scientific Data 6:112 (2019)
# https://doi.org/10.1038/s41597-019-0116-4
# data.dir may vary based on user-specified path
gm12878.data <- Read10X(data.dir = './GM12878/outs/filtered_gene_bc_matrices/GM12878_GRCh38/')
# min.cells = number of cells in which a gene must be present to be included in analysis
# min.features = number of distinct mRNA transcripts a cell must express to be included in analysis
gm12878 <- CreateSeuratObject(counts = gm12878.data, project = "GM12878", min.cells = 3, min.features = 200)
print(gm12878)

# GM18502 LCL
# Provenance: Osorio et al, Scientific Data 6:112 (2019)
# https://doi.org/10.1038/s41597-019-0116-4
# data.dir may vary based on user-specified path
gm18502.data <- Read10X(data.dir = './GM18502/outs/filtered_gene_bc_matrices/GM18502_GRCh38/')
# min.cells = number of cells in which a gene must be present to be included in analysis
# min.features = number of distinct mRNA transcripts a cell must express to be included in analysis
gm18502 <- CreateSeuratObject(counts = gm18502.data, project = "GM18502", min.cells = 3, min.features = 200)
print(gm18502)



###############################################################
##  ANALYSIS WITHOUT CELL CYCLE REGRESSION -- FOR REFERENCE  ##
###############################################################

# add percent mitochondrial gene expression as a feature to each dataset
lcl777_b958[["percent.mt"]] <- PercentageFeatureSet(lcl777_b958, pattern = "^MT-")
lcl777_m81[["percent.mt"]] <- PercentageFeatureSet(lcl777_m81, pattern = "^MT-")
lcl461_b958[["percent.mt"]] <- PercentageFeatureSet(lcl461_b958, pattern = "-MT-")
gm12878[["percent.mt"]] <- PercentageFeatureSet(gm12878, pattern = "^MT-")
gm18502[["percent.mt"]] <- PercentageFeatureSet(gm18502, pattern = "^MT-")

# visualize distribution of relevant features for QC thresholds
VlnPlot(lcl777_b958, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(lcl777_m81, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(lcl461_b958, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(gm12878, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(gm18502, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# QC filtering of cells by feature thresholds
# nFeature_RNA = number of different mRNA transcripts in cell
# percent.mt = # mitochondrial gene transcripts / total # of transcripts in cell
lcl777_b958 <- subset(lcl777_b958, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
lcl777_m81 <- subset(lcl777_m81, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
lcl461_b958 <- subset(lcl461_b958, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
gm12878 <- subset(gm12878, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
gm18502 <- subset(gm18502, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)

# normalize data from each sample
lcl777_b958 <- NormalizeData(lcl777_b958, normalization.method = "LogNormalize", scale.factor = 10000)
lcl777_m81 <- NormalizeData(lcl777_m81, normalization.method = "LogNormalize", scale.factor = 10000)
lcl461_b958 <- NormalizeData(lcl461_b958, normalization.method = "LogNormalize", scale.factor = 10000)
gm12878 <- NormalizeData(gm12878, normalization.method = "LogNormalize", scale.factor = 10000)
gm18502 <- NormalizeData(gm18502, normalization.method = "LogNormalize", scale.factor = 10000)

# feature selection for PCA
# nfeatures = top n most variably-expressed genes
lcl777_b958 <- FindVariableFeatures(lcl777_b958, selection.method = "vst", nfeatures = 2000)
lcl777_m81 <- FindVariableFeatures(lcl777_m81, selection.method = "vst", nfeatures = 2000)
lcl461_b958 <- FindVariableFeatures(lcl461_b958, selection.method = "vst", nfeatures = 2000)
gm12878 <- FindVariableFeatures(gm12878, selection.method = "vst", nfeatures = 2000)
gm18502 <- FindVariableFeatures(gm18502, selection.method = "vst", nfeatures = 2000)

# scale data
all.genes.777_b958 <- rownames(lcl777_b958)
lcl777_b958 <- ScaleData(lcl777_b958, features = all.genes.777_b958)
all.genes.777_m81 <- rownames(lcl777_m81)
lcl777_m81 <- ScaleData(lcl777_m81, features = all.genes.777_m81)
all.genes.461_b958 <- rownames(lcl461_b958)
lcl461_b958 <- ScaleData(lcl461_b958, features = all.genes.461_b958)
all.genes.gm12878 <- rownames(gm12878)
gm12878 <- ScaleData(gm12878, features = all.genes.gm12878)
all.genes.gm18502 <- rownames(gm18502)
gm18502 <- ScaleData(gm18502, features = all.genes.gm18502)

# perform PCA on selected features
lcl777_b958 <- RunPCA(lcl777_b958, features = VariableFeatures(object = lcl777_b958))
lcl777_m81 <- RunPCA(lcl777_m81, features = VariableFeatures(object = lcl777_m81))
lcl461_b958 <- RunPCA(lcl461_b958, features = VariableFeatures(object = lcl461_b958))
gm12878 <- RunPCA(gm12878, features = VariableFeatures(object = gm12878))
gm18502 <- RunPCA(gm18502, features = VariableFeatures(object = gm18502))

# compute jack straw scores for PCAs
lcl777_b958 <- JackStraw(lcl777_b958, num.replicate = 100)
lcl777_b958 <- ScoreJackStraw(lcl777_b958, dims = 1:20)
lcl777_m81 <- JackStraw(lcl777_m81, num.replicate = 100)
lcl777_m81 <- ScoreJackStraw(lcl777_m81, dims = 1:20)
lcl461_b958 <- JackStraw(lcl461_b958, num.replicate = 100)
lcl461_b958 <- ScoreJackStraw(lcl461_b958, dims = 1:20)
gm12878 <- JackStraw(gm12878, num.replicate = 100)
gm12878 <- ScoreJackStraw(gm12878, dims = 1:20)
gm18502 <- JackStraw(gm18502, num.replicate = 100)
gm18502 <- ScoreJackStraw(gm18502, dims = 1:20)

# plot jack straw for each sample
JackStrawPlot(lcl777_b958, dims = 1:20)
JackStrawPlot(lcl777_m81, dims = 1:20)
JackStrawPlot(lcl461_b958, dims = 1:20)
JackStrawPlot(gm12878, dims = 1:20)
JackStrawPlot(gm18502, dims = 1:20)



########################################### 
##  ANALYSIS WITH CELL CYCLE REGRESSION  ##
########################################### 

# Data Loading and QC 

# 777_b958
lcl777_b958.data <- Read10X(data.dir = './LCL777_B958/outs/filtered_gene_bc_matrices/170310_GRCh38_and_B958/')
lcl777_b958.ccr <- CreateSeuratObject(counts = lcl777_b958.data, project = "LCL777_B958", min.cells = 3, min.features = 200)
lcl777_b958.ccr[["percent.mt"]] <- PercentageFeatureSet(lcl777_b958.ccr, pattern = "^MT-")
lcl777_b958.ccr <- subset(lcl777_b958.ccr, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
lcl777_b958.ccr <- NormalizeData(lcl777_b958.ccr)

# 777_m81
lcl777_aggr.data <- Read10X(data.dir = './AGGR_LCL777_B958_M81/outs/filtered_gene_bc_matrices_mex/170310_GRCh38_and_B958/')
lcl777_m81.data <- lcl777_aggr.data[, 1907:3798]
lcl777_m81.ccr <- CreateSeuratObject(counts = lcl777_m81.data, project = "LCL777_M81", min.cells = 3, min.features = 200)
lcl777_m81.ccr[["percent.mt"]] <- PercentageFeatureSet(lcl777_m81.ccr, pattern = "^MT-")
lcl777_m81.ccr <- subset(lcl777_m81.ccr, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
lcl777_m81.ccr <- NormalizeData(lcl777_m81.ccr)

# 461_b958
lcl461_b958.data <- Read10X(data.dir = './LCL461_B958/outs/filtered_gene_bc_matrices/GRCh38_B958_mkref_out_170705/')
lcl461_b958.ccr <- CreateSeuratObject(counts = lcl461_b958.data, project = "LCL461_B958", min.cells = 3, min.features = 200)
lcl461_b958.ccr[["percent.mt"]] <- PercentageFeatureSet(lcl461_b958.ccr, pattern = "MT-")
lcl461_b958.ccr <- subset(lcl461_b958.ccr, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
lcl461_b958.ccr <- NormalizeData(lcl461_b958.ccr)

# gm12878
gm12878.data <- Read10X(data.dir = './GM12878/outs/filtered_gene_bc_matrices/GM12878_GRCh38/')
gm12878.ccr <- CreateSeuratObject(counts = gm12878.data, project = "GM12878", min.cells = 3, min.features = 200)
gm12878.ccr[["percent.mt"]] <- PercentageFeatureSet(gm12878.ccr, pattern = "^MT-")
gm12878.ccr <- subset(gm12878.ccr, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
gm12878.ccr <- NormalizeData(gm12878.ccr)

# gm18502
gm18502.data <- Read10X(data.dir = './GM18502/outs/filtered_gene_bc_matrices/GM18502_GRCh38/')
gm18502.ccr <- CreateSeuratObject(counts = gm18502.data, project = "GM18502", min.cells = 3, min.features = 200)
gm18502.ccr[["percent.mt"]] <- PercentageFeatureSet(gm18502.ccr, pattern = "^MT-")
gm18502.ccr <- subset(gm18502.ccr, subset = nFeature_RNA > 200 & nFeature_RNA < 65000 & percent.mt < 5)
gm18502.ccr <- NormalizeData(gm18502.ccr)


# visualize relevant QC features
VlnPlot(lcl777_b958.ccr, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(lcl777_m81.ccr, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(lcl461_b958.ccr, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(gm12878.ccr, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(gm18502.ccr, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)


# get gene name lists for cell cycle scoring and regression (s and g2m phases)
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes


# append GRCh38- for lcl461_b958 annotations
s.genes.461 = vector(mode = "character", length = length(s.genes))

for(g in 1:length(s.genes)){
  s.genes.461[g] <- paste0('GRCh38-', s.genes[g]) 
} 

g2m.genes.461 = vector(mode = "character", length = length(g2m.genes))

for(g in 1:length(g2m.genes)){
  g2m.genes.461[g] <- paste0('GRCh38-', g2m.genes[g]) 
} 


# Feature Selection with Cell Cycle Regression

# 777_b958
lcl777_b958.ccr <- CellCycleScoring(lcl777_b958.ccr, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
lcl777_b958.ccr <- ScaleData(lcl777_b958.ccr, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(lcl777_b958.ccr))
lcl777_b958.ccr <- FindVariableFeatures(lcl777_b958.ccr, selection.method = "vst", nfeatures = 2000)
lcl777_b958.ccr <- RunPCA(lcl777_b958.ccr, features = VariableFeatures(object = lcl777_b958.ccr))

# 777_m81
lcl777_m81.ccr <- CellCycleScoring(lcl777_m81.ccr, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
lcl777_m81.ccr <- ScaleData(lcl777_m81.ccr, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(lcl777_m81.ccr))
lcl777_m81.ccr <- FindVariableFeatures(lcl777_m81.ccr, selection.method = "vst", nfeatures = 2000)
lcl777_m81.ccr <- RunPCA(lcl777_m81.ccr, features = VariableFeatures(object = lcl777_m81.ccr))

# 461_b958
lcl461_b958.ccr <- CellCycleScoring(lcl461_b958.ccr, s.features = s.genes.461, g2m.features = g2m.genes.461, set.ident = TRUE)
lcl461_b958.ccr <- ScaleData(lcl461_b958.ccr, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(lcl461_b958.ccr))
lcl461_b958.ccr <- FindVariableFeatures(lcl461_b958.ccr, selection.method = "vst", nfeatures = 2000)
lcl461_b958.ccr <- RunPCA(lcl461_b958.ccr, features = VariableFeatures(object = lcl461_b958.ccr))

# gm12878
gm12878.ccr <- CellCycleScoring(gm12878.ccr, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
gm12878.ccr <- ScaleData(gm12878.ccr, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(gm12878.ccr))
gm12878.ccr <- FindVariableFeatures(gm12878.ccr, selection.method = "vst", nfeatures = 2000)
gm12878.ccr <- RunPCA(gm12878.ccr, features = VariableFeatures(object = gm12878.ccr))

# gm18502
gm18502.ccr <- CellCycleScoring(gm18502.ccr, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
gm18502.ccr <- ScaleData(gm18502.ccr, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(gm18502.ccr))
gm18502.ccr <- FindVariableFeatures(gm18502.ccr, selection.method = "vst", nfeatures = 2000)
gm18502.ccr <- RunPCA(gm18502.ccr, features = VariableFeatures(object = gm18502.ccr))


# cluster cells
# dims = number of PCs to include for clustering
# resolution = parameter to control coarseness/fineness of clustering; higher res = more clusters
lcl777_b958.ccr <- FindNeighbors(lcl777_b958.ccr, dims = 1:20)
lcl777_b958.ccr <- FindClusters(lcl777_b958.ccr, resolution = 0.75)
lcl777_m81.ccr <- FindNeighbors(lcl777_m81.ccr, dims = 1:35)
lcl777_m81.ccr <- FindClusters(lcl777_m81.ccr, resolution = 0.95)
lcl461_b958.ccr <- FindNeighbors(lcl461_b958.ccr, dims = 1:35)
lcl461_b958.ccr <- FindClusters(lcl461_b958.ccr, resolution = 0.55)
gm12878.ccr <- FindNeighbors(gm12878.ccr, dims = 1:15)
gm12878.ccr <- FindClusters(gm12878.ccr, resolution = 0.83)
gm18502.ccr <- FindNeighbors(gm18502.ccr, dims = 1:15)
gm18502.ccr <- FindClusters(gm18502.ccr, resolution = 0.25)


# dimensionality reduction (tSNE) of corrected dataset
# tSNE algorithm to dimensionally reduce results from PCA, clustering
lcl777_b958.ccr <- RunTSNE(lcl777_b958.ccr, dims = 1:20)
lcl777_m81.ccr <- RunTSNE(lcl777_m81.ccr, dims = 1:35)
lcl461_b958.ccr <- RunTSNE(lcl461_b958.ccr, dims = 1:35)
gm12878.ccr <- RunTSNE(gm12878.ccr, dims = 1:20)
gm18502.ccr <- RunTSNE(gm18502.ccr, dims = 1:15)


# visualize clusters in tSNE-reduced data
DimPlot(lcl777_b958.ccr, reduction = "tsne")
DimPlot(lcl777_m81.ccr, reduction = "tsne")
DimPlot(lcl461_b958.ccr, reduction = "tsne")
DimPlot(gm12878.ccr, reduction = "tsne")
DimPlot(gm18502.ccr, reduction = "tsne")


# UMAP for alternate visualization
lcl777_b958.ccr.U <- RunUMAP(lcl777_b958.ccr, dims = 1:20)
lcl777_m81.ccr.U <- RunUMAP(lcl777_m81.ccr, dims = 1:35)
DimPlot(lcl777_b958.ccr.U, reduction = "umap")
DimPlot(lcl777_m81.ccr.U, reduction = "umap")


# Optional -- list top genes in clusters
# apply for n clusters
# cluster0.markers <- FindMarkers(gm18502.ccr, ident.1 = 0, min.pct = 0.25)
# write.csv(cluster0.markers, 'c0_genes.csv')
# cluster1.markers <- FindMarkers(gm18502.ccr, ident.1 = 1, min.pct = 0.25)
# write.csv(cluster1.markers, 'c1_genes.csv')
# cluster2.markers <- FindMarkers(gm18502.ccr, ident.1 = 2, min.pct = 0.25)
# write.csv(cluster2.markers, 'c2_genes.csv')
# cluster3.markers <- FindMarkers(gm18502.ccr, ident.1 = 3, min.pct = 0.25)
# write.csv(cluster3.markers, 'c3_genes.csv')
# cluster4.markers <- FindMarkers(gm18502.ccr, ident.1 = 4, min.pct = 0.25)
# write.csv(cluster4.markers, 'c4_genes.csv')
# cluster5.markers <- FindMarkers(gm18502.ccr, ident.1 = 5, min.pct = 0.25)
# write.csv(cluster5.markers, 'c5_genes.csv')
# cluster6.markers <- FindMarkers(gm12878.ccr, ident.1 = 6, min.pct = 0.25)
# write.csv(cluster6.markers, 'c6_genes.csv')
# cluster7.markers <- FindMarkers(gm12878.ccr, ident.1 = 7, min.pct = 0.25)
# write.csv(cluster7.markers, 'c7_genes.csv')
# cluster8.markers <- FindMarkers(gm12878.ccr, ident.1 = 8, min.pct = 0.25)
# write.csv(cluster8.markers, 'c8_genes.csv')
# cluster9.markers <- FindMarkers(gm12878.ccr, ident.1 = 9, min.pct = 0.25)
# write.csv(cluster9.markers, 'c9_genes.csv')
# cluster10.markers <- FindMarkers(gm12878.ccr, ident.1 = 10, min.pct = 0.25)
# write.csv(cluster10.markers, 'c10_genes.csv')
# cluster11.markers <- FindMarkers(gm12878.ccr, ident.1 = 11, min.pct = 0.25)
# write.csv(cluster11.markers, 'c11_genes.csv')
# cluster12.markers <- FindMarkers(gm12878.ccr, ident.1 = 12, min.pct = 0.25)
# write.csv(cluster12.markers, 'c12_genes.csv')
# cluster13.markers <- FindMarkers(gm12878.ccr, ident.1 = 13, min.pct = 0.25)
# write.csv(cluster13.markers, 'c13_genes.csv')
# cluster14.markers <- FindMarkers(gm12878.ccr, ident.1 = 14, min.pct = 0.25)
# write.csv(cluster14.markers, 'c14_genes.csv')



#############################
##  CODE FOR MAIN FIGURES  ##
#############################

###  FIGURE 1  ###
# 1a
FeaturePlot(lcl777_b958.ccr, features = c("IGHM", "IGHA1", "IGHG1"), ncol=3)
FeaturePlot(lcl777_m81.ccr, features = c("IGHM", "IGHA1", "IGHG1"), ncol=3)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-IGHM", "GRCh38-IGHA1", "GRCh38-IGHG1"), ncol=3)
FeaturePlot(gm12878.ccr, features = c("IGHM", "IGHA1", "IGHG1"), ncol=3)
FeaturePlot(gm18502.ccr, features = c("IGHM", "IGHA1", "IGHG1"), ncol=3)

# 1b
pie1 <- data.frame(group=c("IgM", "IgA", "IgG", "Null"), value=c(69.2, 7.24, 23.55, 0.0))
pie2 <- data.frame(group=c("IgM", "IgA", "IgG", "Null"), value=c(1.0, 35.0, 64.0, 0.0))
pie3 <- data.frame(group=c("IgM", "IgA", "IgG", "Null"), value=c(0, 0, 81.7, 18.3))
pie4 <- data.frame(group=c("IgM", "IgA", "IgG", "Null"), value=c(18.2, 73.2, 6.0, 2.6))
pie5 <- data.frame(group=c("IgM", "IgA", "IgG", "Null"), value=c(3.0, 1.0, 94.3, 1.7))

ggplot(pie1, aes(x=3, y=value, fill=group)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  xlim(0.55, 3.5) +
  theme_void() + 
  theme(legend.text=element_text(size=16)) + 
  theme(legend.title = element_blank())

# 1c
lcl777_b958.ccr <- FindNeighbors(lcl777_b958.ccr, dims = 1:20)
lcl777_b958.ccr <- FindClusters(lcl777_b958.ccr, resolution = 0.3)
lcl777_m81.ccr <- FindNeighbors(lcl777_m81.ccr, dims = 1:35)
lcl777_m81.ccr <- FindClusters(lcl777_m81.ccr, resolution = 0.2)
lcl461_b958.ccr <- FindNeighbors(lcl461_b958.ccr, dims = 1:35)
lcl461_b958.ccr <- FindClusters(lcl461_b958.ccr, resolution = 0.3)
gm12878.ccr <- FindNeighbors(gm12878.ccr, dims = 1:15)
gm12878.ccr <- FindClusters(gm12878.ccr, resolution = 0.05)
gm18502.ccr <- FindNeighbors(gm18502.ccr, dims = 1:15)
gm18502.ccr <- FindClusters(gm18502.ccr, resolution = 0.07)

# DimPlot(lcl777_b958.ccr, reduction = "tsne")
# DimPlot(lcl777_m81.ccr, reduction = "tsne")
# DimPlot(lcl461_b958.ccr, reduction = "tsne")
# DimPlot(gm12878.ccr, reduction = "tsne")
# DimPlot(gm18502.ccr, reduction = "tsne")

# DotPlot(lcl777_b958.ccr, features = rev(c("IGHM", "IGHA1", "IGHG1")), scale.min=0, scale.max=100, dot.min=0) + RotatedAxis()
# DotPlot(lcl777_m81.ccr, features = rev(c("IGHM", "IGHA1", "IGHG1")), scale.min=0, scale.max=100, dot.min=0) + RotatedAxis()
# DotPlot(lcl461_b958.ccr, features = rev(c("GRCh38-IGHM", "GRCh38-IGHA1", "GRCh38-IGHG1")), scale.min=0, scale.max=100, dot.min=0) + RotatedAxis()
# DotPlot(gm12878.ccr, features = rev(c("IGHM", "IGHA1", "IGHG1")), scale.min=0, scale.max=100, dot.min=0) + RotatedAxis()
# DotPlot(gm18502.ccr, features = rev(c("IGHM", "IGHA1", "IGHG1")), scale.min=0, scale.max=100, dot.min=0) + RotatedAxis()

# use the following to count the number of cells in clusters for percentage of phenotype in sample
# EXAMPLE:
# table_1 = table(lcl777_b958.ccr@active.ident)
# sum from clusters of interest / sum(table_1)


###  FIGURE 2  ###
# 2a,b
Ig_markers <- c("IGHM", "IGHA1", "IGHG1")
kB_markers <- c("NFKB2", "NFKBIA", "EBI3", "ICAM1", "BCL2A1", "TXN")
diff_markers <- c("TNFRSF17", "XBP1", "MZB1", "CD27", "CD38")
Ig_markers_pct <- Matrix::colSums(lcl777_b958.ccr[Ig_markers, ])/Matrix::colSums(lcl777_b958.ccr)
kB_markers_pct <- Matrix::colSums(lcl777_b958.ccr[kB_markers, ])/Matrix::colSums(lcl777_b958.ccr)
diff_markers_pct <- Matrix::colSums(lcl777_b958.ccr[diff_markers, ])/Matrix::colSums(lcl777_b958.ccr)
lcl777_b958.ccr <- AddMetaData(object = lcl777_b958.ccr, metadata = Ig_markers_pct , col.name = "Ig_markers")
lcl777_b958.ccr <- AddMetaData(object = lcl777_b958.ccr, metadata = kB_markers_pct , col.name = "kB_markers")
lcl777_b958.ccr <- AddMetaData(object = lcl777_b958.ccr, metadata = diff_markers_pct , col.name = "diff_markers")
FeaturePlot(lcl777_b958.ccr, features = c("Ig_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#ec31e5", "#46cb18"))
FeaturePlot(lcl777_b958.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))

Ig_markers_pct <- Matrix::colSums(lcl777_m81.ccr[Ig_markers, ])/Matrix::colSums(lcl777_m81.ccr)
kB_markers_pct <- Matrix::colSums(lcl777_m81.ccr[kB_markers, ])/Matrix::colSums(lcl777_m81.ccr)
diff_markers_pct <- Matrix::colSums(lcl777_m81.ccr[diff_markers, ])/Matrix::colSums(lcl777_m81.ccr)
lcl777_m81.ccr <- AddMetaData(object = lcl777_m81.ccr, metadata = Ig_markers_pct , col.name = "Ig_markers")
lcl777_m81.ccr <- AddMetaData(object = lcl777_m81.ccr, metadata = kB_markers_pct , col.name = "kB_markers")
lcl777_m81.ccr <- AddMetaData(object = lcl777_m81.ccr, metadata = diff_markers_pct , col.name = "diff_markers")
FeaturePlot(lcl777_m81.ccr, features = c("Ig_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#ec31e5", "#46cb18"))
FeaturePlot(lcl777_m81.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))

Ig_markers <- c("GRCh38-IGHM", "GRCh38-IGHA1", "GRCh38-IGHG1")
kB_markers <- c("GRCh38-NFKB2", "GRCh38-NFKBIA", "GRCh38-EBI3", "GRCh38-ICAM1", "GRCh38-BCL2A1", "GRCh38-TXN")
diff_markers <- c("GRCh38-TNFRSF17", "GRCh38-XBP1", "GRCh38-MZB1", "GRCh38-CD27", "GRCh38-CD38")
Ig_markers_pct <- Matrix::colSums(lcl461_b958.ccr[Ig_markers, ])/Matrix::colSums(lcl461_b958.ccr)
kB_markers_pct <- Matrix::colSums(lcl461_b958.ccr[kB_markers, ])/Matrix::colSums(lcl461_b958.ccr)
diff_markers_pct <- Matrix::colSums(lcl461_b958.ccr[diff_markers, ])/Matrix::colSums(lcl461_b958.ccr)
lcl461_b958.ccr <- AddMetaData(object = lcl461_b958.ccr, metadata = Ig_markers_pct , col.name = "Ig_markers")
lcl461_b958.ccr <- AddMetaData(object = lcl461_b958.ccr, metadata = kB_markers_pct , col.name = "kB_markers")
lcl461_b958.ccr <- AddMetaData(object = lcl461_b958.ccr, metadata = diff_markers_pct , col.name = "diff_markers")
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-IGHG1", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#ec31e5", "#46cb18"))
FeaturePlot(lcl461_b958.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))

Ig_markers <- c("IGHM", "IGHA1", "IGHG1")
kB_markers <- c("NFKB2", "NFKBIA", "EBI3", "ICAM1", "BCL2A1", "TXN")
diff_markers <- c("TNFRSF17", "XBP1", "MZB1", "CD27", "CD38")
Ig_markers_pct <- Matrix::colSums(gm12878.ccr[Ig_markers, ])/Matrix::colSums(gm12878.ccr)
kB_markers_pct <- Matrix::colSums(gm12878.ccr[kB_markers, ])/Matrix::colSums(gm12878.ccr)
diff_markers_pct <- Matrix::colSums(gm12878.ccr[diff_markers, ])/Matrix::colSums(gm12878.ccr)
gm12878.ccr <- AddMetaData(object = gm12878.ccr, metadata = Ig_markers_pct , col.name = "Ig_markers")
gm12878.ccr <- AddMetaData(object = gm12878.ccr, metadata = kB_markers_pct , col.name = "kB_markers")
gm12878.ccr <- AddMetaData(object = gm12878.ccr, metadata = diff_markers_pct , col.name = "diff_markers")
FeaturePlot(gm12878.ccr, features = c("Ig_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#ec31e5", "#46cb18"))
FeaturePlot(gm12878.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))

Ig_markers_pct <- Matrix::colSums(gm18502.ccr[Ig_markers, ])/Matrix::colSums(gm18502.ccr)
kB_markers_pct <- Matrix::colSums(gm18502.ccr[kB_markers, ])/Matrix::colSums(gm18502.ccr)
diff_markers_pct <- Matrix::colSums(gm18502.ccr[diff_markers, ])/Matrix::colSums(gm18502.ccr)
gm18502.ccr <- AddMetaData(object = gm18502.ccr, metadata = Ig_markers_pct , col.name = "Ig_markers")
gm18502.ccr <- AddMetaData(object = gm18502.ccr, metadata = kB_markers_pct , col.name = "kB_markers")
gm18502.ccr <- AddMetaData(object = gm18502.ccr, metadata = diff_markers_pct , col.name = "diff_markers")
FeaturePlot(gm18502.ccr, features = c("IGHG1", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#ec31e5", "#46cb18"))
FeaturePlot(gm18502.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))

# 2c
# function for plotting correlation coefficient heatmap for input gene list of input seurat dataset
make_gene_corrcoef_map <- function(seurat_obj, gene_names){
  
  # get all the gene count vectors into a matrix
  count_matrix <- matrix(, nrow = length(gene_names), ncol = length(seurat_obj[gene_names[1]]@assays$RNA@counts))
  for(row in 1:length(gene_names)){
    gene_count_vector <- as.numeric(seurat_obj[gene_names[row]]@assays$RNA@counts)
    count_matrix[row, ] <- gene_count_vector
  }
  
  # do all pairwise correlation calculations (each row in count_matrix vs every other row)
  pearson_matrix <- matrix(, nrow = length(gene_names), ncol = length(gene_names))
  for(row in 1:dim(count_matrix)[1]){
    for(col in 1:dim(count_matrix)[1]){
      pearson_matrix[row, col] <- cor(count_matrix[row, ], count_matrix[col, ])
    }
  }
  
  # plot the resulting correlation map
  library("viridis")
  library(gplots)
  par(mar = c(1,1,1,1))
  heatmap.2(pearson_matrix, labRow = gene_names, labCol = gene_names, 
            cexRow = 0.9, cexCol = 0.9, col = magma(50), tracecol=NA, 
            keysize = 1, key.title= "", key.xlab = "Pearson R", key.ylab = "", 
            key.ytickfun = "")
}

# list of genes for correlation coefficient heatmap
gene_names = c("NFKB2", "NFKBIA", "REL", "ICAM1", "EBI3", "PRDM1", "XBP1", "CD27", "CD38", "TNFRSF17")
gene_names.461 = vector(mode = "character", length = length(gene_names))
for(gene in 1:length(gene_names.461)){
  gene_names.461[gene] <- paste0('GRCh38-', gene_names[gene]) 
} 

# function call to create heatmaps
make_gene_corrcoef_map(lcl777_b958.ccr, gene_names)
make_gene_corrcoef_map(lcl777_m81.ccr, gene_names)
make_gene_corrcoef_map(lcl461_b958.ccr, gene_names.461)
make_gene_corrcoef_map(gm12878.ccr, gene_names)
make_gene_corrcoef_map(gm18502.ccr, gene_names)

# 2d
FeaturePlot(lcl777_b958.ccr, features = c("IGHM", "IGHG1"), blend=TRUE, cols=c("#62c2cc", "#820c7e"))
FeaturePlot(lcl777_m81.ccr, features = c("IGHA1", "IGHG1"), blend=TRUE, cols=c("#62c2cc", "#820c7e"))
FeaturePlot(gm12878.ccr, features = c("IGHM", "IGHG1"), blend=TRUE, cols=c("#62c2cc", "#820c7e"))

FeaturePlot(lcl777_b958.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))
FeaturePlot(lcl777_m81.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))
FeaturePlot(gm12878.ccr, features = c("diff_markers", "kB_markers"), blend=TRUE, blend.threshold=0.001, cols=c("#cc5500", "#46cb18"))


###  FIGURE 3  ###
# 3a
lcl777_b958.ccr <- FindNeighbors(lcl777_b958.ccr, dims = 1:20)
lcl777_b958.ccr <- FindClusters(lcl777_b958.ccr, resolution = 0.75)
lcl777_m81.ccr <- FindNeighbors(lcl777_m81.ccr, dims = 1:35)
lcl777_m81.ccr <- FindClusters(lcl777_m81.ccr, resolution = 0.95)

DimPlot(lcl777_b958.ccr, reduction = "tsne")
DimPlot(lcl777_m81.ccr, reduction = "tsne")

# 3b
c7_b985.ccr <- lcl777_b958.ccr
new.cluster.ids.b958 <- c("Latent", "Latent", "Latent", "Latent", "Latent", "Latent", "Latent", "Lytic")
names(new.cluster.ids.b958) <- levels(c7_b985.ccr)
c7_b985.ccr <- RenameIdents(c7_b985.ccr, new.cluster.ids.b958)
DimPlot(c7_b985.ccr, reduction="tsne", pt.size=0.75) + theme(legend.text=element_text(size=20))

c9_m81.ccr <- lcl777_m81.ccr
new.cluster.ids.m81 <- c("Latent", "Latent", "Latent", "Latent", "Latent", "Latent", "Latent", "Latent", "Latent", "Lytic")
names(new.cluster.ids.m81) <- levels(c9_m81.ccr)
c9_m81.ccr <- RenameIdents(c9_m81.ccr, new.cluster.ids.m81)
DimPlot(c9_m81.ccr, reduction="tsne", pt.size=0.75) + theme(legend.text=element_text(size=20))

# for querying lytic cluster genes
b958.lytic.host.markers <- FindMarkers(lcl777_b958.ccr, ident.1 = 7, min.pct = 0.25)
m81.lytic.host.markers <- FindMarkers(lcl777_m81.ccr, ident.1 = 9, min.pct = 0.25)

# 3c
viral_lytic_genes = c("BHRF1", "BLRF1", "BALF1", "BARF1")
FeaturePlot(lcl777_b958.ccr, features = c(viral_lytic_genes), ncol=2)
FeaturePlot(lcl777_m81.ccr, features = c(viral_lytic_genes), ncol=2)

# 3d
host_lytic_genes = c("IGHM", "IGHG1", "IGHA1", "SFN", "MIER2", "NFATC1", "NHLH1", "SGK1")
# DotPlot(c7_b985.ccr, features = rev(host_lytic_genes)) + RotatedAxis()
# DotPlot(c9_m81.ccr, features = rev(host_lytic_genes)) + RotatedAxis()

VlnPlot(c7_b985.ccr, features = rev(host_lytic_genes), ncol = 4, pt.size=0.5)
VlnPlot(c9_m81.ccr, features = rev(host_lytic_genes), ncol = 4, pt.size=0.5)


###  FIGURE 4  ###
# 4a
lcl461_b958.ccr <- FindNeighbors(lcl461_b958.ccr, dims = 1:35)
lcl461_b958.ccr <- FindClusters(lcl461_b958.ccr, resolution = 0.55)
gm12878.ccr <- FindNeighbors(gm12878.ccr, dims = 1:15)
gm12878.ccr <- FindClusters(gm12878.ccr, resolution = 0.83)
gm18502.ccr <- FindNeighbors(gm18502.ccr, dims = 1:15)
gm18502.ccr <- FindClusters(gm18502.ccr, resolution = 0.25)

DimPlot(lcl461_b958.ccr, reduction = "tsne")
DimPlot(gm12878.ccr, reduction = "tsne")
DimPlot(gm18502.ccr, reduction = "tsne")

# 4b
FeaturePlot(lcl461_b958.ccr, features = c("percent.mt"))
FeaturePlot(gm12878.ccr, features = c("percent.mt"))
FeaturePlot(gm18502.ccr, features = c("percent.mt"))

# 4c
cluster6.markers <- FindMarkers(lcl461_b958.ccr, ident.1 = 6, min.pct = 0.25)
write.csv(cluster6.markers, 'c6_genes.csv')
cluster14.markers <- FindMarkers(gm12878.ccr, ident.1 = 14, min.pct = 0.25)
write.csv(cluster14.markers, 'c14_genes.csv')
cluster5.markers <- FindMarkers(gm18502.ccr, ident.1 = 5, min.pct = 0.25)
write.csv(cluster6.markers, 'c5_genes.csv')

fig_4_genes = c("MT-CO1", "MT-CO2", "MT-ND1", "MT-ND2", "MALAT1", "CD19", "MS4A1", "PTPRC", "CD74", "HLA-A", "ACTB", "TUBB", "PKM", "ENO1", "PSMA1", "HSP90AB1", "LDHA", "PPIA", "TXN", "PRDX1")
fig_4_genes.461 = vector(mode="character", length=length(fig_4_genes))

for(f in 1:length(fig_4_genes)){
  fig_4_genes.461[f] <- paste0('GRCh38-', fig_4_genes[f]) 
} 

c6_461.ccr <- lcl461_b958.ccr
new.cluster.ids.461 <- c("0", "0", "0", "0", "0", "0", "1")
names(new.cluster.ids.461) <- levels(c6_461.ccr)
c6_461.ccr <- RenameIdents(c6_461.ccr, new.cluster.ids.461)
DimPlot(c6_461.ccr, reduction="tsne", label=TRUE, pt.size=0.5)

c14_12878.ccr <- gm12878.ccr
new.cluster.ids.12878 <- c("0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1")
names(new.cluster.ids.12878) <- levels(c14_12878.ccr)
c14_12878.ccr <- RenameIdents(c14_12878.ccr, new.cluster.ids.12878)
DimPlot(c14_12878.ccr, reduction="tsne", label=TRUE, pt.size=0.5)

c5_18502.ccr <- gm18502.ccr
new.cluster.ids.18502 <- c("0", "0", "0", "0", "0", "1")
names(new.cluster.ids.18502) <- levels(c5_18502.ccr)
c5_18502.ccr <- RenameIdents(c5_18502.ccr, new.cluster.ids.18502)
DimPlot(c5_18502.ccr, reduction="tsne", label=TRUE, pt.size=0.5)

# 4d
dot_plot_feats.461 = rev(c(fig_4_genes.461))
dot_plot_feats.12878 = rev(c(fig_4_genes))
dot_plot_feats.18502 = rev(c(fig_4_genes))
DotPlot(c6_461.ccr, features = dot_plot_feats.461) + RotatedAxis()
DotPlot(c14_12878.ccr, features = dot_plot_feats.12878) + RotatedAxis()
DotPlot(c5_18502.ccr, features = dot_plot_feats.18502) + RotatedAxis()


###  SEE PYTHON CODE ("ig_evo_sim.py") FOR SIMULATION USED TO GENERATE FIGURE 5 PLOTS  ###



######################################
##  CODE FOR SUPPLEMENTARY FIGURES  ##
######################################

###  FIGURE S1  ###
all.lcls <- merge(lcl777_b958.ccr, y = c(lcl777_m81.ccr, lcl461_b958.ccr, gm12878.ccr, gm18502.ccr), add.cell.ids = c("777_B958", "777_M81", "461_B958", "GM12878", "GM18502"), project = "LCLs")
VlnPlot(all.lcls, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size=0.15)


###  FIGURE S2  ###
RidgePlot(lcl777_b958.ccr, features = c("PCNA", "TOP2A", "MCM6", "MKI67"), ncol = 2)
RidgePlot(lcl777_m81.ccr, features = c("PCNA", "TOP2A", "MCM6", "MKI67"), ncol = 2)
RidgePlot(lcl461_b958.ccr, features = c("GRCh38-PCNA", "GRCh38-TOP2A", "GRCh38-MCM6", "GRCh38-MKI67"), ncol = 2)
RidgePlot(gm12878.ccr, features = c("PCNA", "TOP2A", "MCM6", "MKI67"), ncol = 2)
RidgePlot(gm18502.ccr, features = c("PCNA", "TOP2A", "MCM6", "MKI67"), ncol = 2)


###  FIGURE S3  ###
# elbow plots for PCA usage
ElbowPlot(lcl777_b958.ccr, ndims = 40)
ElbowPlot(lcl777_m81.ccr, ndims = 40)
ElbowPlot(lcl461_b958.ccr, ndims = 40)
ElbowPlot(gm12878.ccr, ndims = 40)
ElbowPlot(gm18502.ccr, ndims = 40)
# compute jack straw scores for PCAs
lcl777_b958.ccr <- JackStraw(lcl777_b958.ccr, num.replicate = 100)
lcl777_b958.ccr <- ScoreJackStraw(lcl777_b958.ccr, dims = 1:20)
lcl777_m81.ccr <- JackStraw(lcl777_m81.ccr, num.replicate = 100)
lcl777_m81.ccr <- ScoreJackStraw(lcl777_m81.ccr, dims = 1:20)
lcl461_b958.ccr <- JackStraw(lcl461_b958.ccr, num.replicate = 100)
lcl461_b958.ccr <- ScoreJackStraw(lcl461_b958.ccr, dims = 1:20)
gm12878.ccr <- JackStraw(gm12878.ccr, num.replicate = 100)
gm12878.ccr <- ScoreJackStraw(gm12878.ccr, dims = 1:20)
gm18502.ccr <- JackStraw(gm18502.ccr, num.replicate = 100)
gm18502.ccr <- ScoreJackStraw(gm18502.ccr, dims = 1:20)
# plot jack straw for each sample
JackStrawPlot(lcl777_b958.ccr, dims = 1:20)
JackStrawPlot(lcl777_m81.ccr, dims = 1:20)
JackStrawPlot(lcl461_b958.ccr, dims = 1:20)
JackStrawPlot(gm12878.ccr, dims = 1:20)
JackStrawPlot(gm18502.ccr, dims = 1:20)


###  FIGURE S4-S8  ###
master_features_1 = c("CD19", "MS4A1", "CR2", "CD27", "CD38", "CD40", "PTPRC", "CD74", "CD80", "TNFRSF17", "XBP1", "MZB1")
master_features_2 = c("TXN", "NFKB2", "NFKBIA", "ICAM1", "ICAM2", "EBI3", "TRAF1", "FCER2", "HLA-A", "HLA-DRA", "HLA-DRB1", "HLA-DOA")
master_features_3 = c("IGHD", "IGHM", "IGHA1", "IGHA2", "IGHG1", "IGHG2", "IGHG3", "IGHG4", "IGKC", "IGLC2", "IGLC3", "IGLC5")
master_features_4 = c("BHRF1", "BHLF1", "BARF1", "BALF1", "NFATC1", "MIER2", "SFN", "SGK1")

master_features_1.461 = vector(mode = "character", length = length(master_features_1))
master_features_2.461 = vector(mode = "character", length = length(master_features_2))
master_features_3.461 = vector(mode = "character", length = length(master_features_3))
master_features_4.461 = vector(mode = "character", length = length(master_features_4))

for(f in 1:length(master_features_1)){
  master_features_1.461[f] <- paste0('GRCh38-', master_features_1[f]) 
} 

for(f in 1:length(master_features_2)){
  master_features_2.461[f] <- paste0('GRCh38-', master_features_2[f]) 
} 

for(f in 1:length(master_features_3)){
  master_features_3.461[f] <- paste0('GRCh38-', master_features_3[f]) 
} 

# replace IGHD --> IGDCC4 for 461 (No IgD in this expression dataset)
master_features_3.461[1] <- "GRCh38-IGDCC4"

for(f in 1:length(master_features_4)){
  if(f < 5){
    master_features_4.461[f] <- paste0('B958---', master_features_4[f]) 
  }
  else{
    master_features_4.461[f] <- paste0('GRCh38-', master_features_4[f]) 
  }
} 

# master gene expression profiles - plot 1
FeaturePlot(lcl777_b958.ccr, features = c(master_features_1))
FeaturePlot(lcl777_m81.ccr, features = c(master_features_1))
FeaturePlot(lcl461_b958.ccr, features = c(master_features_1.461))
FeaturePlot(gm12878.ccr, features = c(master_features_1))
FeaturePlot(gm18502.ccr, features = c(master_features_1))

# master gene expression profiles - plot 2
FeaturePlot(lcl777_b958.ccr, features = c(master_features_2))
FeaturePlot(lcl777_m81.ccr, features = c(master_features_2))
FeaturePlot(lcl461_b958.ccr, features = c(master_features_2.461))
FeaturePlot(gm12878.ccr, features = c(master_features_2))
FeaturePlot(gm18502.ccr, features = c(master_features_2))

# master gene expression profiles - plot 3
FeaturePlot(lcl777_b958.ccr, features = c(master_features_3))
FeaturePlot(lcl777_m81.ccr, features = c(master_features_3))
FeaturePlot(lcl461_b958.ccr, features = c(master_features_3.461))
FeaturePlot(gm12878.ccr, features = c(master_features_3))
FeaturePlot(gm18502.ccr, features = c(master_features_3))

# master gene expression profiles - plot 4
FeaturePlot(lcl777_b958.ccr, features = c(master_features_4), ncol=4)
FeaturePlot(lcl777_m81.ccr, features = c(master_features_4), ncol=4)
FeaturePlot(lcl461_b958.ccr, features = c(master_features_4.461), ncol=4)
FeaturePlot(gm12878.ccr, features = c(master_features_4), ncol=4)
FeaturePlot(gm18502.ccr, features = c(master_features_4), ncol=4)


###  FIGURE S9-S13  ###
# see function "make_gene_corrcoef_map" on lines 441-466

# input gene list
mega_genes = c("CD19", "MS4A1", "CR2", "CD27", "CD38", "CD40", "PTPRC", "CD74", "CD80", "TNFRSF17", "XBP1", "MZB1", "TXN",
               "NFKB2", "NFKBIA", "REL", "RELA", "RELB", "EBI3", "TRAF1", "FCER2", "HLA-A", "HLA-B", "HLA-C", 
               "HLA-DRB1", "HLA-DOA", "IGHM", "IGHA1", "IGHG1", "IGHG2", "IGHG3", "IGHG4", "IGKC", "IGLC2", "IGLC3",
               "IGLC5", "NFATC1", "MIER2", "PRDM1", "PRDX1", "PKM", "LDHA", "ENO1", "HSP90AB1", "BCL2", "BCL2L1", "BCL2A1",
               "MCL1", "MTHFD2", "RUNX3", "RGS1", "ATP1B1", "PTK2B", "PAX5", "MYC", "CIITA", "ID3", "SPIB", "IRF8", "IRF4")
mega_genes.461 = vector(mode = "character", length = length(mega_genes))
for(gene in 1:length(mega_genes.461)){
  mega_genes.461[gene] <- paste0('GRCh38-', mega_genes[gene]) 
} 

# function call for each dataset
make_gene_corrcoef_map(lcl777_b958.ccr, mega_genes)
make_gene_corrcoef_map(lcl777_m81.ccr, mega_genes)
make_gene_corrcoef_map(lcl461_b958.ccr, mega_genes.461)
make_gene_corrcoef_map(gm12878.ccr, mega_genes)
make_gene_corrcoef_map(gm18502.ccr, mega_genes)


###  FIGURE S14-S18  ###
DimHeatmap(lcl777_b958.ccr, dims = 8, cells = 1000, balanced = TRUE, ncol=1)
DimHeatmap(lcl777_m81.ccr, dims = 8, cells = 1000, balanced = TRUE, ncol=1)
DimHeatmap(lcl461_b958.ccr, dims = 8, cells = 1000, balanced = TRUE, ncol=2)
DimHeatmap(gm12878.ccr, dims = 8, cells = 1000, balanced = TRUE, ncol=1)
DimHeatmap(gm18502.ccr, dims = 8, cells = 1000, balanced = TRUE, ncol=1)


###  FIGURE S19  ###
# S19a-b
FeaturePlot(lcl777_b958.ccr, features = c("CD27", "TNFRSF17", "XBP1", "MZB1", "PRDM1"), ncol=5)
FeaturePlot(lcl777_b958.ccr, features = c("NFKB2", "NFKBIA", "EBI3", "TXN"), ncol=4)
FeaturePlot(lcl777_m81.ccr, features = c("CD27", "TNFRSF17", "XBP1", "MZB1", "PRDM1"), ncol=5)
FeaturePlot(lcl777_m81.ccr, features = c("NFKB2", "NFKBIA", "EBI3", "TXN"), ncol=4)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-CD27", "GRCh38-TNFRSF17", "GRCh38-XBP1", "GRCh38-MZB1", "GRCh38-PRDM1"), ncol=5)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-NFKB2", "GRCh38-NFKBIA", "GRCh38-EBI3", "GRCh38-TXN"), ncol=4)
FeaturePlot(gm12878.ccr, features = c("CD27", "TNFRSF17", "XBP1", "MZB1", "PRDM1"), ncol=5)
FeaturePlot(gm12878.ccr, features = c("NFKB2", "NFKBIA", "EBI3", "TXN"), ncol=4)
FeaturePlot(gm18502.ccr, features = c("CD27", "TNFRSF17", "XBP1", "MZB1", "PRDM1"), ncol=5)
FeaturePlot(gm18502.ccr, features = c("NFKB2", "NFKBIA", "EBI3", "TXN"), ncol=4)

# S19c
plot1 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "NFKB2")
plot2 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "NFKBIA")
plot3 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "EBI3")
plot4 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "TXN")
plot5 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "CD27")
plot6 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "TNFRSF17")
plot7 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "XBP1")
plot8 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "MZB1")
plot9 <- FeatureScatter(gm18502.ccr, feature1 = "IGHG1", feature2 = "PRDM1")
grid.arrange(plot1, plot2, plot3, plot4, ncol=4)
grid.arrange(plot5, plot6, plot7, plot8, plot9, ncol=5)


###  FIGURE S20  ###
FeaturePlot(lcl777_b958.ccr, features = c("TXN", "PRDX1", "PKM", "LDHA", "ENO1", "HSP90AB1"), ncol=6)
FeaturePlot(lcl777_m81.ccr, features = c("TXN", "PRDX1", "PKM", "LDHA", "ENO1", "HSP90AB1"), ncol=6)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-TXN", "GRCh38-PRDX1", "GRCh38-PKM", "GRCh38-LDHA", "GRCh38-ENO1", "GRCh38-HSP90AB1"), ncol=6)
FeaturePlot(gm12878.ccr, features = c("TXN", "PRDX1", "PKM", "LDHA", "ENO1", "HSP90AB1"), ncol=6)
FeaturePlot(gm18502.ccr, features = c("TXN", "PRDX1", "PKM", "LDHA", "ENO1", "HSP90AB1"), ncol=6)


###  FIGURE S21  ###
FeaturePlot(lcl777_b958.ccr, features = c("REL", "RELA", "RELB"), ncol=3)
FeaturePlot(lcl777_m81.ccr, features = c("REL", "RELA", "RELB"), ncol=3)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-REL", "GRCh38-RELA", "GRCh38-RELB"), ncol=3)
FeaturePlot(gm12878.ccr, features = c("REL", "RELA", "RELB"), ncol=3)
FeaturePlot(gm18502.ccr, features = c("REL", "RELA", "RELB"), ncol=3)


###  FIGURE S22  ###
FeaturePlot(lcl777_b958.ccr, features = c("BCL2", "BCL2L1", "BCL2L2", "BCL2A1", "MCL1"), ncol=5)
FeaturePlot(lcl777_m81.ccr, features = c("BCL2", "BCL2L1", "BCL2L2", "BCL2A1", "MCL1"), ncol=5)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-BCL2", "GRCh38-BCL2L1", "GRCh38-BCL2L2", "GRCh38-BCL2A1", "GRCh38-MCL1"), ncol=5)
FeaturePlot(gm12878.ccr, features = c("BCL2", "BCL2L1", "BCL2L2", "BCL2A1", "MCL1"), ncol=5)
FeaturePlot(gm18502.ccr, features = c("BCL2", "BCL2L1", "BCL2L2", "BCL2A1", "MCL1"), ncol=5)


###  FIGURE S23  ###
FeaturePlot(lcl777_b958.ccr, features = c("PAX5", "IRF4", "XBP1", "IRF8", "MKI67", "PRDM1"), ncol=3)
FeaturePlot(lcl777_m81.ccr, features = c("PAX5", "IRF4", "XBP1", "IRF8", "MKI67", "PRDM1"), ncol=3)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-PAX5", "GRCh38-IRF4", "GRCh38-XBP1", "GRCh38-IRF8", "GRCh38-MKI67", "GRCh38-PRDM1"), ncol=3)
FeaturePlot(gm12878.ccr, features = c("PAX5", "IRF4", "XBP1", "IRF8", "MKI67", "PRDM1"), ncol=3)
FeaturePlot(gm18502.ccr, features = c("PAX5", "IRF4", "XBP1", "IRF8", "MKI67", "PRDM1"), ncol=3)


###  FIGURE S24  ###
FeaturePlot(lcl777_b958.ccr, features = c("HES1", "FCER2", "MTHFD2", "RUNX3"), ncol=4)
FeaturePlot(lcl777_m81.ccr, features = c("HES1", "FCER2", "MTHFD2", "RUNX3"), ncol=4)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-HES1", "GRCh38-FCER2", "GRCh38-MTHFD2", "GRCh38-RUNX3"), ncol=4)
FeaturePlot(gm12878.ccr, features = c("HES1", "FCER2", "MTHFD2", "RUNX3"), ncol=4)
FeaturePlot(gm18502.ccr, features = c("HES1", "FCER2", "MTHFD2", "RUNX3"), ncol=4)

FeaturePlot(lcl777_b958.ccr, features = c("RGS1", "ATP1B1", "PTK2B"), ncol=3)
FeaturePlot(lcl777_m81.ccr, features = c("RGS1", "ATP1B1", "PTK2B"), ncol=3)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-RGS1", "GRCh38-ATP1B1", "GRCh38-PTK2B"), ncol=3)
FeaturePlot(gm12878.ccr, features = c("RGS1", "ATP1B1", "PTK2B"), ncol=3)
FeaturePlot(gm18502.ccr, features = c("RGS1", "ATP1B1", "PTK2B"), ncol=3)


###  FIGURE S25  ###
FeaturePlot(lcl777_b958.ccr, features = c("ADAMDEC1", "CXCL9", "CXCL10", "BCL2L11"), ncol=4)
FeaturePlot(lcl777_m81.ccr, features = c("ADAMDEC1", "CXCL9", "CXCL10", "BCL2L11"), ncol=4)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-ADAMDEC1", "GRCh38-CXCL9", "GRCh38-CXCL10", "GRCh38-BCL2L11"), ncol=4)
FeaturePlot(gm12878.ccr, features = c("ADAMDEC1", "CXCL9", "CXCL10", "BCL2L11"), ncol=4)
FeaturePlot(gm18502.ccr, features = c("ADAMDEC1", "CXCL9", "CXCL10", "BCL2L11"), ncol=4)


###  FIGURE S26-S30   ###
lcl777_b958.ccr <- FindNeighbors(lcl777_b958.ccr, dims = 1:20)
lcl777_b958.ccr <- FindClusters(lcl777_b958.ccr, resolution = 0.75)
DimPlot(lcl777_b958.ccr, reduction = "tsne")

lcl777_m81.ccr <- FindNeighbors(lcl777_m81.ccr, dims = 1:35)
lcl777_m81.ccr <- FindClusters(lcl777_m81.ccr, resolution = 0.95)
DimPlot(lcl777_m81.ccr, reduction = "tsne")

lcl461_b958.ccr <- FindNeighbors(lcl461_b958.ccr, dims = 1:35)
lcl461_b958.ccr <- FindClusters(lcl461_b958.ccr, resolution = 0.55)
DimPlot(lcl461_b958.ccr, reduction = "tsne")

gm12878.ccr <- FindNeighbors(gm12878.ccr, dims = 1:15)
gm12878.ccr <- FindClusters(gm12878.ccr, resolution = 0.65)
DimPlot(gm12878.ccr, reduction = "tsne")

gm18502.ccr <- FindNeighbors(gm18502.ccr, dims = 1:15)
gm18502.ccr <- FindClusters(gm18502.ccr, resolution = 0.2)
DimPlot(gm18502.ccr, reduction = "tsne")


###  FIGURE S31  ###
FeaturePlot(lcl777_b958.ccr, features = c("HLA-A", "HLA-B", "HLA-C"), ncol=3)
FeaturePlot(lcl777_m81.ccr, features = c("HLA-A", "HLA-B", "HLA-C"), ncol=3)
FeaturePlot(lcl461_b958.ccr, features = c("GRCh38-HLA-A", "GRCh38-HLA-B", "GRCh38-HLA-C"), ncol=3)
FeaturePlot(gm12878.ccr, features = c("HLA-A", "HLA-B", "HLA-C"), ncol=3)
FeaturePlot(gm18502.ccr, features = c("HLA-A", "HLA-B", "HLA-C"), ncol=3)


###  FIGURE S32  ###
FeaturePlot(lcl777_b958.ccr, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol=3)
FeaturePlot(lcl777_m81.ccr, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol=3)
FeaturePlot(lcl461_b958.ccr, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol=3)
FeaturePlot(gm12878.ccr, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol=3)
FeaturePlot(gm18502.ccr, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol=3)




