#Supplementary file - TCR analysis
#Script by Daniel Radtke


#-------------start session info
#> sessionInfo()
#R version 3.5.1 (2018-07-02)
#Platform: x86_64-w64-mingw32/x64 (64-bit)
#Running under: Windows 10 x64 (build 19044)
#
#Matrix products: default
#
#locale:
#[1] LC_COLLATE=German_Germany.1252  LC_CTYPE=German_Germany.1252   
#[3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C                   
#[5] LC_TIME=German_Germany.1252    
#
#attached base packages:
#[1] stats     graphics  grDevices utils     datasets  methods   base     
#
#other attached packages:
#[1] ggplot2_3.2.1
#
#loaded via a namespace (and not attached):
# [1] nlme_3.1-137        tsne_0.1-3          bitops_1.0-6       
# [4] RcppAnnoy_0.0.14    RColorBrewer_1.1-2  httr_1.4.1         
# [7] sctransform_0.2.0   tools_3.5.1         R6_2.4.1           
#[10] irlba_2.3.3         KernSmooth_2.23-15  uwot_0.1.4         
#[13] lazyeval_0.2.2      colorspace_1.4-1    npsurv_0.4-0       
#[16] withr_2.1.2         tidyselect_0.2.5    gridExtra_2.3      
#[19] compiler_3.5.1      plotly_4.9.1        labeling_0.3       
#[22] Seurat_3.1.1        caTools_1.17.1.3    scales_1.1.0       
#[25] lmtest_0.9-37       ggridges_0.5.1      pbapply_1.4-2      
#[28] stringr_1.4.0       digest_0.6.25       R.utils_2.9.0      
#[31] pkgconfig_2.0.2     htmltools_0.4.0     bibtex_0.4.2       
#[34] htmlwidgets_1.5.1   rlang_0.4.5         farver_2.0.1       
#[37] zoo_1.8-6           jsonlite_1.6.1      ica_1.0-2          
#[40] gtools_3.8.1        dplyr_0.8.3         R.oo_1.23.0        
#[43] magrittr_1.5        Matrix_1.2-14       Rcpp_1.0.1         
#[46] munsell_0.5.0       ape_5.3             reticulate_1.13    
#[49] lifecycle_0.1.0     R.methodsS3_1.7.1   stringi_1.4.3      
#[52] gbRd_0.4-11         MASS_7.3-50         gplots_3.0.1.1     
#[55] Rtsne_0.15          plyr_1.8.4          grid_3.5.1         
#[58] parallel_3.5.1      gdata_2.18.0        listenv_0.7.0      
#[61] ggrepel_0.8.1       crayon_1.3.4        lattice_0.20-35    
#[64] cowplot_1.0.0       splines_3.5.1       SDMTools_1.1-221.2 
#[67] pillar_1.4.2        igraph_1.2.4.2      future.apply_1.3.0 
#[70] reshape2_1.4.3      codetools_0.2-15    leiden_0.3.1       
#[73] glue_1.4.0          lsei_1.2-0          metap_1.1          
#[76] data.table_1.12.2   RcppParallel_4.4.4  vctrs_0.2.4        
#[79] png_0.1-7           Rdpack_0.11-0       gtable_0.3.0       
#[82] RANN_2.6.1          purrr_0.3.3         tidyr_1.0.0        
#[85] future_1.15.1       assertthat_0.2.1    rsvd_1.0.2         
#[88] survival_2.42-3     viridisLite_0.3.0   tibble_2.1.3       
#[91] cluster_2.0.7-1     globals_0.12.4      fitdistrplus_1.0-14
#[94] ROCR_1.0-7      
#-------------end session info



#optional set options: 
setwd('K:/LabInfBio/Results/10xNatalie_Seurat')
memory.limit(30000)
set.seed(42)

#load libraries
library(ggplot2)

#-------------start load and filter contig data
#load in Cell Ranger 3.0.1 output of TCR analysis "all_contig_annotations.csv" files, relabel and combine
contig_files<-list()
contig_files[[1]]<-read.csv('K:/217393_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F)
contig_files[[2]]<-read.csv('K:/217394_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F)
contig_files[[3]]<-read.csv('K:/217395_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F)
contig_files[[4]]<-read.csv('K:/217396_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F)
#also added a publicly available PBMC naive reference dataset
#10xGenomics TCR reference data set via the 10xGenomics website: PBMCs from C57BL/6 mice (v1, 150 x150), Single Cell Immune Profiling Dataset by Cell Ranger 3.0.0, 10x Genomics, (2018, November 19).
#source: https://www.10xgenomics.com/resources/datasets/pbm-cs-from-c-57-bl-6-mice-tcr-enrichment-from-amplified-c-dna-1-standard #also referenced in main manuscript
contig_files[[5]]<-read.csv('K:/vdj_v1_mm_c57bl6_pbmc_t_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F)

#visual inspection
lapply(contig_files, dim)

#give every sample unique labels
contig_files[1:5]<-lapply(1:5, function(x) {contig_files[[x]][,"barcode"]<-gsub("1", x, contig_files[[x]][,"barcode"]); return(contig_files[[x]])}     )
contig_files[1:5]<-lapply(1:5, function(x) {contig_files[[x]][,"contig_id"]<-gsub("-1", paste0("-", x), contig_files[[x]][,"contig_id"]); return(contig_files[[x]])}     )

#check loaded data
lapply(contig_files, function(x) head(x))
lapply(contig_files, dim)

#load filtered seurat file to extract barcodes belonging to cells that were included in the manuscript after QC (QC described in Radtke et al. 2022 main maunuscript)
SeuratObject<-readRDS("K:/210304_QC_TCR_trad_pipeline.Rdata")
#subset contigs based on loaded barcodes
subsetSeuratObject<-SeuratObject[[]]
contig_files[1:4]<-lapply(1:4, function(x) {contig_files[[x]]<-contig_files[[x]][contig_files[[x]][,"barcode"]%in%rownames(subsetSeuratObject),]; return(contig_files[[x]])}   )


#concanatate all contig information
contigs<-do.call(rbind, contig_files[c(1:5)])

#filter contigs for quality based on Cell Ranger output
head(contigs)
dim(contigs)
contigs<-contigs[ contigs[,"is_cell"]=="True" & contigs[,"productive"]=="True" & contigs[,"full_length"]=="True" &  contigs[,"high_confidence"]=="True" &  contigs[,"umis"]>1 ,]
contigs<-contigs[  unlist(lapply(contigs[,"cdr3"], function(x) nchar(x)>5)) ,]
dim(contigs)

#store contigs in one data.frame for later use
contigs_one_frame<-contigs
#-------------end load and filter contig data


#--plot (optional)
#pdf(file ="210526_TCR_complete_run.pdf")
#--

#-------------start determine occurance of chain pairs
#get all contigs for each cell in a list
contigs<-split(contigs, contigs[,"barcode"])
length(contigs)
#visually inspect:
contigs[1:2]
#remove cells with only one chain
chain_combs<-lapply(contigs, function(x) if(length(x[,"cdr3"])>1){combn(x[,"cdr3"], 2, simplify = FALSE)}else{""})
length(chain_combs)
chain_combs<-chain_combs[!unlist(lapply(chain_combs, function(x) nchar(x[[1]][1])<2))]
length(chain_combs)
#order that a combination of chains is always in the same order
chain_combs<-lapply(chain_combs, function(x) lapply(x, function(y) y[order(y)]  ))
#combine chains to one string for simpler analysis
chain_combs_linked<-lapply(chain_combs, function(x) unlist(lapply(x, function(y) paste(y[1],y[2],sep="_")  ),recursive=T))
all_single_combs<-unique(unlist(chain_combs_linked))
#determine how often a chain combination is present
how_often_comb_present<-lapply(all_single_combs, function(x) as.vector(table(unlist(lapply(chain_combs_linked, function(y) x%in%y )))[2])  )
how_often_comb_present<-unlist(how_often_comb_present)
names(how_often_comb_present)<-all_single_combs
how_often_comb_present<-how_often_comb_present[order(how_often_comb_present, decreasing = TRUE)]
#get common chain combinations
head(how_often_comb_present)
#filter high abundant chain combinations
high_abundant_combs<-names(how_often_comb_present[how_often_comb_present>1])

#get cell identifier for high abundant chain combinations
combination_in_which_cells<-lapply(high_abundant_combs, function(x) unlist(lapply(chain_combs_linked, function(y) x%in%y))  )
combination_in_which_cells<-lapply(combination_in_which_cells, function(x) names(x[x==TRUE]))
names(combination_in_which_cells)<-high_abundant_combs
head(combination_in_which_cells)
#-------------end determine occurance of chain pairs


#-------------start check how often a single chain is found combined in all expanded combinations
exp_comb<-how_often_comb_present[how_often_comb_present>1]
chains_in_exp_combs<-strsplit(names(exp_comb), split='_', fixed=TRUE)
found_in_exp_combs<-lapply(unique(unlist(chains_in_exp_combs)), function(x) exp_comb[unlist(lapply(chains_in_exp_combs, function(y) x%in%y ))])
names(found_in_exp_combs)<-unique(unlist(chains_in_exp_combs))
combinations_to_cells<-lapply(found_in_exp_combs, function(x) combination_in_which_cells[names(combination_in_which_cells)%in%names(x)])
combinations_to_cells<-lapply(combinations_to_cells, function(x) unlist(x, use.names = FALSE))
fair_rank_expanded_combs<-lapply(combinations_to_cells, function(x) length(unique(x)))
fair_rank_expanded_combs<-unlist(fair_rank_expanded_combs)
fair_rank_expanded_combs<-fair_rank_expanded_combs[order(fair_rank_expanded_combs, decreasing = TRUE)]
#inspect top 10 selected chains
fair_rank_expanded_combs[fair_rank_expanded_combs>10]
#-------------end check how often a single chain is found combined in all expanded combinations


#-------------start plot chains (loop thought to be plotted to PDF)
#manually selected chains combined
chain_names_to_plot1<-c("CALSDSNTNKVVF", "CAIDPSGSWQLIF", "CAASGPSGSWQLIF", "CAAEAGTGGYKVVF", "CAIDSSGSWQLIF") # could optionally use this to plot chains found in more than 8 cells: chain_names_to_plot1<-names(fair_rank_expanded_combs[fair_rank_expanded_combs>8])
#manually selected expanded single chains
chain_names_to_plot2<-c("CAAASSGSWQLIF", "CAASDTNTGKLTF", "CALGSSGSWQLIF")
#invariant NKT
chain_names_to_plot3<-c("CVVGDRGSALGRLHF")
#chain_names_to_plot<-c("CASEDRYNSPLYF", "CASSDAGQQDTQYF")

chain_names_to_plot_all<-list(chain_names_to_plot1, chain_names_to_plot2, chain_names_to_plot3)

for(i in 1:length(chain_names_to_plot_all)){

chain_names_to_plot<-chain_names_to_plot_all[[i]]

info_to_plot<-lapply(chain_names_to_plot , function(x) do.call(rbind, contigs[unlist(lapply(contigs, function(y) x%in%y[,"cdr3"]) )])   )
info_to_plot<-lapply(1:length(info_to_plot), function(x) info_to_plot[[x]][!info_to_plot[[x]][,"cdr3"]%in%chain_names_to_plot[x],]  )
info_to_plot<-lapply(1:length(info_to_plot), function(x) {info_to_plot[[x]][,"y"]<-chain_names_to_plot[x];return(info_to_plot[[x]])})
info_to_plot<-do.call(rbind,info_to_plot)
info_to_plot[,"y"]<-factor(info_to_plot[,"y"], chain_names_to_plot)

#provide organ and sample info for plot
organ_sample<-as.numeric(gsub("\\D", "", info_to_plot[,"barcode"]))
info_to_plot[,"sample"]<-factor(  c("Lung1", "MLN1", "Lung2", "MLN2", "b6")[organ_sample]  ,c("Lung1", "MLN1", "Lung2", "MLN2", "b6"))
info_to_plot[,"organ"]<-factor(  c("Lung", "MLN", "Lung", "MLN", "b6")[organ_sample]  ,c("Lung", "MLN", "b6"))

info_to_plot[,"cdr3"]<-factor(info_to_plot[,"cdr3"],   names(table(info_to_plot[,"cdr3"])[order(table(info_to_plot[,"cdr3"]), decreasing=T)])  )
pairs_plt = ggplot(info_to_plot, aes(x = y, y = cdr3)) + geom_jitter(aes(color = sample, shape = organ), width = .2, height = .2) + theme_minimal() + xlab('CDR3 aa sequence') + ylab('Motif paired with expanded') + theme(axis.text.x = element_text(angle = 45))
pairs_plt = pairs_plt + scale_color_manual(values = c("#F8766D", "#00BFC4", "#7CAE00", "#C77CFF", "#979797")) + scale_shape_manual(values=c(15,16,17,18,25))

#provide info about TRA or TRB chain
get_chains_for_colour<-contigs_one_frame[contigs_one_frame[,"cdr3"]%in%chain_names_to_plot,c("cdr3", "chain")]
get_chains_for_colour<-get_chains_for_colour[!duplicated(get_chains_for_colour[,"cdr3"]),]
rownames(get_chains_for_colour)<-get_chains_for_colour[,"cdr3"]
get_chains_for_colour<-get_chains_for_colour[chain_names_to_plot,"chain"]


a <- ifelse(get_chains_for_colour == "TRA", "black", "grey")
pairs_plt = pairs_plt + theme(axis.text.x = element_text(angle = 45, hjust = 1, colour = a))
to_color_chains<-pairs_plt$data[!duplicated(pairs_plt$data[,"cdr3"]),c("cdr3", "chain")]
rownames(to_color_chains)<-to_color_chains[,"cdr3"]
to_color_chains<-to_color_chains[levels(pairs_plt$data[,"cdr3"]),]
to_color_chains<-to_color_chains[,"chain"]
b <- ifelse(to_color_chains== "TRA", "black", "grey")

pairs_plt = pairs_plt + theme(axis.text.y = element_text(angle = 0, hjust = 1, colour = b))
pairs_plt
print(pairs_plt)
}
#-------------end plot chains (loop thought to be plotted to PDF)



#-------------start determine occurance of single chains
occurance_single_chains<-table(contigs_one_frame[,"cdr3"])
occurance_single_chains<-occurance_single_chains[order(occurance_single_chains, decreasing=T)]
occurance_single_chains[occurance_single_chains>12]

#provide organ and sample info for plot
organ_sample<-as.numeric(gsub("\\D", "", contigs_one_frame[,"barcode"]))
contigs_one_frame[,"sample"]<-factor(  c("Lung1", "MLN1", "Lung2", "MLN2", "b6")[organ_sample]  ,rev(c("Lung1", "MLN1", "Lung2", "MLN2", "b6")))
contigs_one_frame[,"organ"]<-factor(  c("Lung", "MLN", "Lung", "MLN", "b6")[organ_sample]  ,c("Lung", "MLN", "b6"))

contigs_one_frame[,"occurance_single_chains"]<-occurance_single_chains[match(contigs_one_frame[,"cdr3"], names(occurance_single_chains))]
contigs_one_frame<-contigs_one_frame[order(contigs_one_frame[,"occurance_single_chains"], decreasing=T),]
#-------------end determine occurance of single chains


#-------------start plot single chain occurance
chains_to_plot_here<-rev(names(occurance_single_chains)[1:10])

to_transform<-contigs_one_frame[contigs_one_frame[,"cdr3"]%in%chains_to_plot_here,]
to_transform[,"count"]<-1
chains_to_plot_occurance<-occurance_single_chains[names(occurance_single_chains)%in%chains_to_plot_here]
chains_to_plot_occurance<-chains_to_plot_occurance[order(chains_to_plot_occurance)]
to_transform[,"cdr3"]<-factor(to_transform[,"cdr3"],names(chains_to_plot_occurance))
ggplot(to_transform, aes(x = cdr3, y = count, fill = sample, label = occurance_single_chains)) +  geom_bar(stat = "identity") + facet_wrap(~to_transform[,"organ"]) + coord_flip() + scale_fill_manual(values = c("#979797", "#C77CFF", "#7CAE00", "#00BFC4", "#F8766D")) + theme_minimal()


#!!!careful as a lot of manual transformation is involved - will not work e.g. if population is thrown out - always double check vs plot above
#test manually check contig info:
number_per_organ<-data.frame(table(contigs_one_frame[,"sample"]))

to_transform<-split(to_transform, to_transform[,"sample"])
number_per_organ<-number_per_organ[match(number_per_organ[,"Var1"], names(to_transform)),]
number_per_organ<-number_per_organ[,"Freq"]
to_transform<-lapply(1:length(to_transform), function(x) {to_transform[[x]][,"count"]<-to_transform[[x]][,"count"]/number_per_organ[x]*100;return(to_transform[[x]])})
to_transform<-do.call(rbind, to_transform)
ggplot(to_transform, aes(x = cdr3, y = count, fill = sample, label = cdr3)) +  geom_bar(stat = "identity") + facet_wrap(~to_transform[,"sample"]) + coord_flip() + scale_fill_manual(values = c("#979797", "#C77CFF", "#7CAE00", "#00BFC4", "#F8766D")) + theme_minimal()
#-------------end plot single chain occurance


#-------------
#-------------
#-------------
#additional analysis
#-------------
#-------------
#-------------


#-------------start get length of all TRA or TRB and export data for external plotting
aa_length<-split(contigs_one_frame, contigs_one_frame[,"chain"])
aa_length<-lapply(aa_length, function(x) split(x,x[,"organ"]) )
aa_length<-lapply(aa_length, function(x) lapply(x, function(y) nchar(y[,"cdr3"])  ))
aa_length<-unlist(aa_length, recursive=F)
aa_length<-lapply(aa_length, function(x) c(x,rep(NA, max(unlist(lapply(aa_length, function(x) length(x)))-length(x)))   ))
unlist(lapply(aa_length, length))
aa_length<-do.call(cbind,aa_length)
write.csv(aa_length, "K:/210526_aa_length.csv")
#-------------end get length of all TRA or TRB and export data for external plotting

#--optional stop plot (if optional Pdf line above was executed)
#dev.off()
#--


#-------------start check data for defined chain - in which cells is it found with which other chains
contigs_one_frame[contigs_one_frame[,"cdr3"]=="CAASDTNTGKLTF",]
contigs_one_frame[contigs_one_frame[,"cdr3"]=="CAAEAGTGGYKVVF",]
cells<-contigs_one_frame[contigs_one_frame[,"cdr3"]=="CAAEAGTGGYKVVF","barcode"]
cells_with_chain<-contigs_one_frame[contigs_one_frame[,"barcode"]%in%cells,]
cells_with_chain<-split(cells_with_chain, cells_with_chain[,"barcode"])
cells_with_chain
cell_chains<-lapply(cells_with_chain, function(x) x[,"cdr3"])
cell_chains<-lapply(cell_chains, function(x) x[order(x)])
unique_combs<-lapply(cell_chains, function(x) paste(x,collapse="_"))
unique(unlist(unique_combs))
#-------------end check data for defined chain


#-------------
#-------------
#-------------
#Workflow is similar to 'https://bioconductor.org/packages/release/bioc/vignettes/CellaRepertorium/inst/doc/cdr3_clustering.html' but using this independent script for higher flexibility
#the referred clustering vignette belongs to the CellaRepertorium package:
#Andrew McDavid, Yu Gu and Erik VonKaenel (2021). CellaRepertorium:
#Data structures, clustering and testing for single cell immune
#receptor repertoires (scRNAseq RepSeq/AIRR-seq). R package version
#1.1.2. https://github.com/amcdavid/CellaRepertorium

