#Supplementary file - TCR analysis #Script by Daniel Radtke #-------------start session info #> sessionInfo() #R version 3.5.1 (2018-07-02) #Platform: x86_64-w64-mingw32/x64 (64-bit) #Running under: Windows 10 x64 (build 19044) # #Matrix products: default # #locale: #[1] LC_COLLATE=German_Germany.1252 LC_CTYPE=German_Germany.1252 #[3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C #[5] LC_TIME=German_Germany.1252 # #attached base packages: #[1] stats graphics grDevices utils datasets methods base # #other attached packages: #[1] ggplot2_3.2.1 # #loaded via a namespace (and not attached): # [1] nlme_3.1-137 tsne_0.1-3 bitops_1.0-6 # [4] RcppAnnoy_0.0.14 RColorBrewer_1.1-2 httr_1.4.1 # [7] sctransform_0.2.0 tools_3.5.1 R6_2.4.1 #[10] irlba_2.3.3 KernSmooth_2.23-15 uwot_0.1.4 #[13] lazyeval_0.2.2 colorspace_1.4-1 npsurv_0.4-0 #[16] withr_2.1.2 tidyselect_0.2.5 gridExtra_2.3 #[19] compiler_3.5.1 plotly_4.9.1 labeling_0.3 #[22] Seurat_3.1.1 caTools_1.17.1.3 scales_1.1.0 #[25] lmtest_0.9-37 ggridges_0.5.1 pbapply_1.4-2 #[28] stringr_1.4.0 digest_0.6.25 R.utils_2.9.0 #[31] pkgconfig_2.0.2 htmltools_0.4.0 bibtex_0.4.2 #[34] htmlwidgets_1.5.1 rlang_0.4.5 farver_2.0.1 #[37] zoo_1.8-6 jsonlite_1.6.1 ica_1.0-2 #[40] gtools_3.8.1 dplyr_0.8.3 R.oo_1.23.0 #[43] magrittr_1.5 Matrix_1.2-14 Rcpp_1.0.1 #[46] munsell_0.5.0 ape_5.3 reticulate_1.13 #[49] lifecycle_0.1.0 R.methodsS3_1.7.1 stringi_1.4.3 #[52] gbRd_0.4-11 MASS_7.3-50 gplots_3.0.1.1 #[55] Rtsne_0.15 plyr_1.8.4 grid_3.5.1 #[58] parallel_3.5.1 gdata_2.18.0 listenv_0.7.0 #[61] ggrepel_0.8.1 crayon_1.3.4 lattice_0.20-35 #[64] cowplot_1.0.0 splines_3.5.1 SDMTools_1.1-221.2 #[67] pillar_1.4.2 igraph_1.2.4.2 future.apply_1.3.0 #[70] reshape2_1.4.3 codetools_0.2-15 leiden_0.3.1 #[73] glue_1.4.0 lsei_1.2-0 metap_1.1 #[76] data.table_1.12.2 RcppParallel_4.4.4 vctrs_0.2.4 #[79] png_0.1-7 Rdpack_0.11-0 gtable_0.3.0 #[82] RANN_2.6.1 purrr_0.3.3 tidyr_1.0.0 #[85] future_1.15.1 assertthat_0.2.1 rsvd_1.0.2 #[88] survival_2.42-3 viridisLite_0.3.0 tibble_2.1.3 #[91] cluster_2.0.7-1 globals_0.12.4 fitdistrplus_1.0-14 #[94] ROCR_1.0-7 #-------------end session info #optional set options: setwd('K:/LabInfBio/Results/10xNatalie_Seurat') memory.limit(30000) set.seed(42) #load libraries library(ggplot2) #-------------start load and filter contig data #load in Cell Ranger 3.0.1 output of TCR analysis "all_contig_annotations.csv" files, relabel and combine contig_files<-list() contig_files[[1]]<-read.csv('K:/217393_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F) contig_files[[2]]<-read.csv('K:/217394_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F) contig_files[[3]]<-read.csv('K:/217395_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F) contig_files[[4]]<-read.csv('K:/217396_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F) #also added a publicly available PBMC naive reference dataset #10xGenomics TCR reference data set via the 10xGenomics website: PBMCs from C57BL/6 mice (v1, 150 x150), Single Cell Immune Profiling Dataset by Cell Ranger 3.0.0, 10x Genomics, (2018, November 19). #source: https://www.10xgenomics.com/resources/datasets/pbm-cs-from-c-57-bl-6-mice-tcr-enrichment-from-amplified-c-dna-1-standard #also referenced in main manuscript contig_files[[5]]<-read.csv('K:/vdj_v1_mm_c57bl6_pbmc_t_all_contig_annotations.csv', row.names=NULL, header=TRUE, sep=",", fill=TRUE, stringsAsFactors=F) #visual inspection lapply(contig_files, dim) #give every sample unique labels contig_files[1:5]<-lapply(1:5, function(x) {contig_files[[x]][,"barcode"]<-gsub("1", x, contig_files[[x]][,"barcode"]); return(contig_files[[x]])} ) contig_files[1:5]<-lapply(1:5, function(x) {contig_files[[x]][,"contig_id"]<-gsub("-1", paste0("-", x), contig_files[[x]][,"contig_id"]); return(contig_files[[x]])} ) #check loaded data lapply(contig_files, function(x) head(x)) lapply(contig_files, dim) #load filtered seurat file to extract barcodes belonging to cells that were included in the manuscript after QC (QC described in Radtke et al. 2022 main maunuscript) SeuratObject<-readRDS("K:/210304_QC_TCR_trad_pipeline.Rdata") #subset contigs based on loaded barcodes subsetSeuratObject<-SeuratObject[[]] contig_files[1:4]<-lapply(1:4, function(x) {contig_files[[x]]<-contig_files[[x]][contig_files[[x]][,"barcode"]%in%rownames(subsetSeuratObject),]; return(contig_files[[x]])} ) #concanatate all contig information contigs<-do.call(rbind, contig_files[c(1:5)]) #filter contigs for quality based on Cell Ranger output head(contigs) dim(contigs) contigs<-contigs[ contigs[,"is_cell"]=="True" & contigs[,"productive"]=="True" & contigs[,"full_length"]=="True" & contigs[,"high_confidence"]=="True" & contigs[,"umis"]>1 ,] contigs<-contigs[ unlist(lapply(contigs[,"cdr3"], function(x) nchar(x)>5)) ,] dim(contigs) #store contigs in one data.frame for later use contigs_one_frame<-contigs #-------------end load and filter contig data #--plot (optional) #pdf(file ="210526_TCR_complete_run.pdf") #-- #-------------start determine occurance of chain pairs #get all contigs for each cell in a list contigs<-split(contigs, contigs[,"barcode"]) length(contigs) #visually inspect: contigs[1:2] #remove cells with only one chain chain_combs<-lapply(contigs, function(x) if(length(x[,"cdr3"])>1){combn(x[,"cdr3"], 2, simplify = FALSE)}else{""}) length(chain_combs) chain_combs<-chain_combs[!unlist(lapply(chain_combs, function(x) nchar(x[[1]][1])<2))] length(chain_combs) #order that a combination of chains is always in the same order chain_combs<-lapply(chain_combs, function(x) lapply(x, function(y) y[order(y)] )) #combine chains to one string for simpler analysis chain_combs_linked<-lapply(chain_combs, function(x) unlist(lapply(x, function(y) paste(y[1],y[2],sep="_") ),recursive=T)) all_single_combs<-unique(unlist(chain_combs_linked)) #determine how often a chain combination is present how_often_comb_present<-lapply(all_single_combs, function(x) as.vector(table(unlist(lapply(chain_combs_linked, function(y) x%in%y )))[2]) ) how_often_comb_present<-unlist(how_often_comb_present) names(how_often_comb_present)<-all_single_combs how_often_comb_present<-how_often_comb_present[order(how_often_comb_present, decreasing = TRUE)] #get common chain combinations head(how_often_comb_present) #filter high abundant chain combinations high_abundant_combs<-names(how_often_comb_present[how_often_comb_present>1]) #get cell identifier for high abundant chain combinations combination_in_which_cells<-lapply(high_abundant_combs, function(x) unlist(lapply(chain_combs_linked, function(y) x%in%y)) ) combination_in_which_cells<-lapply(combination_in_which_cells, function(x) names(x[x==TRUE])) names(combination_in_which_cells)<-high_abundant_combs head(combination_in_which_cells) #-------------end determine occurance of chain pairs #-------------start check how often a single chain is found combined in all expanded combinations exp_comb<-how_often_comb_present[how_often_comb_present>1] chains_in_exp_combs<-strsplit(names(exp_comb), split='_', fixed=TRUE) found_in_exp_combs<-lapply(unique(unlist(chains_in_exp_combs)), function(x) exp_comb[unlist(lapply(chains_in_exp_combs, function(y) x%in%y ))]) names(found_in_exp_combs)<-unique(unlist(chains_in_exp_combs)) combinations_to_cells<-lapply(found_in_exp_combs, function(x) combination_in_which_cells[names(combination_in_which_cells)%in%names(x)]) combinations_to_cells<-lapply(combinations_to_cells, function(x) unlist(x, use.names = FALSE)) fair_rank_expanded_combs<-lapply(combinations_to_cells, function(x) length(unique(x))) fair_rank_expanded_combs<-unlist(fair_rank_expanded_combs) fair_rank_expanded_combs<-fair_rank_expanded_combs[order(fair_rank_expanded_combs, decreasing = TRUE)] #inspect top 10 selected chains fair_rank_expanded_combs[fair_rank_expanded_combs>10] #-------------end check how often a single chain is found combined in all expanded combinations #-------------start plot chains (loop thought to be plotted to PDF) #manually selected chains combined chain_names_to_plot1<-c("CALSDSNTNKVVF", "CAIDPSGSWQLIF", "CAASGPSGSWQLIF", "CAAEAGTGGYKVVF", "CAIDSSGSWQLIF") # could optionally use this to plot chains found in more than 8 cells: chain_names_to_plot1<-names(fair_rank_expanded_combs[fair_rank_expanded_combs>8]) #manually selected expanded single chains chain_names_to_plot2<-c("CAAASSGSWQLIF", "CAASDTNTGKLTF", "CALGSSGSWQLIF") #invariant NKT chain_names_to_plot3<-c("CVVGDRGSALGRLHF") #chain_names_to_plot<-c("CASEDRYNSPLYF", "CASSDAGQQDTQYF") chain_names_to_plot_all<-list(chain_names_to_plot1, chain_names_to_plot2, chain_names_to_plot3) for(i in 1:length(chain_names_to_plot_all)){ chain_names_to_plot<-chain_names_to_plot_all[[i]] info_to_plot<-lapply(chain_names_to_plot , function(x) do.call(rbind, contigs[unlist(lapply(contigs, function(y) x%in%y[,"cdr3"]) )]) ) info_to_plot<-lapply(1:length(info_to_plot), function(x) info_to_plot[[x]][!info_to_plot[[x]][,"cdr3"]%in%chain_names_to_plot[x],] ) info_to_plot<-lapply(1:length(info_to_plot), function(x) {info_to_plot[[x]][,"y"]<-chain_names_to_plot[x];return(info_to_plot[[x]])}) info_to_plot<-do.call(rbind,info_to_plot) info_to_plot[,"y"]<-factor(info_to_plot[,"y"], chain_names_to_plot) #provide organ and sample info for plot organ_sample<-as.numeric(gsub("\\D", "", info_to_plot[,"barcode"])) info_to_plot[,"sample"]<-factor( c("Lung1", "MLN1", "Lung2", "MLN2", "b6")[organ_sample] ,c("Lung1", "MLN1", "Lung2", "MLN2", "b6")) info_to_plot[,"organ"]<-factor( c("Lung", "MLN", "Lung", "MLN", "b6")[organ_sample] ,c("Lung", "MLN", "b6")) info_to_plot[,"cdr3"]<-factor(info_to_plot[,"cdr3"], names(table(info_to_plot[,"cdr3"])[order(table(info_to_plot[,"cdr3"]), decreasing=T)]) ) pairs_plt = ggplot(info_to_plot, aes(x = y, y = cdr3)) + geom_jitter(aes(color = sample, shape = organ), width = .2, height = .2) + theme_minimal() + xlab('CDR3 aa sequence') + ylab('Motif paired with expanded') + theme(axis.text.x = element_text(angle = 45)) pairs_plt = pairs_plt + scale_color_manual(values = c("#F8766D", "#00BFC4", "#7CAE00", "#C77CFF", "#979797")) + scale_shape_manual(values=c(15,16,17,18,25)) #provide info about TRA or TRB chain get_chains_for_colour<-contigs_one_frame[contigs_one_frame[,"cdr3"]%in%chain_names_to_plot,c("cdr3", "chain")] get_chains_for_colour<-get_chains_for_colour[!duplicated(get_chains_for_colour[,"cdr3"]),] rownames(get_chains_for_colour)<-get_chains_for_colour[,"cdr3"] get_chains_for_colour<-get_chains_for_colour[chain_names_to_plot,"chain"] a <- ifelse(get_chains_for_colour == "TRA", "black", "grey") pairs_plt = pairs_plt + theme(axis.text.x = element_text(angle = 45, hjust = 1, colour = a)) to_color_chains<-pairs_plt$data[!duplicated(pairs_plt$data[,"cdr3"]),c("cdr3", "chain")] rownames(to_color_chains)<-to_color_chains[,"cdr3"] to_color_chains<-to_color_chains[levels(pairs_plt$data[,"cdr3"]),] to_color_chains<-to_color_chains[,"chain"] b <- ifelse(to_color_chains== "TRA", "black", "grey") pairs_plt = pairs_plt + theme(axis.text.y = element_text(angle = 0, hjust = 1, colour = b)) pairs_plt print(pairs_plt) } #-------------end plot chains (loop thought to be plotted to PDF) #-------------start determine occurance of single chains occurance_single_chains<-table(contigs_one_frame[,"cdr3"]) occurance_single_chains<-occurance_single_chains[order(occurance_single_chains, decreasing=T)] occurance_single_chains[occurance_single_chains>12] #provide organ and sample info for plot organ_sample<-as.numeric(gsub("\\D", "", contigs_one_frame[,"barcode"])) contigs_one_frame[,"sample"]<-factor( c("Lung1", "MLN1", "Lung2", "MLN2", "b6")[organ_sample] ,rev(c("Lung1", "MLN1", "Lung2", "MLN2", "b6"))) contigs_one_frame[,"organ"]<-factor( c("Lung", "MLN", "Lung", "MLN", "b6")[organ_sample] ,c("Lung", "MLN", "b6")) contigs_one_frame[,"occurance_single_chains"]<-occurance_single_chains[match(contigs_one_frame[,"cdr3"], names(occurance_single_chains))] contigs_one_frame<-contigs_one_frame[order(contigs_one_frame[,"occurance_single_chains"], decreasing=T),] #-------------end determine occurance of single chains #-------------start plot single chain occurance chains_to_plot_here<-rev(names(occurance_single_chains)[1:10]) to_transform<-contigs_one_frame[contigs_one_frame[,"cdr3"]%in%chains_to_plot_here,] to_transform[,"count"]<-1 chains_to_plot_occurance<-occurance_single_chains[names(occurance_single_chains)%in%chains_to_plot_here] chains_to_plot_occurance<-chains_to_plot_occurance[order(chains_to_plot_occurance)] to_transform[,"cdr3"]<-factor(to_transform[,"cdr3"],names(chains_to_plot_occurance)) ggplot(to_transform, aes(x = cdr3, y = count, fill = sample, label = occurance_single_chains)) + geom_bar(stat = "identity") + facet_wrap(~to_transform[,"organ"]) + coord_flip() + scale_fill_manual(values = c("#979797", "#C77CFF", "#7CAE00", "#00BFC4", "#F8766D")) + theme_minimal() #!!!careful as a lot of manual transformation is involved - will not work e.g. if population is thrown out - always double check vs plot above #test manually check contig info: number_per_organ<-data.frame(table(contigs_one_frame[,"sample"])) to_transform<-split(to_transform, to_transform[,"sample"]) number_per_organ<-number_per_organ[match(number_per_organ[,"Var1"], names(to_transform)),] number_per_organ<-number_per_organ[,"Freq"] to_transform<-lapply(1:length(to_transform), function(x) {to_transform[[x]][,"count"]<-to_transform[[x]][,"count"]/number_per_organ[x]*100;return(to_transform[[x]])}) to_transform<-do.call(rbind, to_transform) ggplot(to_transform, aes(x = cdr3, y = count, fill = sample, label = cdr3)) + geom_bar(stat = "identity") + facet_wrap(~to_transform[,"sample"]) + coord_flip() + scale_fill_manual(values = c("#979797", "#C77CFF", "#7CAE00", "#00BFC4", "#F8766D")) + theme_minimal() #-------------end plot single chain occurance #------------- #------------- #------------- #additional analysis #------------- #------------- #------------- #-------------start get length of all TRA or TRB and export data for external plotting aa_length<-split(contigs_one_frame, contigs_one_frame[,"chain"]) aa_length<-lapply(aa_length, function(x) split(x,x[,"organ"]) ) aa_length<-lapply(aa_length, function(x) lapply(x, function(y) nchar(y[,"cdr3"]) )) aa_length<-unlist(aa_length, recursive=F) aa_length<-lapply(aa_length, function(x) c(x,rep(NA, max(unlist(lapply(aa_length, function(x) length(x)))-length(x))) )) unlist(lapply(aa_length, length)) aa_length<-do.call(cbind,aa_length) write.csv(aa_length, "K:/210526_aa_length.csv") #-------------end get length of all TRA or TRB and export data for external plotting #--optional stop plot (if optional Pdf line above was executed) #dev.off() #-- #-------------start check data for defined chain - in which cells is it found with which other chains contigs_one_frame[contigs_one_frame[,"cdr3"]=="CAASDTNTGKLTF",] contigs_one_frame[contigs_one_frame[,"cdr3"]=="CAAEAGTGGYKVVF",] cells<-contigs_one_frame[contigs_one_frame[,"cdr3"]=="CAAEAGTGGYKVVF","barcode"] cells_with_chain<-contigs_one_frame[contigs_one_frame[,"barcode"]%in%cells,] cells_with_chain<-split(cells_with_chain, cells_with_chain[,"barcode"]) cells_with_chain cell_chains<-lapply(cells_with_chain, function(x) x[,"cdr3"]) cell_chains<-lapply(cell_chains, function(x) x[order(x)]) unique_combs<-lapply(cell_chains, function(x) paste(x,collapse="_")) unique(unlist(unique_combs)) #-------------end check data for defined chain #------------- #------------- #------------- #Workflow is similar to 'https://bioconductor.org/packages/release/bioc/vignettes/CellaRepertorium/inst/doc/cdr3_clustering.html' but using this independent script for higher flexibility #the referred clustering vignette belongs to the CellaRepertorium package: #Andrew McDavid, Yu Gu and Erik VonKaenel (2021). CellaRepertorium: #Data structures, clustering and testing for single cell immune #receptor repertoires (scRNAseq RepSeq/AIRR-seq). R package version #1.1.2. https://github.com/amcdavid/CellaRepertorium