import import_ipynb
import Scanpy_functions_v03262021 as sc_pipe
import scvelo as scv
scv.logging.print_version()
import warnings
import scirpy as ir
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import bbknn
import logging
from sklearn.mixture import GaussianMixture
from scipy.stats import norm
import glob
import os
import hvplot.pandas
import docx
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
from scipy import sparse
import scanpy.external as sce
import holoviews as hv
import panel as pn
import bokeh
from bokeh.resources import INLINE
import scanorama
import gseapy
importing Jupyter notebook from Scanpy_functions_v03262021.ipynb Running scvelo 0.2.3 (python 3.8.5) on 2021-11-25 10:09. Running scvelo 0.2.3 (python 3.8.5) on 2021-11-25 10:09.
sc.logging.print_versions()
WARNING: If you miss a compact list, please try `print_header`! ----- anndata 0.7.6 scanpy 1.8.1 sinfo 0.3.1 ----- Levenshtein NA PIL 8.3.1 Scanpy_functions_v03262021 NA adjustText NA airr 1.3.1 anndata 0.7.6 annoy NA anyio NA appdirs 1.4.4 async_generator 1.10 attr 21.2.0 babel 2.9.1 backcall 0.2.0 bbknn NA bioservices 1.7.11 bokeh 2.3.3 bottleneck 1.3.2 brotli NA bs4 4.9.3 cairo 1.19.1 certifi 2021.05.30 cffi 1.14.6 chardet 4.0.0 cloudpickle 1.6.0 colorama 0.4.4 colorcet 1.0.0 colorlog NA cycler 0.10.0 cython_runtime NA cytoolz 0.11.0 dask 2.30.0 dateutil 2.8.2 decorator 5.0.9 defusedxml 0.7.1 docutils 0.17.1 docx 0.8.10 easydev 0.11.0 entrypoints 0.3 fbpca NA google NA gseapy 0.10.4 h5py 2.10.0 holoviews 1.14.2 html5lib 1.1 hvplot 0.7.1 idna 2.10 igraph 0.9.6 import_ipynb NA intervaltree NA ipykernel 5.3.4 ipython_genutils 0.2.0 ipywidgets 7.6.3 jedi 0.17.1 jinja2 3.0.1 joblib 1.0.1 json5 NA jsonschema 3.2.0 jupyter_server 1.4.1 jupyterlab_pygments 0.1.2 jupyterlab_server 2.6.1 kiwisolver 1.3.1 leidenalg 0.8.2 llvmlite 0.34.0 louvain 0.7.0 lxml 4.6.3 markupsafe 2.0.1 matplotlib 3.3.2 mistune 0.8.4 mkl 2.3.0 mpl_toolkits NA natsort 7.1.1 nbclassic NA nbclient 0.5.3 nbconvert 6.1.0 nbformat 5.1.3 networkx 2.6.2 numba 0.51.2 numexpr 2.7.3 numpy 1.19.2 packaging 21.0 pandas 1.2.4 pandocfilters NA panel 0.10.3 param 1.11.1 parasail 1.2.4 parso 0.7.0 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prometheus_client NA prompt_toolkit 3.0.17 psutil 5.8.0 ptyprocess 0.7.0 pvectorc NA pycparser 2.20 pygments 2.9.0 pylab NA pynndescent 0.5.2 pyparsing 2.4.7 pyrsistent NA pytoml NA pytz 2021.1 pyviz_comms 2.0.2 requests 2.25.1 requests_cache 0.5.2 scanorama 1.7.1 scanpy 1.8.1 scipy 1.6.2 scirpy 0.9.1 scvelo 0.2.3 seaborn 0.11.0 send2trash NA setuptools_scm NA sinfo 0.3.1 six 1.16.0 sklearn 0.24.2 sniffio 1.2.0 socks 1.7.1 sortedcontainers 2.4.0 soupsieve 2.2.1 sphinxcontrib NA statsmodels 0.12.2 storemagic NA tables 3.6.1 tblib 1.7.0 testpath 0.5.0 texttable 1.6.4 threadpoolctl 2.2.0 tlz 0.11.0 toolz 0.11.1 tornado 6.1 tqdm 4.61.2 tracerlib NA traitlets 5.0.5 typing_extensions NA umap 0.4.6 urllib3 1.26.6 wcwidth 0.2.5 webencodings 0.5.1 wrapt 1.11.2 yaml 5.4.1 yamlordereddictloader NA zipp NA zmq 20.0.0 zope NA ----- IPython 7.22.0 jupyter_client 6.1.12 jupyter_core 4.7.1 jupyterlab 3.0.14 notebook 6.4.0 ----- Python 3.8.5 (default, Sep 4 2020, 07:30:14) [GCC 7.3.0] Linux-3.10.0-514.2.2.el7.x86_64-x86_64-with-glibc2.10 352 logical CPU cores, x86_64 ----- Session information updated at 2021-11-25 10:09
# define sample metadata. Usually read from a file.
exclude_genes = ['RPL', 'RPS', 'TRAV', 'TRAJ', 'TRBJ', 'TRBV','MRP','FAU','DAP3','UBA52','IGHV', 'IGKV', 'IGLV']
antilist = [
]
In_path = '/user/ifrec/liuyuchen/scRNASeq_DATA/Yamazaki_lu_scRNASeq_all_pfizer/'
out_path = '/user/ifrec/liuyuchen/Analysis_Reports/Yamazaki_lu_scRNASeq_all_pfizer/'
samples = {}
for line in open(In_path+'sample_list.txt'):
sample = line.strip()
samples[sample]={}
samples[sample]['gex']=sample
samples[sample]['TCR']=sample.replace('5DE','TCR')
samples
{'25_and_28_1_5DE': {'gex': '25_and_28_1_5DE', 'TCR': '25_and_28_1_TCR'}, '25_and_28_2_5DE': {'gex': '25_and_28_2_5DE', 'TCR': '25_and_28_2_TCR'}, '25_and_28_3_5DE': {'gex': '25_and_28_3_5DE', 'TCR': '25_and_28_3_TCR'}, '25_and_28_4_5DE': {'gex': '25_and_28_4_5DE', 'TCR': '25_and_28_4_TCR'}, '25_and_28_5_5DE': {'gex': '25_and_28_5_5DE', 'TCR': '25_and_28_5_TCR'}, '25_and_28_6_5DE': {'gex': '25_and_28_6_5DE', 'TCR': '25_and_28_6_TCR'}, '25_and_28_7_5DE': {'gex': '25_and_28_7_5DE', 'TCR': '25_and_28_7_TCR'}, '25_and_28_8_5DE': {'gex': '25_and_28_8_5DE', 'TCR': '25_and_28_8_TCR'}, '27_1_5DE': {'gex': '27_1_5DE', 'TCR': '27_1_TCR'}, '27_2_5DE': {'gex': '27_2_5DE', 'TCR': '27_2_TCR'}, '4_and_17_1_5DE': {'gex': '4_and_17_1_5DE', 'TCR': '4_and_17_1_TCR'}, '4_and_17_2_5DE': {'gex': '4_and_17_2_5DE', 'TCR': '4_and_17_2_TCR'}, '4_and_17_3_5DE': {'gex': '4_and_17_3_5DE', 'TCR': '4_and_17_3_TCR'}, '4_and_17_4_5DE': {'gex': '4_and_17_4_5DE', 'TCR': '4_and_17_4_TCR'}, '4_and_17_5_5DE': {'gex': '4_and_17_5_5DE', 'TCR': '4_and_17_5_TCR'}, '4_and_17_6_5DE': {'gex': '4_and_17_6_5DE', 'TCR': '4_and_17_6_TCR'}, '4_and_17_7_5DE': {'gex': '4_and_17_7_5DE', 'TCR': '4_and_17_7_TCR'}, '4_and_17_8_5DE': {'gex': '4_and_17_8_5DE', 'TCR': '4_and_17_8_TCR'}, '8and_13and_15_1_5DE': {'gex': '8and_13and_15_1_5DE', 'TCR': '8and_13and_15_1_TCR'}, '8and_13and_15_2_5DE': {'gex': '8and_13and_15_2_5DE', 'TCR': '8and_13and_15_2_TCR'}, '8and_13and_15_3_5DE': {'gex': '8and_13and_15_3_5DE', 'TCR': '8and_13and_15_3_TCR'}, '8and_13and_15_4_5DE': {'gex': '8and_13and_15_4_5DE', 'TCR': '8and_13and_15_4_TCR'}, '8and_13and_15_5_5DE': {'gex': '8and_13and_15_5_5DE', 'TCR': '8and_13and_15_5_TCR'}, '8and_13and_15_6_5DE': {'gex': '8and_13and_15_6_5DE', 'TCR': '8and_13and_15_6_TCR'}, '8and_13and_15_7_5DE': {'gex': '8and_13and_15_7_5DE', 'TCR': '8and_13and_15_7_TCR'}, '8and_13and_15_8_5DE': {'gex': '8and_13and_15_8_5DE', 'TCR': '8and_13and_15_8_TCR'}}
adatalist = []
for sample, sample_meta in samples.items():
gex_file = In_path+sample_meta["gex"]+'/outs/filtered_feature_bc_matrix.h5'
adata = sc.read_10x_h5(gex_file, gex_only=False)
tcr_file = In_path+sample_meta["TCR"]+'/outs/filtered_contig_annotations.csv'
adata_tcr = ir.io.read_10x_vdj(tcr_file)
ir.pp.merge_with_ir(adata, adata_tcr)
adata.var_names_make_unique()
adata.obs["Sample"] = sample
hashtag = adata.var_names[adata.var_names.str.contains('Hashtag')].tolist()
for h in hashtag:
adata.obs[h] = adata[:, h].X.A
adata.obs['Hashtag'] = adata.obs[hashtag].idxmax(axis=1)
antis = adata.var_names[adata.var_names.str.contains('human')].tolist()
for a in antis:
adata.obs[a] = adata[:, a].X.A
data = adata[:, a].X.A
data = np.interp(data, (data.min(), data.max()), (0, 10))
adata.obs[a+'_normalized'] = data
adata = adata[:,adata.var[adata.var['feature_types']!='Antibody Capture'].index]
adatalist.append(adata)
Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical Variable names are not unique. To make them unique, call `.var_names_make_unique`. ... storing 'feature_types' as categorical ... storing 'genome' as categorical
#sc.set_figure_params(scanpy=True, dpi=200, figsize=[12.8,9.6])
sc.settings.verbosity = 3
adata = sc_pipe.unify_value(adatalist)
adata.obs['Sample'].value_counts()
8and_13and_15_3_5DE 25160 8and_13and_15_6_5DE 24872 8and_13and_15_8_5DE 24789 8and_13and_15_4_5DE 23650 8and_13and_15_7_5DE 23401 8and_13and_15_5_5DE 23164 8and_13and_15_1_5DE 22018 8and_13and_15_2_5DE 20872 25_and_28_1_5DE 12018 25_and_28_2_5DE 11776 25_and_28_8_5DE 11718 25_and_28_7_5DE 11636 25_and_28_3_5DE 11076 25_and_28_5_5DE 10171 27_1_5DE 10171 4_and_17_4_5DE 10152 25_and_28_6_5DE 10151 25_and_28_4_5DE 9899 4_and_17_5_5DE 9484 4_and_17_3_5DE 9328 4_and_17_1_5DE 9120 4_and_17_2_5DE 8624 4_and_17_6_5DE 8582 4_and_17_8_5DE 8209 4_and_17_7_5DE 8006 27_2_5DE 6746 Name: Sample, dtype: int64
sc.pl.highest_expr_genes(adata)
normalizing counts per cell finished (0:00:08)
adata.obs['Hashtag'].value_counts()
_8_3wks_Hashtag_1 44273 _15_6wks_Hashtag_8 38721 17_24wks_Hashtag_6 35986 _8_6wks_Hashtag_2 31020 _28_3wks_Hashtag_4 25259 _13_24wks_Hashtag_6 22591 _28_6wks_Hashtag_5 21037 _25_6wks_Hashtag_2 20554 17_6wks_Hashtag_5 20396 _13_3wks_Hashtag_4 19101 _25_24wks_Hashtag_3 14779 _8_24wks_Hashtag_3 11740 27_6wks_Hashtag_2 10775 _13_6wks_Hashtag_5 9022 _15_24wks_Hashtag_9 8325 _28_24wks_Hashtag_6 6136 27_24wks_Hashtag_3 5886 4_6wks_Hashtag_2 4964 4_24wks_Hashtag_3 3643 17_3wks_Hashtag_4 3322 4_3wks_Hashtag_1 3194 _15_3wks_Hashtag_7 3133 _25_3wks_Hashtag_1 680 27_3wks_Hashtag_1 256 Name: Hashtag, dtype: int64
hashtag = adata.obs['Hashtag'].unique().tolist()
tags ={x:x.split('_Hashtag_')[0] for x in hashtag}
hashtag
['_25_6wks_Hashtag_2', '_28_24wks_Hashtag_6', '_28_6wks_Hashtag_5', '_25_24wks_Hashtag_3', '_28_3wks_Hashtag_4', '_25_3wks_Hashtag_1', '27_6wks_Hashtag_2', '27_24wks_Hashtag_3', '27_3wks_Hashtag_1', '17_24wks_Hashtag_6', '17_6wks_Hashtag_5', '4_3wks_Hashtag_1', '17_3wks_Hashtag_4', '4_6wks_Hashtag_2', '4_24wks_Hashtag_3', '_8_6wks_Hashtag_2', '_8_3wks_Hashtag_1', '_13_24wks_Hashtag_6', '_15_6wks_Hashtag_8', '_8_24wks_Hashtag_3', '_13_3wks_Hashtag_4', '_13_6wks_Hashtag_5', '_15_24wks_Hashtag_9', '_15_3wks_Hashtag_7']
adata.obs['Batch']=adata.obs['Sample']
adata.obs['Sample']=adata.obs['Hashtag'].map(tags)
adata.obs['Sample'].value_counts()
_8_3wks 44273 _15_6wks 38721 17_24wks 35986 _8_6wks 31020 _28_3wks 25259 _13_24wks 22591 _28_6wks 21037 _25_6wks 20554 17_6wks 20396 _13_3wks 19101 _25_24wks 14779 _8_24wks 11740 27_6wks 10775 _13_6wks 9022 _15_24wks 8325 _28_24wks 6136 27_24wks 5886 4_6wks 4964 4_24wks 3643 17_3wks 3322 4_3wks 3194 _15_3wks 3133 _25_3wks 680 27_3wks 256 Name: Sample, dtype: int64
adata = sc_pipe.qc_and_preprocess(adata,out_path,multi_sample=True)
normalizing counts per cell finished (0:00:08) WARNING: saving figure to file figures/highest_expr_genes_before_filter.png
Running Scrublet filtered out 9560 genes that are detected in less than 3 cells normalizing counts per cell finished (0:00:05) extracting highly variable genes finished (0:03:17) --> added 'highly_variable', boolean vector (adata.var) 'means', float vector (adata.var) 'dispersions', float vector (adata.var) 'dispersions_norm', float vector (adata.var) normalizing counts per cell finished (0:00:00) normalizing counts per cell finished (0:01:05) Embedding transcriptomes using PCA... Detected doublet rate = 0.3% Estimated detectable doublet fraction = 24.0% Overall doublet rate: Expected = 5.0% Estimated = 1.1% Scrublet finished (1:06:37)
... storing 'IR_VJ_1_c_call' as categorical ... storing 'IR_VJ_1_j_call' as categorical ... storing 'IR_VJ_2_j_call' as categorical ... storing 'IR_VDJ_2_j_call' as categorical ... storing 'IR_VJ_1_junction' as categorical ... storing 'IR_VJ_2_junction' as categorical ... storing 'IR_VDJ_1_junction' as categorical ... storing 'IR_VDJ_2_junction' as categorical ... storing 'IR_VJ_1_junction_aa' as categorical ... storing 'IR_VJ_2_junction_aa' as categorical ... storing 'IR_VDJ_1_junction_aa' as categorical ... storing 'IR_VDJ_2_junction_aa' as categorical ... storing 'IR_VJ_1_v_call' as categorical ... storing 'IR_VJ_2_v_call' as categorical ... storing 'IR_VDJ_1_v_call' as categorical ... storing 'IR_VDJ_2_v_call' as categorical ... storing 'Sample' as categorical ... storing 'Hashtag' as categorical ... storing 'Batch' as categorical
WARNING: saving figure to file figures/violin_QC_of_entire_set.pdf
WARNING: saving figure to file figures/violin_QC_by_sample_1.png
WARNING: saving figure to file figures/violin_QC_by_sample_2.png
WARNING: saving figure to file figures/violin_QC_by_sample_3.png
WARNING: saving figure to file figures/violin_QC_by_sample_4.png
WARNING: saving figure to file figures/violin_scrublet_sample.png
WARNING: saving figure to file figures/scatter_count_to_mito_1.png
WARNING: saving figure to file figures/scatter_count_to_gene.png
filtered out 0 cells that have less than 67 counts filtered out 9560 genes that are detected in less than 3 cells filtered out 9560 genes that are detected in less than 3 cells filtered out 0 cells that have more than 41504487 counts filtered out 13510 cells that has over 19% reads belong to mitochondrial genes
Trying to set attribute `.obs` of view, copying.
filtered out 0 cells that have less than 176 genes expressed filtered out 9967 cells that have over 5784 genes expressed WARNING: saving figure to file figures/violin_QC_of_entire_set_after_filtration.pdf
adata = sc_pipe.feature_selection(adata, out_path, exclude_genes)
#adata = adata[:, adata.var.highly_variable]
#adata = sc_pipe.clustering(adata,hash_samples)
adata = sc_pipe.BBKNN_clustering(adata,out_path, resol=1, multi_sample=True)
normalizing counts per cell finished (0:00:03) normalizing counts per cell finished (0:00:07) WARNING: saving figure to file figures/highest_expr_genes_after_QC.png
normalizing counts per cell finished (0:00:06) WARNING: saving figure to file figures/highest_expr_genes_after_exclude.png
If you pass `n_top_genes`, all cutoffs are ignored. extracting highly variable genes finished (0:00:08) --> added 'highly_variable', boolean vector (adata.var) 'means', float vector (adata.var) 'dispersions', float vector (adata.var) 'dispersions_norm', float vector (adata.var) Number of highly variable genes: 4000 WARNING: saving figure to file figures/filter_genes_dispersion_highly_variable_genes.png
computing PCA on highly variable genes with n_comps=50 finished (0:00:51) computing neighbors using 'X_pca' with n_pcs = 50 finished: added to `.uns['neighbors']` `.obsp['distances']`, distances for each pair of neighbors `.obsp['connectivities']`, weighted adjacency matrix (0:01:51) computing UMAP finished: added 'X_umap', UMAP coordinates (adata.obsm) (0:05:17) computing batch balanced neighbors finished: added to `.uns['neighbors']` `.obsp['distances']`, distances for each pair of neighbors `.obsp['connectivities']`, weighted adjacency matrix (0:06:22) running Leiden clustering finished: found 16 clusters and added 'leiden', the cluster labels (adata.obs, categorical) (0:21:20) running PAGA finished: added 'paga/connectivities', connectivities adjacency (adata.uns) 'paga/connectivities_tree', connectivities subtree (adata.uns) (0:00:45) --> added 'pos', the PAGA positions (adata.uns['paga']) WARNING: saving figure to file figures/paga_Graph.png
computing UMAP finished: added 'X_umap', UMAP coordinates (adata.obsm) (0:06:28) WARNING: saving figure to file figures/umap_Leiden_clustering.png
WARNING: saving figure to file figures/umap_no_bbknn.png
WARNING: saving figure to file figures/umap_by_sample.png
adata = sc_pipe.differential(adata,out_path, multi_sample = True)
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' ranking genes finished: added to `.uns['rank_genes_groups']` 'names', sorted np.recarray to be indexed by group ids 'scores', sorted np.recarray to be indexed by group ids 'logfoldchanges', sorted np.recarray to be indexed by group ids 'pvals', sorted np.recarray to be indexed by group ids 'pvals_adj', sorted np.recarray to be indexed by group ids (0:03:01) WARNING: saving figure to file figures/rank_genes_groups_leiden_by_cluster.png
WARNING: saving figure to file figures/umap_top2_in_clusters.png
WARNING: dendrogram data not found (using key=dendrogram_leiden). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently. using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_leiden']` WARNING: saving figure to file figures/dotplot_in_clusters.png
WARNING: Gene labels are not shown when more than 50 genes are visualized. To show gene labels set `show_gene_labels=True` WARNING: saving figure to file figures/heatmap_in_clusters.png
WARNING: saving figure to file figures/matrixplot_in_clusters.png
WARNING: saving figure to file figures/stacked_violin_in_clusters.png
WARNING: saving figure to file figures/tracksplot_in_clusters.png
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' ranking genes finished: added to `.uns['rank_genes_samples']` 'names', sorted np.recarray to be indexed by group ids 'scores', sorted np.recarray to be indexed by group ids 'logfoldchanges', sorted np.recarray to be indexed by group ids 'pvals', sorted np.recarray to be indexed by group ids 'pvals_adj', sorted np.recarray to be indexed by group ids (0:04:22) WARNING: saving figure to file figures/rank_genes_groups_Sample_compare.png
ir.tl.chain_qc(adata)
ax = ir.pl.group_abundance(adata, groupby="receptor_subtype", target_col="Sample")
... storing 'receptor_type' as categorical ... storing 'receptor_subtype' as categorical ... storing 'chain_pairing' as categorical
ax = ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="Sample")
# using default parameters, `ir_dist` will compute nucleotide sequence identity
ir.pp.ir_dist(adata)
ir.tl.define_clonotypes(adata, receptor_arms="all", dual_ir="primary_only")
Computing sequence x sequence distance matrix for VJ sequences. Computing sequence x sequence distance matrix for VDJ sequences. Initializing lookup tables. --> Done initializing lookup tables. (0:00:05) Computing clonotype x clonotype distances. NB: Computation happens in chunks. The progressbar only advances when a chunk has finished.
0%| | 0/34406 [00:00<?, ?it/s]
--> Done computing clonotype x clonotype distances. (0:01:33) Stored clonal assignments in `adata.obs["clone_id"]`.
ir.tl.clonal_expansion(adata)
sc.pl.umap(adata, color=["clonal_expansion", "clone_id_size"])
... storing 'clone_id' as categorical ... storing 'clonal_expansion' as categorical
ir.pl.clonal_expansion(adata, groupby="Sample", clip_at=4, normalize=False)
<AxesSubplot:>
adata.obs
is_cell | high_confidence | multi_chain | extra_chains | IR_VJ_1_c_call | IR_VJ_2_c_call | IR_VDJ_1_c_call | IR_VDJ_2_c_call | IR_VJ_1_consensus_count | IR_VJ_2_consensus_count | ... | pct_counts_in_top_200_genes | pct_counts_in_top_500_genes | log_counts | leiden | receptor_type | receptor_subtype | chain_pairing | clone_id | clone_id_size | clonal_expansion | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AAACCTGAGAGCTGCA-1-0 | True | True | False | [] | TRAC | NaN | TRBC1 | NaN | 5681.0 | NaN | ... | 44.478503 | 59.537845 | 9.451167 | 10 | TCR | TRA+TRB | single pair | 0 | 28.0 | >= 3 |
AAACCTGAGCGTAGTG-1-0 | True | True | False | [] | TRAC | NaN | TRBC2 | NaN | 777.0 | NaN | ... | 47.661587 | 64.766159 | 8.244334 | 2 | TCR | TRA+TRB | single pair | 1 | 54.0 | >= 3 |
AAACCTGAGCGTGAGT-1-0 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 45.398517 | 60.137888 | 9.166702 | 9 | no IR | no IR | no IR | NaN | NaN | nan |
AAACCTGAGCGTTTAC-1-0 | True | True | False | [] | TRAC | NaN | TRBC1 | NaN | 2828.0 | NaN | ... | 45.864066 | 59.704965 | 9.913487 | 6 | TCR | TRA+TRB | single pair | 2 | 8.0 | >= 3 |
AAACCTGAGGACAGCT-1-0 | True | True | False | [] | TRAC | NaN | TRBC2 | NaN | 3331.0 | NaN | ... | 46.676249 | 62.468559 | 8.624432 | 7 | TCR | TRA+TRB | single pair | 3 | 4.0 | >= 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
TTTGTCATCGGCGCAT-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 53.904282 | 91.687657 | 6.677083 | 1 | no IR | no IR | no IR | NaN | NaN | nan |
TTTGTCATCGTTGCCT-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 57.100592 | 100.000000 | 6.516193 | 5 | no IR | no IR | no IR | NaN | NaN | nan |
TTTGTCATCTCGATGA-1-25 | True | True | False | [] | TRAC | NaN | TRBC2 | NaN | 3682.0 | NaN | ... | 48.046595 | 63.172043 | 8.626944 | 8 | TCR | TRA+TRB | single pair | 23806 | 6059.0 | >= 3 |
TTTGTCATCTGCGTAA-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 52.649870 | 78.714162 | 7.048387 | 1 | no IR | no IR | no IR | NaN | NaN | nan |
TTTGTCATCTGGAGCC-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 55.387931 | 87.715517 | 6.833032 | 5 | no IR | no IR | no IR | NaN | NaN | nan |
341316 rows × 106 columns
sc.pl.umap(adata, color=["CD4", "CD8A"],use_raw=False,color_map= 'OrRd')
sc.pl.umap(adata, color=antis,use_raw=False,color_map= 'OrRd')
ad_ucsc=sc.read_h5ad('/user/ifrec/liuyuchen/Azimuth_ref/PBMC/Azimuth_ref.h5ad')
types = ['celltype.l1','celltype.l2','celltype.l3']
ad_ingest = adata[:, adata.var.highly_variable].copy()
var_names = ad_ucsc.var_names.intersection(ad_ingest.var_names)
ad_ucsc = ad_ucsc[:, var_names]
ad_ingest= ad_ingest[:, var_names]
sc.tl.ingest(ad_ingest, ad_ucsc, obs=types)
adata.obs = ad_ingest.obs
sc.pl.umap(adata, color=types,legend_loc='on data')
running ingest finished (0:07:26)
ad_ucsc=sc.read_h5ad('/user/ifrec/liuyuchen/scRNASeq_DATA/COVID19/public_B_science/E-MTAB-10026/covid_portal_210320_with_raw.h5ad')
sc.pp.highly_variable_genes(ad_ucsc)
types = ['full_clustering']
sc.tl.pca(ad_ucsc)
sc.pp.neighbors(ad_ucsc)
ad_ingest = adata.copy()
var_names = ad_ucsc.var_names.intersection(ad_ingest.var_names)
ad_ucsc = ad_ucsc[:, var_names]
ad_ingest= ad_ingest[:, var_names]
sc.tl.ingest(ad_ingest, ad_ucsc, obs=types)
adata.obs = ad_ingest.obs
sc.pl.umap(adata, color=types)
extracting highly variable genes finished (0:00:37) --> added 'highly_variable', boolean vector (adata.var) 'means', float vector (adata.var) 'dispersions', float vector (adata.var) 'dispersions_norm', float vector (adata.var) computing PCA on highly variable genes with n_comps=50 finished (0:00:24) computing neighbors using 'X_pca' with n_pcs = 50 finished: added to `.uns['neighbors']` `.obsp['distances']`, distances for each pair of neighbors `.obsp['connectivities']`, weighted adjacency matrix (0:16:21) running ingest finished (0:10:24)
adata.obs["Top_expanded_clone"]= 'Others'
adata.obs.loc[adata.obs["clone_id_size"]>100,"Top_expanded_clone"]=adata.obs.loc[adata.obs["clone_id_size"]>100,'clone_id']
#gene_list = pd.read_csv('/user/ifrec/liuyuchen/Analysis_Reports/Yamazaki_1108/gene_list.csv')
gene_list = pd.read_csv('/user/ifrec/liuyuchen/Analysis_Reports/Yamazaki_1108/Tfh_list_211110_2.csv')
#TFH_genes = gene_list['Genes upregulated in TFH cells'].dropna().tolist()
TFH_genes = gene_list['list1'].dropna().tolist()
feature_genes = adata.var_names.intersection(TFH_genes)
feature_genes
data = np.sum(adata[:, feature_genes].X, axis=1).A1 / len(feature_genes)
data = np.interp(data, (data.min(), data.max()), (0, 10))
adata.obs['Tfh_score_1'] = data
#TFH_genes = gene_list['Genes upregulated in TFH cells'].dropna().tolist()
TFH_genes = gene_list['list2'].dropna().tolist()
feature_genes = adata.var_names.intersection(TFH_genes)
feature_genes
data = np.sum(adata[:, feature_genes].X, axis=1).A1 / len(feature_genes)
data = np.interp(data, (data.min(), data.max()), (0, 10))
adata.obs['Tfh_score_2'] = data
sc.pl.violin(adata,keys='Tfh_score_1',groupby='leiden',rotation=90)
... storing 'Top_expanded_clone' as categorical
sc.pl.violin(adata,keys='Tfh_score_2',groupby='leiden',rotation=90)
adata.obs['time'] = adata.obs['Sample'].apply(lambda x: x.split('_')[1])
sc.pl.umap(adata, groups ='3wks' ,color='time')
... storing 'time' as categorical
adata.write(out_path+'/Merged_pfizer_tcells.h5ad')
adata.obs
is_cell | high_confidence | multi_chain | extra_chains | IR_VJ_1_c_call | IR_VJ_2_c_call | IR_VDJ_1_c_call | IR_VDJ_2_c_call | IR_VJ_1_consensus_count | IR_VJ_2_consensus_count | ... | clone_id_size | clonal_expansion | celltype.l1 | celltype.l2 | celltype.l3 | full_clustering | Top_expanded_clone | Tfh_score_1 | Tfh_score_2 | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cell | |||||||||||||||||||||
AAACCTGAGAGCTGCA-1-0 | True | True | False | [] | TRAC | NaN | TRBC1 | NaN | 5681.0 | NaN | ... | 28.0 | >= 3 | CD8 T | CD8 TEM | CD8 TEM_2 | CD8.TE | Others | 0.630983 | 0.561950 | 25 |
AAACCTGAGCGTAGTG-1-0 | True | True | False | [] | TRAC | NaN | TRBC2 | NaN | 777.0 | NaN | ... | 54.0 | >= 3 | CD4 T | CD4 TCM | CD4 TCM_1 | CD4.CM | Others | 0.938146 | 0.742752 | 28 |
AAACCTGAGCGTGAGT-1-0 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | nan | other T | gdT | gdT_1 | NK_16hi | Others | 1.033337 | 0.873725 | 25 |
AAACCTGAGCGTTTAC-1-0 | True | True | False | [] | TRAC | NaN | TRBC1 | NaN | 2828.0 | NaN | ... | 8.0 | >= 3 | CD4 T | CD4 TCM | CD4 TEM_3 | CD4.EM | Others | 0.725423 | 0.745934 | 28 |
AAACCTGAGGACAGCT-1-0 | True | True | False | [] | TRAC | NaN | TRBC2 | NaN | 3331.0 | NaN | ... | 4.0 | >= 3 | CD4 T | CD4 TCM | CD4 TEM_3 | CD4.CM | Others | 0.353632 | 0.363630 | 28 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
TTTGTCATCGGCGCAT-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | nan | CD4 T | CD4 TEM | CD4 TEM_3 | CD4.Naive | Others | 0.497584 | 0.511653 | 8 |
TTTGTCATCGTTGCCT-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | nan | CD4 T | CD4 TEM | CD4 TEM_3 | CD4.Naive | Others | 0.000000 | 0.000000 | 8 |
TTTGTCATCTCGATGA-1-25 | True | True | False | [] | TRAC | NaN | TRBC2 | NaN | 3682.0 | NaN | ... | 6059.0 | >= 3 | CD8 T | CD8 TEM | CD8 TEM_4 | NK_16hi | 23806 | 1.287892 | 1.324306 | 13 |
TTTGTCATCTGCGTAA-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | nan | CD4 T | CD4 TEM | CD4 TEM_3 | CD4.CM | Others | 0.765889 | 0.787544 | 13 |
TTTGTCATCTGGAGCC-1-25 | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | nan | CD4 T | CD4 TEM | CD4 TEM_3 | CD4.Naive | Others | 0.772122 | 0.793953 | 8 |
341316 rows × 114 columns