import pandas as pd

bulk_path = "data/bulkRNAseq/bulk_gse120502.txt"

df = pd.read_table(bulk_path, index_col=0)
df.head()

import scanpy as sc
sc_path = "data/scRNAseq/sampled_100_data8k.h5ad"
adata = sc.read(sc_path)
adata

AnnData object with n_obs × n_vars = 1000 × 18159
    obs: 'cell_type'

adata.obs

adata.X.max()

733.0

import dissect

config = dissect.config

config

{'experiment_folder': '/home/user/experiment',
 'simulation_params': {'scdata': '/home/user/experiment/data.h5ad',
  'n_samples': None,
  'type': 'bulk',
  'celltype_col': 'celltype',
  'batch_col': None,
  'cells_per_sample': None,
  'downsample': None,
  'preprocess': None,
  'filter': {'min_genes': 200, 'min_cells': 3, 'mt_cutoff': 5, 'min_expr': 0},
  'concentration': None,
  'prop_sparse': 0.5,
  'generate_component_figures': True},
 'deconv_params': {'test_dataset': '../bulk.txt',
  'test_dataset_format': 'txt',
  'test_dataset_type': 'bulk',
  'duplicated': 'first',
  'normalize_simulated': 'cpm',
  'normalize_test': 'cpm',
  'var_cutoff': 0.1,
  'test_in_mix': None,
  'simulated': True,
  'sig_matrix': False,
  'mix': 'srm',
  'save_config': True,
  'network_params': {'n_hidden_layers': 4,
   'hidden_units': [512, 256, 128, 64],
   'hidden_activation': 'relu6',
   'output_activation': 'softmax',
   'loss': 'kldivergence',
   'n_steps': 5000,
   'lr': 1e-05,
   'batch_size': 64,
   'dropout': None,
   'n_steps_expr': 5000},
  'alpha_range': [0.1, 0.9],
  'normalization_per_batch': 'log1p-MinMax',
  'models': [1, 2, 3, 4, 5]}}

config["experiment_folder"] # this prints the current folder

'/home/user/experiment'

# we will set it to "tutorial_bulkRNAseq"

config["experiment_folder"] = "tutorial_bulkRNAseq"

config["simulation_params"]["scdata"] = "data/scRNAseq/sampled_100_data8k.h5ad"
config["simulation_params"]["type"] = "bulk"
config["simulation_params"]["celltype_col"] = "cell_type"

# by default the single-cell data passes through following QC:
config["simulation_params"]["filter"]

{'min_genes': 200, 'min_cells': 3, 'mt_cutoff': 5, 'min_expr': 0}

dissect.simulate(config)

Number of batches in single-cell data is 1. If this is incorrect, please specify name of the batch column as in the single-cell data object (.obs)

100%|████████████████████████████████████████████████████████████████████████████████| 6000/6000 [01:29<00:00, 67.40it/s]

config["deconv_params"]["test_dataset"] = "data/bulkRNAseq/bulk_gse120502.txt"
config["deconv_params"]["test_dataset_format"] = "txt"

# preprocess data
# It does CPM normalization and filters out gene with low variance (default cutoff 0.1).

dissect.dataset(config)

Removing genes which have less than 0.1 variance in their expressions.
test dataset has 16872 distinct and variable genes.
simulated dataset has 13669 distinct genes.
There are 11505 common genes between simulated and test dataset.
Saving numpy files.
Done.

# Estimate fractions
dissect.run_dissect_frac(config)

Loading prepared datasets...
Starting training model 0

step: 5000| loss: 0.0104: 100%|██████████████████████████████████████████████████████| 5000/5000 [02:43<00:00, 30.57it/s]

Starting training model 1

step: 5000| loss: 0.0112: 100%|██████████████████████████████████████████████████████| 5000/5000 [02:42<00:00, 30.84it/s]

Starting training model 2

step: 5000| loss: 0.0127: 100%|██████████████████████████████████████████████████████| 5000/5000 [02:40<00:00, 31.21it/s]

Starting training model 3

step: 5000| loss: 0.0135: 100%|██████████████████████████████████████████████████████| 5000/5000 [02:41<00:00, 30.90it/s]

Starting training model 4

step: 5000| loss: 0.0106: 100%|██████████████████████████████████████████████████████| 5000/5000 [02:39<00:00, 31.36it/s]

import os
result_path = os.path.join(config["experiment_folder"],
                          "dissect_fractions.txt")

df = pd.read_table(result_path, index_col=0)
df.head()

## Show cell type composition of each sample

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", font_scale=1.3) # set figure params

show_samples = 50 # We will show composition of the first 50 samples. 
# show estimates of first 50 samples. More gets hard to see.
df.iloc[0:show_samples,:].plot(kind='bar', stacked=True, figsize=(15,5))
plt.legend(bbox_to_anchor=(1,1.02), frameon=False)
plt.ylabel("Estimate")

Text(0, 0.5, 'Estimate')

## Show distribution of cell type proportions

sc.set_figure_params(dpi=100)
melted = df.melt(var_name="Cell type", value_name="Estimation")
ax = sns.boxplot(data=melted, x="Cell type", y="Estimation")

sns.stripplot(data=melted, x="Cell type", y="Estimation", linewidth=1, edgecolor="gray", ax=ax, hue="Cell type", legend=True)
ax.set_xticks([],[])
plt.legend(bbox_to_anchor=(1,1))
plt.show()

config["deconv_params"]["reference"] = "tutorial_bulkRNAseq/simulation/simulated.h5ad"

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Estimate cell type specific gene expression
dissect.run_dissect_expr(config)

Estimating expression of 11529 common genes
Preparing training datasets.

100%|████████████████████████████████████████████████████████████████████████████████████| 55/55 [35:30<00:00, 38.74s/it]

Estimated gene expression per cell type is saved at tutorial_bulkRNAseq/est_expression_layered.h5ad and is included in the layers attribute of the anndata object.
Estimated gene expression per cell type for all samples is saved at tutorial_bulkRNAseq/est_expression.h5ad, corresponding samples are in sample attribute of the anndata.obs.

adata_est_layered = sc.read("tutorial_bulkRNAseq/est_expression_layered.h5ad")
print(adata_est_layered)

adata_est = sc.read("tutorial_bulkRNAseq/est_expression.h5ad")
print(adata_est)

AnnData object with n_obs × n_vars = 250 × 11529
    layers: 'Bcells', 'CD4Tcells', 'CD8Tcells', 'Monocytes', 'NK', 'Unknown'
AnnData object with n_obs × n_vars = 1500 × 11529
    obs: 'sample', 'cell_type'
    layers: 'scaled_counts'

sc.set_figure_params(dpi=50)

sc.pp.normalize_total(adata_est, target_sum=1e6)
sc.pp.log1p(adata_est)
sc.pp.highly_variable_genes(adata_est)

n_hvg = adata_est.var[adata_est.var.highly_variable].sort_values(by="dispersions",
                                                        ascending=False).index[0:100]

sc.tl.pca(adata_est)
sc.pl.pca(adata_est, color="cell_type")

sc.pl.clustermap(adata_est[:, n_hvg], 
                obs_keys="cell_type", cmap="RdBu_r")

	T63_LPS_FC1_4	T81_LPS_FC1_4	T104_LPS_FC1_4	T4_GARD_BATCH_2	K47_LPS_FC1_4	T4_CTL_FC5	T12_CTL_B_BATCH_2	T84_LPS_FC1_4	K44_LPS_FC1_4	K10_LPS_FC1_4	...	T70_LPS_FC1_4	K10_GARD_BATCH_2	K37_CTL_FC1_4	T5_CTL_FC5	K53_LPS_FC1_4	T120_LPS_FC5	K7_GARD_BATCH_2	K12_GARD_BATCH_2	T35_GARD_BATCH_2	K50_CTL_FC1_4
ATF7-NPFF	29.0	28.0	38.0	71.0	40.0	34.0	39.0	25.0	58.0	38.0	...	30.0	52.0	31.0	27.0	34.0	22.0	56.0	50.0	50.0	24.0
B4GAT1-DT	15.0	16.0	28.0	44.0	7.0	24.0	44.0	28.0	23.0	18.0	...	32.0	24.0	18.0	36.0	13.0	4.0	43.0	37.0	47.0	16.0
BTBD8	0.0	3.0	4.0	36.0	13.0	6.0	30.0	23.0	37.0	13.0	...	5.0	12.0	16.0	25.0	0.0	0.0	23.0	22.0	46.0	32.0
C12orf42-AS1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
C4orf54	0.0	2.0	0.0	6.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	2.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0

	Bcells	CD4Tcells	CD8Tcells	Monocytes	NK	Unknown
T63_LPS_FC1_4	0.050998	0.264567	0.298605	0.185140	0.094879	0.105810
T81_LPS_FC1_4	0.116595	0.326707	0.239222	0.145019	0.054338	0.118119
T104_LPS_FC1_4	0.041435	0.470600	0.228636	0.106191	0.062100	0.091038
T4_GARD_BATCH_2	0.079613	0.358756	0.294246	0.090234	0.083928	0.093222
K47_LPS_FC1_4	0.073276	0.292910	0.236181	0.200853	0.076844	0.119935

Datasets needed for tutorial¶

A note on the supported formats¶

Import DISSECT module¶

Set experiment folder¶

Do simulation¶

Estimation of fractions¶

Estimation of cell type specific gene expression¶

Show PCA¶

Show highly variable genes¶

	cell_type
cell_7406	CD4Tcells
cell_4119	CD4Tcells
cell_3032	CD4Tcells
cell_3795	NK
cell_318	Monocytes
...	...
cell_6912	CD4Tcells
cell_5269	CD8Tcells
cell_7541	Monocytes
cell_4880	Monocytes
cell_3135	Bcells