"""Loading single-cell RNA-seq datasets with necessary pre-processing steps."""
import h5py
import loompy
import numpy as np
import pandas as pd
import torch
from pyro.distributions import ZeroInflatedNegativeBinomial as ZINB
from scipy.sparse import csc_matrix, csr_matrix, vstack
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, Sampler
[docs]
class CORTEX(Dataset):
"""
Loads CORTEX dataset.
A class with necessary pre-processing steps for the gold standard
Zeisel data set which contains 3005 mouse cortex cells and gold-standard
labels for seven distinct cell types. Each cell type corresponds
to a cluster to recover.
The pre-processing steps are:
1. exctracting the labels of the cell types from the data
2. choosing the genes that are transcribed in more than 25 cells
3. Selecting the 558 genes with the highest Variance in the remaining
genes from the previous step
4. Performing random permutation of the genes
Parameters
----------
file_dir : str
The path to the .csv file.
n_genes : int
Number of the high variable genes that should be selected.
n_cells:
Total number of cells.
data: torch Tensor
The data.
labels: torch Tensor
The labels.
Examples
--------
>>> import data_prep
>>> cortex = data_prep.CORTEX()
>>> dl = DataLoader(cortex, batch_size= 128, shuffle=True)
"""
def __init__(
self,
file_dir="/home/longlab/Data/Thesis/Data/expression_mRNA_17-Aug-2014.txt",
n_genes=558,
):
self.file_dir = file_dir
self.n_genes = n_genes
df = pd.read_csv(self.file_dir, delimiter="\t", low_memory=False)
# Groups of cells, i.e. labels
Groups = df.iloc[7,]
Groups = Groups.values[2:]
_, labels = np.unique(Groups, return_inverse=True)
df1 = df.iloc[10:].copy()
df1.drop(["tissue", "Unnamed: 0"], inplace=True, axis=1)
df1 = df1.astype(float)
# choosing genes that are transcribed in more than 25 cells
rows = np.count_nonzero(df1, axis=1) > 0
df1 = df1[np.ndarray.tolist(rows)]
data = df1.values
# sorting the data based on the variance
rows = np.argsort(np.var(data, axis=1) * -1)
data = data[rows, :]
data = data[0 : self.n_genes, :] # choosing high variable genes
# Permutation on features (they were sorted)
np.random.seed(197)
p = np.random.permutation(data.shape[0])
data = data[p, :]
y = data.T
self.n_cells = y.shape[0]
self.data = torch.from_numpy(y)
self.labels = torch.tensor(labels)
def __len__(self):
return self.n_cells
def __getitem__(self, index):
return self.data[index], self.labels[index]
[docs]
class Brain_Large(Dataset):
"""
Loads BRAIN data set.
A class with necessary pre-processing steps for the Large brain dataset.
The variance of the genes for a sub-sample of 10^5 cells will be calculated
and the high variable genes (720 by default) will be selected.
The Large brain dataset can be downloaded from the following url:
"http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5"
The data is in HDF5 format which can be easily accessed and processed with the
`h5py` library.
The data contains:
barcodes: This contains information of the batch number which can be used for
batch correction.
`gene_names` contains the gene names.
`genes` contains the Ensembl Gene id such as: 'ENSMUSG00000089699'
`data` is an array containing all the non zero elements of the sparse matrix
`indices` is an array mapping each element in `data` to its column in the sparse
matrix.
`indptr` maps the elements of data and indices to the rows of the sparse matrix.
`shape` the dimension of the sparse matrix
For more info please visit
https://stackoverflow.com/questions/52299420/scipy-csr-matrix-understand-indptr
and
"https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html"
Parameters
----------
file_dir: str
The directory of the HDF5 file which should be provided by the user.
n_sub_samples: int
Number of samples (cells) in the downsampled matrix. Default: 10^5
n_select_genes: int
Number of the high variable genes to be selected in the pre-processing
step. Default: 750
low_memory: Boolean
If `False`, the whole data will be loaded into memory (high amount of
memory required); `True` by default.
Attributes
----------
file_dir : str
The directory of the HDF5 file
n_select_genes : int
Number of high variable genes to be selected in the pre-processing step.
n_genes : int
Total number of genes in the data set
n_cells : int
Total number of cells
selected_genes : ndarray
The indices of selected high variable genes.
low_memory : boolean
Whether perform data loading with memory efficiency or not.
matrix : scipy csc_matrix
If `low_memory` is False, it is the whole data otherwise `None`.
Examples
--------
>>> import data_prep
>>> brain = data_prep.Brain_Large()
>>> dl = DataLoader(brain, batch_size= batch_size, shuffle=True)
"""
def __init__(
self,
file_dir=(
"/home/longlab/Data/Thesis/Data/",
"1M_neurons_filtered_gene_bc_matrices_h5.h5",
),
n_sub_samples=10**5,
n_select_genes=720,
low_memory=True,
):
self.file_dir = file_dir
self.n_select_genes = n_select_genes
self.low_memory = low_memory
with h5py.File(self.file_dir, "r") as f:
data = f["mm10"]
self.n_genes, self.n_cells = data["shape"]
# sub-sampling for choosing high variable genes
indptr_sub_samp = data["indptr"][:n_sub_samples]
last_indx = indptr_sub_samp[-1]
sub_sampled = csc_matrix(
(
data["data"][:last_indx],
data["indices"][:last_indx],
indptr_sub_samp,
),
shape=(self.n_genes, n_sub_samples - 1),
)
scale = StandardScaler(with_mean=False)
scale.fit(sub_sampled.T)
# choosing high variable genes
self.selected_genes = np.argsort(scale.var_ * -1)[: self.n_select_genes]
if not low_memory:
del sub_sampled, scale, indptr_sub_samp
print("Loading the whole dataset")
n_iters = int(self.n_cells / 10**5) + (self.n_cells % 10**5 > 0)
with h5py.File(self.file_dir, "r") as f:
data = f["mm10"]
index_partitioner = data["indptr"][...]
# The following code is from the scvi package
# (https://github.com/scverse/scvi-tools)
for i in range(n_iters):
print(f"Reading batch {i+1}")
index_partitioner_batch = index_partitioner[
(i * 10**5) : ((1 + i) * 10**5 + 1)
]
first_index_batch = index_partitioner_batch[0]
last_index_batch = index_partitioner_batch[-1]
index_partitioner_batch = (
index_partitioner_batch - first_index_batch
).astype(np.int32)
n_cells_batch = len(index_partitioner_batch) - 1
data_batch = data["data"][
first_index_batch:last_index_batch
].astype(np.float32)
indices_batch = data["indices"][
first_index_batch:last_index_batch
].astype(np.int32)
matrix_batch = csr_matrix(
(data_batch, indices_batch, index_partitioner_batch),
shape=(n_cells_batch, self.n_genes),
)[:, self.selected_genes]
# stack on the fly to limit RAM usage
if i == 0:
matrix = matrix_batch
else:
matrix = vstack([matrix, matrix_batch])
self.matrix = matrix
else:
self.matrix = None
def __len__(self):
return self.n_cells
def __getitem__(self, index):
if self.low_memory:
with h5py.File(self.file_dir, "r") as f:
data = f["mm10"]
# try to initalize the data in init and then use data in the getitem?
indptr_sub_samp = data["indptr"][index : (index + 1) + 1]
first_indx = indptr_sub_samp[0]
last_indx = indptr_sub_samp[-1]
indptr_sub_samp = (indptr_sub_samp - first_indx).astype(np.int32)
matrix_batch = csc_matrix(
(
data["data"][first_indx:last_indx],
data["indices"][first_indx:last_indx],
indptr_sub_samp,
),
shape=(self.n_genes, 1),
)
matrix_batch = matrix_batch.toarray().T[:, self.selected_genes]
matrix_batch = torch.tensor(matrix_batch, dtype=torch.float32)
return matrix_batch, index
else:
return torch.tensor(self.matrix[index].A, dtype=torch.float32), index
[docs]
class Indice_Sampler(Sampler):
"""
This is a sampler for Pytorch ``DataLoader`` in which it will sample from the
``Dataset`` with indices of mask.
Parameters
----------
mask: ``torch.tensor``
The indices of the samples.
Example
----------
>>> from torch.utils.data import DataLoader
>>> brain = Brain_Large()
>>> dataloader = torch.utils.data.DataLoader(brain,
batch_size=64,
shuffle=True)
>>> a, b = next(iter(dataloader))
>>> trainloader_sampler1 = torch.utils.data.DataLoader(brain,
batch_size=64,
sampler = Brain_Large_Sampler(b),
shuffle=False)
>>> c, d = next(iter(trainloader_sampler1))
"""
def __init__(self, mask):
self.mask = mask
def __iter__(self):
return (i for i in self.mask)
def __len__(self):
return len(self.mask)
[docs]
class RETINA(Dataset):
"""
Loads RETINA data set.
A class with necessary pre-processing steps for the RETINA dataset. It seperates
the raw count data, the cluster numbers, and batches.
The data provided by Lopez et al. is used which can be
downloaded from: https://github.com/YosefLab/scVI-data/raw/master/retina.loom
The data is in loompy format (http://linnarssonlab.org/loompy/index.html)
The original dataset is available under the GEO accession number GSE81904.
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81904
The webpage consists of the raw matrix, BAM files, Rdata files, and R code.
To perform the pre-processing steps and analyses of the original paper,
one can use the following github repository:
https://github.com/broadinstitute/BipolarCell2016
Parameters
----------
file_dir: str
The directory of the .loom file which should be provided by the user.
Attributes
----------
file_dir : str
The directory of the HDF5 file.
n_genes : int
Total number of genes in the data set.
n_cells : int
Total number of cells.
batchID : ndarray
One dimensional array of the Batch Ids of cells.
cluster : ndarray
One dimensional array of the assigned cluster numbers by the authors of
the original paper.
Notes
-----
Since the data is not very large (approximately 2GB in memory), the class will
load the whole data into memory.
Examples
--------
>>> import data_prep
>>> retina = data_prep.RETINA()
>>> dl = DataLoader(retina, batch_size= batch_size, shuffle=True)
"""
def __init__(self, file_dir="/home/longlab/Data/Thesis/Data/retina.loom"):
self.file_dir = file_dir
with loompy.connect(self.file_dir) as ds:
self.data = torch.from_numpy(ds[:, :].T).float()
self.n_genes, self.n_cells = ds.shape
self.batchID = ds.ca["BatchID"].ravel()
self.cluster = ds.ca["ClusterID"].ravel()
def __len__(self):
return self.n_cells
def __getitem__(self, index):
matrix_batch = self.data[index]
return matrix_batch, index
[docs]
class SyntheticData(Dataset):
"""
Generates a synthetic data set for test.
The resulted data set is not a data set for research purposes.
To do: Complete the documentation.
"""
def __init__(self, device, n_features=50, n_samples=100):
self.n_genes = n_features
self.n_samples = n_samples
self.p = torch.rand(n_samples, n_features).to(device)
self.log_theta = torch.FloatTensor(1, n_features).uniform_(0, 100).to(device)
self.gate_logits = \
torch.FloatTensor(n_samples, n_features).uniform_(0, 50).to(device)
self.dist = ZINB(self.log_theta, gate_logits=self.gate_logits, probs=self.p)
self.data = self.dist.sample().to(device)
def __len__(self):
return self.n_samples
def __getitem__(self, index):
return self.data[index]