diff --git a/gemmapy/gemmapy_api.py b/gemmapy/gemmapy_api.py index 72a0993..27c961e 100644 --- a/gemmapy/gemmapy_api.py +++ b/gemmapy/gemmapy_api.py @@ -7,14 +7,18 @@ import logging import os import subprocess +import tarfile +import tempfile import warnings from getpass import getpass -from io import StringIO -from typing import Optional, List, Callable +from io import StringIO, BytesIO +from os.path import join +from typing import Optional, List, Callable, Any import anndata as ad import numpy as np import pandas as pd +import scanpy from anndata import AnnData from pandas import DataFrame @@ -1667,7 +1671,46 @@ def make_anndata(pack): pass return out - def get_differential_expression_values(self, + def get_single_cell_dataset_object(self, dataset: str | int, + download_dir=None) -> AnnData: + """ + :param download_dir: Directory where datasets can be downloaded, or else + the data will be retrieved in-memory. + :return: + """ + + def resolve(): + if download_dir: + dest = join(download_dir, dataset + '.tar') + if not os.path.exists(dest): + logger.info('Downloading single-cell data for %s to %s...', + dataset, download_dir) + with open(dest, 'wb') as f: + f.write(self.raw.get_dataset_single_cell_expression( + dataset)) + return open(dest, 'rb') + else: + logger.info("Downloading single-cell data data for %s...", + str(dataset)) + return BytesIO( + self.raw.get_dataset_single_cell_expression(dataset)) + + with (resolve() as f, tarfile.open(fileobj=f) as tf, + tempfile.TemporaryDirectory() as tmpdir): + logger.info('Extracting TAR file for %s to %s...', str(dataset), + tmpdir) + tf.extractall(tmpdir) + samples = [] + for sample_dir in os.listdir(tmpdir): + logger.info('Reading MEX data for %s...', sample_dir) + # Gemma already guarantees unicity of cell identifiers and + # scanpy cannot deal with numeric gene identifiers when + # make_unique is True, so we skip that part + samples.append(scanpy.read_10x_mtx(join(tmpdir, sample_dir), + make_unique=False)) + return scanpy.concat(samples, axis="var") + + def get_differential_expression_values(self, dataset:Optional[str|int] = None, keep_non_specific:bool = False, result_sets:Optional[List[str|int]] = None, diff --git a/setup.cfg b/setup.cfg index 7fae1e6..63e45b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ install_requires = pandas numpy anndata + scanpy typing #[options.packages.find] diff --git a/tests/test_basic.py b/tests/test_basic.py index 9bc468a..fc27855 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -70,6 +70,19 @@ def test_auth(monkeypatch): monkeypatch.setitem(os.environ, 'GEMMA_PASSWORD_CMD', '') gemmapy.GemmaPy() +def test_get_single_cell_data(): + # TODO: use a publicly available dataset + client = gemmapy.GemmaPy() + ad = client.get_single_cell_dataset_object('GSE227313', download_dir='.') + print(ad) + +def test_get_genes(): + assert len(api.get_genes('BRCA1')) > 0 + assert len(api.get_genes(['BRCA1'])) > 0 + assert len(api.get_genes(672)) > 0 + assert len(api.get_genes([672])) > 0 + assert len(api.get_genes([672, 'BRCA1'])) > 0 + def test_get_result_sets(): res = api.get_result_sets([200])