Skip to content

Commit

Permalink
(feat): tracking peak memory, clean up args
Browse files Browse the repository at this point in the history
  • Loading branch information
ilan-gold committed Apr 25, 2024
1 parent 3eb9d9a commit c19307c
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 71 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
__pycache__/
/*cache/
.ipynb_checkpoints/
/data/
data/

# Distribution / packaging
/dist/
Expand All @@ -16,3 +16,6 @@ __pycache__/

# Venvs
*venv/

# asv
.asv/
54 changes: 34 additions & 20 deletions benchmarks/benchmarks/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import rapids_singlecell as rsc

from .utils import track_peakmem


class PreprocessingSuite:
_data_dict = dict(pbmc68k_reduced=sc.datasets.pbmc68k_reduced())
Expand All @@ -21,69 +23,81 @@ def setup(self, input_data: str):
def time_calculate_qc_metrics(self, *_):
self.adata.var["mt"] = self.adata.var_names.str.startswith("MT-")
rsc.pp.calculate_qc_metrics(
self.adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
self.adata, qc_vars=["mt"], log1p=False
)

def peakmem_calculate_qc_metrics(self, *_):
@track_peakmem
def track_peakmem_calculate_qc_metrics(self, *_):
self.adata.var["mt"] = self.adata.var_names.str.startswith("MT-")
rsc.pp.calculate_qc_metrics(
self.adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
self.adata, qc_vars=["mt"], log1p=False
)

def time_filter_cells(self, *_):
rsc.pp.filter_cells(self.adata, min_genes=200)
rsc.pp.filter_cells(self.adata, qc_var="n_counts", min_count=200)

@track_peakmem
def track_peakmem_filter_cells(self, *_):
rsc.pp.filter_cells(self.adata, qc_var="n_counts", min_count=200)

def peakmem_filter_cells(self, *_):
rsc.pp.filter_cells(self.adata, min_genes=200)

def time_filter_genes(self, *_):
rsc.pp.filter_genes(self.adata, min_cells=3)
rsc.pp.filter_genes(self.adata, qc_var="n_counts", min_count=3)

def peakmem_filter_genes(self, *_):
rsc.pp.filter_genes(self.adata, min_cells=3)
@track_peakmem
def track_peakmem_filter_genes(self, *_):
rsc.pp.filter_genes(self.adata, qc_var="n_counts", min_count=3)

def time_normalize_total(self, *_):
rsc.pp.normalize_total(self.adata, target_sum=1e4)

def peakmem_normalize_total(self, *_):
@track_peakmem
def track_peakmem_normalize_total(self, *_):
rsc.pp.normalize_total(self.adata, target_sum=1e4)

def time_log1p(self, *_):
rsc.pp.log1p(self.adata)

def peakmem_time_log1p(self, *_):
@track_peakmem
def track_peakmem_time_log1p(self, *_):
rsc.pp.log1p(self.adata)

def time_pca(self, *_):
rsc.pp.pca(self.adata, svd_solver="arpack")
rsc.pp.pca(self.adata)

def peakmem_pca(self, *_):
rsc.pp.pca(self.adata, svd_solver="arpack")
@track_peakmem
def track_peakmem_pca(self, *_):
rsc.pp.pca(self.adata)

def time_highly_variable_genes(self, *_):
rsc.pp.highly_variable_genes(
self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
)

def peakmem_highly_variable_genes(self, *_):
@track_peakmem
def track_peakmem_highly_variable_genes(self, *_):
rsc.pp.highly_variable_genes(
self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
)

def time_regress_out(self, *_):
rsc.pp.regress_out(self.adata, ["n_counts", "percent_mito"])

def peakmem_regress_out(self, *_):
@track_peakmem
def track_peakmem_regress_out(self, *_):
rsc.pp.regress_out(self.adata, ["n_counts", "percent_mito"])

def time_scale(self, *_):
rsc.pp.scale(self.adata, max_value=10)

def peakmem_scale(self, *_):
@track_peakmem
def track_peakmem_scale(self, *_):
rsc.pp.scale(self.adata, max_value=10)

def time_neighbors(self, *_):
rsc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=100)
rsc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=50)

@track_peakmem
def track_peakmem_neighbors(self, *_):
rsc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=50)

def peakmem_neighbors(self, *_):
rsc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=100)
34 changes: 7 additions & 27 deletions benchmarks/benchmarks/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,43 +22,23 @@

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

import anndata
import scanpy as sc

from rapids_singlecell.get import anndata_to_GPU

if TYPE_CHECKING:
from collections.abc import Callable
from pathlib import Path

import pathlib

sc.settings.datasetdir = pathlib.Path(__file__).parent.resolve() / "data"

@dataclass
class Dataset:
path: Path
get: Callable[[], anndata.AnnData]

path="/p/project/training2406/team_scverse/gold2/rapids_singlecell/benchmarks/data/pbmc3k_raw.h5ad"

from .utils import track_peakmem

class ToGPUSuite:
_data_dict = dict(pbmc3k=anndata.read_h5ad(path))
_data_dict = dict(obmc68k_reduced=sc.datasets.pbmc68k_reduced())
params = _data_dict.keys()
param_names = ["input_data"]

def setup(self, input_data: str):
self.data = self._data_dict[input_data]
self.adata = self._data_dict[input_data]

def time_to_gpu(self, *_):
anndata_to_GPU(self.data)
anndata_to_GPU(self.adata)

def peakmem_to_gpu(self, *_):
anndata_to_GPU(self.data)
@track_peakmem
def track_peakmem_to_gpu(self, *_):
anndata_to_GPU(self.adata)

def mem_to_gpu(self, *_):
anndata_to_GPU(self.data)
40 changes: 23 additions & 17 deletions benchmarks/benchmarks/squidpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,58 @@

from itertools import product

import anndata as ad
import scanpy as sc

import rapids_singlecell as rsc

import pathlib
from .utils import track_peakmem

class ToolsSuite:
_data_dict = dict(
visium_sge=ad.read_h5ad("/p/project/training2406/team_scverse/gold2/rapids_singlecell/benchmarks/data/paul15.h5ad"),
pbmc68k_reduced=sc.datasets.pbmc68k_reduced(),
)
params = _data_dict.keys()
param_names = ["input_data"]

def setup(self, input_data):
self.adata = rsc.get.anndata_to_GPU(self._data_dict[input_data].copy(), copy=True)
self.cpu_adata = self._data_dict[input_data].copy()
self.gpu_adata = rsc.get.anndata_to_GPU(self.cpu_adata, copy=True)

def time_ligrec(self, *_):
gene_ids = self.adata.var.index
gene_ids = self.cpu_adata.var.index
interactions = tuple(product(gene_ids[:5], gene_ids[:5]))
rsc.gr.ligrec(
self.adata,
"leiden",
self.cpu_adata,
"louvain",
interactions=interactions,
n_perms=5,
use_raw=False,
)

def peakmem_ligrec(self, *_):
gene_ids = self.adata.var.index
@track_peakmem
def track_peakmem_ligrec(self, *_):
gene_ids = self.cpu_adata.var.index
interactions = tuple(product(gene_ids[:5], gene_ids[:5]))
rsc.gr.ligrec(
self.adata,
"leiden",
self.cpu_adata,
"louvain",
interactions=interactions,
n_perms=5,
use_raw=False,
)


def time_autocorr_moran(self, *_):
rsc.gr.spatial_autocorr(self.adata, mode="moran")
rsc.gr.spatial_autocorr(self.gpu_adata, mode="moran", connectivity_key="connectivities")

def peakmem_autocorr_moran(self, *_):
rsc.gr.spatial_autocorr(self.adata, mode="moran")
@track_peakmem
def track_peakmem_autocorr_moran(self, *_):
rsc.gr.spatial_autocorr(self.gpu_adata, mode="moran", connectivity_key="connectivities")

def time_autocorr_geary(self, *_):
rsc.gr.spatial_autocorr(self.adata, mode="geary")
rsc.gr.spatial_autocorr(self.gpu_adata, mode="geary", connectivity_key="connectivities")

@track_peakmem
def track_peakmem_autocorr_geary(self, *_):
rsc.gr.spatial_autocorr(self.gpu_adata, mode="geary", connectivity_key="connectivities")

def peakmem_autocorr_geary(self, *_):
rsc.gr.spatial_autocorr(self.adata, mode="geary")
16 changes: 10 additions & 6 deletions benchmarks/benchmarks/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import rapids_singlecell as rsc

import pathlib
from .utils import track_peakmem


class ToolsSuite:
Expand All @@ -21,28 +21,32 @@ class ToolsSuite:

def setup(self, input_data):
self.adata = rsc.get.anndata_to_GPU(self._data_dict[input_data].copy(), copy=True)
assert "X_pca" in self.adata.obsm

def time_umap(self, *_):
rsc.tl.umap(self.adata)

def peakmem_umap(self, *_):
@track_peakmem
def track_peakmem_umap(self, *_):
rsc.tl.umap(self.adata)

def time_diffmap(self, *_):
rsc.tl.diffmap(self.adata)

def peakmem_diffmap(self, *_):
@track_peakmem
def track_peakmem_diffmap(self, *_):
rsc.tl.diffmap(self.adata)

def time_leiden(self, *_):
rsc.tl.leiden(self.adata)

def peakmem_leiden(self, *_):
@track_peakmem
def track_peakmem_leiden(self, *_):
rsc.tl.leiden(self.adata)

def time_embedding_denity(self, *_):
rsc.tl.embedding_density(self.adata, basis="umap")

def peakmem_embedding_denity(self, *_):
@track_peakmem
def track_peakmem_embedding_denity(self, *_):
rsc.tl.embedding_density(self.adata, basis="umap")

74 changes: 74 additions & 0 deletions benchmarks/benchmarks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# From https://github.com/rapidsai/benchmark/blob/570531ba4bc90c508245e943d2aaa11d68a24286/rapids_pytest_benchmark/rapids_pytest_benchmark/rmm_resource_analyzer.py#L29

import os
import csv
import rmm
import tempfile


class RMMResourceAnalyzer:
"""
Class to control enabling, disabling, & parsing RMM resource
logs.
"""

def __init__(self, benchmark_name):
self.max_gpu_util = -1
self.max_gpu_mem_usage = 0
self.leaked_memory = 0
log_file_name = benchmark_name
self._log_file_prefix = os.path.join(tempfile.gettempdir(), log_file_name)

def enable_logging(self):
"""
Enable RMM logging. RMM creates a CSV output file derived from
provided file name that looks like: log_file_prefix + ".devX", where
X is the GPU number.
"""
rmm.enable_logging(log_file_name=self._log_file_prefix)

def disable_logging(self):
"""
Disable RMM logging
"""
log_output_files = rmm.get_log_filenames()
rmm.mr._flush_logs()
rmm.disable_logging()
# FIXME: potential improvement here would be to only parse the log files for
# the gpu ID that's passed in via --benchmark-gpu-device
self._parse_results(log_output_files)
for _, log_file in log_output_files.items():
os.remove(log_file)

def _parse_results(self, log_files):
"""
Parse CSV results. CSV file has columns:
Thread,Time,Action,Pointer,Size,Stream
"""
current_mem_usage = 0
for _, log_file in log_files.items():
with open(log_file, mode="r") as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
row_action = row["Action"]
row_size = int(row["Size"])

if row_action == "allocate":
current_mem_usage += row_size
if current_mem_usage > self.max_gpu_mem_usage:
self.max_gpu_mem_usage = current_mem_usage

if row_action == "free":
current_mem_usage -= row_size
self.leaked_memory = current_mem_usage

def track_peakmem(fn):
from functools import wraps
@wraps(fn)
def wrapper(self, *args, **kwargs):
resource_analyzer = RMMResourceAnalyzer(benchmark_name=fn.__name__)
resource_analyzer.enable_logging()
fn(self, *args, **kwargs)
resource_analyzer.disable_logging()
return resource_analyzer.max_gpu_mem_usage
return wrapper

0 comments on commit c19307c

Please sign in to comment.