diff --git a/.github/workflows/python-conda-build.yaml b/.github/workflows/python-conda-build.yaml new file mode 100644 index 0000000..cfdf1c9 --- /dev/null +++ b/.github/workflows/python-conda-build.yaml @@ -0,0 +1,39 @@ +name: Python Package using Conda + +on: [ pull_request ] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda env create -f scAR-cpu.yml + - name: Run binary + run: | + export PATH="$GITHUB_PATH:$PATH" + source activate scAR + scar --help + - name: Run pylint + run: | + export PATH="$GITHUB_PATH:$PATH" + source activate scAR + conda install pylint + pylint scAR --fail-under 0.5 + - name: Run unit tests + run: | + export PATH="$GITHUB_PATH:$PATH" + source activate scAR + conda install pytest + pytest scAR \ No newline at end of file diff --git a/README.md b/README.md index dde3252..e68f1e6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # scAR -[![scAR](https://img.shields.io/badge/scAR-005AF0?style=for-the-badge&logo=dependabot&logoColor=white.svg)](https://github.com/CaibinSh/scAR) +[![scAR](https://img.shields.io/badge/scAR-005AF0?style=for-the-badge&logo=dependabot&logoColor=white.svg)](https://github.com/Novartis/scAR) ![single-cell omics](https://img.shields.io/badge/single_cell_omics-005AF0?style=for-the-badge.svg) ![machine learning](https://img.shields.io/badge/machine_learning-005AF0?style=for-the-badge.svg) ![variational autoencoders](https://img.shields.io/badge/variational_autoencoders-005AF0?style=for-the-badge.svg) @@ -25,24 +25,33 @@ Clone this repository, ```sh -$ git clone https://github.com/CaibinSh/scAR.git +$ git clone https://github.com/Novartis/scAR.git ``` -To install the dependencies, create a conda environment: +Enter the cloned directory: + ```sh -$ conda env create -f scAR.yml +$ cd scAR ``` -Locate to scAR directory: +To install the dependencies, create a conda environment: + +> Please use `scAR-gpu` if you have an nvidia graphis card and the corresponging driver installed. ```sh -$ cd scAR +$ conda env create -f scAR-gpu.yml ``` -Pip install scAR: +or + +> Please use `scAR-cpu` if you don't have a graphis card availalble. +```sh +$ conda env create -f scAR-cpu.yml +``` +To activate the scAR conda environment run: ```sh -$ pip install . +$ conda activate scAR ``` ## Usage @@ -52,7 +61,8 @@ There are two ways to run scAR. 1) Use scAR API if you are Python users ```sh ->>> scarObj = scAR.model(adata.X.to_df(), empty_profile) +>>> from scAR import model +>>> scarObj = model(adata.X.to_df(), empty_profile) >>> scarObj.train() >>> scarObj.inference() >>> adata.layers["X_scAR_denoised"] = scarObj.native_counts @@ -101,7 +111,7 @@ The output folder contains four (or five) files: - [Denoising protein data for CITE-seq](https://github.com/CaibinSh/scAR-reproducibility/blob/main/reproducibility/scAR_tutorial_denoising_CITEseq.ipynb) - [Denoising mRNA data for scRNAseq](https://github.com/CaibinSh/scAR-reproducibility/blob/main/reproducibility/scAR_tutorial_mRNA_denoising.ipynb) - If you'd like to contribute, please contact Caibin (caibin.sheng@novartis.com). -- Please use the [issues](https://github.com/CaibinSh/scAR/issues) to submit bug reports. +- Please use the [issues](https://github.com/Novartis/scAR/issues) to submit bug reports. ## License diff --git a/scAR-cpu.yml b/scAR-cpu.yml new file mode 100644 index 0000000..0c2a2b2 --- /dev/null +++ b/scAR-cpu.yml @@ -0,0 +1,23 @@ +name: scAR +channels: +- nvidia +- pytorch +- conda-forge +dependencies: +- conda-forge::python>=3.8.6 +- nvidia::cudatoolkit>=11.1 +- pytorch::pytorch>=1.10.0 +- pytorch::torchvision>=0.9.0 +- pytorch::torchaudio>=0.8.0 +- pytorch-mutex=*=cpu +- conda-forge::tqdm>=4.62.3 +- conda-forge::pandas>=1.3.4 +- conda-forge::seaborn>=0.11.2 +- conda-forge::tensorboard>=2.2.1 +- conda-forge::scikit-learn>=1.0.1 +- conda-forge::anndata +- conda-forge::scanpy +- conda-forge::pyro-ppl>=1.8.0 +- conda-forge::pip +- pip: + - . diff --git a/scAR-gpu.yml b/scAR-gpu.yml new file mode 100644 index 0000000..79bc182 --- /dev/null +++ b/scAR-gpu.yml @@ -0,0 +1,23 @@ +name: scAR +channels: +- nvidia +- pytorch +- conda-forge +dependencies: +- conda-forge::python>=3.8.6 +- nvidia::cudatoolkit>=11.1 +- pytorch::pytorch>=1.10.0 +- pytorch::tcorchvision>=0.9.0 +- pytorch::torchaudio>=0.8.0 +- pytorch-mutex=*=cuda +- conda-forge::tqdm>=4.62.3 +- conda-forge::pandas>=1.3.4 +- conda-forge::seaborn>=0.11.2 +- conda-forge::tensorboard>=2.2.1 +- conda-forge::scikit-learn>=1.0.1 +- conda-forge::anndata +- conda-forge::scanpy +- conda-forge::pyro-ppl>=1.8.0 +- conda-forge::pip +- pip: + - . diff --git a/scAR.yml b/scAR.yml deleted file mode 100644 index 7bb65c9..0000000 --- a/scAR.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: scAR -channels: -- nvidia -- pytorch -- conda-forge -- anaconda -dependencies: -- python=3.8.6 -- cudatoolkit=11.1 -- pytorch>=1.10.0 -- torchvision>=0.9.0 -- torchaudio>=0.8.0 -- tqdm=4.62.3 -- gpyopt=1.2.6 -- pandas=1.3.4 -- seaborn>=0.11.2 -- tensorboard>=2.2.1 -- scikit-learn>=1.0.1 -- anndata -- scanpy -- pip: - - pyro-ppl==1.8.0 diff --git a/scAR/__init__.py b/scAR/__init__.py index e23488e..94a49f0 100644 --- a/scAR/__init__.py +++ b/scAR/__init__.py @@ -1,6 +1 @@ -# -*- coding: utf-8 -*- - -__version__ = '0.1.6' - -from scAR import _data_generater as DataSimulator -from scAR._scAR import model \ No newline at end of file +from scAR.main._scAR import model \ No newline at end of file diff --git a/scAR/_helper_functions.py b/scAR/_helper_functions.py deleted file mode 100644 index 3dcf1d9..0000000 --- a/scAR/_helper_functions.py +++ /dev/null @@ -1,165 +0,0 @@ -# -*- coding: utf-8 -*- - -import torch -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.metrics import r2_score, median_absolute_error -from scipy.stats import pearsonr - -## plot estimated noise ratio -def histgram_noise_ratio(epoch, nr_pred, nr_truth=None, xlim=(0.5, 1), return_obj=False): - - epsilon = nr_pred.detach().cpu().numpy().squeeze() - nosie_df = pd.DataFrame(epsilon, columns=['noise ratio']) - - fig = plt.figure(figsize=(3,2), tight_layout=True); - ax = sns.histplot(nosie_df['noise ratio'], bins=200, label='inferenced'); - - plt.legend(); - plt.title(f'epoch: {epoch}') - # plt.xlim(xlim) - plt.xlabel('noise ratio') - plt.ylabel('cell counts') - - if nr_truth is not None: - nosie_df["ground truth"] = nr_truth - ax.vlines(nr_truth, ymin=0., ymax=180, colors='r', ls='--', lw=2, label='ground truth'); - - if return_obj: - return fig - -def jointplot_noise_ratio(epoch, nr_pred, nr_truth, celltype, xlim=(0.3, 0.55), return_obj=False): - - noise_ratio_inf = nr_pred.cpu().numpy().squeeze() - nosie_df = pd.DataFrame(np.array([noise_ratio_inf, nr_truth]).T, columns=['inferenced', 'ground truth']) - nosie_df["celltype"] = celltype - - ax = sns.jointplot(data=nosie_df, x="ground truth", y="inferenced", hue="celltype", kind='scatter', s=2, alpha=0.5); - ax.ax_joint.plot(xlim, xlim, 'b--', linewidth = 2); - plt.tight_layout() - - if return_obj: - return ax.fig - -def get_correlation_btn_native_ambient(epoch, native_signals_pred, empty_frequencies, scRNAseq_tech): - - # assign the highest guide - def maxvalue(x): - return (x == x.max(dim=1).values.view(-1,1)).type(x.dtype) - - if scRNAseq_tech.lower() == 'cropseq': - native_clean = maxvalue(native_signals_pred.detach()).cpu().numpy() - else: - native_clean = native_signals_pred.cpu().numpy() - - frac_native = native_clean.sum(axis=0)/native_clean.sum() - freq_truth = empty_frequencies.detach().cpu().numpy() - - # metric to evaluate the predection - r2 = r2_score(freq_truth, frac_native) - # mae = median_absolute_error(freq_truth, frac_native) - pr = pearsonr(freq_truth, frac_native)[0] - - return pr, r2, frac_native, freq_truth - - -def plt_correlation_btn_native_ambient(epoch, native_frequencies_pred, empty_frequencies, scRNAseq_tech, return_obj=False): - - pr, r2, frac_native, freq_truth = get_correlation_btn_native_ambient(epoch, native_frequencies_pred, empty_frequencies, scRNAseq_tech=scRNAseq_tech) - - fig = plt.figure(figsize=(3,3), tight_layout=True); - plt.title(f'epoch: {epoch}') - plt.scatter(freq_truth, frac_native, s=10); - xmin, xmax, ymin, ymax = plt.axis() - plt.plot([xmin, xmax], [ymin, ymax], 'r--'); - plt.text(1.15*xmin, 0.85*ymax, f'corr: {pr:.4f} \nR2: {r2:.4f}\n') - plt.xlabel('empty profile'); - plt.ylabel('expected native frequencies'); - - if return_obj: - return fig - -def assignment_accuracy(probs, celltype_truth): - return (np.argmax(probs, axis=1) == celltype_truth).sum()/len(celltype_truth) - -def naive_assignment_accuracy(count_matrix, celltype_truth): - row_ct = np.argmax(count_matrix, axis=1) - return (row_ct == celltype_truth).sum()/len(celltype_truth) - -def plot_a_sample(epoch, native_frequencies_pred, syn_data_obj, idx=None, return_obj=False): - - if idx is None: - idx = np.random.randint(syn_data_obj.n_samples) - - plt.figure(figsize=(8,12)) - plt.subplot(5,1,1) - plt.title(f'cell: {idx} epoch: {epoch}') - plt.scatter(np.linspace(0, syn_data_obj.n_fbs-1, syn_data_obj.n_fbs), syn_data_obj.ambient_signals[idx], c='gray', alpha=0.5, label='ambient signals'); - plt.scatter(syn_data_obj.celltype[idx], syn_data_obj.count_matrix[idx][syn_data_obj.celltype[idx]], c='b', alpha=0.5, label='observation'); - plt.scatter(syn_data_obj.celltype[idx], syn_data_obj.native_signals[idx][syn_data_obj.celltype[idx]], c='r', alpha=0.5, label='native signals'); - plt.ylabel('counts'); - plt.legend(); - - plt.subplot(5,1,2) - plt.scatter(np.linspace(0, n_fbs-1, n_fbs), syn_data_obj.empty_profile, c='gray', alpha=0.5, label='ambient frequencies'); - plt.scatter(np.linspace(0, n_fbs-1, n_fbs), native_frequencies_pred.cpu().numpy()[idx], c='r', alpha=0.5, label='native frequencies'); - plt.ylabel('frequencies') - plt.legend() - - if return_obj: - return plt.gcf() - -def barplot_native_freq_by_celltype(epoch, dec_prob, syn_data_obj, return_obj=False): - n_fbs = syn_data_obj.n_fbs - n_celltypes = syn_data_obj.n_celltypes - fig, axs = plt.subplots(ncols=n_celltypes, figsize=(n_celltypes*5, 5), sharey=True, tight_layout=True); - type_specific_profile = dec_prob.cpu() - - for i in np.unique(syn_data_obj.celltype): - axs[i].barh(y = np.linspace(1,n_fbs,n_fbs), width = syn_data_obj.native_profile[syn_data_obj.celltype==i][0,...], height=0.9, alpha=0.2); - axs[i].boxplot(type_specific_profile[syn_data_obj.celltype==i].T, vert=False, notch=False, widths=0.7, medianprops=dict(linestyle='-.', linewidth=2.5, color='firebrick')); - axs[i].set_ylabel("ADT or genes") - axs[i].set_title(f"cell type {i}") - axs[i].text(0.6*axs[i].get_xlim()[1], 1, f"box: inference \nbar: ground truth") - axs[i].set_xlabel("frequencies") - if return_obj: - return fig - -def heatmap_native_pred(expected_native_counts, syn_data_obj, return_obj=False): - - idx_ct_sorted = np.argsort(syn_data_obj.celltype) - - fake_signals_nv = syn_data_obj.native_signals - fake_signals_bg = syn_data_obj.ambient_signals - fake_signals_Obs = syn_data_obj.count_matrix - - log_native_signals = np.log2(fake_signals_nv + 1) - log_ambient_sigals = np.log2(fake_signals_bg + 1) - log_obs = np.log2(fake_signals_Obs + 1) - - fig, axs = plt.subplots(ncols=4, nrows=1, figsize=(12,3),tight_layout=True); - - sns.heatmap(log_obs[idx_ct_sorted],yticklabels=False, - vmin=0, vmax=10, cmap="coolwarm", center=2,ax=axs[0]); - axs[0].set_title("Observation"); - axs[0].set_ylabel("cells"); - axs[0].set_xlabel("protein markers"); - sns.heatmap(log_ambient_sigals[idx_ct_sorted], yticklabels=False, - vmin=0, vmax=10, cmap="coolwarm", center=2,ax=axs[1]); - axs[1].set_title("ambient signals"); - axs[1].set_xlabel("protein markers"); - - sns.heatmap(log_native_signals[idx_ct_sorted], yticklabels=False, - vmin=0, vmax=10, cmap="coolwarm", center=2,ax=axs[2]); - axs[2].set_title("native signals"); - axs[2].set_xlabel("protein markers"); - - sns.heatmap(np.log2(expected_native_counts.cpu().numpy()+1)[idx_ct_sorted], yticklabels=False, - vmin=0, vmax=10, cmap="coolwarm", center=2,ax=axs[3]); - axs[3].set_title("Expected native signals"); - axs[3].set_xlabel("protein markers"); - - if return_obj: - return fig \ No newline at end of file diff --git a/scAR/_hyperparams_optimization.py b/scAR/_hyperparams_optimization.py deleted file mode 100644 index b8cad00..0000000 --- a/scAR/_hyperparams_optimization.py +++ /dev/null @@ -1,156 +0,0 @@ -# -*- coding: utf-8 -*- - -import GPy -import GPyOpt -from sklearn.metrics import r2_score -from scipy.stats import pearsonr -import numpy as np - -import torch.nn as nn -import torch -from torch.distributions import Normal, kl_divergence, Multinomial, Binomial, Poisson - -from ._vae import VAE -from ._loss_functions import loss_fn -from tqdm import tqdm - -######################################################################### -### Objective function for Hyperparams Optimization using GpyOpt -######################################################################### - -def obj_func(native_signals_pred, probs, scRNAseq_tech, empty_frequencies): - - if scRNAseq_tech.lower() == 'cropseq': - native_clean = (probs == probs.max(axis=1).reshape(-1,1)) - else: - native_clean = native_signals_pred #.cpu().numpy() - - frac_native = native_clean.sum(axis=0)/native_clean.sum() - freq_truth = empty_frequencies.detach().cpu().numpy() - - # metric to evaluate the predection - # mae = median_absolute_error(freq_truth, frac_native) - # pr = pearsonr(freq_truth, frac_native)[0] - r2 = r2_score(freq_truth, frac_native) - - return r2 #mae, pr - -######################################################################### -### -######################################################################### - -def training_model_with_a_set_params(train_set, total_set, num_input_feature=99, - NN_layer1=200, - NN_layer2=50, - latent_space=10, - kld_weight=0, - frac_match_weight=0, - dropout_prob=0, - lr=0.002, - reconstruction_weight=1, - scRNAseq_tech='CROPseq', - lr_step_size=5, - lr_gamma=0.97, - epochs = 50): - - ''' - train the model with a set of parameters - ''' - - # Define model - model = VAE(num_input_feature, NN_layer1, NN_layer2, latent_space, scRNAseq_tech=scRNAseq_tech).cuda() - - # Define optimizer - optim = torch.optim.Adam(model.parameters(), lr=lr) - scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=lr_step_size, gamma=lr_step_size) - - print("......kld_weight: ", kld_weight) - print("......lr: ", lr) - print("......lr_step_size: ", lr_step_size) - print("......lr_gamma: ", lr_gamma) - # Training - for epoch in tqdm(range(epochs)): - - model.train() - for x_batch, ambient_freq in train_set: - optim.zero_grad() - z, dec_nr, dec_prob, mu, var = model(x_batch) - recon_loss_minibatch, kld_loss_minibatch, match_loss_minibatch, loss_minibatch = loss_fn(x_batch, dec_nr, dec_prob, mu, var, ambient_freq, - reconstruction_weight=reconstruction_weight, kld_weight=kld_weight, frac_match_weight=frac_match_weight) - loss_minibatch.backward() - optim.step() - scheduler.step() - - # evaluation - model.eval() - with torch.no_grad(): - expected_native_counts, probs = model.inference(total_set, ambient_freq[0,:]) - - r2 = obj_func(expected_native_counts, probs, scRNAseq_tech, empty_frequencies=ambient_freq[0,:]) - - return r2 - - -############################################################################################################ -### Bayesian optimization of hyperparameters -############################################################################################################ - -class ParamsOpt(): - ''' - Use Bayesian Optimization for hyperparams optimization - ''' - def __init__(self, train_set, total_set, num_input_feature, scRNAseq_tech, epochs=150, initial_design_numdata=30, max_iter=50, lr_step_size=5, lr_gamma=0.97): - self.train_set = train_set - self.total_set = total_set - self.num_input_feature = num_input_feature - self.scRNAseq_tech = scRNAseq_tech - self.epochs = epochs - self.initial_design_numdata = initial_design_numdata - self.max_iter = max_iter -# self.lr = lr lr=0.002, - self.lr_step_size = lr_step_size - self.lr_gamma = lr_gamma - self.domain = [{'name': 'NN_layer1', 'type': 'continuous', 'domain': (np.log(50), np.log(250))}, # interval of NN_layer1 is [10, 1000] - {'name': 'NN_layer2', 'type': 'continuous', 'domain': (np.log(50), np.log(250))}, # interval of NN_layer2 is [10, 1000] - {'name': 'latent_space', 'type': 'continuous', 'domain': (np.log(5), np.log(20))}, # interval of latent_space is [10, 100] - {'name': 'kld_weight', 'type': 'continuous', 'domain': (np.log(1e-6), np.log(1e-2))}, # interval of KLD weight is [1e-6, 1e-2] - {'name': 'lr', 'type': 'continuous', 'domain': (np.log(1e-3), np.log(1e-2))}] # interval of KLD weight is [1e-6, 1e-2] - - def BayesianOptimization(self): - - def hyperparams_opt(domain): - params = np.atleast_2d(np.exp(domain)) - fs = np.zeros((params.shape[0],1)) - for i in range(params.shape[0]): - try: - fs[i] = training_model_with_a_set_params(self.train_set, - self.total_set, - num_input_feature=self.num_input_feature, ## num of feature barcodes - NN_layer1=int(params[i, 0]), - NN_layer2=int(params[i, 1]), - latent_space=int(params[i, 2]), - kld_weight=params[i, 3], - lr=params[i, 4], - epochs = self.epochs, - reconstruction_weight=1, - scRNAseq_tech=self.scRNAseq_tech, - lr_step_size=self.lr_step_size, - lr_gamma=self.lr_gamma) - - ## if the model doesn't succeed, return a very low R2_score value. - except: - fs[i] = -1 - - return fs - - opt = GPyOpt.methods.BayesianOptimization(f = hyperparams_opt, # function to optimize - domain = self.domain, # box-constraints of the problem - initial_design_numdata = self.initial_design_numdata, - acquisition_type ='LCB', # LCB acquisition - maximize = True) - - opt.run_optimization(max_iter=self.max_iter) #eps = self.tolerance - x_best = np.exp(opt.X[np.argmin(opt.Y)]) - best_params = dict(zip([el['name'] for el in self.domain], x_best)) - self.opt = opt - self.best_params = best_params \ No newline at end of file diff --git a/scAR/main/__init__.py b/scAR/main/__init__.py new file mode 100644 index 0000000..a04d931 --- /dev/null +++ b/scAR/main/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +__version__ = '0.2.0' + +from scAR.main import _data_generater as DataSimulator \ No newline at end of file diff --git a/scAR/__main__.py b/scAR/main/__main__.py similarity index 96% rename from scAR/__main__.py rename to scAR/main/__main__.py index f1c1f43..03a3899 100644 --- a/scAR/__main__.py +++ b/scAR/main/__main__.py @@ -3,7 +3,7 @@ import argparse, os import pandas as pd from ._scAR import model -from .__init__ import __version__ +from scAR.main import __version__ def main(): @@ -24,7 +24,6 @@ def main(): parser.add_argument('-ls', '--latent_space', type=int, default=None, help='dimension of latent space') parser.add_argument('-epo', '--epochs', type=int, default=800, help='training epochs') parser.add_argument('-s', '--save_model', type=int, default=False, help='whether save the trained model') - parser.add_argument('-plot', '--plot_every_epoch', type=int, default=50, help='plot every epochs') parser.add_argument('-batchsize', '--batchsize', type=int, default=64, help='batch size') parser.add_argument('-adjust', '--adjust', type=str, default='micro', help='''Only used for calculating Bayesfactors to improve performance, @@ -51,7 +50,6 @@ def main(): latent_space = args.latent_space epochs = args.epochs save_model = args.save_model - plot_every_epoch = args.plot_every_epoch batch_size = args.batchsize adjust = args.adjust feature_type = args.feature_type @@ -83,7 +81,6 @@ def main(): scARObj.train(batch_size=batch_size, epochs=epochs, - plot_every_epoch=plot_every_epoch, TensorBoard=TensorBoard, save_model=save_model ) diff --git a/scAR/_activation_functions.py b/scAR/main/_activation_functions.py similarity index 100% rename from scAR/_activation_functions.py rename to scAR/main/_activation_functions.py diff --git a/scAR/_data_generater.py b/scAR/main/_data_generater.py similarity index 100% rename from scAR/_data_generater.py rename to scAR/main/_data_generater.py diff --git a/scAR/_loss_functions.py b/scAR/main/_loss_functions.py similarity index 100% rename from scAR/_loss_functions.py rename to scAR/main/_loss_functions.py diff --git a/scAR/_scAR.py b/scAR/main/_scAR.py similarity index 89% rename from scAR/_scAR.py rename to scAR/main/_scAR.py index a316b75..e2cba6e 100644 --- a/scAR/_scAR.py +++ b/scAR/main/_scAR.py @@ -9,9 +9,6 @@ from sklearn.model_selection import train_test_split from ._vae import VAE from ._loss_functions import loss_fn -from ._helper_functions import (histgram_noise_ratio, get_correlation_btn_native_ambient, - plt_correlation_btn_native_ambient, assignment_accuracy, - naive_assignment_accuracy, plot_a_sample) import contextlib from tqdm import tqdm @@ -62,7 +59,8 @@ class model(): Examples -------- - >>> scarObj = scAR.model(adata.X.to_df(), empty_profile) + >>> from scAR import model + >>> scarObj = model(adata.X.to_df(), empty_profile) >>> scarObj.train() >>> scarObj.inference() >>> adata.layers["X_scAR_denoised"] = scarObj.native_counts @@ -131,7 +129,6 @@ def train(self, epochs: int=800, reconstruction_weight: float=1, dropout_prob: float=0, - plot_every_epoch: int=50, TensorBoard: bool=False, save_model: bool=False): @@ -162,8 +159,6 @@ def train(self, Dropout probability of nodes (float). Default: 0 TensorBoard Whether output training details through Tensorboard (bool). Default: False. Under development. - plot_every_epoch - The epochs by which diagnostic plots will be generated in TensorBoard (int). Default: 50. Under development. save_model Whether to save trained models. Default: False. Under development. @@ -173,7 +168,8 @@ def train(self, Examples -------- - >>> scarObj = scAR.model(adata.X.to_df(), empty_profile) + >>> from scAR import model + >>> scarObj = model(adata.X.to_df(), empty_profile) >>> scarObj.train() >>> scarObj.inference() >>> adata.layers["X_scAR_denoised"] = scarObj.native_counts @@ -202,12 +198,12 @@ def train(self, f'NN_layer1={self.NN_layer1}, NN_layer2={self.NN_layer2}, latent_space={self.latent_space}, kld_weight={kld_weight}, lr={lr}, epochs={epochs}, reconstruction_weight={reconstruction_weight}, dropout_prob={dropout_prob}', 0) # Define model - VAE_model = VAE(self.num_input_feature, self.NN_layer1, self.NN_layer2, self.latent_space, self.scRNAseq_tech, dropout_prob, model=self.model).cuda() + VAE_model = VAE(self.num_input_feature, self.NN_layer1, self.NN_layer2, self.latent_space, self.scRNAseq_tech, dropout_prob, model=self.model).to(self.device) # Define optimizer optim = torch.optim.Adam(VAE_model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=lr_step_size, gamma=lr_gamma) - + print("......kld_weight: ", kld_weight) print("......lr: ", lr) print("......lr_step_size: ", lr_step_size) @@ -273,28 +269,6 @@ def train(self, writer.add_scalar('ValLoss/kld_loss', val_kld_loss, epoch) writer.flush() - ################################################################################ - ## Save intermediate results every 50 epoch... - ################################################################################ - if (epoch % plot_every_epoch == plot_every_epoch-1) and TensorBoard: - - step = epoch // plot_every_epoch - with torch.no_grad(): - z_eval, dec_nr_eval, dec_prob_eval, mu_eval, var_eval, dec_dp_eval = VAE_model(self.total_set.dataset.tensors[0]) - - pr, r2, _, _ = get_correlation_btn_native_ambient(epoch, dec_prob_eval, empty_frequencies=ambient_freq_val[0,:], scRNAseq_tech=self.scRNAseq_tech) - - writer.add_scalar('RegressionError/correlation coeffcient', pr, epoch) - writer.add_scalar('RegressionError/R2', r2, epoch) - - # ..log a Matplotlib Figure showing the model's predictions on the full dataset - # 1, noise ratio - writer.add_figure('noise ratio', histgram_noise_ratio(epoch, dec_nr_eval, return_obj=True), global_step=epoch) - - # 2, native frequencies - writer.add_figure('correlation', plt_correlation_btn_native_ambient(epoch, dec_prob_eval, scRNAseq_tech=self.scRNAseq_tech, empty_frequencies=ambient_freq_val[0,:], return_obj=True), global_step=epoch) - writer.flush() - if save_model: torch.save(VAE_model, save_model) @@ -338,7 +312,8 @@ def inference(self, batch_size=None, model='poisson', adjust='micro', feature_ty Examples -------- - >>> scarObj = scAR.model(adata.X.to_df(), empty_profile) + >>> from scAR import model + >>> scarObj = model(adata.X.to_df(), empty_profile) >>> scarObj.train() >>> scarObj.inference() >>> adata.layers["X_scAR_denoised"] = scarObj.native_counts diff --git a/scAR/_vae.py b/scAR/main/_vae.py similarity index 100% rename from scAR/_vae.py rename to scAR/main/_vae.py diff --git a/scAR/test/test_activation_functions.py b/scAR/test/test_activation_functions.py new file mode 100644 index 0000000..9d75beb --- /dev/null +++ b/scAR/test/test_activation_functions.py @@ -0,0 +1,35 @@ +""" This tests the activation functions. """ +import decimal +import unittest +import numpy +import torch +from decimal import Decimal +from scAR.main._activation_functions import mytanh, hnormalization, mySoftplus + + +class ActivationFunctionsTest(unittest.TestCase): + """ + Test activation_functions.py functions. + """ + + def test_mytanh(self): + """ + Test mytanh(). + """ + self.assertEqual(mytanh(Decimal(1)).quantize(decimal.Decimal('.01'), rounding=decimal.ROUND_DOWN), + Decimal(0.88).quantize(decimal.Decimal('.01'), + rounding=decimal.ROUND_DOWN)) + + def test_hnormalization(self): + """ + Test hnormalization(). + """ + self.assertTrue(torch.allclose(hnormalization(torch.tensor(numpy.full((20, 8), 1))).double(), + torch.tensor(numpy.full((20, 8), 0.1250)))) + + def test_mySoftplus(self): + """ + Test mySoftplus(). + """ + self.assertTrue(torch.allclose(mySoftplus(torch.tensor(numpy.full((20, 8), 0.1))).double(), + torch.tensor(numpy.full((20, 8), 0.7444)))) diff --git a/setup.py b/setup.py index 3183e49..b4a6d0a 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import os.path # Set __version__ -exec(open('scAR/__init__.py').read()) +exec(open('scAR/main/__init__.py').read()) setup( name='scAR', @@ -13,21 +13,20 @@ description="single cell Ambient Remover (scAR): remove ambient signals for single-cell omics data", packages=find_packages(), entry_points = { - 'console_scripts': ['scar=scAR.__main__:main'],}, + 'console_scripts': ['scar=scAR.main.__main__:main'],}, include_package_data=True, url='https://github.com/CaibinSh/scAR', license='MIT', install_requires=[ - "torch==1.10.0", + "torch>=1.10.0", "pandas>=1.3.4", "torchvision>=0.9.0", "torchaudio>=0.8.0", - "tqdm==4.62.3", - "gpyopt", + "tqdm>=4.62.3", "seaborn>=0.11.2", "tensorboard>=2.2.1", "scikit-learn>=1.0.1", - "pyro-ppl==1.8.0" + "pyro-ppl>=1.8.0" ], classifiers=[ "Programming Language :: Python :: 3",