Skip to content

Commit

Permalink
Parallel and download function (#10)
Browse files Browse the repository at this point in the history
* cpu count

* cpu count default

* add chrM columns in create script

* Update README.md

* Update README.md

* source db

* utils

* packages

* packages

* rm package

* package

* Update README.md

* rename

* Update README.md

* variable names

* final adj

Co-authored-by: Kalin Nonchev <[email protected]>
  • Loading branch information
KalinNonchev and Kalin Nonchev authored Jun 30, 2021
1 parent 3890d6d commit 79860bb
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 18 deletions.
28 changes: 25 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,27 @@ It extracts from a gnomAD vcf the ["AF", "AF_afr", "AF_eas", "AF_fin", "AF_nfe",

###### The package works for all currently available gnomAD releases.(2021)

## 1. Data preprocessing and SQL database creation
## 1. Download SQLite preprocessed files

I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.

gnomAD v3.1.1 (hg38, **759'302'267** variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \
gnomAD v2.1.1 (hg19, **261'942'336** variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1

You can download it as:

```python
from gnomad_db.database import gnomAD_DB
download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1"
output_dir = "test_dir" # database_location
gnomAD_DB.download_and_unzip(download_link, output_dir)
```
#### NB this would take ~30min (network speed 10mb/s)


or you can create the database by yourself. **However, I recommend to use the preprocessed files to save ressources and time**. If you do so, you can go to **2. API usage** and explore the package and its great features!

## 1.1 Data preprocessing and SQL database creation

Start by downloading the vcf files from gnomAD in a single directory:

Expand Down Expand Up @@ -60,7 +80,8 @@ database_location = "test_dir"
db = gnomAD_DB(database_location)
```

3. Insert some test variants to run the examples below
3. Insert some test variants to run the examples below \
**If you have downloaded the preprocessed sqlite3 files, you can skip this step as you already have variants**
```python
# get some variants
var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=db.columns, index_col=False)
Expand All @@ -69,7 +90,8 @@ var_df = pd.read_csv("data/test_vcf_gnomad_chr21_10000.tsv.gz", sep="\t", names=
db.insert_variants(var_df)
```

4. Query variant minor allele frequency
4. Query variant minor allele frequency \
**These example variants are assembled to hg38!**
```python
# query some MAF scores
dummy_var_df = pd.DataFrame({
Expand Down
19 changes: 15 additions & 4 deletions gnomad_db/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import pandas as pd
import multiprocessing
from joblib import Parallel, delayed
from . import utils


class gnomAD_DB:

def __init__(self, genodb_path, parallel=True):
def __init__(self, genodb_path, parallel=True, cpu_count=512):

self.cpu_count = int(multiprocessing.cpu_count())
self.cpu_count = min(cpu_count, int(multiprocessing.cpu_count()))
self.parallel = parallel

self.db_file = os.path.join(genodb_path, 'gnomad_db.sqlite3')
Expand Down Expand Up @@ -182,7 +183,9 @@ def get_mafs_for_interval(self, chrom: str, interval_start: int, interval_end: i


def get_maf_from_str(self, var: str, query: str="AF") -> float:
# variant in form chrom:pos:ref>alt
"""
get AF for variant in form chrom:pos:ref>alt
"""

chrom, pos, ref, alt = self._pack_from_str(var)

Expand All @@ -193,4 +196,12 @@ def get_maf_from_str(self, var: str, query: str="AF") -> float:
where chrom = '{chrom}' AND pos = {pos} AND ref = '{ref}' AND alt = '{alt}';
"""

return self.query_direct(sql_query).squeeze()
return self.query_direct(sql_query).squeeze()


@staticmethod
def download_and_unzip(url, output_path):
"""
download database and unzip file
"""
utils.download_and_unzip_file(url, output_path)
38 changes: 38 additions & 0 deletions gnomad_db/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import urllib.request
import gzip
import shutil
from tqdm import tqdm
import os
import time


class DownloadProgressBar(tqdm):
def update_to(self, b=1, bsize=1, tsize=None):
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n)


def download_url(url, output_dir):
with DownloadProgressBar(unit='B', unit_scale=True,
miniters=1, desc=url.split('/')[-1]) as t:
urllib.request.urlretrieve(url, filename=f"{output_dir}/gnomad_db.sqlite3.gz", reporthook=t.update_to)
time.sleep(5)

def unzip(output_dir):
file_name_in = f"{output_dir}/gnomad_db.sqlite3.gz"
file_name_out = f"{output_dir}/gnomad_db.sqlite3"
with gzip.open(file_name_in, 'rb') as f_in:
with open(file_name_out, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
time.sleep(5)
os.remove(file_name_in)
print(f"Database location: {file_name_out}")

def download_and_unzip_file(url, output_dir):
print("Starting downloading...")
download_url(url, output_dir)
print("Starting unzipping. This can take some time...")
unzip(output_dir)
print("Done!")

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ numpy
pyyaml
pytest
joblib
tqdm
30 changes: 28 additions & 2 deletions scripts/GettingStartedwithGnomAD_DB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "98d3ae32-d439-41bf-be89-3e6f52a3591f",
"id": "12c65da9-cb0c-42f6-a92c-cf0c2535ba13",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,6 +12,32 @@
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "20ffe529-064d-482b-9267-9b0fd729070c",
"metadata": {},
"source": [
"# Download SQLite preprocessed files\n",
"\n",
"I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.\n",
"\n",
"gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1 \n",
"gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1 "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e525076f-8540-444f-912a-b411a224f0ec",
"metadata": {},
"outputs": [],
"source": [
"# uncomment if you actually want to download it\n",
"# download_link = \"https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1\"\n",
"# output_dir = \"test_dir\" # database_location\n",
"# gnomAD_DB.download_and_unzip(download_link, output_dir) "
]
},
{
"cell_type": "markdown",
"id": "3fa70b8a-1d43-4929-bfee-4df0d071f9fa",
Expand Down Expand Up @@ -241,7 +267,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.9.5"
}
},
"nbformat": 4,
Expand Down
14 changes: 14 additions & 0 deletions scripts/GettingStartedwithGnomAD_DB.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@
import pandas as pd
import numpy as np

# %% [markdown]
# # Download SQLite preprocessed files
#
# I have preprocessed and created sqlite3 files for gnomAD v2.1.1 and 3.1.1 for you, which can be easily downloaded from here. They contain all variants on the 24 standard chromosomes.
#
# gnomAD v3.1.1 (hg38, 759'302'267 variants) 25G zipped, 56G in total - https://zenodo.org/record/5045170/files/gnomad_db_v3.1.1.sqlite3.gz?download=1
# gnomAD v2.1.1 (hg19, 261'942'336 variants) 9G zipped, 20G in total - https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1

# %%
# uncomment if you actually want to download it
# download_link = "https://zenodo.org/record/5045102/files/gnomad_db_v2.1.1.sqlite3.gz?download=1"
# output_dir = "test_dir" # database_location
# gnomAD_DB.download_and_unzip(download_link, output_dir)

# %% [markdown]
# # Initialize Database

Expand Down
35 changes: 30 additions & 5 deletions scripts/createTSVtables.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"from subprocess import PIPE, Popen\n",
"import pandas as pd\n",
"from joblib import Parallel, delayed\n",
"import multiprocessing\n",
"import os"
]
},
Expand Down Expand Up @@ -92,7 +93,27 @@
{
"cell_type": "code",
"execution_count": null,
"id": "efc70048-1fde-4f0e-af94-cc111f3a9250",
"id": "f3b2f898-f126-476b-bab5-d087e27bb0a7",
"metadata": {
"papermill": {
"duration": 0.008863,
"end_time": "2021-05-05T20:00:58.715794",
"exception": false,
"start_time": "2021-05-05T20:00:58.706931",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"cpu_count = int(multiprocessing.cpu_count())\n",
"cpu_count"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae0419e4-55bd-484b-b2f2-981d67457504",
"metadata": {
"papermill": {
"duration": 0.008863,
Expand All @@ -108,11 +129,15 @@
"# extract needed columns\n",
"# if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd\n",
"def create_table(file, table_location):\n",
" if \"chrM\" in file:\n",
" query_string = \"%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF_hom\\t%AF_het\\n\"\n",
" else:\n",
" query_string = \"%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AF_afr\\t%AF_eas\\t%AF_fin\\t%AF_nfe\\t%AF_asj\\t%AF_oth\\t%AF_popmax\\n\"\n",
" if not os.path.exists(table_location):\n",
" cmd = f\"bcftools query -f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%AF\\t%AF_afr\\t%AF_eas\\t%AF_fin\\t%AF_nfe\\t%AF_asj\\t%AF_oth\\t%AF_popmax\\n' {file} | gzip > {table_location}\"\n",
" cmd = f\"bcftools query -f '{query_string}' {file} | gzip > {table_location}\"\n",
" p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)\n",
" print(p.communicate())\n",
" \n"
" "
]
},
{
Expand All @@ -132,7 +157,7 @@
"outputs": [],
"source": [
"# run bcftools in parallel\n",
"Parallel(12)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))"
"Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))"
]
}
],
Expand All @@ -155,7 +180,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.9.5"
},
"papermill": {
"default_parameters": {},
Expand Down
13 changes: 11 additions & 2 deletions scripts/createTSVtables.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from subprocess import PIPE, Popen
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing
import os

# %% papermill={"duration": 0.014665, "end_time": "2021-05-05T20:00:58.675108", "exception": false, "start_time": "2021-05-05T20:00:58.660443", "status": "completed"} tags=["parameters"]
Expand All @@ -36,18 +37,26 @@
tables_location = [f'{tables_location}/{file.split("/")[-1].replace(".vcf.bgz", "")}.tsv.gz' for file in files]
tables_location

# %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[]
cpu_count = int(multiprocessing.cpu_count())
cpu_count


# %% papermill={"duration": 0.008863, "end_time": "2021-05-05T20:00:58.715794", "exception": false, "start_time": "2021-05-05T20:00:58.706931", "status": "completed"} tags=[]
# extract needed columns
# if running DIRECTLY from notebook, add module load i12g/bcftools; in the beginning of cmd
def create_table(file, table_location):
if "chrM" in file:
query_string = "%CHROM\t%POS\t%REF\t%ALT\t%AF_hom\t%AF_het\n"
else:
query_string = "%CHROM\t%POS\t%REF\t%ALT\t%AF\t%AF_afr\t%AF_eas\t%AF_fin\t%AF_nfe\t%AF_asj\t%AF_oth\t%AF_popmax\n"
if not os.path.exists(table_location):
cmd = f"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%AF\t%AF_afr\t%AF_eas\t%AF_fin\t%AF_nfe\t%AF_asj\t%AF_oth\t%AF_popmax\n' {file} | gzip > {table_location}"
cmd = f"bcftools query -f '{query_string}' {file} | gzip > {table_location}"
p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
print(p.communicate())



# %% papermill={"duration": 0.329741, "end_time": "2021-05-05T20:00:59.051392", "exception": false, "start_time": "2021-05-05T20:00:58.721651", "status": "completed"} tags=[]
# run bcftools in parallel
Parallel(12)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))
Parallel(cpu_count)(delayed(create_table)(file, table_location) for file, table_location in tqdm(zip(files, tables_location)))
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='gnomad_db',
version='0.0.4',
version='0.0.5',
description='This package scales the huge gnomAD files to a SQLite database, which is easy and fast to query. It extracts from a gnomAD vcf the minor allele frequency for each variant.',
author='KalinNonchev',
author_email='[email protected]',
Expand All @@ -11,6 +11,6 @@
url="https://github.com/KalinNonchev/gnomAD_MAF",
packages=find_packages(), # find packages
include_package_data=True,
install_requires=['pandas', 'numpy', 'joblib'], # external packages as dependencies,
install_requires=['pandas', 'numpy', 'joblib', 'tqdm'], # external packages as dependencies,
python_requires='>=3.6'
)

0 comments on commit 79860bb

Please sign in to comment.