Skip to content

Commit

Permalink
Merge pull request #37 from abearab/master
Browse files Browse the repository at this point in the history
Install option to download data from harvard dataverse
  • Loading branch information
abearab authored Apr 8, 2024
2 parents 41a4b78 + 7e3a526 commit 6a2a6fd
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 22 deletions.
Binary file modified .gitignore
Binary file not shown.
2 changes: 1 addition & 1 deletion CanDI/candi/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self):
#self._verify_install()
self._init_sources()
self._init_depmap_paths()
self._init_index_tables()
# self._init_index_tables()

def _verify_install(self): #ensures data being loaded is present
try:
Expand Down
File renamed without changes.
134 changes: 134 additions & 0 deletions CanDI/setup/dataverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""Metadata and scripts to collect datasets for CanDI
https://doi.org/10.7910/DVN/JIAT0H
"""
import os
import requests
from tqdm import tqdm
import sys


CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H'

depmap_dataset_names = [
'CCLE_expression',
'CCLE_fusions',
'CCLE_gene_cn',
'CCLE_mutations',
'CCLE_RNAseq_reads',
'CRISPR_gene_dependency',
'CRISPR_gene_effect',
'sample_info',
'README',
]

name2type = {
'CCLE_expression': 'csv',
'CCLE_fusions': 'csv',
'CCLE_gene_cn': 'csv',
'CCLE_mutations': 'csv',
'CCLE_RNAseq_reads': 'csv',
'CRISPR_gene_dependency': 'csv',
'CRISPR_gene_effect': 'csv',
'sample_info': 'csv',
'README': 'txt',
}

name2id = {
'CCLE_expression': 8076862,
'CCLE_fusions': 10085763,
'CCLE_gene_cn': 8076861,
'CCLE_mutations': 8076857,
'CCLE_RNAseq_reads': 8076859,
'CRISPR_gene_dependency': 8076863,
'CRISPR_gene_effect': 8076860,
'sample_info': 10085764,
'README': 8151459,
}


def print_sys(s):
"""system print
Args:
s (str): the string to print
"""
print(s, flush = True, file = sys.stderr)


def dataverse_download(url, path, name, types):
"""dataverse download helper with progress bar
Args:
url (str): the url of the dataset
path (str): the path to save the dataset
name (str): the dataset name
types (dict): a dictionary mapping from the dataset name to the file format
"""
save_path = os.path.join(path, f"{name}.{types[name]}")
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(save_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()


def download_wrapper(name, path, return_type=None):
"""wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
Args:
name (str): the rough dataset query name
path (str): the path to save the dataset
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
Returns:
str: the exact dataset query name
"""
server_path = "https://dataverse.harvard.edu/api/access/datafile/"

url = server_path + str(name2id[name])

if not os.path.exists(path):
os.mkdir(path)

file_name = f"{name}.{name2type[name]}"

if os.path.exists(os.path.join(path, file_name)):
print_sys("Found local copy...")
os.path.join(path, file_name)
else:
print_sys("Downloading...")
dataverse_download(url, path, name, name2type)

if return_type == "url":
return url
elif return_type == "name":
return file_name
elif return_type == ["url", "name"]:
return url, file_name


def depmap_dataverse_download(path, return_type=None):
"""download all datasets to the path
Args:
path (str): the path to save the datasets
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
"""
url_list = []
file_names = []

for name in depmap_dataset_names:
url, file_name = download_wrapper(name, path, return_type=["url", "name"])
url_list.append(url)
file_names.append(file_name)

if return_type == "url":
return url_list
elif return_type == "name":
return file_names
elif return_type == ["url", "name"]:
return url_list, file_names
29 changes: 22 additions & 7 deletions CanDI/setup/install.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
import argparse
from manager import Manager

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--source", help="Specify the download source", default="dataverse")
args = parser.parse_args()

m = Manager()
m.get_depmap_info()
m.write_config(m.cfig_path, m.parser)
m.download_defaults()
m.write_config(m.cfig_path, m.parser)
m.depmap_autoformat()
m.write_config(m.cfig_path, m.parser)
if args.source == 'dataverse':
print("Downloading data from Dataverse")
m = Manager(download_source=args.source)
m.download_reformatted_data()
m.write_config(m.cfig_path, m.parser)

elif args.source == 'depmap':
print("Downloading data from DepMap")
m = Manager(download_source=args.source)
m.get_depmap_info()
m.write_config(m.cfig_path, m.parser)
m.download_defaults()
m.write_config(m.cfig_path, m.parser)
m.depmap_autoformat()
m.write_config(m.cfig_path, m.parser)

else:
raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'")
52 changes: 38 additions & 14 deletions CanDI/setup/manager.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
import os
import sys
import configparser
import json
import time
import requests
import shutil
import pandas as pd
from time import sleep
from pathlib import Path
import contextlib
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import requests
from dataverse import depmap_dataverse_download

class Manager(object):
"""The Manager class handles interations with the datasources
and the config file. It is used to setup of the config file upon installation.
All data downloading is done by Manager
"""
def __init__(self):
def __init__(self, download_source=None):

manager_path = os.path.dirname(os.path.realpath(__file__))
cfig_path = manager_path + "/data/config.ini"
parser = configparser.ConfigParser()
parser.read(cfig_path)
parser.read(cfig_path.replace(".ini", ".draft.ini"))

self.manager_path = manager_path
self.cfig_path = Path(cfig_path)
self.parser = parser


self.download_source = download_source
def sanger_download():
pass

Expand Down Expand Up @@ -217,16 +217,40 @@ def format_depmap_data(self, df, path):
formatted[path.split("/")[-1]] = path


def download_reformatted_data(self, depmap_release=''):
if self.download_source == "dataverse":
urls, file_names = depmap_dataverse_download(
self.manager_path + '/data/depmap/',
return_type= ["url", "name"]
)

depmap_urls = {
file: url for url, file in zip(urls, file_names)
}

depmap_files = {}
for file in file_names:
f_key = file.split('.')[0]
f_key = f_key.replace('CCLE_','')
f_key = f_key.replace('CRISPR_','')
depmap_files[f_key] = file

formatted = {
f'{self.manager_path}/data/depmap/{file}': file for file in file_names
if 'readme' not in file.lower()
}

self.parser["depmap_urls"] = depmap_urls
self.parser["depmap_files"] = depmap_files
self.parser["formatted"] = formatted

else:
raise RuntimeError("Set download source to 'dataverse' before running download_formated_data")

@staticmethod
def write_config(cfig_path, parser):

print("Writing config file")
with open(cfig_path, "w") as f:
parser.write(f)
f.close()

if __name__ == "__main__":
m = Manager()
#m.depmap_download("fusions")
m.depmap_autoformat()
m.write_config(m.cfig_path, m.parser)

0 comments on commit 6a2a6fd

Please sign in to comment.