Skip to content

Commit

Permalink
Merge pull request #144 from databio/dev
Browse files Browse the repository at this point in the history
Release 0.3.0
  • Loading branch information
khoroshevskyi authored Apr 4, 2024
2 parents 5940bf9 + 4728857 commit 6f650a5
Show file tree
Hide file tree
Showing 100 changed files with 2,044 additions and 2,440 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.8", "3.11"]
python-version: ["3.9", "3.11"]
os: [ubuntu-latest]

steps:
Expand Down
2 changes: 1 addition & 1 deletion geniml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from logging import getLogger

from .const import PKG_NAME
from ._version import __version__
from .const import PKG_NAME

_LOGGER = getLogger(PKG_NAME)
2 changes: 1 addition & 1 deletion geniml/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.0"
__version__ = "0.3.0"
6 changes: 3 additions & 3 deletions geniml/assess/assess.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def get_rbs_from_assessment_file(file, cs_each_file=False, flexible=False):
else:
df["f_t_u"] = df["median_dist_file_to_universe"]
df["u_t_f"] = df["median_dist_universe_to_file"]
df["RBS"] = get_rbs(df["f_t_u"], df["u_t_t"])
df["RBS"] = get_rbs(df["f_t_u"], df["u_t_f"])
if cs_each_file:
return df
else:
Expand Down Expand Up @@ -247,8 +247,8 @@ def get_f_10_score_from_assessment_file(file, f10_each_file=False):
:param bool f10_each_file: if report F10 for each file, not average for the collection
"""
df = pd.read_csv(file, index_col=(0))
r = df["A&U/A"]
p = df["A&U/U"]
r = df["universe&file"] / (df["universe&file"] + df["file/universe"])
p = df["universe&file"] / (df["universe&file"] + df["univers/file"])
df["F_10"] = (1 + 10**2) * (p * r) / ((10**2 * p) + r)
if f10_each_file:
return df["F_10"]
Expand Down
8 changes: 4 additions & 4 deletions geniml/assess/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def flexible_distance_between_two_regions(region, query):
"""Calculate distance between region and flexible region from flexible universe
:param [int, int] region: region from flexible universe
:param int query: analysed region
:param int query: analyzed region
:return int: distance
"""
if region[0] <= query <= region[1]:
Expand All @@ -40,8 +40,8 @@ def distance_to_closest_region(
Calculate distance from given peak to the closest region in database
:param file db: database file
:param list db_queue: queue of three last positions in database
:param i: analysed position from the query
:param str current_chrom: current analysed chromosome from query
:param i: analyzed position from the query
:param str current_chrom: current analyzed chromosome from query
:param list unused_db: list of positions from universe that were not compared to query
:param list pos_index: which indexes from universe region use to calculate distance
:param bool flexible: whether the universe if flexible
Expand Down Expand Up @@ -90,7 +90,7 @@ def read_in_new_universe_regions(
Read in new universe regions closest to the peak
:param file db: universe file
:param str q_chrom: new peak's chromosome
:param str current_chrom: chromosome that was analysed so far
:param str current_chrom: chromosome that was analyzed so far
:param list unused_db: list of positions from universe that were not compared to query
:param list db_queue: que of three last positions in universe
:param bool waiting: whether iterating through file, without calculating
Expand Down
8 changes: 4 additions & 4 deletions geniml/assess/intersection.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ def two_region_intersection_diff(
:param bool inside_d: whether there is still part of the region from universe to analyse
:param bool inside_q: whether there is still part of the region from query to analyse
:param int overlap: size of overlap
:param int start_d: start position of currently analysed universe region
:param int start_q: start position of currently analysed query region
:param bool waiting_d: whether waiting for the query to finish chrom
:param bool waiting_q: whether waiting for the universe to finish chrom
:param int start_d: start position of currently analyzed universe region
:param int start_q: start position of currently analyzed query region
:param bool waiting_d: whether waiting for the query to finish chromosome
:param bool waiting_q: whether waiting for the universe to finish chromosome
"""
if waiting_q:
only_in_d += region_d[1] - region_d[0]
Expand Down
10 changes: 5 additions & 5 deletions geniml/assess/likelihood.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ def calc_likelihood_hard(
:param coverage_folder: path to a folder with genome coverage by tracks
:param str name: suffix of model file name, which contains information
about model type
:param int s_index: from which position in univers line take assess region
:param int s_index: from which position in universe line take assess region
start position
:param int e_index: from which position in univers line take assess region
:param int e_index: from which position in universe line take assess region
end position
:return float: likelihood of univers for given model
:return float: likelihood of universe for given model
"""
current_chrom = ""
missing_chrom = ""
Expand Down Expand Up @@ -178,9 +178,9 @@ def weigh_livelihood(start, end, model_process, model_cove, model_out, reverse):
Calculate weighted likelihood of flexible part of the region
:param int start: start of the region
:param int end: end of the region
:param array model_process: model for analysed type of flexible region
:param array model_process: model for analyzed type of flexible region
:param array model_cove: model for coverage
:param array model_out: model for flexible region that is not being analysed
:param array model_out: model for flexible region that is not being analyzed
:param bool reverse: if model_process corespondents to end we have to reverse the weighs
:return float: likelihood of flexible part of the region
"""
Expand Down
14 changes: 7 additions & 7 deletions geniml/atacformer/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Union, List
from typing import List, Union

import torch
import torch.nn as nn
Expand All @@ -9,16 +9,16 @@
from ..models.main import ExModel
from ..tokenization.main import ITTokenizer
from .const import (
POOLING_TYPES,
POOLING_METHOD_KEY,
CONFIG_FILE_NAME,
D_MODEL_KEY,
VOCAB_SIZE_KEY,
NUM_LAYERS_KEY,
NHEAD_KEY,
DEFAULT_EMBEDDING_DIM,
CONFIG_FILE_NAME,
MODEL_FILE_NAME,
NHEAD_KEY,
NUM_LAYERS_KEY,
POOLING_METHOD_KEY,
POOLING_TYPES,
UNIVERSE_FILE_NAME,
VOCAB_SIZE_KEY,
)


Expand Down
13 changes: 7 additions & 6 deletions geniml/atacformer/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
from glob import glob
from math import ceil
from typing import List, Tuple

import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from genimtools.utils import read_tokens_from_gtok
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

from .const import MASK_RATE, REPLACE_WITH_MASK_RATE, REPLACE_WITH_RANDOM_RATE, KEEP_RATE
from .const import KEEP_RATE, MASK_RATE, REPLACE_WITH_MASK_RATE, REPLACE_WITH_RANDOM_RATE


class AtacformerMLMDataset(Dataset):
Expand Down Expand Up @@ -44,7 +45,7 @@ def __init__(
def __len__(self):
return len(self.files)

def __getitem__(self, idx) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
This should return a tuple of (tokens, masked_tokens, mask_ids).
"""
Expand Down Expand Up @@ -72,8 +73,8 @@ def __getitem__(self, idx) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
return tokens, masked_tokens, mask_ids

def collate_batch(
batch: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]], padding_token: int
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
self, batch: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], padding_token: int
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Collate function for the MLM dataset. This should take a batch of
(tokens, masked_tokens, mask_ids) and return a tuple of (tokens, masked_tokens, mask_ids) that are padded
Expand Down
112 changes: 101 additions & 11 deletions geniml/bbclient/bbclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from logging import getLogger
from typing import List, NoReturn, Union

import boto3
import requests
from botocore.exceptions import ClientError
from ubiquerg import is_url

from .._version import __version__
Expand All @@ -13,23 +15,25 @@
from .const import (
BEDFILE_URL_PATTERN,
BEDSET_URL_PATTERN,
DEFAULT_BUCKET_NAME,
DEFAULT_BEDBASE_API,
DEFAULT_BEDFILE_EXT,
DEFAULT_BEDFILE_SUBFOLDER,
DEFAULT_BEDSET_EXT,
DEFAULT_BEDSET_SUBFOLDER,
DEFAULT_BUCKET_FOLDER,
DEFAULT_CACHE_FOLDER,
MODULE_NAME,
)
from .utils import BedCacheManager, get_bbclient_path_folder
from .utils import BedCacheManager, get_abs_path

_LOGGER = getLogger(MODULE_NAME)


class BBClient(BedCacheManager):
def __init__(
self,
cache_folder: str = DEFAULT_CACHE_FOLDER,
cache_folder: Union[str, os.PathLike] = DEFAULT_CACHE_FOLDER,
bedbase_api: str = DEFAULT_BEDBASE_API,
):
"""
Expand All @@ -39,23 +43,24 @@ def __init__(
if not given it will be the environment variable `BBCLIENT_CACHE`
:param bedbase_api: url to bedbase
"""
# get default cache folder from environment variable set by user
super().__init__(get_bbclient_path_folder(cache_folder))
cache_folder = get_abs_path(cache_folder)
super().__init__(cache_folder)

self.bedbase_api = bedbase_api

def load_bedset(self, bedset_id: str) -> BedSet:
"""
Loads a BED set from cache, or downloads and caches it plus BED files in it if it doesn't exist
Load a BEDset from cache, or download and add it to the cache with its BED files
:param bedset_id: unique identifier of BED set
:param BedSet: BedSet object
"""

file_path = self._bedset_path(bedset_id)

if os.path.exists(file_path):
_LOGGER.info(f"BED set {bedset_id} already exists in cache.")
with open(file_path, "r") as file:
extracted_data = file.readlines()
extracted_data = file.read().splitlines()
else:
extracted_data = self._download_bedset_data(bedset_id)
# write the identifiers of BED files in the BedSet to a local .txt file
Expand All @@ -79,8 +84,8 @@ def _download_bedset_data(self, bedset_id: str) -> List[str]:
"""
bedset_url = BEDSET_URL_PATTERN.format(bedbase_api=self.bedbase_api, bedset_id=bedset_id)
response = requests.get(bedset_url)
data = response.json()
extracted_data = [entry.get("record_identifier") for entry in data["bedfile_metadata"]]
data = response.json()["results"]
extracted_data = [entry.get("id") for entry in data]

return extracted_data

Expand All @@ -89,6 +94,7 @@ def load_bed(self, bed_id: str) -> RegionSet:
Loads a BED file from cache, or downloads and caches it if it doesn't exist
:param bed_id: unique identifier of a BED file
:return: the RegionSet object
"""
file_path = self._bedfile_path(bed_id)

Expand Down Expand Up @@ -130,8 +136,8 @@ def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:
"""
Add a BED file to the cache
:param bedfile: a RegionSet class or a path to a BED file to be added to cache
:return: the identifier if the BedFile object
:param bedfile: a RegionSet object or a path or url to the BED file
:return: the RegionSet identifier
"""
if isinstance(bedfile, str):
bedfile = RegionSet(bedfile)
Expand Down Expand Up @@ -163,12 +169,96 @@ def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:

return bedfile_id

def add_bed_to_s3(
self,
identifier: str,
bucket: str = DEFAULT_BUCKET_NAME,
endpoint_url: str = None,
aws_access_key_id: str = None,
aws_secret_access_key: str = None,
s3_path: str = DEFAULT_BUCKET_FOLDER,
) -> str:
"""
Add a cached BED file to S3
:param identifier: the unique identifier of the BED file
:param bucket: the name of the bucket
:param endpoint_url: the URL of the S3 endpoint [Default: set up by the environment vars]
:param aws_access_key_id: the access key of the AWS account [Default: set up by the environment vars]
:param aws_secret_access_key: the secret access key of the AWS account [Default: set up by the environment vars]
:param s3_path: the path on S3
:return: full path on S3
"""
s3_client = boto3.client(
"s3",
endpoint_url=endpoint_url,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
local_file_path = self.seek(identifier)
bed_file_name = os.path.basename(local_file_path)
s3_bed_path = os.path.join(identifier[0], identifier[1], bed_file_name)
if s3_path:
s3_bed_path = os.path.join(s3_path, s3_bed_path)

s3_client.upload_file(local_file_path, bucket, s3_bed_path)
_LOGGER.info(f"Project was uploaded successfully to s3://{bucket}/{s3_bed_path}")
return s3_bed_path

def get_bed_from_s3(
self,
identifier: str,
bucket: str = DEFAULT_BUCKET_NAME,
endpoint_url: str = None,
aws_access_key_id: str = None,
aws_secret_access_key: str = None,
s3_path: str = DEFAULT_BUCKET_FOLDER,
) -> str:
"""
Get a cached BED file from S3 and cache it locally
:param identifier: the unique identifier of the BED file
:param bucket: the name of the bucket
:param endpoint_url: the URL of the S3 endpoint [Default: set up by the environment vars]
:param aws_access_key_id: the access key of the AWS account [Default: set up by the environment vars]
:param aws_secret_access_key: the secret access key of the AWS account [Default: set up by the environment vars]
:param s3_path: the path on S3
:return: bed file id
:raise FileNotFoundError: if the identifier does not exist in cache
"""
s3_bed_path = os.path.join(
identifier[0], identifier[1], f"{identifier}{DEFAULT_BEDFILE_EXT}"
)
if s3_path:
s3_bed_path = os.path.join(s3_path, s3_bed_path)

s3_client = boto3.client(
"s3",
endpoint_url=endpoint_url,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
try:
s3_client.download_file(
bucket, s3_bed_path, self._bedfile_path(identifier, create=True)
)
except ClientError as e:
if e.response["Error"]["Code"] == "404":
raise FileNotFoundError(f"{identifier} does not exist in S3.")
else:
raise e

return identifier

def seek(self, identifier: str) -> str:
"""
Get local path to BED file or BED set with specific identifier
:param identifier: the unique identifier
:return: the local path of the file
:raise FileNotFoundError: if the identifier does not exist in cache
"""

# check if any BED set has that identifier
Expand Down
Loading

0 comments on commit 6f650a5

Please sign in to comment.