Merge pull request #144 from databio/dev

Release 0.3.0
databio · Apr 4, 2024 · 6f650a5 · 6f650a5
2 parents 5940bf9 + 4728857
commit 6f650a5
Show file tree

Hide file tree

Showing 100 changed files with 2,044 additions and 2,440 deletions.
diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ["3.8", "3.11"]
+        python-version: ["3.9", "3.11"]
         os: [ubuntu-latest]
 
     steps:

diff --git a/geniml/__init__.py b/geniml/__init__.py
@@ -1,6 +1,6 @@
 from logging import getLogger
 
-from .const import PKG_NAME
 from ._version import __version__
+from .const import PKG_NAME
 
 _LOGGER = getLogger(PKG_NAME)
diff --git a/geniml/_version.py b/geniml/_version.py
@@ -1 +1 @@
-__version__ = "0.2.0"
+__version__ = "0.3.0"
diff --git a/geniml/assess/assess.py b/geniml/assess/assess.py
@@ -204,7 +204,7 @@ def get_rbs_from_assessment_file(file, cs_each_file=False, flexible=False):
     else:
         df["f_t_u"] = df["median_dist_file_to_universe"]
         df["u_t_f"] = df["median_dist_universe_to_file"]
-    df["RBS"] = get_rbs(df["f_t_u"], df["u_t_t"])
+    df["RBS"] = get_rbs(df["f_t_u"], df["u_t_f"])
     if cs_each_file:
         return df
     else:
@@ -247,8 +247,8 @@ def get_f_10_score_from_assessment_file(file, f10_each_file=False):
     :param bool f10_each_file: if report F10 for each file, not average for the collection
     """
     df = pd.read_csv(file, index_col=(0))
-    r = df["A&U/A"]
-    p = df["A&U/U"]
+    r = df["universe&file"] / (df["universe&file"] + df["file/universe"])
+    p = df["universe&file"] / (df["universe&file"] + df["univers/file"])
     df["F_10"] = (1 + 10**2) * (p * r) / ((10**2 * p) + r)
     if f10_each_file:
         return df["F_10"]

diff --git a/geniml/assess/distance.py b/geniml/assess/distance.py
@@ -15,7 +15,7 @@
 def flexible_distance_between_two_regions(region, query):
     """Calculate distance between region and flexible region from flexible universe
     :param [int, int] region: region from flexible universe
-    :param int query: analysed region
+    :param int query: analyzed region
     :return int: distance
     """
     if region[0] <= query <= region[1]:
@@ -40,8 +40,8 @@ def distance_to_closest_region(
     Calculate distance from given peak to the closest region in database
     :param file db: database file
     :param list db_queue: queue of three last positions in database
-    :param i: analysed position from the query
-    :param str current_chrom: current analysed chromosome from query
+    :param i: analyzed position from the query
+    :param str current_chrom: current analyzed chromosome from query
     :param list unused_db: list of positions from universe that were not compared to query
     :param list pos_index: which indexes from universe region use to calculate distance
     :param bool flexible: whether the universe if flexible
@@ -90,7 +90,7 @@ def read_in_new_universe_regions(
     Read in new universe regions closest to the peak
     :param file db: universe file
     :param str q_chrom: new peak's chromosome
-    :param str current_chrom: chromosome that was analysed so far
+    :param str current_chrom: chromosome that was analyzed so far
     :param list unused_db: list of positions from universe that were not compared to query
     :param list db_queue: que of three last positions in universe
     :param bool waiting: whether iterating through file, without calculating

diff --git a/geniml/assess/intersection.py b/geniml/assess/intersection.py
@@ -70,10 +70,10 @@ def two_region_intersection_diff(
     :param bool inside_d: whether there is still part of the region from universe to analyse
     :param bool inside_q: whether there is still part of the region from query to analyse
     :param int overlap: size of overlap
-    :param int start_d: start position of currently analysed universe region
-    :param int start_q: start position of currently analysed query region
-    :param bool waiting_d: whether waiting for the query to finish chrom
-    :param bool waiting_q: whether waiting for the universe to finish chrom
+    :param int start_d: start position of currently analyzed universe region
+    :param int start_q: start position of currently analyzed query region
+    :param bool waiting_d: whether waiting for the query to finish chromosome
+    :param bool waiting_q: whether waiting for the universe to finish chromosome
     """
     if waiting_q:
         only_in_d += region_d[1] - region_d[0]

diff --git a/geniml/assess/likelihood.py b/geniml/assess/likelihood.py
@@ -41,11 +41,11 @@ def calc_likelihood_hard(
     :param coverage_folder: path to a folder with genome coverage by tracks
     :param str name: suffix of model file name, which contains information
      about model type
-    :param int s_index: from which position in univers line take assess region
+    :param int s_index: from which position in universe line take assess region
      start position
-    :param int e_index: from which position in univers line take assess region
+    :param int e_index: from which position in universe line take assess region
      end position
-    :return float: likelihood of univers for given model
+    :return float: likelihood of universe for given model
     """
     current_chrom = ""
     missing_chrom = ""
@@ -178,9 +178,9 @@ def weigh_livelihood(start, end, model_process, model_cove, model_out, reverse):
     Calculate weighted likelihood of flexible part of the region
     :param int start: start of the region
     :param int end: end of the region
-    :param array model_process: model for analysed type of flexible region
+    :param array model_process: model for analyzed type of flexible region
     :param array model_cove: model for coverage
-    :param array model_out: model for flexible region that is not being analysed
+    :param array model_out: model for flexible region that is not being analyzed
     :param bool reverse: if model_process corespondents to end we have to reverse the weighs
     :return float: likelihood of flexible part of the region
     """

diff --git a/geniml/atacformer/main.py b/geniml/atacformer/main.py
@@ -1,5 +1,5 @@
 import os
-from typing import Union, List
+from typing import List, Union
 
 import torch
 import torch.nn as nn
@@ -9,16 +9,16 @@
 from ..models.main import ExModel
 from ..tokenization.main import ITTokenizer
 from .const import (
-    POOLING_TYPES,
-    POOLING_METHOD_KEY,
+    CONFIG_FILE_NAME,
     D_MODEL_KEY,
-    VOCAB_SIZE_KEY,
-    NUM_LAYERS_KEY,
-    NHEAD_KEY,
     DEFAULT_EMBEDDING_DIM,
-    CONFIG_FILE_NAME,
     MODEL_FILE_NAME,
+    NHEAD_KEY,
+    NUM_LAYERS_KEY,
+    POOLING_METHOD_KEY,
+    POOLING_TYPES,
     UNIVERSE_FILE_NAME,
+    VOCAB_SIZE_KEY,
 )
 
 

diff --git a/geniml/atacformer/utils.py b/geniml/atacformer/utils.py
@@ -1,13 +1,14 @@
 import os
 from glob import glob
 from math import ceil
+from typing import List, Tuple
 
 import torch
-from torch.utils.data import Dataset
-from torch.nn.utils.rnn import pad_sequence
 from genimtools.utils import read_tokens_from_gtok
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
 
-from .const import MASK_RATE, REPLACE_WITH_MASK_RATE, REPLACE_WITH_RANDOM_RATE, KEEP_RATE
+from .const import KEEP_RATE, MASK_RATE, REPLACE_WITH_MASK_RATE, REPLACE_WITH_RANDOM_RATE
 
 
 class AtacformerMLMDataset(Dataset):
@@ -44,7 +45,7 @@ def __init__(
     def __len__(self):
         return len(self.files)
 
-    def __getitem__(self, idx) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         This should return a tuple of (tokens, masked_tokens, mask_ids).
         """
@@ -72,8 +73,8 @@ def __getitem__(self, idx) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         return tokens, masked_tokens, mask_ids
 
     def collate_batch(
-        batch: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]], padding_token: int
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        self, batch: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], padding_token: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Collate function for the MLM dataset. This should take a batch of
         (tokens, masked_tokens, mask_ids) and return a tuple of (tokens, masked_tokens, mask_ids) that are padded

diff --git a/geniml/bbclient/bbclient.py b/geniml/bbclient/bbclient.py
@@ -4,7 +4,9 @@
 from logging import getLogger
 from typing import List, NoReturn, Union
 
+import boto3
 import requests
+from botocore.exceptions import ClientError
 from ubiquerg import is_url
 
 from .._version import __version__
@@ -13,23 +15,25 @@
 from .const import (
     BEDFILE_URL_PATTERN,
     BEDSET_URL_PATTERN,
+    DEFAULT_BUCKET_NAME,
     DEFAULT_BEDBASE_API,
     DEFAULT_BEDFILE_EXT,
     DEFAULT_BEDFILE_SUBFOLDER,
     DEFAULT_BEDSET_EXT,
     DEFAULT_BEDSET_SUBFOLDER,
+    DEFAULT_BUCKET_FOLDER,
     DEFAULT_CACHE_FOLDER,
     MODULE_NAME,
 )
-from .utils import BedCacheManager, get_bbclient_path_folder
+from .utils import BedCacheManager, get_abs_path
 
 _LOGGER = getLogger(MODULE_NAME)
 
 
 class BBClient(BedCacheManager):
     def __init__(
         self,
-        cache_folder: str = DEFAULT_CACHE_FOLDER,
+        cache_folder: Union[str, os.PathLike] = DEFAULT_CACHE_FOLDER,
         bedbase_api: str = DEFAULT_BEDBASE_API,
     ):
         """
@@ -39,23 +43,24 @@ def __init__(
         if not given it will be the environment variable `BBCLIENT_CACHE`
         :param bedbase_api: url to bedbase
         """
-        # get default cache folder from environment variable set by user
-        super().__init__(get_bbclient_path_folder(cache_folder))
+        cache_folder = get_abs_path(cache_folder)
+        super().__init__(cache_folder)
+
         self.bedbase_api = bedbase_api
 
     def load_bedset(self, bedset_id: str) -> BedSet:
         """
-        Loads a BED set from cache, or downloads and caches it plus BED files in it if it doesn't exist
+        Load a BEDset from cache, or download and add it to the cache with its BED files
 
-        :param bedset_id: unique identifier of BED set
+        :param BedSet: BedSet object
         """
 
         file_path = self._bedset_path(bedset_id)
 
         if os.path.exists(file_path):
             _LOGGER.info(f"BED set {bedset_id} already exists in cache.")
             with open(file_path, "r") as file:
-                extracted_data = file.readlines()
+                extracted_data = file.read().splitlines()
         else:
             extracted_data = self._download_bedset_data(bedset_id)
             # write the identifiers of BED files in the BedSet to a local .txt file
@@ -79,8 +84,8 @@ def _download_bedset_data(self, bedset_id: str) -> List[str]:
         """
         bedset_url = BEDSET_URL_PATTERN.format(bedbase_api=self.bedbase_api, bedset_id=bedset_id)
         response = requests.get(bedset_url)
-        data = response.json()
-        extracted_data = [entry.get("record_identifier") for entry in data["bedfile_metadata"]]
+        data = response.json()["results"]
+        extracted_data = [entry.get("id") for entry in data]
 
         return extracted_data
 
@@ -89,6 +94,7 @@ def load_bed(self, bed_id: str) -> RegionSet:
         Loads a BED file from cache, or downloads and caches it if it doesn't exist
 
         :param bed_id: unique identifier of a BED file
+        :return: the RegionSet object
         """
         file_path = self._bedfile_path(bed_id)
 
@@ -130,8 +136,8 @@ def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:
         """
         Add a BED file to the cache
 
-        :param bedfile: a RegionSet class or a path to a BED file to be added to cache
-        :return: the identifier if the BedFile object
+        :param bedfile: a RegionSet object or a path or url to the BED file
+        :return: the RegionSet identifier
         """
         if isinstance(bedfile, str):
             bedfile = RegionSet(bedfile)
@@ -163,12 +169,96 @@ def add_bed_to_cache(self, bedfile: Union[RegionSet, str]) -> str:
 
         return bedfile_id
 
+    def add_bed_to_s3(
+        self,
+        identifier: str,
+        bucket: str = DEFAULT_BUCKET_NAME,
+        endpoint_url: str = None,
+        aws_access_key_id: str = None,
+        aws_secret_access_key: str = None,
+        s3_path: str = DEFAULT_BUCKET_FOLDER,
+    ) -> str:
+        """
+        Add a cached BED file to S3
+
+        :param identifier: the unique identifier of the BED file
+        :param bucket: the name of the bucket
+        :param endpoint_url: the URL of the S3 endpoint [Default: set up by the environment vars]
+        :param aws_access_key_id: the access key of the AWS account [Default: set up by the environment vars]
+        :param aws_secret_access_key: the secret access key of the AWS account [Default: set up by the environment vars]
+        :param s3_path: the path on S3
+
+        :return: full path on S3
+        """
+        s3_client = boto3.client(
+            "s3",
+            endpoint_url=endpoint_url,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+        )
+        local_file_path = self.seek(identifier)
+        bed_file_name = os.path.basename(local_file_path)
+        s3_bed_path = os.path.join(identifier[0], identifier[1], bed_file_name)
+        if s3_path:
+            s3_bed_path = os.path.join(s3_path, s3_bed_path)
+
+        s3_client.upload_file(local_file_path, bucket, s3_bed_path)
+        _LOGGER.info(f"Project was uploaded successfully to s3://{bucket}/{s3_bed_path}")
+        return s3_bed_path
+
+    def get_bed_from_s3(
+        self,
+        identifier: str,
+        bucket: str = DEFAULT_BUCKET_NAME,
+        endpoint_url: str = None,
+        aws_access_key_id: str = None,
+        aws_secret_access_key: str = None,
+        s3_path: str = DEFAULT_BUCKET_FOLDER,
+    ) -> str:
+        """
+        Get a cached BED file from S3 and cache it locally
+
+        :param identifier: the unique identifier of the BED file
+        :param bucket: the name of the bucket
+        :param endpoint_url: the URL of the S3 endpoint [Default: set up by the environment vars]
+        :param aws_access_key_id: the access key of the AWS account [Default: set up by the environment vars]
+        :param aws_secret_access_key: the secret access key of the AWS account [Default: set up by the environment vars]
+        :param s3_path: the path on S3
+
+        :return: bed file id
+        :raise FileNotFoundError: if the identifier does not exist in cache
+        """
+        s3_bed_path = os.path.join(
+            identifier[0], identifier[1], f"{identifier}{DEFAULT_BEDFILE_EXT}"
+        )
+        if s3_path:
+            s3_bed_path = os.path.join(s3_path, s3_bed_path)
+
+        s3_client = boto3.client(
+            "s3",
+            endpoint_url=endpoint_url,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+        )
+        try:
+            s3_client.download_file(
+                bucket, s3_bed_path, self._bedfile_path(identifier, create=True)
+            )
+        except ClientError as e:
+            if e.response["Error"]["Code"] == "404":
+                raise FileNotFoundError(f"{identifier} does not exist in S3.")
+            else:
+                raise e
+
+        return identifier
+
     def seek(self, identifier: str) -> str:
         """
         Get local path to BED file or BED set with specific identifier
 
         :param identifier: the unique identifier
         :return: the local path of the file
+        :raise FileNotFoundError: if the identifier does not exist in cache
         """
 
         # check if any BED set has that identifier