Merge 'read_sequencer/main' into scRNAsim_toolz

zavolanlab · Sep 19, 2023 · e3d9ec4 · e3d9ec4
2 parents a4fecb4 + a5e6c6b
commit e3d9ec4
Show file tree

Hide file tree

Showing 14 changed files with 692 additions and 0 deletions.
diff --git a/read_sequencer/.gitignore b/read_sequencer/.gitignore
@@ -0,0 +1,175 @@
+.DS_Store
+.idea/
+read_sequencer.egg-info
+readsequencer.egg-info
+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+   For a library or package, you might want to ignore these files since the code is
+   intended to run in multiple environments; otherwise, check them in:
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+
+# End of https://www.toptal.com/developers/gitignore/api/python
diff --git a/read_sequencer/.gitlab-ci.yml b/read_sequencer/.gitlab-ci.yml
@@ -0,0 +1,34 @@
+default:         # Set default
+  tags:
+    - docker
+  image: python:3.10-slim-buster
+
+stages:          # List of stages for jobs, and their order of execution
+  - build
+  - test
+
+build-job:       # This job runs in the build stage, which runs first.
+  stage: build
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements-dev.txt
+    - pip install -e .
+
+unit-test-job:   # This job runs in the test stage.
+  stage: test    # It only starts when the job in the build stage completes successfully.
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements-dev.txt
+    - pip install -e .
+    - coverage run --source readsequencer -m pytest
+    - coverage report -m
+
+lint-test-job:   # This job also runs in the test stage.
+  stage: test    # It can run at the same time as unit-test-job (in parallel).
+  script:
+    - pip install -r requirements.txt
+    - pip install -r requirements-dev.txt
+    - pip install -e .
+    - flake8 --docstring-convention google readsequencer/ tests/
+    - pylint readsequencer/ tests/
+    - mypy readsequencer/ tests/
diff --git a/read_sequencer/LICENSE.txt b/read_sequencer/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Clara Serger, Michael Sandholzer and Christoph Harmel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/read_sequencer/README.md b/read_sequencer/README.md
@@ -0,0 +1,57 @@
+# Read Sequencer
+
+## Overview
+
+Read Sequencer is a python package to simulate sequencing. 
+It reads fasta files, simulate sequencing with specified read length and writes the resulting sequences into a new fasta file.
+
+
+## Installation from github 
+
+Read Sequencer requires Python 3.9 or later.
+
+Install Read Sequencer from Github using:
+
+```
+git clone https://git.scicore.unibas.ch/zavolan_group/tools/read-sequencer.git
+cd read-sequencer
+pip install . 
+```
+
+## Usage
+
+```
+usage: readsequencer [-h] [-i INPUT] [-r READ_LENGTH] [-n N_RANDOM] [-s CHUNK_SIZE] output 
+Simulates sequencing of DNA sequences specified by an FASTA file.
+
+positional arguments:
+  output                path to FASTA file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        path to FASTA file
+  -r READ_LENGTH, --read-length READ_LENGTH
+                        read length for sequencing
+  -n N_RANDOM, --n_random N_RANDOM
+                        n random sequences. Just used if input fasta file is not specified.
+  -s CHUNK_SIZE, --chunk-size CHUNK_SIZE
+                        chunk_size for batch processing
+
+```
+
+## Docker
+
+The docker image is available on docker hub: https://hub.docker.com/r/grrchrr/readsequencer
+
+```
+docker pull grrchrr/readsequencer
+docker run readsequencer readsequencer --help
+```
+
+## Contributors and Contact Information
+
+Christoph Harmel - [email protected]  
+Michael Sandholzer - [email protected]  
+Clara Serger - [email protected]  
+
diff --git a/read_sequencer/readsequencer/__init__.py b/read_sequencer/readsequencer/__init__.py
@@ -0,0 +1 @@
+"""Initialise read-sequencer."""
diff --git a/read_sequencer/readsequencer/cli.py b/read_sequencer/readsequencer/cli.py
@@ -0,0 +1,80 @@
+"""Receive command line arguments."""
+import argparse
+import logging
+from readsequencer.read_sequencer import ReadSequencer
+
+logging.basicConfig(
+    format='[%(asctime)s: %(levelname)s] %(message)s \
+        (module "%(module)s")',
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+LOG = logging.getLogger(__name__)
+
+
+def main():
+    """Use CLI arguments to simulate sequencing."""
+    parser = argparse.ArgumentParser(
+        prog="readsequencer",
+        description="Simulates sequencing of DNA sequences specified \
+            by an FASTA file.",
+    )
+    parser.add_argument(
+        "output",
+        help="path to FASTA file"
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        default=None,
+        help="path to FASTA file"
+    )
+    parser.add_argument(
+        "-r",
+        "--read-length",
+        type=int,
+        default=100,
+        help="read length for sequencing",
+    )
+    parser.add_argument(
+        "-n",
+        "--n_random",
+        default=100,
+        type=int,
+        help="n random sequences. Just used if input"
+             "fasta file is not specified.",
+    )
+    parser.add_argument(
+        "-s",
+        "--chunk-size",
+        default=10000,
+        type=int,
+        help="chunk_size for batch processing",
+    )
+    args = parser.parse_args()
+    LOG.info("Read sequencer started.")
+    if args.input is not None:
+        read_sequencer = ReadSequencer(
+            fasta=args.input,
+            output=args.output,
+            read_length=args.read_length,
+            chunk_size=args.chunk_size,
+        )
+        read_sequencer.get_n_sequences()
+    else:
+        read_sequencer = ReadSequencer(
+            fasta=args.input,
+            output=args.output,
+            read_length=args.read_length,
+            chunk_size=args.chunk_size,
+        )
+        read_sequencer.define_random_sequences(n_seq=args.n_random)
+
+    read_sequencer.run_sequencing()
+
+    LOG.info("Read sequencer finished.")
+
+
+if __name__ == '__main__':
+    main()