Skip to content

Commit

Permalink
significant updates to the model structure and location
Browse files Browse the repository at this point in the history
  • Loading branch information
chg60 committed Dec 19, 2021
1 parent c97ca41 commit 760455d
Show file tree
Hide file tree
Showing 35 changed files with 638 additions and 1,506,258 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ numpy~=1.20.2
pandas~=1.2.4
pretty-html-table==0.9.10
plotly~=5.1.0
kaleido~=0.2.1
scipy~=1.7.0
setuptools~=58.2.0
50 changes: 50 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
[metadata]
license_file = LICENSE
name = depht
version = 1.0.0
author = Christian Gauthier
author_email = [email protected]
description = Discovery and Extraction of Phages Tool
long_description = file:README.md
long_description_content_type = text/markdown
url = https://github.com/chg60/DEPhT
project_urls =
classifiers =
Intended Audience :: Science/Research
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Operating System :: MacOS
Operating System :: POSIX :: Linux
Programming Language :: Python
Programming Language :: Python :: 3
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9

[options]
python_requires = >=3.6
package_dir =
=src
packages = find:
install_requires =
biopython~=1.79
bitarray~=2.0.0
bokeh~=2.2.2
dna-features-viewer~=3.0.3
kaleido~=0.2.1
matplotlib~=3.4.1
numpy~=1.20.2
pandas~=1.2.4
pretty-html-table~=0.9.10
plotly~=5.1.0
scipy~=1.7.0

[options.packages.find]
where = src
include = depht, depht.classes, depht.functions
exclude =

[options.entry_points]
console_scripts =
depht = depht.__main__:main
18 changes: 0 additions & 18 deletions setup.py

This file was deleted.

134 changes: 79 additions & 55 deletions src/depht/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from Bio import SeqIO

from depht import PACKAGE_DIR
from depht.classes.contig import CODING_FEATURE_TYPES, Contig
from depht.classes.prophage import ANNOTATIONS, DEFAULT_PRODUCT, Prophage
from depht.functions.annotation import annotate_record, MIN_LENGTH
Expand All @@ -23,14 +22,27 @@
from depht.functions.mmseqs import assemble_bacterial_mask
from depht.functions.multiprocess import PHYSICAL_CORES
from depht.functions.prophage_prediction import predict_prophage_coords, WINDOW
from depht.functions.sniff_format import sniff_format
from depht.functions.visualization import draw_complete_diagram

# GLOBAL VARIABLES
# -----------------------------------------------------------------------------
# For temporary file I/O
TMP_DIR = pathlib.Path("/tmp/prophicient")
CONTIG_DATA_HEADER = ["Gene ID", "Start", "End", "Prediction",
"Bacterial Homology", "Phage Homology"]
DEPHT_DIR = pathlib.Path().home().joinpath(".depht")
if not DEPHT_DIR.is_dir():
DEPHT_DIR.mkdir()

MODEL_DIR = DEPHT_DIR.joinpath("models")
if not MODEL_DIR.is_dir():
MODEL_DIR.mkdir()

LOCAL_MODELS = [x.name for x in MODEL_DIR.iterdir() if x.name != ".DS_Store"]
if len(LOCAL_MODELS) == 0:
print("No DEPhT models found in ~/.depht/models. Please create one using "
"'depht_train', or download models from https://osf.io/zt4n3/")
sys.exit(0)

# Where temporary file I/O should happen
TMP_DIR = DEPHT_DIR.joinpath("tmp")

# Model can't scan contigs with fewer CDS than window size
MIN_CDS_FEATURES = WINDOW
Expand All @@ -49,15 +61,8 @@
MIN_PRODUCTS_NORMAL = 5
MIN_PRODUCTS_STRICT = 10

# DEFAULT FILE PATHS
# -----------------------------------------------------------------------------
BLASTN_DB = PACKAGE_DIR.joinpath("data/blastn/mycobacteria")

ESSENTIAL_DB = PACKAGE_DIR.joinpath("data/hhsearch/essential/essential")
EXTENDED_DB = PACKAGE_DIR.joinpath("data/hhsearch/extended/extended")

BACTERIAL_REF_FASTA = PACKAGE_DIR.joinpath("data/mmseqs/bacterial_genes.fasta")
BACTERIAL_REF_VALUES = PACKAGE_DIR.joinpath("data/mmseqs/bacterial_genes.pbv")
CONTIG_DATA_HEADER = ["Gene ID", "Start", "End", "Prediction",
"Bacterial Homology", "Phage Homology"]


def parse_args():
Expand All @@ -66,42 +71,46 @@ def parse_args():
:return: parsed_args
"""
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("-i", "--infile",
type=pathlib.Path, nargs="+", required=True,
p = argparse.ArgumentParser(description=__doc__, prog="depht")
# Positional arguments
p.add_argument("infile", type=pathlib.Path, nargs="+",
help="path to genome file(s) to scan for prophages")
p.add_argument("-f", "--input-format",
choices=("fasta", "genbank"), default="fasta", type=str,
help="input which format your input file(s) are in")
p.add_argument("-o", "--outdir",
type=pathlib.Path, required=True,
help="path where outputs can be written")
p.add_argument("outdir", type=pathlib.Path,
help="path where output files can be written")

# Optional arguments
p.add_argument("--model", type=str,
choices=set(LOCAL_MODELS), default=LOCAL_MODELS[0],
help=f"which local model should be used [default: "
f"{LOCAL_MODELS[0]}]")
p.add_argument("-c", "--cpus",
default=PHYSICAL_CORES, type=int,
metavar="", default=PHYSICAL_CORES, type=int,
help=f"number of CPU cores to use [default: "
f"{PHYSICAL_CORES}]")
p.add_argument("-n", "--no-draw",
action="store_false", dest="draw",
p.add_argument("-n", "--no-draw", action="store_false", dest="draw",
help="don't draw genome diagram for identified prophage(s)")
p.add_argument("-m", "--mode",
choices=("fast", "normal", "strict"), default="normal",
help="select a runmode that favors speed or accuracy")
p.add_argument("-s", "--att_sensitivity", type=float,
default=ATT_SENSITIVITY,
p.add_argument("-s", "--att_sens", type=float,
default=ATT_SENSITIVITY, metavar="",
help="sensitivity parameter for att site detection.")
p.add_argument("-d", "--dump-data", action="store_true",
help="dump all support data to outdir")
p.add_argument("-v", "--verbose",
action="store_true",
help="print progress messages as the program runs")
p.add_argument("-t", "--tmp-dir", type=pathlib.Path, default=TMP_DIR,
help=f"temporary directory to use for file I/O [default: "
f"{TMP_DIR}]")
p.add_argument("-p", "--product-threshold", type=int, default=None,
help="select a phage homolog product lower threshold")
p.add_argument("-l", "--length-threshold", type=int, default=MIN_SIZE,
help=f"select a minimum length for prophages [default: "
f"{MIN_SIZE}]")
p.add_argument("-t", "--tmp-dir",
type=pathlib.Path, default=TMP_DIR, metavar="",
help=f"temporary directory to use for file I/O "
f"[default: {TMP_DIR}]")
p.add_argument("-p", "--products",
type=int, default=None, metavar="",
help="minimum number of phage homologs to report a prophage")
p.add_argument("-l", "--length",
type=int, default=MIN_SIZE, metavar="",
help=f"minimum length to report a prophage "
f"[default: {MIN_SIZE}]")

return p.parse_args()

Expand All @@ -113,15 +122,15 @@ def main():
"""
args = parse_args()

infiles = args.infile # What input file(s)?
fmt = args.input_format # What is the input file format?
infiles = args.infile # What input file(s) did the user give?
model = args.model # Which model was selected?
draw = args.draw # Are we going to draw genome diagrams?
dump = args.dump_data # Are we dumping data into outdir when done?
runmode = args.mode # What runmode are we using?
verbose = args.verbose # Print verbose outputs?
cpus = args.cpus # How many physical cores can we use?
att_sens = args.att_sensitivity # What's the att sensitivity modifier?
min_length = args.length_threshold # Minimum prophage length in bp
att_sens = args.att_sens # What's the att sensitivity modifier?
min_length = args.length # Minimum prophage length in bp

# Get output dir and make sure it's a valid path
outdir = pathlib.Path(args.outdir).resolve()
Expand All @@ -131,9 +140,17 @@ def main():

# Get the temporary directory, refresh it if it exists
tmpdir = pathlib.Path(args.tmp_dir).resolve()
if tmpdir.is_dir():
shutil.rmtree(tmpdir)
tmpdir.mkdir(parents=True)
if not tmpdir.is_dir():
tmpdir.mkdir(parents=True)

# Set up model directories
model_dir = MODEL_DIR.joinpath(model)
bact_ref_fasta = model_dir.joinpath("mmseqs/bacterial_genes.fasta")
bact_ref_values = model_dir.joinpath("mmseqs/bacterial_genes.pbv")
essential_db = model_dir.joinpath("hhsearch/essential")
extended_db = model_dir.joinpath("hhsearch/extended")
blast_db = model_dir.joinpath("blastn/references")
classifier = model_dir.joinpath("classifier.pkl")

# Mark program start time
mark = datetime.now()
Expand All @@ -155,15 +172,20 @@ def main():
if not genome_tmp_dir.is_dir():
genome_tmp_dir.mkdir()

# Automatically check the file format
fmt = sniff_format(infile)

# Parse all contigs of annotation-worthy length
if verbose:
print(f"\nparsing '{str(infile)}'...")
print(f"\nparsing '{str(infile)}' as {fmt}...")

# Parse the file and retain contigs above cutoff length
records = [x for x in SeqIO.parse(infile, fmt) if len(x) >= MIN_LENGTH]

if not records and verbose:
print(f"no {fmt}-formatted records of at least {MIN_LENGTH}bp "
f"found in '{str(infile)}' - skipping it...")
if not records:
if verbose:
print(f"no {fmt}-formatted records of at least {MIN_LENGTH}bp "
f"found in '{str(infile)}' - skipping it...")

shutil.rmtree(genome_tmp_dir) # clean up after ourselves
continue
Expand Down Expand Up @@ -209,16 +231,16 @@ def main():

# Detect conserved bacterial genes for each contig
bacterial_masks = assemble_bacterial_mask(
contigs, BACTERIAL_REF_FASTA, BACTERIAL_REF_VALUES, mmseqs_dir)
contigs, bact_ref_fasta, bact_ref_values, mmseqs_dir)

if verbose:
print("looking for high-probability prophage regions...")

# Predict prophage coordinates for each contig
prophage_predictions = list()
for i, contig in enumerate(contigs):
prediction = predict_prophage_coords(contig, EXTEND_BY,
mask=bacterial_masks[i])
prediction = predict_prophage_coords(
contig, classifier, EXTEND_BY, mask=bacterial_masks[i])

filtered_prediction = []
for pred in prediction:
Expand All @@ -245,22 +267,22 @@ def main():
if verbose:
print("searching for phage gene homologs...")

find_homologs(contigs, prophage_predictions, ESSENTIAL_DB,
find_homologs(contigs, prophage_predictions, essential_db,
hhsearch_dir, cpus)
product_threshold = MIN_PRODUCTS_NORMAL

if runmode == "strict":
if verbose:
print("extending search for phage gene homologs...")
find_homologs(contigs, prophage_predictions, EXTENDED_DB,
find_homologs(contigs, prophage_predictions, extended_db,
hhsearch_dir, cpus, cache_scores=False)
product_threshold = MIN_PRODUCTS_STRICT
else:
for contig in contigs:
contig.fill_hhsearch_scores()

if args.product_threshold is not None:
product_threshold = args.product_threshold
if args.products is not None:
product_threshold = args.products

prophages = load_initial_prophages(contigs, prophage_predictions,
product_threshold,
Expand All @@ -277,7 +299,7 @@ def main():

# Detect attachment sites, where possible, for the predicted prophage
search_space = att_sens * EXTEND_BY
detect_att_sites(prophages, BLASTN_DB, search_space, att_dir)
detect_att_sites(prophages, blast_db, search_space, att_dir)

prophages = [prophage for prophage in prophages
if prophage.length >= min_length]
Expand Down Expand Up @@ -439,6 +461,8 @@ def detect_att_sites(prophages, reference_db_path, search_space,
if search_space > half_len:
search_space = half_len

search_space = int(search_space)

l_seq = str(prophage.seq[:search_space])
r_seq = str(prophage.seq[-1*search_space:])

Expand Down
8 changes: 7 additions & 1 deletion src/depht/classes/prophage_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def fit(self, x, y, plot=False):

ratio = float(len(bacteria_df)) / len(prophage_df)

figs = list()
for i, feature in enumerate(x.columns):
prophage_series = prophage_df.loc[:, feature]
bacteria_series = bacteria_df.loc[:, feature]
Expand All @@ -63,13 +64,15 @@ def fit(self, x, y, plot=False):
color="class", barmode="overlay",
color_discrete_map=colormap,
template="simple_white")
fig.show()
figs.append((feature, fig))

dist = ProbabilityDistribution(prophage_hist, bacteria_hist,
weights=[1, ratio])

self.distributions_[feature] = dist

return figs

def predict_proba(self, x, feature_weights=None):
"""
Uses the known probability distributions for input features
Expand Down Expand Up @@ -134,6 +137,9 @@ def predict(self, x, feature_weights=None, alpha=0.5):

return temp_predict

def __len__(self):
return len(self.distributions_)


class Histogram:
def __init__(self, data, bin_width=None):
Expand Down
14 changes: 0 additions & 14 deletions src/depht/data/blastn/Mycobacteria.nsd

This file was deleted.

Binary file removed src/depht/data/blastn/Mycobacteria.nsi
Binary file not shown.
Binary file removed src/depht/data/blastn/mycobacteria.nhr
Binary file not shown.
Binary file removed src/depht/data/blastn/mycobacteria.nin
Binary file not shown.
Binary file removed src/depht/data/blastn/mycobacteria.nog
Binary file not shown.
Binary file removed src/depht/data/blastn/mycobacteria.nsq
Binary file not shown.
14 changes: 0 additions & 14 deletions src/depht/data/blastn/references.fasta

This file was deleted.

Loading

0 comments on commit 760455d

Please sign in to comment.