Skip to content

Commit

Permalink
Merge pull request #30 from bigict/dev
Browse files Browse the repository at this point in the history
fix: crop protein name length to 31
  • Loading branch information
chungongyu authored Mar 24, 2024
2 parents 708b70c + 747115b commit 6ffa369
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 9 deletions.
10 changes: 10 additions & 0 deletions kclust2db.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@
"$cdhit -i $infile -o $outfile -c $c -n $n -T $ncpu -M 8000")


def write_fseqs(txt, outfile, max_name_len=32): # pylint: disable=redefined-outer-name
with open(outfile, "w") as f:
for line in txt.splitlines():
if line.startswith(">"):
name, *_ = line.split(" ")
if len(name) > max_name_len:
line = f"{line[:max_name_len]} {line[max_name_len:]}"
f.write(f"{line}\n")


def remove_a3m_gap(infile, outfile, seqname_prefix=""): # pylint: disable=redefined-outer-name
''' read a3m/fasta format infile, remove gaps and output to outfile.
return the number of sequences '''
Expand Down
21 changes: 12 additions & 9 deletions msa_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
import logging

from hhpaths import bin_dict
from kclust2db import id2s_dict, kclust2db, remove_a3m_gap
from kclust2db import id2s_dict, kclust2db, remove_a3m_gap, write_fseqs
from utils import exists, make_tmpdir, mkdir_if_not_exist

logger = logging.getLogger(__file__)
Expand Down Expand Up @@ -515,8 +515,8 @@ def trim_eslsfetch(fseqs_file,
if len(seq) < Lpart:
trim_txt += f">{seqname_prefix}{name}\n{seq}\n"
continue
for i in range(0, L, Lpart / 2):
start_pos = Lpart * i / 2
for i in range(0, L, Lpart // 2):
start_pos = Lpart * i // 2
end_pos = start_pos + Lpart
trim_txt += f">{seqname_prefix}{name}_{i}\n{seq[start_pos:end_pos]}\n" # pylint: disable=line-too-long
if end_pos >= len(seq):
Expand Down Expand Up @@ -558,8 +558,9 @@ def run_jackblits(query_fasta, db_list, ncpu, hhblits_prefix, jackblits_prefix):
db = f"{jackblits_prefix}-mydb/mydb"

a3mdir = f"{jackblits_prefix}.fseqs"
with open(a3mdir, "w") as fp:
fp.write(txt)
write_fseqs(a3mdir, txt)
# with open(a3mdir, "w") as fp:
# fp.write(txt)

if txt.count("\n>") > kclust2db_threshold:
### cluster at 30% seqID and build database ###
Expand Down Expand Up @@ -630,8 +631,9 @@ def run_bfd(query_fasta, db_list, ncpu, hhblits_prefix, jackblits_prefix,
db = f"{bfd_prefix}-mydb/mydb"

a3mdir = f"{bfd_prefix}.fseqs"
with open(a3mdir, "w") as fp:
fp.write(txt)
write_fseqs(a3mdir, txt)
# with open(a3mdir, "w") as fp:
# fp.write(txt)

if txt.count("\n>") > kclust2db_threshold:
### cluster at 30% seqID and build database ###
Expand Down Expand Up @@ -802,8 +804,9 @@ def run_hmsblits(query_fasta, sequence, hhblits_prefix, db_list, ncpu, # pylint
mkdir_if_not_exist(db)

a3mdir = f"{hmmsearch_prefix}.fseqs"
with open(a3mdir, "w") as fp:
fp.write(txt)
write_fseqs(a3mdir, txt)
# with open(a3mdir, "w") as fp:
# fp.write(txt)

if txt.count("\n>") > kclust2db_threshold:
### cluster at 30% seqID and build database ###
Expand Down

0 comments on commit 6ffa369

Please sign in to comment.