Skip to content

Commit

Permalink
v1.7.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Kinggerm committed Feb 10, 2021
1 parent d43673f commit 711e275
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 10 deletions.
18 changes: 15 additions & 3 deletions GetOrganelleLib/pipe_control_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ def __str__(self):

HEAD_MAXIMUM_LINES = 2147483647

INVALID_PATH_CHAR_RANGES = [
[u'\u4e00', u'\u9fff'] # chinese characters are not accepted by SPAdes
]


def simple_log(log, output_base, prefix, log_level="NOTSET"):
log_simple = log
Expand Down Expand Up @@ -168,6 +172,14 @@ def executable(test_this):
return True if os.access(test_this, os.X_OK) or getstatusoutput(test_this)[0] != DEAD_CODE else False


def is_valid_path(path_str):
for char in path_str:
for down_str, up_str in INVALID_PATH_CHAR_RANGES:
if down_str <= char <= up_str:
return False
return True


def run_command(command, print_command=False, check_echo_error=True):
if print_command:
print(command)
Expand Down Expand Up @@ -774,14 +786,14 @@ def zip_file(source, target, verbose_log=False, log_handler=None, remove_source=

def unzip(source, target, line_limit=HEAD_MAXIMUM_LINES, verbose_log=False, log_handler=None):
target_temp = target + ".Temp"
if HEAD_MAXIMUM_LINES == float("inf"):
if line_limit == float("inf"):
try_commands = [
"tar -x -f " + source + " -O > " + target_temp,
"gunzip -c " + source + " > " + target_temp]
else:
try_commands = [
"tar -x -f " + source + " -O | head -n " + str(line_limit) + " > " + target_temp,
"gunzip -c " + source + " | head -n " + str(line_limit) + " > " + target_temp]
"tar -x -f " + source + " -O | head -n " + str(int(line_limit)) + " > " + target_temp,
"gunzip -c " + source + " | head -n " + str(int(line_limit)) + " > " + target_temp]
# re-order try commands
if "tar." not in source:
try_commands = try_commands[1], try_commands[0]
Expand Down
9 changes: 9 additions & 0 deletions GetOrganelleLib/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ def get_versions():


versions = [
{
"number": "1.7.3.3",
"features": [
"1. early termination on invalid path characters for spades",
"2. fix a bug introduced by '--max-reads inf'",
"3. get_organelle_config.py: fix a bug if a new organelle types was added and '--use-local' was used",
],
"time": "2021-02-11 01:00 UTC+8"
},
{
"number": "1.7.3.2",
"features": [
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
[![Anaconda-Server Badge](https://anaconda.org/bioconda/getorganelle/badges/latest_release_date.svg)](https://anaconda.org/bioconda/getorganelle)

[![GitHub release](https://img.shields.io/github/release/Kinggerm/GetOrganelle.svg)](https://GitHub.com/Kinggerm/GetOrganelle/releases/)
[![GitHub version](https://img.shields.io/github/commits-since/Kinggerm/GetOrganelle/1.7.3.1.svg)](https://github.com/Kinggerm/GetOrganelle/commit/master)
[![GitHub version](https://img.shields.io/github/commits-since/Kinggerm/GetOrganelle/1.7.3.3.svg)](https://github.com/Kinggerm/GetOrganelle/commit/master)

This toolkit assemblies organelle genome from genomic skimming data.

It achieved the best performance overall both on simulated and real data and was recommended as the default for chloroplast assemblies in a third-party comparison paper ([Freudenthal et al. 2020. Genome Biology](https://doi.org/10.1186/s13059-020-02153-6)).
It achieved the best performance overall both on simulated and real data and was recommended as the default for chloroplast genome assemblies in a third-party comparison paper ([Freudenthal et al. 2020. Genome Biology](https://doi.org/10.1186/s13059-020-02153-6)).

<div id="citation"></div>

Expand Down Expand Up @@ -58,6 +58,13 @@ Download [a simulated _Arabidopsis thaliana_ WGS dataset](https://github.com/Kin
wget https://github.com/Kinggerm/GetOrganelleGallery/raw/master/Test/reads/Arabidopsis_simulated.1.fq.gz
wget https://github.com/Kinggerm/GetOrganelleGallery/raw/master/Test/reads/Arabidopsis_simulated.2.fq.gz

then verify the integrity of downloaded files using `md5sum`:

md5sum Arabidopsis_simulated.*.fq.gz
# 935589bc609397f1bfc9c40f571f0f19 Arabidopsis_simulated.1.fq.gz
# d0f62eed78d2d2c6bed5f5aeaf4a2c11 Arabidopsis_simulated.2.fq.gz
# Please re-download the reads if your md5 values unmatched above

then do the fast plastome assembly (memory: ~600MB, CPU time: ~60s):

get_organelle_from_reads.py -1 Arabidopsis_simulated.1.fq.gz -2 Arabidopsis_simulated.2.fq.gz -t 1 -o Arabidopsis_simulated.plastome -F embplant_pt -R 10
Expand Down Expand Up @@ -135,7 +142,7 @@ or see the detailed illustrations:

get_organelle_from_reads.py --help

To extract the plastome from an existing assembly graph (`*.fastg`/`*.gfa`):
To extract the plastid genome from an existing assembly graph (`*.fastg`/`*.gfa`; e.g. from long-read sequencing assemblies):

get_organelle_from_assembly.py -F embplant_pt -g ONT_assembly_graph.gfa

Expand Down
16 changes: 12 additions & 4 deletions get_organelle_from_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def get_options(description, version):
"7.5E7 (-F embplant_mt/other_pt/anonym); 3E8 (-F animal_mt)")
parser.remove_option("--fast")
parser.add_option("--fast", dest="fast_strategy",
help="=\"-R 10 -t 4 -J 5 -M 7 --max-words 3E7 --larger-auto-ws "
help="=\"-R 10 -t 4 -J 5 -M 7 --max-n-words 3E7 --larger-auto-ws "
"--disentangle-time-limit 360\"")
parser.remove_option("-k")
parser.add_option("-k", dest="spades_kmer", default="21,55,85,115",
Expand Down Expand Up @@ -676,6 +676,14 @@ def _check_default_db(this_sub_organelle, extra_type=""):
log_handler.info("WORKING DIR: " + os.getcwd())
log_handler.info(" ".join(["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n")

if options.run_spades:
for fq_file in [options.fq_file_1, options.fq_file_2] + options.unpaired_fq_files:
assert is_valid_path(os.path.basename(fq_file)), \
"Invalid characters for SPAdes in file name: " + os.path.basename(fq_file)
for fq_file in [options.output_base, options.prefix]:
assert is_valid_path(os.path.realpath(fq_file)), \
"Invalid characters for SPAdes in path: " + os.path.realpath(fq_file)

log_handler = timed_log(log_handler, options.output_base, options.prefix + "get_org.")
if options.word_size is None:
pass
Expand Down Expand Up @@ -1012,7 +1020,7 @@ def estimate_maximum_n_reads_using_mapping(
check_fq = os.path.join(this_check_dir, "check_" + str(f_id + 1))
if not (os.path.exists(check_fq) and resume):
if r_file.endswith(".gz"):
unzip(r_file, check_fq, int(4 * check_num_line), verbose_log, log_handler if verbose_log else None)
unzip(r_file, check_fq, 4 * check_num_line, verbose_log, log_handler if verbose_log else None)
else:
os.system("head -n " + str(int(4 * check_num_line)) + " " + r_file + " > " + check_fq + ".temp")
os.rename(check_fq + ".temp", check_fq)
Expand Down Expand Up @@ -3814,7 +3822,7 @@ def main():
target_fq = os.path.join(out_base, str(file_id + 1) + "-" +
os.path.basename(read_file)) + ".fastq"
if not (os.path.exists(target_fq) and resume):
unzip(read_file, target_fq, int(4 * all_read_nums[file_id]),
unzip(read_file, target_fq, 4 * all_read_nums[file_id],
options.verbose_log, log_handler)
else:
target_fq = os.path.join(out_base, str(file_id + 1) + "-" +
Expand Down Expand Up @@ -3999,7 +4007,7 @@ def main():
files_to_unzip = [os.path.join(out_base, candidate)
for candidate in os.listdir(out_base) if candidate.endswith(".fq.tar.gz")]
for file_to_u in files_to_unzip:
unzip(source=file_to_u, target=file_to_u[:-7])
unzip(source=file_to_u, target=file_to_u[:-7], line_limit=inf)
options.spades_kmer = check_kmers(options.spades_kmer, word_size, max_read_len, log_handler)
log_handler.info("Assembling using SPAdes ...")
if not executable("pigz -h"):
Expand Down

0 comments on commit 711e275

Please sign in to comment.