diff --git a/GetOrganelleLib/pipe_control_func.py b/GetOrganelleLib/pipe_control_func.py index 35a76b3..4b60f91 100755 --- a/GetOrganelleLib/pipe_control_func.py +++ b/GetOrganelleLib/pipe_control_func.py @@ -64,6 +64,10 @@ def __str__(self): HEAD_MAXIMUM_LINES = 2147483647 +INVALID_PATH_CHAR_RANGES = [ + [u'\u4e00', u'\u9fff'] # chinese characters are not accepted by SPAdes +] + def simple_log(log, output_base, prefix, log_level="NOTSET"): log_simple = log @@ -168,6 +172,14 @@ def executable(test_this): return True if os.access(test_this, os.X_OK) or getstatusoutput(test_this)[0] != DEAD_CODE else False +def is_valid_path(path_str): + for char in path_str: + for down_str, up_str in INVALID_PATH_CHAR_RANGES: + if down_str <= char <= up_str: + return False + return True + + def run_command(command, print_command=False, check_echo_error=True): if print_command: print(command) @@ -774,14 +786,14 @@ def zip_file(source, target, verbose_log=False, log_handler=None, remove_source= def unzip(source, target, line_limit=HEAD_MAXIMUM_LINES, verbose_log=False, log_handler=None): target_temp = target + ".Temp" - if HEAD_MAXIMUM_LINES == float("inf"): + if line_limit == float("inf"): try_commands = [ "tar -x -f " + source + " -O > " + target_temp, "gunzip -c " + source + " > " + target_temp] else: try_commands = [ - "tar -x -f " + source + " -O | head -n " + str(line_limit) + " > " + target_temp, - "gunzip -c " + source + " | head -n " + str(line_limit) + " > " + target_temp] + "tar -x -f " + source + " -O | head -n " + str(int(line_limit)) + " > " + target_temp, + "gunzip -c " + source + " | head -n " + str(int(line_limit)) + " > " + target_temp] # re-order try commands if "tar." not in source: try_commands = try_commands[1], try_commands[0] diff --git a/GetOrganelleLib/versions.py b/GetOrganelleLib/versions.py index 4d9ac07..8b75b8c 100644 --- a/GetOrganelleLib/versions.py +++ b/GetOrganelleLib/versions.py @@ -5,6 +5,15 @@ def get_versions(): versions = [ + { + "number": "1.7.3.3", + "features": [ + "1. early termination on invalid path characters for spades", + "2. fix a bug introduced by '--max-reads inf'", + "3. get_organelle_config.py: fix a bug if a new organelle types was added and '--use-local' was used", + ], + "time": "2021-02-11 01:00 UTC+8" + }, { "number": "1.7.3.2", "features": [ diff --git a/README.md b/README.md index 4e88cb5..be7c8c8 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,11 @@ [![Anaconda-Server Badge](https://anaconda.org/bioconda/getorganelle/badges/latest_release_date.svg)](https://anaconda.org/bioconda/getorganelle) [![GitHub release](https://img.shields.io/github/release/Kinggerm/GetOrganelle.svg)](https://GitHub.com/Kinggerm/GetOrganelle/releases/) -[![GitHub version](https://img.shields.io/github/commits-since/Kinggerm/GetOrganelle/1.7.3.1.svg)](https://github.com/Kinggerm/GetOrganelle/commit/master) +[![GitHub version](https://img.shields.io/github/commits-since/Kinggerm/GetOrganelle/1.7.3.3.svg)](https://github.com/Kinggerm/GetOrganelle/commit/master) This toolkit assemblies organelle genome from genomic skimming data. -It achieved the best performance overall both on simulated and real data and was recommended as the default for chloroplast assemblies in a third-party comparison paper ([Freudenthal et al. 2020. Genome Biology](https://doi.org/10.1186/s13059-020-02153-6)). +It achieved the best performance overall both on simulated and real data and was recommended as the default for chloroplast genome assemblies in a third-party comparison paper ([Freudenthal et al. 2020. Genome Biology](https://doi.org/10.1186/s13059-020-02153-6)).
@@ -58,6 +58,13 @@ Download [a simulated _Arabidopsis thaliana_ WGS dataset](https://github.com/Kin wget https://github.com/Kinggerm/GetOrganelleGallery/raw/master/Test/reads/Arabidopsis_simulated.1.fq.gz wget https://github.com/Kinggerm/GetOrganelleGallery/raw/master/Test/reads/Arabidopsis_simulated.2.fq.gz +then verify the integrity of downloaded files using `md5sum`: + + md5sum Arabidopsis_simulated.*.fq.gz + # 935589bc609397f1bfc9c40f571f0f19 Arabidopsis_simulated.1.fq.gz + # d0f62eed78d2d2c6bed5f5aeaf4a2c11 Arabidopsis_simulated.2.fq.gz + # Please re-download the reads if your md5 values unmatched above + then do the fast plastome assembly (memory: ~600MB, CPU time: ~60s): get_organelle_from_reads.py -1 Arabidopsis_simulated.1.fq.gz -2 Arabidopsis_simulated.2.fq.gz -t 1 -o Arabidopsis_simulated.plastome -F embplant_pt -R 10 @@ -135,7 +142,7 @@ or see the detailed illustrations: get_organelle_from_reads.py --help -To extract the plastome from an existing assembly graph (`*.fastg`/`*.gfa`): +To extract the plastid genome from an existing assembly graph (`*.fastg`/`*.gfa`; e.g. from long-read sequencing assemblies): get_organelle_from_assembly.py -F embplant_pt -g ONT_assembly_graph.gfa diff --git a/get_organelle_from_reads.py b/get_organelle_from_reads.py index 2175393..d158c22 100755 --- a/get_organelle_from_reads.py +++ b/get_organelle_from_reads.py @@ -427,7 +427,7 @@ def get_options(description, version): "7.5E7 (-F embplant_mt/other_pt/anonym); 3E8 (-F animal_mt)") parser.remove_option("--fast") parser.add_option("--fast", dest="fast_strategy", - help="=\"-R 10 -t 4 -J 5 -M 7 --max-words 3E7 --larger-auto-ws " + help="=\"-R 10 -t 4 -J 5 -M 7 --max-n-words 3E7 --larger-auto-ws " "--disentangle-time-limit 360\"") parser.remove_option("-k") parser.add_option("-k", dest="spades_kmer", default="21,55,85,115", @@ -676,6 +676,14 @@ def _check_default_db(this_sub_organelle, extra_type=""): log_handler.info("WORKING DIR: " + os.getcwd()) log_handler.info(" ".join(["\"" + arg + "\"" if " " in arg else arg for arg in sys.argv]) + "\n") + if options.run_spades: + for fq_file in [options.fq_file_1, options.fq_file_2] + options.unpaired_fq_files: + assert is_valid_path(os.path.basename(fq_file)), \ + "Invalid characters for SPAdes in file name: " + os.path.basename(fq_file) + for fq_file in [options.output_base, options.prefix]: + assert is_valid_path(os.path.realpath(fq_file)), \ + "Invalid characters for SPAdes in path: " + os.path.realpath(fq_file) + log_handler = timed_log(log_handler, options.output_base, options.prefix + "get_org.") if options.word_size is None: pass @@ -1012,7 +1020,7 @@ def estimate_maximum_n_reads_using_mapping( check_fq = os.path.join(this_check_dir, "check_" + str(f_id + 1)) if not (os.path.exists(check_fq) and resume): if r_file.endswith(".gz"): - unzip(r_file, check_fq, int(4 * check_num_line), verbose_log, log_handler if verbose_log else None) + unzip(r_file, check_fq, 4 * check_num_line, verbose_log, log_handler if verbose_log else None) else: os.system("head -n " + str(int(4 * check_num_line)) + " " + r_file + " > " + check_fq + ".temp") os.rename(check_fq + ".temp", check_fq) @@ -3814,7 +3822,7 @@ def main(): target_fq = os.path.join(out_base, str(file_id + 1) + "-" + os.path.basename(read_file)) + ".fastq" if not (os.path.exists(target_fq) and resume): - unzip(read_file, target_fq, int(4 * all_read_nums[file_id]), + unzip(read_file, target_fq, 4 * all_read_nums[file_id], options.verbose_log, log_handler) else: target_fq = os.path.join(out_base, str(file_id + 1) + "-" + @@ -3999,7 +4007,7 @@ def main(): files_to_unzip = [os.path.join(out_base, candidate) for candidate in os.listdir(out_base) if candidate.endswith(".fq.tar.gz")] for file_to_u in files_to_unzip: - unzip(source=file_to_u, target=file_to_u[:-7]) + unzip(source=file_to_u, target=file_to_u[:-7], line_limit=inf) options.spades_kmer = check_kmers(options.spades_kmer, word_size, max_read_len, log_handler) log_handler.info("Assembling using SPAdes ...") if not executable("pigz -h"):