From 50917bab8cbcaa7a6da2ad3ee1f5e2dad0568335 Mon Sep 17 00:00:00 2001 From: Kinggerm Date: Fri, 16 Apr 2021 15:18:20 +0800 Subject: [PATCH] v1.7.4.1 --- GetOrganelleLib/assembly_parser.py | 4 +- GetOrganelleLib/versions.py | 10 +++ README.md | 80 +++++++++++------------ Utilities/get_organelle_config.py | 101 +++++++++++++++-------------- 4 files changed, 106 insertions(+), 89 deletions(-) diff --git a/GetOrganelleLib/assembly_parser.py b/GetOrganelleLib/assembly_parser.py index 0d5a631..bd6cb3d 100755 --- a/GetOrganelleLib/assembly_parser.py +++ b/GetOrganelleLib/assembly_parser.py @@ -2965,8 +2965,8 @@ def add_gap_nodes_with_spades_res(self, scaffold_fasta, scaffold_paths, min_cov= length=len(new_seq), coverage=new_average_cov, forward_seq=new_seq, - head_connections=OrderedDict([((l_name, l_end), None)]), - tail_connections=OrderedDict([((r_name, r_end), None)])) + head_connections=OrderedDict([((l_name, l_end), ctg_olp)]), + tail_connections=OrderedDict([((r_name, r_end), ctg_olp)])) self.vertex_info[l_name].connections[l_end][(gap_name, False)] = ctg_olp self.vertex_info[r_name].connections[r_end][(gap_name, True)] = ctg_olp gap_added = True diff --git a/GetOrganelleLib/versions.py b/GetOrganelleLib/versions.py index 17189f4..e75559d 100644 --- a/GetOrganelleLib/versions.py +++ b/GetOrganelleLib/versions.py @@ -5,6 +5,15 @@ def get_versions(): versions = [ + { + "number": "1.7.4.1", + "features": [ + "1. get_organelle_config.py: provide guidance for old code and new database incompatibility (reported by Wenxiang Liu@SWFU)", + "2. assembly_parser.py: fix a bug after scaffolding with SPAdes path (introduced in 1.7.4 feature 5; reported by Robin van Velzen@WUR)", + "3. update README.md with improved instruction", + ], + "time": "2021-04-16 14:46 UTC+8" + }, { "number": "1.7.4", "features": [ @@ -18,6 +27,7 @@ def get_versions(): "8. get_organelle_from_reads.py/disentangle_organelle_assembly.py: correct typos", "9. pipe_control_func.py: map_with_bowtie2: warn reads integrity; build_bowtie2_db: rm small index", "10. get_organelle_config.py: verbose log for bowtie2 and blast", + "11. update README.md with a reframed instruction", ], "time": "2021-04-14 17:52 UTC+8" }, diff --git a/README.md b/README.md index c6c106b..d00f46e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Anaconda-Server Badge](https://anaconda.org/bioconda/getorganelle/badges/downloads.svg)](https://anaconda.org/bioconda/getorganelle) [![GitHub release](https://img.shields.io/github/release/Kinggerm/GetOrganelle.svg)](https://GitHub.com/Kinggerm/GetOrganelle/releases/) -[![GitHub version](https://img.shields.io/github/commits-since/Kinggerm/GetOrganelle/1.7.4.svg)](https://github.com/Kinggerm/GetOrganelle/commit/master) +[![GitHub version](https://img.shields.io/github/commits-since/Kinggerm/GetOrganelle/1.7.4.1.svg)](https://github.com/Kinggerm/GetOrganelle/commit/master) This toolkit assemblies organelle genome from genomic skimming data. @@ -94,36 +94,36 @@ But you are still highly recommended to read the following minimal introductions Since v1.6.2, `get_organelle_from_reads.py` will automatically estimate the read data it needs, without user assignment nor data reducing (see flags `--reduce-reads-for-coverage` and `--max-reads`). * Main Options - - Take your input seed (fasta format; if `-s` was not provided, - the default is `GetOrganelleLib/SeedDatabase/*.fasta`) as probe, - the script would recruit target reads in successive rounds (extending process). - The default seed works for most samples, but using a complete organelle genome sequence of a related species as the seed would help the assembly in many cases - (e.g. degraded DNA samples, fastly-evolving in animal/fungal samples). - - The value word size (followed with `-w`), like the kmer in assembly, is crucial to the feasibility and efficiency of this process. - The best word size changes upon data and will be affected by read length, read quality, base coverage, organ DNA percent and other factors. - By default, GetOrganelle would automatically estimate a proper word size based on the data characters. - Although the automatically-estimated word size value does not ensure the best performance nor the best result, - you do not need to adjust this value (`-w`) if a complete/circular organelle genome assembly is produced, - because the circular result generated by GetOrganelle is highly consistent under different options and seeds. - The automatically estimated word size may be screwy in some animal mitogenome data due to inaccurate coverage estimation, - for which you fine-tune it instead. - - The best kmer(s) depend on a wide variety of factors too. - Although more kmer values add the time consuming, you are recommended to use a wide range of kmers to benefit from the power of SPAdes. - Empirically, you should include at least including one small kmer (e.g. `21`) and one large kmer (`105`) for a successful organelle genome assembly. + + * `-w` The value word size, like the kmer in assembly, is crucial to the feasibility and efficiency of this process. + The best word size changes upon data and will be affected by read length, read quality, base coverage, organ DNA percent and other factors. + By default, GetOrganelle would automatically estimate a proper word size based on the data characters. + Although the automatically-estimated word size value does not ensure the best performance nor the best result, + you do not need to adjust this value (`-w`) if a complete/circular organelle genome assembly is produced, + because the circular result generated by GetOrganelle is highly consistent under different options and seeds. + The automatically estimated word size may be screwy in some animal mitogenome data due to inaccurate coverage estimation, + for which you fine-tune it instead. + + * `-k` The best kmer(s) depend on a wide variety of factors too. + Although more kmer values add the time consuming, you are recommended to use a wide range of kmers to benefit from the power of SPAdes. + Empirically, you should include at least including one small kmer (e.g. `21`) and one large kmer (`105`) for a successful organelle genome assembly. + + * `-s` GetOrganelle takes the seed (fasta format; if this was not provided, + the default is `GetOrganelleLib/SeedDatabase/*.fasta`) as probe, + the script would recruit target reads in successive rounds (extending process). + The default seed works for most samples, but using a complete organelle genome sequence of a related species as the seed would help the assembly in many cases + (e.g. degraded DNA samples, fastly-evolving in animal/fungal samples; see more [here](https://github.com/Kinggerm/GetOrganelle/wiki/FAQ#how-to-assemble-a-target-organelle-genome-using-my-own-reference)). * Key Results - The key output files include + The key output files include - * `*.path_sequence.fasta`, each fasta file represents one type of genome structure - * `*.selected_graph.gfa`, the [organelle-only assembly graph](https://github.com/Kinggerm/GetOrganelle/wiki/Terminology) - * `get_org.log.txt`, the log file - * `extended_K*.assembly_graph.fastg`, the raw assembly graph - * `extended_K*.assembly_graph.fastg.extend_embplant_pt-embplant_mt.fastg`, a simplified assembly graph - * `extended_K*.assembly_graph.fastg.extend_embplant_pt-embplant_mt.csv`, a tab-format contig label file for bandage visualization + * `*.path_sequence.fasta`, each fasta file represents one type of genome structure + * `*.selected_graph.gfa`, the [organelle-only assembly graph](https://github.com/Kinggerm/GetOrganelle/wiki/Terminology) + * `get_org.log.txt`, the log file + * `extended_K*.assembly_graph.fastg`, the raw assembly graph + * `extended_K*.assembly_graph.fastg.extend_embplant_pt-embplant_mt.fastg`, a simplified assembly graph + * `extended_K*.assembly_graph.fastg.extend_embplant_pt-embplant_mt.csv`, a tab-format contig label file for bandage visualization You may delete the files other than above if the resulting genome is complete (indicated in the log file and the name of the `*.fasta`). You are expected to obtain the complete organelle genome assembly for most animal/fungal mitogenomes and plant chloroplast genomes @@ -139,22 +139,22 @@ But you are still highly recommended to read the following minimal introductions * Input data & Main Options - The input must be a FASTG or GFA formatted assembly graph file. - - If you input an assembly graph assembled from total DNA sequencing using third-party a de novo assembler (e.g. Velvet), - the assembly graph may includes a great amount of non-target contigs. - You may want to use `--min-depth` and `--max-depth` to greatly reduce the computational burden for target extraction. - - If you input an [organelle-equivalent assembly graph](https://github.com/Kinggerm/GetOrganelle/wiki/Terminology) - (e.g. manually curated and exported using Bandage), you may use `--no-slim`. + * `-g` The input must be a FASTG or GFA formatted assembly graph file. + + * If you input an assembly graph assembled from total DNA sequencing using third-party a de novo assembler (e.g. Velvet), + the assembly graph may includes a great amount of non-target contigs. + You may want to use `--min-depth` and `--max-depth` to greatly reduce the computational burden for target extraction. + + * If you input an [organelle-equivalent assembly graph](https://github.com/Kinggerm/GetOrganelle/wiki/Terminology) + (e.g. manually curated and exported using Bandage), you may use `--no-slim`. * Key Results - The key output files include - - * `*.path_sequence.fasta`, one fasta file represents one type of genome structure - * `*.selected_graph.gfa`, the [organelle-only assembly graph](https://github.com/Kinggerm/GetOrganelle/wiki/Terminology) - * `get_org.log.txt`, the log file + The key output files include + + * `*.path_sequence.fasta`, one fasta file represents one type of genome structure + * `*.selected_graph.gfa`, the [organelle-only assembly graph](https://github.com/Kinggerm/GetOrganelle/wiki/Terminology) + * `get_org.log.txt`, the log file ### GetOrganelle flowchart diff --git a/Utilities/get_organelle_config.py b/Utilities/get_organelle_config.py index 2ee4ec4..3fd496b 100755 --- a/Utilities/get_organelle_config.py +++ b/Utilities/get_organelle_config.py @@ -65,46 +65,47 @@ def get_options(description): parser = ArgumentParser(description=description, usage="get_organelle_config.py -a embplant_pt,embplant_mt") parser.add_argument("-a", "--add", dest="add_organelle_type", - help="Add database for organelle type(s). Followed by any of all/" + - "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma such as " - "embplant_pt,embplant_mt,fungus_mt.") - parser.add_argument("--use-version", dest="version", default="latest", - help="The version of database to add. Find more versions at github.com/Kinggerm/GetOrganelleDB. " - "Default: %(default)s") + help="Add database for organelle type(s). Followed by any of all/" + + "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma such as " + "embplant_pt,embplant_mt,fungus_mt.") + parser.add_argument("--use-version", dest="db_version", default="latest", + help="The version of database to add. " + "Find more versions at github.com/Kinggerm/GetOrganelleDB. " + "Default: %(default)s") parser.add_argument("-r", "--rm", dest="rm_organelle_type", - help="Remove local database(s) for organelle type(s). Followed by any of all/" + - "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma " - "such as embplant_pt,embplant_mt.") + help="Remove local database(s) for organelle type(s). Followed by any of all/" + + "/".join(ORGANELLE_TYPE_LIST) + " or multiple types joined by comma " + "such as embplant_pt,embplant_mt.") parser.add_argument("--update", dest="update", default=False, action="store_true", - help="Update local databases to the latest online version, or the local version " - "if \"--use-local LOCAL_DB_PATH\" provided.") + help="Update local databases to the latest online version, or the local version " + "if \"--use-local LOCAL_DB_PATH\" provided.") parser.add_argument("--config-dir", dest="get_organelle_path", default=None, - help="The directory where the default databases were placed. " - "The default value also can be changed by adding 'export GETORG_PATH=your_favor' " - "to the shell script (e.g. ~/.bash_profile or ~/.bashrc) " - "Default: " + GO_PATH) + help="The directory where the default databases were placed. " + "The default value also can be changed by adding 'export GETORG_PATH=your_favor' " + "to the shell script (e.g. ~/.bash_profile or ~/.bashrc) " + "Default: " + GO_PATH) parser.add_argument("--use-local", dest="use_local", - help="Input a path. This local database path must include subdirectories " - "LabelDatabase and SeedDatabase, under which there is the fasta file(s) named by the " - "organelle type you want add, such as fungus_mt.fasta. ") + help="Input a path. This local database path must include subdirectories " + "LabelDatabase and SeedDatabase, under which there is the fasta file(s) named by the " + "organelle type you want add, such as fungus_mt.fasta. ") parser.add_argument("--clean", dest="clean", default=False, action="store_true", - help="Remove all configured database files (==\"--rm all\").") + help="Remove all configured database files (==\"--rm all\").") parser.add_argument("--list", dest="list_available", default=False, action="store_true", - help="List configured databases checking and exit. ") + help="List configured databases checking and exit. ") parser.add_argument("--check", dest="check", default=False, action="store_true", - help="Check configured database files and exit. ") + help="Check configured database files and exit. ") parser.add_argument("--db-type", dest="db_type", default="both", - help="The database type (seed/label/both). Default: %(default)s") + help="The database type (seed/label/both). Default: %(default)s") parser.add_argument("--which-blast", dest="which_blast", default="", - help="Assign the path to BLAST binary files if not added to the path. " - "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" + SYSTEM_NAME + - "/ncbi-blast\" first, then $PATH") + help="Assign the path to BLAST binary files if not added to the path. " + "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" + SYSTEM_NAME + + "/ncbi-blast\" first, then $PATH") parser.add_argument("--which-bowtie2", dest="which_bowtie2", default="", - help="Assign the path to Bowtie2 binary files if not added to the path. " - "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" + SYSTEM_NAME + - "/bowtie2\" first, then $PATH") + help="Assign the path to Bowtie2 binary files if not added to the path. " + "Default: try \"" + os.path.realpath("GetOrganelleDep") + "/" + SYSTEM_NAME + + "/bowtie2\" first, then $PATH") parser.add_argument("--verbose", dest="verbose", default=False, action="store_true", - help="verbose output to the screen. Default: %(default)s") + help="verbose output to the screen. Default: %(default)s") parser.add_argument("-v", "--version", action="version", version="GetOrganelle v{version}".format(version=get_versions())) options = parser.parse_args() @@ -207,22 +208,28 @@ def get_options(description): if not os.path.isfile(this_fas_f): sys.stdout.write("File " + this_fas_f + " not available!\n") sys.exit() - options.version = "customized" + options.db_version = "customized" sys.stdout.write("Use local database: " + options.use_local + "\n") else: if options.update: - options.version = "latest" - if options.version == "latest": + options.db_version = "latest" + if options.db_version == "latest": remote_quest = get_static_html_context(VERSION_URLS[0], verbose=options.verbose, alternative_url_list=VERSION_URLS[1:]) if remote_quest["status"]: - options.version = remote_quest["content"].strip() + options.db_version = remote_quest["content"].strip() else: sys.stderr.write("Error: " + remote_quest["info"] + "\n") sys.stderr.write("Please check your connection to github/gitee!\n") sys.stdout.write("\nYou can download the database files from www.github.com/Kinggerm/GetOrganelleDB " "and install it from from local (flag --use-local)\n") sys.exit() + if options.db_version not in SEED_DB_HASH or options.db_version not in LABEL_DB_HASH: + sys.stderr.write("GetOrganelle v{} does not support Database v{}\n". + format(get_versions(), options.db_version) + + "Please upgrade GetOrganelle (recommended) " + "or degrade the Database version (not recommended; --use-version)\n") + sys.exit() return options @@ -365,14 +372,14 @@ def main(): fasta_f=target_output, overwrite=False, verbose=options.verbose) else: - if existing_seed_db[sub_o_type]["version"] == options.version: + if existing_seed_db[sub_o_type]["version"] == options.db_version: # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n") initialize_seed_database(which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=False, verbose=options.verbose) else: - these_urls = [sub_url.format(options.version, sub_o_type) for sub_url in seed_url_temp] - check_sha256 = SEED_DB_HASH[options.version][sub_o_type]["sha256"] + these_urls = [sub_url.format(options.db_version, sub_o_type) for sub_url in seed_url_temp] + check_sha256 = SEED_DB_HASH[options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) @@ -383,7 +390,7 @@ def main(): initialize_seed_database(which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=True, verbose=options.verbose) - existing_seed_db[sub_o_type] = {"version": options.version, "sha256": check_sha256} + existing_seed_db[sub_o_type] = {"version": options.db_version, "sha256": check_sha256} write_version_file(version_dict=existing_seed_db, output_to_file=seed_version_f) if options.db_type in ("label", "both"): @@ -420,14 +427,14 @@ def main(): fasta_f=target_output, overwrite=False, verbose=options.verbose) else: - if existing_seed_db[sub_o_type]["version"] == options.version: + if existing_seed_db[sub_o_type]["version"] == options.db_version: # sys.stdout.write("The same " + sub_o_type + " Seed Database exists. Skipped.\n") initialize_notation_database(which_blast=options.which_blast, fasta_f=target_output, overwrite=False, verbose=options.verbose) else: - these_urls = [sub_url.format(options.version, sub_o_type) for sub_url in label_url_temp] - check_sha256 = LABEL_DB_HASH[options.version][sub_o_type]["sha256"] + these_urls = [sub_url.format(options.db_version, sub_o_type) for sub_url in label_url_temp] + check_sha256 = LABEL_DB_HASH[options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) @@ -437,7 +444,7 @@ def main(): continue initialize_notation_database(which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) - existing_label_db[sub_o_type] = {"version": options.version, "sha256": check_sha256} + existing_label_db[sub_o_type] = {"version": options.db_version, "sha256": check_sha256} write_version_file(version_dict=existing_label_db, output_to_file=label_version_f) # Case 4 @@ -465,8 +472,8 @@ def main(): fasta_f=target_output, overwrite=True, verbose=options.verbose) else: - these_urls = [sub_url.format(options.version, sub_o_type) for sub_url in seed_url_temp] - check_sha256 = SEED_DB_HASH[options.version][sub_o_type]["sha256"] + these_urls = [sub_url.format(options.db_version, sub_o_type) for sub_url in seed_url_temp] + check_sha256 = SEED_DB_HASH[options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) @@ -476,7 +483,7 @@ def main(): initialize_seed_database(which_bowtie2=options.which_bowtie2, fasta_f=target_output, overwrite=True, verbose=options.verbose) - existing_seed_db[sub_o_type] = {"version": options.version, "sha256": check_sha256} + existing_seed_db[sub_o_type] = {"version": options.db_version, "sha256": check_sha256} write_version_file(version_dict=existing_seed_db, output_to_file=seed_version_f) if options.db_type in ("label", "both"): @@ -502,8 +509,8 @@ def main(): initialize_notation_database(which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) else: - these_urls = [sub_url.format(options.version, sub_o_type) for sub_url in label_url_temp] - check_sha256 = LABEL_DB_HASH[options.version][sub_o_type]["sha256"] + these_urls = [sub_url.format(options.db_version, sub_o_type) for sub_url in label_url_temp] + check_sha256 = LABEL_DB_HASH[options.db_version][sub_o_type]["sha256"] status = download_file_with_progress( remote_url=these_urls[0], output_file=target_output, sha256_v=check_sha256, timeout=time_out, alternative_url_list=these_urls[1:], verbose=options.verbose) @@ -512,7 +519,7 @@ def main(): continue initialize_notation_database(which_blast=options.which_blast, fasta_f=target_output, overwrite=True, verbose=options.verbose) - existing_label_db[sub_o_type] = {"version": options.version, "sha256": check_sha256} + existing_label_db[sub_o_type] = {"version": options.db_version, "sha256": check_sha256} write_version_file(version_dict=existing_label_db, output_to_file=label_version_f) sys.stdout.write("\nTotal cost: %.2f s\n" % (time.time() - time_start))