diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 7ee6d6f..65befb2 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -58,7 +58,7 @@ jobs:
pytest test_proteomes.py
pytest test_clustering.py
pytest test_annotation.py
- pytest test_workflow.py
+ pytest test_workflow_uniprot.py
pytest test_precomputed.py
pytest test_eggnog.py
pytest test_report.py
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e84532e..5be4965 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,15 +1,30 @@
# Changelog
-# EsMeCaTa v0.5.4 (2024-11-09)
+# EsMeCaTa v0.6.0 (2025-01-27)
## Add
-* Create database from different output folders of esmecata (`from_runs`).
+* New command `esmecata_create_db` to create database from different output folders of esmecata (`from_runs`).
+* Full release of `esmecata precomputed` associated with the first version of [esmecata precomputed database](https://doi.org/10.5281/zenodo.13354073).
+* Option threshold (`-t`) to precomputed.
+* Add `--gseapyCutOff` option to `gseapy_enrichr`.
+* A check after database creation to detect taxon with few predicted proteins compared to higher affiliated taxon.
* Check the good format of the gzip file.
+* Header `KEGG_reaction` in annotation_reference from `annotation_uniprot` to avoid issues with `esmecata_create_db`.
## Fix
* Issue with protein IDs from UniParc during annotation (incorrect split on '|').
+* Fix issue in `get_taxon_obs_name` function.
+* Issues in test.
+
+## Modify
+
+* Add database version in log.
+* Rename `test_workflow.py` into `test_workflow_uniprot.py`, to better reflect what is done.
+* Update workflow figure.
+* Update readme.
+* Update article_data folder and the associated readme.
# EsMeCaTa v0.5.4 (2024-11-06)
diff --git a/README.md b/README.md
index 4276f97..279c883 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
# EsMeCaTa: *Es*timating *Me*tabolic *Ca*pabilties from *Ta*xonomic affiliations
-EsMeCaTa is a method to estimate metabolic capabilities from a taxonomic affiliation (for example with 16S rRNA sequencing or using a specific taxon name) by using the UniProt Proteomes database. This can be used to (1) estimate protein sequences and functions for an organism with no sequenced genomes, (2) explore the protein diveristy of a taxon.
+EsMeCaTa is a method to estimate metabolic capabilities from a taxonomic affiliation (for example with 16S rRNA sequencing or using a specific taxon name) by using the UniProt Proteomes database. This can be used to (1) estimate protein sequences and functions for an organism with no sequenced genomes, (2) explore the protein diversity of a taxon.
![](pictures/esmecata_10.svg)
@@ -37,6 +37,9 @@ EsMeCaTa is a method to estimate metabolic capabilities from a taxonomic affilia
- [EsMeCaTa report](#esmecata-report)
- [EsMeCaTa gseapy](#esmecata-gseapy)
- [EsMeCaTa create\_db](#esmecata-create_db)
+ - [Troubleshooting](#troubleshooting)
+ - [Issue with incompatible versions of ete3 and UniProt NCBI Taxonomy databases](#issue-with-incompatible-versions-of-ete3-and-uniprot-ncbi-taxonomy-databases)
+ - [Citation](#citation)
- [License](#license)
## Requirements
@@ -73,6 +76,12 @@ To query the precomputed database, it is only required to install EsMeCaTa with
All the required dependencies for the estimation from the precomputed database are performed with python packages.
+The second requirement is esmecata precomputed database (file size: 4G) available at the following [Zenodo archive](https://zenodo.org/records/13354073).
+As this file is quite big and if you want just to test esmecata precomputed, you can try:
+
+- the precomputed database (`buchnera_database.zip`) present in the [test folder](https://github.com/AuReMe/esmecata/tree/main/test). You can use it on the `buchnera_workflow.tsv` input file present in the same test folder.
+- one of the precomputed database associated with the article and present in this other [Zenodo archive](https://zenodo.org/records/14502342). The associated input files are in this [folder](https://github.com/AuReMe/esmecata/tree/main/article_data).
+
### Core pipeline installation
For the whole workflow, the easiest way to install the dependencies of EsMeCaTa is by using conda (or mamba):
@@ -116,7 +125,7 @@ All dependencies can be installed with following command:
## Input
-EsMeCaTa takes as input a tabulated or an excel file with two columns one with the ID corresponding to the taxonomic affiliation (for example the OTU ID from 16S rRNA sequencing) and a second column with the taxonomic classification separated by ';'. In the following documentation, the first column (named `observation_name`) will be used to identify the label associated with each taxonomic affiliation. An example is located in the test folder ([Example.tsv](https://github.com/ArnaudBelcour/esmecata/blob/master/test/Example.tsv)).
+EsMeCaTa takes as input a tabulated or an excel file with two columns one with the ID corresponding to the taxonomic affiliation (for example the OTU ID from 16S rRNA sequencing) and a second column with the taxonomic classification separated by ';'. In the following documentation, the first column (named `observation_name`) will be used to identify the label associated with each taxonomic affiliation. Several examples are available ([buchnera_workflow.tsv](https://github.com/AuReMe/esmecata/blob/main/test/buchnera_workflow.tsv), [toy_example.tsv](https://github.com/AuReMe/esmecata/blob/main/article_data/toy_example/toy_example.tsv), [methanogenic_reactor.tsv](https://github.com/AuReMe/esmecata/blob/main/article_data/methanogenic_reactor/methanogenic_reactor.tsv) or [honeybee_esmecata_metdata.tsv](https://github.com/AuReMe/esmecata/blob/main/article_data/mgnify_validation/honeybee_esmecata_metdata.tsv)).
For example:
@@ -137,13 +146,13 @@ It is possible to use EsMeCaTa with a taxonomic affiliation containing only one
| Cluster_1 | Sphaerochaeta |
| Cluster_2 | Yersinia |
-But this can cause issue. For example, "Cluster_2" is associated with Yersinia but two genus are associated with this name (one mantid (taxId: 444888) and one bacteria (taxId: 629)). EsMeCaTa will not able to differentiate them. But if you give more informations by adding more taxons (for example: 'Bacteria;Gammaproteobacteria;Yersinia'), EsMeCaTa will compare all the taxons of the taxonomic affiliation (here: 2 (Bacteria) and 1236 (Gammaproteobacteria)) to the lineage associated with the two taxIDs (for bacteria Yersinia: [1, 131567, 2, 1224, 1236, 91347, 1903411, 629] and for the mantid one: [1, 131567, 2759, 33154, 33208, 6072, 33213, 33317, 1206794, 88770, 6656, 197563, 197562, 6960, 50557, 85512, 7496, 33340, 33341, 6970, 7504, 7505, 267071, 444888]). In this example, there is 2 matches for the bacterial one (2 and 1236) and 0 for the mantid one. So EsMeCaTa will select the taxId associated with the bacteria (629).
+But this can cause issue. For example, "Cluster_2" is associated with Yersinia but two genus are associated with this name (one mantid (taxId: 444888) and one bacteria (taxId: 629)). EsMeCaTa will not able to differentiate them. But if you give more informations by adding more taxa (for example: 'Bacteria;Gammaproteobacteria;Yersinia'), EsMeCaTa will compare all the taxa of the taxonomic affiliation (here: 2 (Bacteria) and 1236 (Gammaproteobacteria)) to the lineage associated with the two taxIDs (for bacteria Yersinia: [1, 131567, 2, 1224, 1236, 91347, 1903411, 629] and for the mantid one: [1, 131567, 2759, 33154, 33208, 6072, 33213, 33317, 1206794, 88770, 6656, 197563, 197562, 6960, 50557, 85512, 7496, 33340, 33341, 6970, 7504, 7505, 267071, 444888]). In this example, there is 2 matches for the bacterial one (2 and 1236) and 0 for the mantid one. So EsMeCaTa will select the taxId associated with the bacteria (629).
A [jupyter notebook](https://github.com/AuReMe/esmecata/blob/master/tutorials/esmecata_method.ipynb) explains how EsMeCata works.
## EsMeCaTa commands
-Several command line are created after the isntallation:
+Several command line are created after the installation:
- `esmecata`: the main command to perform esmecata workflow from input file or with a precomputed database.
- `esmecata_report`: another command to create HTML report showing different statistics on the predictions.
@@ -181,8 +190,8 @@ Steps proteomes and annotation by UniProt requires an internet connection (for R
### Use the precomputed database
-**WARNING**: Database is in development, it is not available yet.
-But there are several precomputed databases associated with the article datasets available in the Zenodo archive of EsMeCaTa.
+The precomputed database of EsMeCaTa is available at this [Zenodo repository](https://doi.org/10.5281/zenodo.13354073). Warning, this precomputed database size is 4 Gb.
+Several precomputed databases (of smaller size) associated with the article datasets are available in the [Zenodo archive of EsMeCaTa's article](https://zenodo.org/records/14502342). And there is also a little precomputed dbatase available for test purpose (on one organism `buchnera_database.zip`) in the test folder ([test folder](https://github.com/AuReMe/esmecata/tree/main/test)).
Using the precomputed database, esmecata searches for input taxon inside the precomputed database to make prediction.
It requires an input file containing the taxonomic affiliations and a precomputed esmecata database.
@@ -190,7 +199,7 @@ For each observation name in the input file, it will returned the associated ann
It will also output the protein sequences for each taxa associated with the observation name.
```
-usage: esmecata precomputed [-h] -i INPUT_FILE -d INPUT_FILE -o OUPUT_DIR [-r RANK_LIMIT] [--update-affiliations]
+usage: esmecata precomputed [-h] -i INPUT_FILE -d INPUT_FILE -o OUPUT_DIR [-r RANK_LIMIT] [--update-affiliations] [-t THRESHOLD_CLUSTERING]
options:
-h, --help show this help message and exit
@@ -205,6 +214,8 @@ options:
information (and a list of rank names).
--update-affiliations
If the taxonomic affiliations were assigned from an outdated taxonomic database, this can lead to taxon not be found in ete3 database. This option tries to udpate the taxonomic affiliations using the lowest taxon name.
+ -t THRESHOLD_CLUSTERING, --threshold THRESHOLD_CLUSTERING
+ Proportion [0 to 1] of proteomes required to occur in a proteins cluster for that cluster to be kept in core proteome assembly. Default is 0.5.
```
Two options can be used to limit the rank used when searching for proteomes and to update the taxonomic affiliations from the input file.
@@ -217,7 +228,7 @@ esmecata precomputed -i input_taxonomic_affiliations.tsv -d esmecata_database.zi
### Classical run of EsMeCaTa
-Otherwise, it is possible to run the whole workflow of EsMeCaTa but it will take times as it will search and download proteomes from UniProt, clsuter protein sequences with mmseqs2 and then annotate them with eggnog-mapper.
+Otherwise, it is possible to run the whole workflow of EsMeCaTa but it will take times as it will search and download proteomes from UniProt, cluster protein sequences with mmseqs2 and then annotate them with eggnog-mapper.
These different steps are presented in the following section.
@@ -268,18 +279,18 @@ options:
For each taxon in each taxonomic affiliations EsMeCaTa will use ete3 to find the corresponding taxon ID. Then it will search for proteomes associated with these taxon ID in the Uniprot Proteomes database.
-If there is more than 100 proteomes, esmecata will apply a specific method:
+If there is more than 100 proteomes, esmecata applies a subsampling procedure:
* (1) use the taxon ID associated with each proteomes to create a taxonomic tree with ete3.
-* (2) from the root of the tree (the input taxon), esmecata will find the direct deescendant (sub-taxons).
+* (2) from the root of the tree (the input taxon), esmecata will find the direct descendants (sub-taxa).
* (3) then esmecata will compute the number of proteomes associated with each sub-taxon.
* (4) the corresponding proportions will be used to select randomly a number of proteomes corresponding to the proportion.
For example: for the taxon Clostridiales, 645 proteomes are found. Using the organism taxon ID associated with the 645 proteomes we found that there is 17 direct sub-taxons. Then for each sub-taxon we compute the percentage of proportion of proteomes given by the sub-taxon to the taxon Clostridiales.
-There is 198 proteomes associated with the sub-taxon Clostridiaceae, the percentage will be computed as follow: 198 / 645 = 30% (if a percentage is superior to 1 it will be round down and if the percentage is lower than 1 it will be round up to keep all the low proportion sub-taxons). We will use this 30% to select randomly 30 proteomes amongst the 198 proteomes of Clostridiaceae. This is done for all the other sub-taxons, so we get a number of proteomes around 100 (here it will be 102). Due to the different rounds (up or down) the total number of proteomes will not be equal to exactly 100 but it will be around it. The number of proteomes leading to this behavior is set to 99 by default but the user can modify it with the `-l/--limit-proteomes` option.
+There is 198 proteomes associated with the sub-taxon Clostridiaceae, the percentage will be computed as follow: 198 / 645 = 30% (if a percentage is superior to 1 it will be round down and if the percentage is lower than 1 it will be round up to keep all the low proportion sub-taxa). We will use this 30% to select randomly 30 proteomes amongst the 198 proteomes of Clostridiaceae. This is done for all the other sub-taxa, so we get a number of proteomes around 100 (here it will be 102). Due to the different rounds (up or down) the total number of proteomes will not be equal to exactly 100 but it will be around it. The number of proteomes leading to this behaviour is set to 99 by default but the user can modify it with the `-l/--limit-proteomes` option.
`esmecata check` options:
@@ -289,7 +300,7 @@ It is possible to avoid using REST queries for esmecata and instead use SPARQL q
* `-b/--busco`: filter proteomes using BUSCO score (default is 0.8)
-It is possible to filter proteomes according to to their BUSCO score (from Uniprot documentation: `The Benchmarking Universal Single-Copy Ortholog (BUSCO) assessment tool is used, for eukaryotic and bacterial proteomes, to provide quantitative measures of UniProt proteome data completeness in terms of expected gene content.`). It is a percentage between 0 and 1 showing the quality of the proteomes that esmecata will download. By default esmecata uses a BUSCO score of 0.80, it will only download proteomes with a BUSCO score of at least 80%.
+It is possible to filter proteomes according to to their BUSCO score (from UniProt documentation: `The Benchmarking Universal Single-Copy Ortholog (BUSCO) assessment tool is used, for eukaryotic and bacterial proteomes, to provide quantitative measures of UniProt proteome data completeness in terms of expected gene content.`). It is a percentage between 0 and 1 showing the quality of the proteomes that esmecata will download. By default esmecata uses a BUSCO score of 0.80, it will only download proteomes with a BUSCO score of at least 80%.
* `--ignore-taxadb-update`: ignore need to udpate ete3 taxaDB
@@ -297,7 +308,7 @@ If you have an old version of the ete3 NCBI taxonomy database, you can use this
* `--all-proteomes`: download all proteomes (reference and non-reference)
-By default, esmecata will try to downlaod the reference proteomes associated with a taxon. But if you want to download all the proteomes associated with a taxon (either if they are non reference proteome) you can use this option. Without this option non-reference proteoems can also be used if no reference proteomes are found.
+By default, esmecata will try to download the reference proteomes associated with a taxon. But if you want to download all the proteomes associated with a taxon (either if they are non reference proteome) you can use this option. Without this option non-reference proteomes can also be used if no reference proteomes are found.
* `-l/--limit-proteomes`: choose the number of proteomes that will lead to the used of the selection of a subset of proteomes
@@ -307,7 +318,7 @@ To avoid working on too many proteomes, esmecata works on subset of proteomes wh
To avoid working on too little proteomes, it is possible to give an int to this option.
With this int, esmecata will select only taxon associated to at least this number of proteomes.
-For example if you use `--minimal-nb-proteomes 10`, and the lowest taxon in the taxonomic affiliation is associated with 3 proteomes, it will be ignored and a taxon with a higer taxonomic rank will be used.
+For example if you use `--minimal-nb-proteomes 10`, and the lowest taxon in the taxonomic affiliation is associated with 3 proteomes, it will be ignored and a taxon with a higher taxonomic rank will be used.
* `-r/--rank-limit`: This option limits the rank used when searching for proteomes. All the ranks superior to the given rank will be ignored. For example, if 'family' is given, only taxon ranks inferior or equal to family will be kept.
@@ -507,7 +518,7 @@ options:
--bioservices Use bioservices instead of esmecata functions for protein annotation.
````
-For each of the protein clusters kept after the clustering, esmecata will look for the annotation (GO terms, EC number, function, gene name, Interpro) in Uniprot.
+For each of the protein clusters kept after the clustering, esmecata will look for the annotation (GO terms, EC number, function, gene name, Interpro) in UniProt.
By default, esmecata will look at the annotations of each proteins from a cluster and keeps only annotation occurring in all the protein of a cluster (threshold 1 of option -p).
It is like selecting the intersection of the annotation of the cluster. This can be changed with the option `-p` and giving a float between 0 and 1.
@@ -539,10 +550,10 @@ With this option, esmecata will extract the [expression information](https://www
* `--annotation-files`: use UniProt txt files instead of queyring Uniprot servers.
-As the `annotation step` needs a high numbers of queries to UniProt servers when working with hundreds or thousands of taxonomic affliations, it can failed due to issues with the query.
+As the `annotation step` needs a high numbers of queries to UniProt servers when working with hundreds or thousands of taxonomic affiliations, it can failed due to issues with the query.
A workaround (for example on a cluster), is to use the UniProt flat files containing the protein annotations.
Warning, the TrEMBL file takes a lot of space (around 150G compressed for the version 2022_05 andd 700G uncompressed).
-One of the downside of this option is that it needs lof of memory to handle indexing the TrEMBL file (around 32G using Biopython indexing) and it takes several hours to parse it.
+One of the downside of this option is that it needs a lot of memory to handle indexing the TrEMBL file (around 32G using Biopython indexing) and it takes several hours to parse it.
But for dataset with thousands of taxonomic affiliations, this can be compensated by the fact that queyring the indexed files is more stable than querying a server.
For this option, you should give the path to the two annotation files (both the Swiss-Prot and the TrEMBL files) separated by `,`such as: `--annotation-files /db/uniprot/UniProt_2022_05/flat/uniprot_sprot.dat,/db/uniprot/UniProt_2022_05/flat/uniprot_trembl.dat`.
The names of the files must contained: `uniprot_sprot` and `uniprot_trembl` to be able to differentiate them.
@@ -635,7 +646,7 @@ options:
--annotation-files ANNOTATION_FILES
Use UniProt annotation files (uniprot_trembl.txt and uniprot_sprot.txt) to avoid querying UniProt REST API. Need both paths to these files separated by a ",".
--update-affiliations
- If the taxonomic affiliations were assigned from an outdated taxonomic database, this can lead to taxon not be found in ete3 database. This option tries to udpate the taxonomic affiliations using the lowest taxon name.
+ If the taxonomic affiliations were assigned from an outdated taxonomic database, this can lead to taxon not be found in ete3 database. This option tries to update the taxonomic affiliations using the lowest taxon name.
--bioservices Use bioservices instead of esmecata functions for protein annotation.
````
@@ -710,9 +721,9 @@ output_folder
├── taxonomy_diff.tsv
````
-The `cluster_founds` contains one tsv file per taxon name used by EsMeCaTa. So multiple `observation_name` can be represented by a similar taxon name to avoid redundancy and limit the disk space used. These files contain the clustered proteins The first column contains the representative proteins of a cluster and the following columns correspond to the other proteins of the same cluster. The first protein occurs two time: one as the representative member o.f the cluster and a second time as a member of the cluster.
+The `cluster_founds` contains one tsv file per taxon name used by EsMeCaTa. So multiple `observation_name` can be represented by a similar taxon name to avoid redundancy and limit the disk space used. These files contain the clustered proteins The first column contains the representative proteins of a cluster and the following columns correspond to the other proteins of the same cluster. The first protein occurs two time: one as the representative member of the cluster and a second time as a member of the cluster.
-The `computed_threshold` folder contains the ratio of proteomes represented in a cluster compared to the total number of proteomes associated with a taxon. If the ratio is equal to 1, it means that all the proteomes are represented by a protein in the cluster, 0.5 means that half of the proteoems are represented in the cluster. This score is used when giving the `-t` argument.
+The `computed_threshold` folder contains the ratio of proteomes represented in a cluster compared to the total number of proteomes associated with a taxon. If the ratio is equal to 1, it means that all the proteomes are represented by a protein in the cluster, 0.5 means that half of the proteomes are represented in the cluster. This score is used when giving the `-t` argument.
The `mmseqs_tmp` folder contains the intermediary files of mmseqs2 for each taxon name. To save disk space, it is recommended to delete it with the option `--remove-tmp`.
@@ -752,7 +763,7 @@ output_folder
│ └── Cluster_1.pf
│ └── ...
│ └── taxon_id.tsv
-├── dataset_annotation_observation_name.tsv
+├── function_table.tsv
├── esmecata_annotation.log
├── esmecata_metadata_annotation.json
├── stat_number_annotation.tsv
@@ -766,7 +777,7 @@ The `merge_fasta` folder contains merged protein sequences of the clustering ste
The `pathologic` folder contains one sub-folder for each `observation_name` in which there is one PathoLogic file. There is also a `taxon_id.tsv` file which corresponds to a modified version of `proteome_tax_id.tsv` with only the `observation_name` and the `taxon_id`. This folder can be used as input to [mpwt](https://github.com/AuReMe/mpwt) to reconstruct draft metabolic networks using Pathway Tools PathoLogic.
-The file `dataset_annotation_observation_name.tsv` contains the EC numbers and GO Terms present in each observation name.
+The file `function_table.tsv` contains the EC numbers and GO Terms present in each observation name.
The file `esmecata_annotation.log` contains the log associated with the command.
@@ -795,13 +806,13 @@ output_folder
├── uniref_annotation (if --uniref option)
│ └── Cluster_1.tsv
│ └── ...
-├── dataset_annotation_observation_name.tsv
+├── function_table.tsv
├── esmecata_annotation.log
├── esmecata_metadata_annotation.json
├── stat_number_annotation.tsv
````
-The `annotation` folder contains a tabulated file for each taxon name (that can be associated with multiple `observation_name`). It contains the annotation retrieved with Uniprot (protein_name, review, GO Terms, EC numbers, Interpros, Rhea IDs and gene name) associated with all the proteins in a proteome or associated with an `observation_name`.
+The `annotation` folder contains a tabulated file for each taxon name (that can be associated with multiple `observation_name`). It contains the annotation retrieved with UniProt (protein_name, review, GO Terms, EC numbers, InterPros, Rhea IDs and gene name) associated with all the proteins in a proteome or associated with an `observation_name`.
The `annotation_reference` contains annotation only for the representative proteins, but the annotation of the other proteins of the same cluster can be propagated to the reference protein if the `-p` was used.
@@ -811,7 +822,7 @@ The `pathologic` contains one sub-folder for each `observation_name` in which th
The file `esmecata_annotation.log` contains the log associated with the command.
-The `esmecata_metadata_annotation.json` serves the same purpose as the one used in `esmecata proteomes` to retrieve metadata about Uniprot release at the time of the query. It also gets the metadata associated with the command used with esmecata and the dependencies.
+The `esmecata_metadata_annotation.json` serves the same purpose as the one used in `esmecata proteomes` to retrieve metadata about UniProt release at the time of the query. It also gets the metadata associated with the command used with esmecata and the dependencies.
The `uniref_annotation` contains the annotation from the representative protein of the UniRef cluster associated with the proteins of a taxon (if the `--uniref` option was used).
@@ -877,7 +888,7 @@ output_folder
│ └── Cluster_1.pf
│ └── ...
│ └── taxon_id.tsv
- ├── dataset_annotation_observation_name.tsv
+ ├── function_table.tsv
├── esmecata_annotation.log
├── esmecata_metadata_annotation.json
├── stat_number_annotation.tsv
@@ -890,7 +901,7 @@ The files in the folders `0_proteomes`, `1_clustering` and `2_annotation` are th
The file `esmecata_workflow.log` contains the log associated with the command.
-The `esmecata_metadata_workflow.json` retrieves metadata about Uniprot release at the time of the query, the command used and its duration.
+The `esmecata_metadata_workflow.json` retrieves metadata about UniProt release at the time of the query, the command used and its duration.
`stat_number_workflow.tsv` is a tabulated file containing the number of proteomes, shared proteins, GO Terms and EC numbers found for each observation name.
@@ -898,7 +909,7 @@ The `esmecata_metadata_workflow.json` retrieves metadata about Uniprot release a
### EsMeCaTa precomputed
-The output of `esmecata precomputed` is similar to the output of `esmecata workflow` but with fewer results as the database does not ocntain all the files created by esmecata:
+The output of `esmecata precomputed` is similar to the output of `esmecata workflow` but with fewer results as the database does not contain all the files created by esmecata:
````
output_folder
@@ -928,7 +939,7 @@ output_folder
│ └── Cluster_1.pf
│ └── ...
│ └── taxon_id.tsv
- ├── dataset_annotation_observation_name.tsv
+ ├── function_table.tsv
├── esmecata_metadata_annotation.json
├── stat_number_annotation.tsv
├── esmecata_precomputed.log
@@ -1020,6 +1031,11 @@ Or by using one input file with:
`esmecata_gseapy gseapy_enrichr -f esmecata_annotation_output_folder -o output_folder --grouping selected --taxa-list manually_selected_groups.tsv`
+Additional arguments can be given to use gseapy or orsum options such as:
+
+- `--gseapyCutOff` to set adjusted p-value cut-off for gseapy enrichr term (by default it is 0.05).
+- `--orsumMinTermSize` to set the MinTermSize of orsum (the minimum size of the terms to be processed.).
+
## EsMeCaTa create_db
Create precomputed database from esmecata output folders or merge already present precomputed databases.
@@ -1056,7 +1072,22 @@ To merge several precomputed databases, you can use the following command:
`esmecata_create_db from_workflow -i esmecata_database_1.zip,esmecata_database_2.zip,esmecata_database_3.zip -o output_folder`
+## Troubleshooting
+
+### Issue with incompatible versions of ete3 and UniProt NCBI Taxonomy databases
+
+A common issue encountered when using EsMeCaTa is that the NCBI Taxonomy database present in the ete3 package (and used to parse the input taxonomic affiliations) is different from the ones used by UniProt. This can lead to several issues at different levels of EsMeCaTa. A possible solution is to update the NCBI Taxonomy database of ete3 with the following command:
+
+```
+python3 -c "from ete3 import NCBITaxa; ncbi = NCBITaxa(); ncbi.update_taxonomy_database()"
+```
+
+## Citation
+
+If you have used esmecata, please cite its preprint:
+
+Arnaud Belcour, Pauline Hamon-Giraud, Alice Mataigne, Baptiste Ruiz, Yann Le Cunff, Jeanne Got, Lorraine Awhangbo, Mégane Lebreton, Clémence Frioux, Simon Dittami, Patrick Dabert, Anne Siegel, Samuel Blanquart. Estimating consensus proteomes and metabolic functions from taxonomic affiliations. bioRxiv 2022.03.16.484574; doi: https://doi.org/10.1101/2022.03.16.484574
+
## License
This software is licensed under the GNU GPL-3.0-or-later, see the [LICENSE](https://github.com/AuReMe/esmecata/blob/main/LICENSE) file for details.
-
diff --git a/article_data/README.md b/article_data/README.md
index ae1bb6e..44f9c33 100644
--- a/article_data/README.md
+++ b/article_data/README.md
@@ -3,22 +3,26 @@
## Table of contents
- [Input files from the article](#input-files-from-the-article)
- [Table of contents](#table-of-contents)
- - [Experiments](#experiments)
- - [Manually selected taxa](#manually-selected-taxa)
- - [MGnify validation](#mgnify-validation)
- - [Biogas reactor](#biogas-reactor)
- - [Algae symbionts](#algae-symbionts)
- - [Old Experiments](#old-experiments)
+ - [Experiments (preprint of 2025)](#experiments-preprint-of-2025)
+ - [Datasets](#datasets)
+ - [Manually selected taxa](#manually-selected-taxa)
+ - [MGnify validation](#mgnify-validation)
+ - [Methanogenic reactor](#methanogenic-reactor)
+ - [Algae symbionts](#algae-symbionts)
+ - [Reproduce experiments](#reproduce-experiments)
+ - [Old Experiments (preprint of 2022)](#old-experiments-preprint-of-2022)
- [Taxonomic affiliations from Gammaproteobacteria and Alveolata](#taxonomic-affiliations-from-gammaproteobacteria-and-alveolata)
- [Taxonomic affiliations from 16S and rpoB](#taxonomic-affiliations-from-16s-and-rpob)
-## Experiments
+## Experiments (preprint of 2025)
-### Manually selected taxa
+### Datasets
+
+#### Manually selected taxa
A folder containing an input file for esmecata containing 13 manually selected taxonomic affiliations from Gammaproteobacteria and Alveolata.
-### MGnify validation
+#### MGnify validation
4 input files associated with dataset from MGnify:
- [honeybee gut v1.0](https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/honeybee-gut/v1.0/)
@@ -29,15 +33,50 @@ A folder containing an input file for esmecata containing 13 manually selected t
For each dataset, it contains all the metadata associated with the genomes of the dataset. The columns `observation_name` and `taxonomic_affiliation` are added so the file can be used by EsMeCaTa.
The column `Completness` is used to filter the genomes (for the article by using the threshold of at least 90% of Completness).
-### Biogas reactor
+#### Methanogenic reactor
-An OTU table from a biogas reactor experiment containing the taxonomic assignment and the abundance of each OTU in different samples (corresponding to time of measurements of the biogas reactor).
+An OTU table from a methanogenic reactor experiment containing the taxonomic assignment and the abundance of each OTU in different samples (corresponding to time of measurements of the biogas reactor).
-### Algae symbionts
+#### Algae symbionts
Microbiotes from Metagenomes experiment ([Burgunter et al. 2020](https://doi.org/10.3389/fmars.2020.00085) and [KleinJan et al. 2023](https://doi.org/10.1111/mec.16766)). 35 MAGs were selected from KleinJan et al. 2023 with more than 90% completion.
-## Old Experiments
+### Reproduce experiments
+
+The experimetns made in the article of EsMeCaTa (a preprint is available at [biorXiv](https://doi.org/10.1101/2022.03.16.484574
+)) are available in this [Zenodo archive](https://zenodo.org/records/14502342).
+
+Furthermore, in this archive, there are several precomptued database present (to try) to reproduce these experiments.
+
+To run these experiments, you will have to install esmecata with: `pip install esmecata`
+
+To do so, download one of the precomputed database associated with the dataset you want to run (such as `precomputed_db_honeybee.zip`). Then you can use `esmecata precomputed` command to use this database on an input file. For example:
+
+```
+esmecata precomputed -i honeybee_esmecata_metdata.tsv -d precomputed_db_honeybee.zip -o output_folder_honeybee
+```
+
+For a better change to reproduce the results, it is recommended to use the NCBI Taxonomy database associacted with the dataset. The NCBI Taxonomy database can be dwonload from the Zenodo archive (file `ncbi_taxonomy_database.zip`). To know which version use, refer to the following table:
+
+| dataset | UniProt | NCBI Taxonomy |
+|---------------------------|---------|---------------|
+| Toy example | 2023_04 | 09-2023 |
+| E. siliculosus microbiota | 2023_02 | 04-2023 |
+| Honeybee gut | 2023_05 | 12-2023 |
+| Human Oral | 2023_05 | 12-2023 |
+| Marine | 2023_05 | 12-2023 |
+| Pig Gut | 2023_05 | 12-2023 |
+| Methanogenic reactor | 2024_01 | 01-2024 |
+
+You can update the NCBI Taxonomy database used by ete3 with the following command (here we use as example `taxdmp_2024-01.tar.gz` associated with the methanogenic reactor):
+
+```
+python3 -c "from ete3 import NCBITaxa; ncbi = NCBITaxa(); ncbi.update_taxonomy_database('taxdmp_2024-01.tar.gz')"
+```
+
+This will create output folder containing the predictions for the associated organisms of the community.
+
+## Old Experiments (preprint of 2022)
In the [first preprint](https://www.biorxiv.org/content/10.1101/2022.03.16.484574v1) two experiments were performed:
diff --git a/article_data/biogas_reactor/biogas_reactor.tsv b/article_data/methanogenic_reactor/methanogenic_reactor.tsv
similarity index 100%
rename from article_data/biogas_reactor/biogas_reactor.tsv
rename to article_data/methanogenic_reactor/methanogenic_reactor.tsv
diff --git a/esmecata/__init__.py b/esmecata/__init__.py
index ecf33ad..dee6a55 100644
--- a/esmecata/__init__.py
+++ b/esmecata/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -13,4 +13,4 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
-__version__ = '0.5.5'
+__version__ = '0.6.0'
diff --git a/esmecata/__main__.py b/esmecata/__main__.py
index 36c5cd0..9558c08 100644
--- a/esmecata/__main__.py
+++ b/esmecata/__main__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -139,7 +139,7 @@ def main():
'--update-affiliations',
dest='update_affiliations',
help='''If the taxonomic affiliations were assigned from an outdated taxonomic database, this can lead to taxon not be found in ete3 database. \
- This option tries to udpate the taxonomic affiliations using the lowest taxon name.''',
+ This option tries to update the taxonomic affiliations using the lowest taxon name.''',
required=False,
action='store_true',
default=None)
@@ -386,7 +386,8 @@ def main():
help='Use precomputed database to create estimated data for the run.',
parents=[
parent_parser_i_taxon, parent_parser_d, parent_parser_o,
- parent_parser_rank_limit, parent_parser_update_affiliation
+ parent_parser_rank_limit, parent_parser_update_affiliation,
+ parent_parser_thr
],
allow_abbrev=False)
@@ -460,7 +461,8 @@ def main():
args.linclust, args.minimal_number_proteomes, args.update_affiliations,
args.option_bioservices, args.eggnog_tmp_dir, args.no_dbmem)
elif args.cmd == 'precomputed':
- precomputed_parse_affiliation(args.input, args.database, args.output, args.rank_limit, args.update_affiliations)
+ precomputed_parse_affiliation(args.input, args.database, args.output, args.rank_limit, args.update_affiliations,
+ args.threshold_clustering)
logger.info("--- Total runtime %.2f seconds ---" % (time.time() - start_time))
logger.warning(f'--- Logs written in {log_file_path} ---')
diff --git a/esmecata/__main_create_database__.py b/esmecata/__main_create_database__.py
index fa89cc9..f455d3e 100644
--- a/esmecata/__main_create_database__.py
+++ b/esmecata/__main_create_database__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/__main_gseapy__.py b/esmecata/__main_gseapy__.py
index 2a60084..c2cda67 100644
--- a/esmecata/__main_gseapy__.py
+++ b/esmecata/__main_gseapy__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -119,6 +119,16 @@ def main():
metavar='INT',
default=None)
+ parent_parser_gseapyCutOff = argparse.ArgumentParser(add_help=False)
+ parent_parser_gseapyCutOff.add_argument(
+ '--gseapyCutOff',
+ dest='gseapyCutOff',
+ required=False,
+ type=float,
+ help='Adjust-Pval cutoff for gseapy enrichr, default: 0.05 (--cut-off argument of gseapy).',
+ metavar='FLOAT',
+ default=0.05)
+
# subparsers
subparsers = parser.add_subparsers(
title='subcommands',
@@ -130,7 +140,7 @@ def main():
help='Extract enriched functions from groups (either chosen from tax_rank or manually selected) using gseapy enrichr and orsum.',
parents=[
parent_parser_f, parent_parser_o, parent_parser_grouping, parent_parser_t, parent_parser_taxa_list,
- parent_parser_e, parent_parser_g, parent_parser_orsumMinTermSize
+ parent_parser_e, parent_parser_g, parent_parser_orsumMinTermSize, parent_parser_gseapyCutOff
],
allow_abbrev=False)
@@ -158,7 +168,8 @@ def main():
if args.cmd == 'gseapy_enrichr':
taxon_rank_annotation_enrichment(args.input_folder, args.output, args.grouping, taxon_rank=args.taxon_rank, taxa_lists_file=args.taxa_list,
- enzyme_data_file=args.enzyme_file, go_basic_obo_file=args.go_file, orsum_minterm_size=args.orsumMinTermSize)
+ enzyme_data_file=args.enzyme_file, go_basic_obo_file=args.go_file, orsum_minterm_size=args.orsumMinTermSize,
+ selected_adjust_pvalue_cutoff=args.gseapyCutOff)
logger.info("--- Total runtime %.2f seconds ---" % (time.time() - start_time))
logger.warning(f'--- Logs written in {log_file_path} ---')
diff --git a/esmecata/__main_report__.py b/esmecata/__main_report__.py
index 8424817..3919f5f 100644
--- a/esmecata/__main_report__.py
+++ b/esmecata/__main_report__.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/core/annotation.py b/esmecata/core/annotation.py
index c8596c5..a0e5516 100644
--- a/esmecata/core/annotation.py
+++ b/esmecata/core/annotation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -1029,22 +1029,23 @@ def write_annotation_reference(protein_annotations, reference_proteins, annotati
with open(annotation_reference_file, 'w') as output_tsv:
csvwriter = csv.writer(output_tsv, delimiter='\t')
if expression_output_dict:
- csvwriter.writerow(['protein_cluster', 'cluster_members', 'protein_name', 'gene_name', 'GO', 'EC', 'Induction', 'Tissue_Specificity', 'Disruption_Phenotype'])
+ csvwriter.writerow(['protein_cluster', 'cluster_members', 'protein_name', 'gene_name', 'GO', 'EC', 'KEGG_reaction' 'Induction', 'Tissue_Specificity', 'Disruption_Phenotype'])
else:
- csvwriter.writerow(['protein_cluster', 'cluster_members', 'protein_name', 'gene_name', 'GO', 'EC'])
+ csvwriter.writerow(['protein_cluster', 'cluster_members', 'protein_name', 'gene_name', 'GO', 'EC', 'KEGG_reaction'])
for protein in protein_annotations:
protein_name = protein_annotations[protein][0]
gene_name = protein_annotations[protein][3]
cluster_members = ','.join(reference_proteins[protein])
gos = ','.join(sorted(list(protein_annotations[protein][1])))
ecs = ','.join(sorted(list(protein_annotations[protein][2])))
+ kegg_reaction = ''
if expression_output_dict:
induction = expression_output_dict[protein][0]
tissue_specificity = expression_output_dict[protein][1]
disruption = expression_output_dict[protein][2]
- csvwriter.writerow([protein, cluster_members, protein_name, gene_name, gos, ecs, induction, tissue_specificity, disruption])
+ csvwriter.writerow([protein, cluster_members, protein_name, gene_name, gos, ecs, kegg_reaction, induction, tissue_specificity, disruption])
else:
- csvwriter.writerow([protein, cluster_members, protein_name, gene_name, gos, ecs])
+ csvwriter.writerow([protein, cluster_members, protein_name, gene_name, gos, ecs, kegg_reaction])
def create_pathologic(base_filename, annotated_protein_to_keeps, reference_proteins, pathologic_output_file):
diff --git a/esmecata/core/clustering.py b/esmecata/core/clustering.py
index c681085..3fd525c 100644
--- a/esmecata/core/clustering.py
+++ b/esmecata/core/clustering.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/core/eggnog.py b/esmecata/core/eggnog.py
index c5453d2..0f40e73 100644
--- a/esmecata/core/eggnog.py
+++ b/esmecata/core/eggnog.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/core/precomputed.py b/esmecata/core/precomputed.py
index f38cab5..88a1e0d 100644
--- a/esmecata/core/precomputed.py
+++ b/esmecata/core/precomputed.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2024-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -17,6 +17,7 @@
import datetime
import json
import logging
+import io
import os
import pandas as pd
import shutil
@@ -28,6 +29,8 @@
from ete3 import __version__ as ete3_version
from ete3 import NCBITaxa
+from Bio import SeqIO
+from Bio import __version__ as biopython_version
from esmecata.utils import is_valid_dir
from esmecata.core.proteomes import associate_taxon_to_taxon_id, disambiguate_taxon, filter_rank_limit, create_comp_taxonomy_file
@@ -97,7 +100,8 @@ def find_proteomes_tax_ids_in_precomputed_database(json_taxonomic_affiliations,
return association_taxon_database, observation_name_not_founds
-def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_folder, rank_limit=None, update_affiliations=None):
+def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_folder, rank_limit=None, update_affiliations=None,
+ clust_threshold=0.5):
"""From a tsv file with taxonomic affiliations find the associated proteomes and download them.
Args:
@@ -105,6 +109,7 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
output_folder (str): pathname to the output folder.
rank_limit (str): rank limit to filter the affiliations (keep this rank and all inferior ranks).
update_affiliations (str): option to update taxonomic affiliations.
+ clust_threshold (float): threshold to select protein cluster according to the representation of protein proteome in the cluster (must be equal or superior to the one use in the creation of the precomputed db).
"""
starttime = time.time()
logger.info('|EsMeCaTa|precomputed| Reading input file.')
@@ -134,7 +139,7 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
# Metadata of the script.
options = {'input_file': input_file, 'output_folder': output_folder, 'database_taxon_file_path': database_taxon_file_path,
- 'rank_limit': rank_limit, 'update_affiliations': update_affiliations}
+ 'rank_limit': rank_limit, 'update_affiliations': update_affiliations, 'clust_threshold': clust_threshold}
options['tool_dependencies'] = {}
options['tool_dependencies']['python_package'] = {}
@@ -142,6 +147,7 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
options['tool_dependencies']['python_package']['esmecata'] = esmecata_version
options['tool_dependencies']['python_package']['ete3'] = ete3_version
options['tool_dependencies']['python_package']['pandas'] = pd.__version__
+ options['tool_dependencies']['python_package']['biopython'] = biopython_version
esmecata_metadata = {}
date = datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S')
@@ -178,6 +184,18 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
for json_key in json_data:
if json_key.endswith('_proteomes'):
proteomes_data_json = json_data[json_key]
+ if json_key.endswith('_clustering'):
+ clustering_data_json = json_data[json_key]
+
+ precomputed_db_version = json_data['database_version']
+ logger.critical('|EsMeCaTa|precomputed| EsMeCaTa is using precomputed database version {0}.'.format(precomputed_db_version))
+
+ # Check compatibility between user threshold and database threshold.
+ database_clustering_threhsold = clustering_data_json['tool_options']['clust_threshold']
+ if clust_threshold < database_clustering_threhsold:
+ logger.critical('|EsMeCaTa|precomputed| Selected threshold (-t) too low ({0}) compared to database threhsold ({1}), select one superior or equal to the one of the database.'.format(clust_threshold, database_clustering_threhsold))
+ sys.exit(1)
+
esmecata_metadata['precomputed_database']['esmecata_query_system'] = proteomes_data_json['esmecata_query_system']
esmecata_metadata['precomputed_database']['uniprot_release'] = proteomes_data_json['uniprot_release']
esmecata_metadata['precomputed_database']['access_time'] = proteomes_data_json['access_time']
@@ -185,6 +203,7 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
esmecata_metadata['precomputed_database']['swissprot_release_date'] = proteomes_data_json['swissprot_release_date']
esmecata_metadata['precomputed_database']['trembl_release_number'] = proteomes_data_json['trembl_release_number']
esmecata_metadata['precomputed_database']['trembl_release_date'] = proteomes_data_json['trembl_release_date']
+ esmecata_metadata['precomputed_database']['esmecata_precomputed_db_version'] = precomputed_db_version
ncbi = NCBITaxa()
@@ -214,7 +233,7 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
for observation_name in association_taxon_database:
tax_id = association_taxon_database[observation_name][1]
if tax_id != 'not_found':
- tax_name = association_taxon_database[observation_name][1]
+ tax_name = association_taxon_database[observation_name][0]
tax_id_name = proteomes_tax_id_names[tax_id]
tax_rank = taxon_data[tax_id][2]
proteome = taxon_data[tax_id][3]
@@ -266,28 +285,48 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
# For each line of the input files that has a match in the database, recreate an imitation of esmecata output folder.
for tax_id in tax_id_obs_names:
- taxi_id_name = proteomes_tax_id_names[tax_id]
-
- # Create a consensus proteoems file.
- clustering_consensus_file = os.path.join(taxi_id_name, taxi_id_name+'.faa')
- output_path_consensus_file = os.path.join(reference_proteins_consensus_fasta_folder, taxi_id_name+'.faa')
- if not os.path.exists(output_path_consensus_file):
- with archive.open(clustering_consensus_file) as zf, open(output_path_consensus_file, 'wb') as f:
- shutil.copyfileobj(zf, f)
+ tax_id_name = proteomes_tax_id_names[tax_id]
# Read annotation file
- annotation_file = os.path.join(taxi_id_name, taxi_id_name+'.tsv')
+ annotation_file = os.path.join(tax_id_name, tax_id_name+'.tsv')
with archive.open(annotation_file) as zf:
df_annotation = pd.read_csv(zf, sep='\t')
+ # Filter according to selected clust_threshold.
+ df_annotation = df_annotation[df_annotation['cluster_ratio'] >= clust_threshold]
+
# Create a computed threhsold file.
- output_computed_threshold_file = os.path.join(computed_threshold_folder, taxi_id_name+'.tsv')
+ output_computed_threshold_file = os.path.join(computed_threshold_folder, tax_id_name+'.tsv')
df_annotation[['representative_protein', 'cluster_ratio', 'proteomes']].to_csv(output_computed_threshold_file, sep='\t', index=None)
+ kept_protein_ids = set(df_annotation['representative_protein'].tolist())
for observation_name in tax_id_obs_names[tax_id]:
# Create an annotaiton_reference file for the observation name.
output_path_annotation_file = os.path.join(annotation_reference_output_folder, observation_name+'.tsv')
- df_annotation.to_csv(output_path_annotation_file, sep='\t', index=None)
+ selected_df_annotation = df_annotation[['representative_protein', 'cluster_members', 'gene_name', 'GO', 'EC', 'KEGG_reaction']]
+ selected_df_annotation.columns = ['protein_cluster', 'cluster_members', 'gene_name', 'GO', 'EC', 'KEGG_reaction']
+ selected_df_annotation.to_csv(output_path_annotation_file, sep='\t', index=None)
+
+ # Create a consensus proteoems file.
+ clustering_consensus_file = os.path.join(tax_id_name, tax_id_name+'.faa')
+ output_path_consensus_file = os.path.join(reference_proteins_consensus_fasta_folder, tax_id_name+'.faa')
+ if not os.path.exists(output_path_consensus_file):
+ records = []
+ with archive.open(clustering_consensus_file) as zf:
+ open_zf_text = io.TextIOWrapper(zf)
+ for record in SeqIO.parse(open_zf_text, 'fasta'):
+ if '|' in record.id:
+ protein_id = record.id.split('|')[1]
+ else:
+ protein_id = record.id
+ if protein_id in kept_protein_ids:
+ records.append(record)
+
+ if len(records) > 0:
+ logger.critical('|EsMeCaTa|precomputed| {0} protein clusters kept for taxon {1} using threshold {2}.'.format(len(records), tax_id_name, clust_threshold))
+ SeqIO.write(records, output_path_consensus_file, 'fasta')
+ else:
+ logger.critical('|EsMeCaTa|precomputed| 0 protein clusters kept for taxon {0} using threshold {1}, it will not have predictions.'.format(tax_id_name, clust_threshold))
archive.close()
@@ -301,23 +340,26 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
csvreader = csv.reader(open_clustering_file_path, delimiter='\t')
next(csvreader)
cluster_0 = []
- cluster_0_5 = []
+ selected_threshold_cluster = []
cluster_0_95 = []
for line in csvreader:
protein_cluster_threshold = float(line[1])
if protein_cluster_threshold >= 0.95:
cluster_0_95.append(line[0])
- if protein_cluster_threshold >= 0.5:
- cluster_0_5.append(line[0])
+ if protein_cluster_threshold >= clust_threshold:
+ selected_threshold_cluster.append(line[0])
if protein_cluster_threshold >= 0:
cluster_0.append(line[0])
- tax_name_clustering_numbers[clustering_file.replace('.tsv', '')] = [cluster_0, cluster_0_5, cluster_0_95]
+ if len(selected_threshold_cluster) > 0:
+ tax_name_clustering_numbers[clustering_file.replace('.tsv', '')] = [cluster_0, selected_threshold_cluster, cluster_0_95]
proteomes_taxa_id_names = get_proteomes_tax_id_name(clustering_proteome_tax_id_file)
+
clustering_numbers = {}
for observation_name in proteomes_taxa_id_names:
tax_name = proteomes_taxa_id_names[observation_name]
- clustering_numbers[observation_name] = tax_name_clustering_numbers[tax_name]
+ if tax_name in tax_name_clustering_numbers:
+ clustering_numbers[observation_name] = tax_name_clustering_numbers[tax_name]
clustering_stat_file = os.path.join(clustering_output_folder, 'stat_number_clustering.tsv')
with open(clustering_stat_file, 'w') as stat_file_open:
@@ -325,9 +367,9 @@ def precomputed_parse_affiliation(input_file, database_taxon_file_path, output_f
csvwriter.writerow(['observation_name', 'Number_protein_clusters_panproteome', 'Number_protein_clusters_kept', 'Number_protein_clusters_coreproteome'])
for observation_name in clustering_numbers:
cluster_0 = len(clustering_numbers[observation_name][0])
- cluster_0_5 = len(clustering_numbers[observation_name][1])
+ selected_threshold_cluster = len(clustering_numbers[observation_name][1])
cluster_0_95 = len(clustering_numbers[observation_name][2])
- csvwriter.writerow([observation_name, cluster_0, cluster_0_5, cluster_0_95])
+ csvwriter.writerow([observation_name, cluster_0, selected_threshold_cluster, cluster_0_95])
annotation_stat_file = os.path.join(annotation_output_folder, 'stat_number_annotation.tsv')
compute_stat_annotation(annotation_reference_output_folder, annotation_stat_file)
diff --git a/esmecata/core/proteomes.py b/esmecata/core/proteomes.py
index b064036..490127f 100644
--- a/esmecata/core/proteomes.py
+++ b/esmecata/core/proteomes.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -1260,7 +1260,7 @@ def get_taxon_obs_name(proteome_tax_id_file, selected_taxon_rank='family'):
found_taxon_id = None
if tax_rank == selected_taxon_rank:
- found_taxon_id = tax_id
+ found_taxon_id = int(tax_id)
else:
tax_id_lineages = ncbi.get_lineage(tax_id)
for tax_id in tax_id_lineages:
@@ -1451,8 +1451,9 @@ def download_proteome_file(proteome, output_proteome_file, empty_proteomes, opti
except requests.exceptions.ChunkedEncodingError as error:
logger.critical('|EsMeCaTa|proteomes| Error with proteome file %s, will be considered as empty.', proteome)
logger.critical(error)
- with open(output_proteome_file, 'wb') as f:
- f.write(b'')
+ with gzip.open(output_proteome_file, 'wb') as output_file:
+ output_file.write(b'')
+
else:
import bioservices
@@ -1462,7 +1463,7 @@ def download_proteome_file(proteome, output_proteome_file, empty_proteomes, opti
with open(output_proteome_file, 'wb') as f:
f.write(data_fasta)
# Check if downloaded file is empty, if yes, try with UniParc.
- if os.path.getsize(output_proteome_file) <= 20:
+ if os.path.getsize(output_proteome_file) <= 200:
time.sleep(1)
logger.info('|EsMeCaTa|proteomes| Proteome file %s seems to be empty, it seems that there is an issue with this proteome on UniProt. Try Uniparc.', proteome)
if option_bioservices is None:
@@ -1474,8 +1475,9 @@ def download_proteome_file(proteome, output_proteome_file, empty_proteomes, opti
except requests.exceptions.ChunkedEncodingError as error:
logger.critical('|EsMeCaTa|proteomes| Error with proteome file %s, will be considered as empty.', proteome)
logger.critical(error)
- with open(output_proteome_file, 'wb') as f:
- f.write(b'')
+ with gzip.open(output_proteome_file, 'wb') as output_file:
+ output_file.write(b'')
+
else:
import bioservices
uniprot_bioservices = bioservices.UniProt()
@@ -1483,7 +1485,7 @@ def download_proteome_file(proteome, output_proteome_file, empty_proteomes, opti
frmt='fasta', compress=True, progress=False)
with open(output_proteome_file, 'wb') as f:
f.write(data_fasta)
- if os.path.getsize(output_proteome_file) <= 20:
+ if os.path.getsize(output_proteome_file) <= 200:
logger.info('|EsMeCaTa|proteomes| Proteome file %s is still empty even after using UniParc.', proteome)
empty_proteomes.append(proteome)
else:
diff --git a/esmecata/core/workflow.py b/esmecata/core/workflow.py
index bc87573..ad76357 100644
--- a/esmecata/core/workflow.py
+++ b/esmecata/core/workflow.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/gseapy/gseapy_orsum.py b/esmecata/gseapy/gseapy_orsum.py
index ea19530..370806e 100755
--- a/esmecata/gseapy/gseapy_orsum.py
+++ b/esmecata/gseapy/gseapy_orsum.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024 Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
+# Copyright (C) 2023-2025 Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@@ -161,7 +161,8 @@ def extract_organisms_selected(taxa_lists_file):
def taxon_rank_annotation_enrichment(annotation_folder, output_folder, grouping="tax_rank",
taxon_rank='phylum', taxa_lists_file=None,
- enzyme_data_file=None, go_basic_obo_file=None, orsum_minterm_size=None):
+ enzyme_data_file=None, go_basic_obo_file=None, orsum_minterm_size=None,
+ selected_adjust_pvalue_cutoff=0.05):
""" Run an enrichment analysis on taxon from annotation results of esmecata using gseapy.
Then filter this list with orsum.
@@ -174,6 +175,7 @@ def taxon_rank_annotation_enrichment(annotation_folder, output_folder, grouping=
enzyme_data_file (str): path to expasy enzyme.dat file, if not given, download it
go_basic_obo_file (str): path to Gene Ontology go-basic.obo file, if not given, download it
orsum_minterm_size (int): option minTermSize of orsum
+ selected_adjust_pvalue_cutoff (float): adjust-Pval cutoff for gseapy enrichr, default: 0.05
"""
starttime = time.time()
logger.info('|EsMeCaTa|gseapy_enrichr| Begin enrichment analysis.')
@@ -264,14 +266,14 @@ def taxon_rank_annotation_enrichment(annotation_folder, output_folder, grouping=
# Try to run gseapy enrichr, if no enriched results, continue.
try:
gseapy.enrichr(gene_list=organisms, gene_sets=annotation_sets, background=None,
- outdir=os.path.join(output_dir, tax_name))
+ outdir=os.path.join(output_dir, tax_name), cutoff=selected_adjust_pvalue_cutoff)
except ValueError as error:
- logger.info('|EsMeCaTa|gseapy_enrichr| No enrichred functions with p-value cutoff < 0.05 for {0}.'.format(tax_name))
+ logger.info('|EsMeCaTa|gseapy_enrichr| No enrichred functions with p-value cutoff < {0} for {1}.'.format(selected_adjust_pvalue_cutoff, tax_name))
continue
if os.path.exists(os.path.join(output_dir, tax_name, 'gs_ind_0.human.enrichr.reports.pdf')):
- # If enriched results, extract the ones with an adjusted p-value inferior to 0.05 to output folder.
+ # If enriched results, extract the ones with an adjusted p-value inferior to selected_adjust_pvalue_cutoff to output folder.
df = pd.read_csv(os.path.join(output_dir, tax_name, 'gs_ind_0.human.enrichr.reports.txt'), sep='\t')
- df = df[df['Adjusted P-value'] < 0.05]
+ df = df[df['Adjusted P-value'] < selected_adjust_pvalue_cutoff]
enriched_elements[tax_name] = df.set_index('Term')['Adjusted P-value'].to_dict()
df.sort_values('Adjusted P-value', inplace=True)
@@ -287,6 +289,12 @@ def taxon_rank_annotation_enrichment(annotation_folder, output_folder, grouping=
for org in enriched_elements:
csvwriter.writerow([org, *[enriched_elements[org][element] if element in enriched_elements[org] else 'NA' for element in all_elments]])
+ # Check that there are files in orsum folder.
+ orsum_input_folder_files = os.listdir(orsum_input_folder)
+ if len(orsum_input_folder_files) == 0:
+ logger.critical('|EsMeCaTa|gseapy_enrichr| No enriched files from gseapy enrichr as input for orsum. It seems that there are no enrichd terms found.')
+ sys.exit(1)
+
# Run orsum to filter list of enriched annotations.
logger.info('|EsMeCaTa|gseapy_enrichr| Launch orsum visualisation.')
orsum_output_folder = os.path.join(output_folder, 'orsum_output_folder')
diff --git a/esmecata/precomputed/create_database.py b/esmecata/precomputed/create_database.py
index 86e10c8..8a17fe6 100644
--- a/esmecata/precomputed/create_database.py
+++ b/esmecata/precomputed/create_database.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
+# Copyright (C) 2024-2025 Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@@ -17,6 +17,7 @@
import pandas as pd
import json
import csv
+import logging
import zipfile
from esmecata.core.eggnog import get_proteomes_tax_id_name
@@ -24,6 +25,10 @@
from multiprocessing import Pool
from collections import Counter
+from ete3 import NCBITaxa
+
+logger = logging.getLogger(__name__)
+
def get_proteomes_tax_id_name(proteomes_tax_id_file_path):
""" Extract tax_name + tax_id associated with observation name.
@@ -135,106 +140,17 @@ def copy_file(annotation_file, proteomes_taxa_names, computed_threshold_folder,
shutil.copyfile(consensus_sequence_file_path, taxon_consensus_sequence_file_path)
-def concat_tsv_file(input_tsv_file_path, tsv_file_concat_path, consensus_sequence_folder, annotation_reference_folder):
- """ Concat tsv file for database by checking that fasta and tsv files exist.
-
- Args:
- input_tsv_file_path (str): path to input tsv file.
- tsv_file_concat_path (str): path to tsv file containing concatenation.
- consensus_sequence_folder (str): path to consensus folder.
- annotation_reference_folder (str): path to annotation reference folder.
- """
- if not os.path.exists(tsv_file_concat_path):
- df_proteomes_tax_id_file_path = pd.read_csv(input_tsv_file_path, sep='\t')
- # Check that corresponding files exist.
- existing_files = [os.path.exists(consensus_sequence_folder, tax_id_name+'.faa') and os.path.exists(annotation_reference_folder, tax_id_name+'.tsv')
- for tax_id_name in df_proteomes_tax_id_file_path['tax_id_name']]
- df_proteomes_tax_id_file_path = df_proteomes_tax_id_file_path[existing_files]
- df_proteomes_tax_id_file_path.to_csv(tsv_file_concat_path, sep='\t', index=False)
- else:
- df_proteomes_tax_id_file_path = pd.read_csv(input_tsv_file_path, sep='\t')
- # Check that corresponding files exist.
- existing_files = [os.path.exists(consensus_sequence_folder, tax_id_name, tax_id_name+'.faa') and os.path.exists(annotation_reference_folder, tax_id_name, tax_id_name+'.tsv')
- for tax_id_name in df_proteomes_tax_id_file_path['tax_id_name']]
- df_proteomes_tax_id_file_path = df_proteomes_tax_id_file_path[existing_files]
- df_tsv_file_concat = pd.read_csv(tsv_file_concat_path, sep='\t')
- df_concat = pd.concat([df_proteomes_tax_id_file_path, df_tsv_file_concat])
- df_concat.to_csv(tsv_file_concat_path, sep='\t', index=False)
-
-
-def create_database(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_database_folder, nb_core=1):
- """ Create esmecata database from a run of esmecata.
-
- Args:
- esmecata_proteomes_folder (str): path to esmecata proteomes folder.
- esmecata_clustering_folder (str): path to esmecata clustering folder.
- esmecata_annotation_folder (str): path to esmecata anntoation folder.
- output_database_folder (str): path to output folder containing zip database of esmecata.
- nb_core (int): number of core to use when creating database.
- """
- if not os.path.exists(output_database_folder):
- os.mkdir(output_database_folder)
-
- consensus_sequence_folder = os.path.join(esmecata_clustering_folder, 'reference_proteins_consensus_fasta')
- computed_threshold_folder = os.path.join(esmecata_clustering_folder, 'computed_threshold')
-
- annotation_reference_folder = os.path.join(esmecata_annotation_folder, 'annotation_reference')
-
- # Create/merge proteome_tax_id file for each taxon level.
- proteomes_tax_id_file_path = os.path.join(esmecata_clustering_folder, 'proteome_tax_id.tsv')
- proteomes_tax_id_file_database_path = os.path.join(output_database_folder, 'proteome_tax_id.tsv')
- concat_tsv_file(proteomes_tax_id_file_path, proteomes_tax_id_file_database_path, consensus_sequence_folder, annotation_reference_folder)
-
- # Create/merge stat_proteome file for each taxon level.
- stat_number_proteome_file_path = os.path.join(esmecata_proteomes_folder, 'stat_number_proteome.tsv')
- stat_number_proteome_database_path = os.path.join(output_database_folder, 'stat_number_proteome.tsv')
- concat_tsv_file(stat_number_proteome_file_path, stat_number_proteome_database_path)
-
- # Create/merge stat_number_clustering file for each taxon level.
- stat_number_clustering_file_path = os.path.join(esmecata_clustering_folder, 'stat_number_clustering.tsv')
- stat_number_clustering_database_path = os.path.join(output_database_folder, 'stat_number_clustering.tsv')
- concat_tsv_file(stat_number_clustering_file_path, stat_number_clustering_database_path)
-
- # Create/merge stat_proteome file for each taxon level.
- stat_number_annotation_file_path = os.path.join(esmecata_annotation_folder, 'stat_number_annotation.tsv')
- stat_number_annotation_database_path = os.path.join(output_database_folder, 'stat_number_annotation.tsv')
- concat_tsv_file(stat_number_annotation_file_path, stat_number_annotation_database_path)
-
- proteomes_taxa_names = get_proteomes_tax_id_name(proteomes_tax_id_file_path)
-
- # Use multiprocessing to copy fasta and annotation files.
- database_copy_pool = Pool(nb_core)
-
- similar_tax_id_names = []
- for observation_name in proteomes_taxa_names:
- similar_tax_id_names.append(proteomes_taxa_names[observation_name])
- similar_tax_id_names = Counter(similar_tax_id_names)
-
- multiprocessing_data = []
- for annotation_file in os.listdir(annotation_reference_folder):
- observation_name = os.path.splitext(annotation_file)[0]
- taxon_id_name = proteomes_taxa_names[observation_name]
- # If more than 1 observation name associated with a similar tax_id_name, do not use multiprocessing as it leads error.
- if similar_tax_id_names[taxon_id_name] > 1:
- copy_file(annotation_file, proteomes_taxa_names, computed_threshold_folder, annotation_reference_folder, consensus_sequence_folder, output_database_folder)
- else:
- multiprocessing_data.append([annotation_file, proteomes_taxa_names, computed_threshold_folder, annotation_reference_folder, consensus_sequence_folder, output_database_folder])
- database_copy_pool.starmap(copy_file, multiprocessing_data)
-
- database_copy_pool.close()
- database_copy_pool.join()
-
-
-def create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_database_folder, nb_core=1):
+def create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_folder, nb_core=1):
""" Create esmecata database from a run of esmecata.
Args:
esmecata_proteomes_folder (str): path to esmecata proteomes folder.
esmecata_clustering_folder (str): path to esmecata clustering folder.
esmecata_annotation_folder (str): path to esmecata anntoation folder.
- output_database_folder (str): path to output folder containing zip database of esmecata.
+ output_folder (str): path to output folder.
nb_core (int): number of core to use when creating database.
"""
+ output_database_folder = os.path.join(output_folder, 'database_folder')
if not os.path.exists(output_database_folder):
os.mkdir(output_database_folder)
@@ -242,6 +158,19 @@ def create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_fold
proteomes_tax_id_file_path = os.path.join(esmecata_clustering_folder, 'proteome_tax_id.tsv')
df_proteomes_tax_id = pd.read_csv(proteomes_tax_id_file_path, sep='\t')
df_proteomes_tax_id.set_index('observation_name', inplace=True)
+ obs_name_proteomes = df_proteomes_tax_id['proteome'].to_dict()
+
+ # Get dict mapping tax IDs and observation names.
+ obs_name_tax_ids = df_proteomes_tax_id['tax_id'].to_dict()
+ tax_id_obs_names = {}
+ for observation_name in obs_name_tax_ids:
+ tax_id = obs_name_tax_ids[observation_name]
+ if tax_id not in tax_id_obs_names:
+ tax_id_obs_names[tax_id] = [observation_name]
+ else:
+ tax_id_obs_names[tax_id].append(observation_name)
+
+ esmecata_proteomes_downloaded_folder = os.path.join(esmecata_proteomes_folder, 'proteomes')
# Create/merge stat_proteome file for each taxon level.
stat_number_proteome_file_path = os.path.join(esmecata_proteomes_folder, 'stat_number_proteome.tsv')
@@ -253,6 +182,10 @@ def create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_fold
df_stat_number_clustering = pd.read_csv(stat_number_clustering_file_path, sep='\t')
df_stat_number_clustering.set_index('observation_name', inplace=True)
+ df_stat_join = df_proteomes_tax_id.join(df_stat_number_clustering)
+ df_stat_join.set_index('tax_id', inplace=True)
+ tax_id_protein_clusters = df_stat_join['Number_protein_clusters_kept'].to_dict()
+
# Create/merge stat_proteome file for each taxon level.
stat_number_annotation_file_path = os.path.join(esmecata_annotation_folder, 'stat_number_annotation.tsv')
df_stat_number_annotation = pd.read_csv(stat_number_annotation_file_path, sep='\t')
@@ -266,6 +199,49 @@ def create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_fold
annotation_reference_folder = os.path.join(esmecata_annotation_folder, 'annotation_reference')
+ ncbi = NCBITaxa()
+ database_tree = ncbi.get_topology([org_tax_id for org_tax_id in df_proteomes_tax_id['tax_id']])
+ tax_id_issues = {}
+ # Parse all descendant of root to search for children tax id with lower protein than parents.
+ # To identify taxon with potential errors.
+ for descendant in database_tree.iter_descendants():
+ descendant_childrens = descendant.children
+ parent_tax_id = int(descendant.name)
+ if parent_tax_id in tax_id_protein_clusters:
+ parent_protein_nb = tax_id_protein_clusters[parent_tax_id]
+ else:
+ parent_protein_nb = None
+ if descendant_childrens != []:
+ for children in descendant.children:
+ children_tax_id = int(children.name)
+ if children_tax_id in tax_id_protein_clusters:
+ children_protein_nb = tax_id_protein_clusters[children_tax_id]
+ else:
+ children_protein_nb = None
+ if children_protein_nb is not None and parent_protein_nb is not None:
+ # Compute percent of proteins in children compare to parent
+ percent_protein_nb = (children_protein_nb / parent_protein_nb) * 100
+ # If children has less than 20% of proteins compared to the parent, add them to the issues.
+ if percent_protein_nb < 20:
+ tax_id_issues[children_tax_id] = (children_protein_nb, parent_tax_id, parent_protein_nb)
+
+ predictions_with_issues = []
+ issues_tax_id_folder = os.path.join(output_folder, 'issues_tax_id')
+ if not os.path.exists(issues_tax_id_folder):
+ os.mkdir(issues_tax_id_folder)
+ for tax_id in tax_id_issues:
+ children_protein_nb = tax_id_issues[tax_id][0]
+ parent_tax_id = tax_id_issues[tax_id][1]
+ parent_protein_nb = tax_id_issues[tax_id][2]
+ for observation_name in tax_id_obs_names[tax_id]:
+ observation_name_issue_folder = os.path.join(issues_tax_id_folder, observation_name)
+ if not os.path.exists(observation_name_issue_folder):
+ os.mkdir(observation_name_issue_folder)
+ for proteome in obs_name_proteomes[observation_name].split(','):
+ shutil.copyfile(os.path.join(esmecata_proteomes_downloaded_folder, proteome+'.faa.gz'), os.path.join(observation_name_issue_folder, proteome+'.faa.gz'))
+ logger.info(f'|EsMeCaTa|create_db| {observation_name} with less than 20% of protein clusters ({children_protein_nb}) compared to parent ({parent_tax_id}) protein clusters ({parent_protein_nb}).')
+ predictions_with_issues.append(observation_name)
+
# Use multiprocessing to copy fasta and annotation files.
database_copy_pool = Pool(nb_core)
@@ -282,20 +258,24 @@ def create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_fold
multiprocessing_data = []
for annotation_file in os.listdir(annotation_reference_folder):
observation_name = os.path.splitext(annotation_file)[0]
- taxon_id_name = proteomes_taxa_names[observation_name]
- reference_proteins_consensus_fasta_file = os.path.join(consensus_sequence_folder, taxon_id_name+'.faa')
- if taxon_id_name not in already_processed_tax_id and os.path.exists(reference_proteins_consensus_fasta_file):
- proteome_tax_id_data.append([taxon_id_name, *df_proteomes_tax_id.loc[observation_name].to_list()])
- stat_number_proteome_data.append([taxon_id_name, *df_stat_number_proteome.loc[observation_name].to_list()])
- stat_number_clustering_data.append([taxon_id_name, *df_stat_number_clustering.loc[observation_name].to_list()])
- stat_number_annotation_data.append([taxon_id_name, *df_stat_number_annotation.loc[observation_name].to_list()])
-
- already_processed_tax_id.append(taxon_id_name)
- # If more than 1 observation name associated with a similar tax_id_name, do not use multiprocessing as it leads error.
- if similar_tax_id_names[taxon_id_name] > 1:
- copy_file(annotation_file, proteomes_taxa_names, computed_threshold_folder, annotation_reference_folder, consensus_sequence_folder, output_database_folder)
+ if observation_name in predictions_with_issues:
+ logger.info(f'|EsMeCaTa|create_db| {observation_name} will not be added to database as it contains 0 ECs and 0 GO Terms.')
else:
- multiprocessing_data.append([annotation_file, proteomes_taxa_names, computed_threshold_folder, annotation_reference_folder, consensus_sequence_folder, output_database_folder])
+ taxon_id_name = proteomes_taxa_names[observation_name]
+ reference_proteins_consensus_fasta_file = os.path.join(consensus_sequence_folder, taxon_id_name+'.faa')
+ if taxon_id_name not in already_processed_tax_id and os.path.exists(reference_proteins_consensus_fasta_file):
+ proteome_tax_id_data.append([taxon_id_name, *df_proteomes_tax_id.loc[observation_name].to_list()])
+ stat_number_proteome_data.append([taxon_id_name, *df_stat_number_proteome.loc[observation_name].to_list()])
+ stat_number_clustering_data.append([taxon_id_name, *df_stat_number_clustering.loc[observation_name].to_list()])
+ stat_number_annotation_data.append([taxon_id_name, *df_stat_number_annotation.loc[observation_name].to_list()])
+
+ already_processed_tax_id.append(taxon_id_name)
+ # If more than 1 observation name associated with a similar tax_id_name, do not use multiprocessing as it leads error.
+ if similar_tax_id_names[taxon_id_name] > 1:
+ copy_file(annotation_file, proteomes_taxa_names, computed_threshold_folder, annotation_reference_folder, consensus_sequence_folder, output_database_folder)
+ else:
+ multiprocessing_data.append([annotation_file, proteomes_taxa_names, computed_threshold_folder, annotation_reference_folder, consensus_sequence_folder, output_database_folder])
+
database_copy_pool.starmap(copy_file, multiprocessing_data)
database_copy_pool.close()
@@ -333,7 +313,7 @@ def create_database_from_esmecata_run(esmecata_proteomes_folder, esmecata_cluste
os.mkdir(output_database_folder)
create_json(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_database_folder, database_version)
- create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_database_folder, nb_core)
+ create_database_from_run(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_folder, nb_core)
compress_database_file = os.path.join(output_folder, 'esmecata_database')
if not os.path.exists(compress_database_file):
@@ -357,38 +337,13 @@ def create_database_from_esmecata_workflow_run(esmecata_workflow_folder, output_
create_database_from_esmecata_run(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_folder, database_version, nb_core)
-def create_database_from_multiple_esmecata_runs(esmecata_proteomes_folders, esmecata_clustering_folders, esmecata_annotation_folders, output_folder, database_version, cpu_number=1):
- """ Extract data from esmecata runs and create an esmecata database from these.
+def merge_db_files(list_db_files, output_folder):
+ """ From a list of zip files of precomputed database of esmecata, merged them in one database.
Args:
- esmecata_proteomes_folders (list): list of path to esmecata proteomes folder.
- esmecata_clustering_folders (list): list of path to esmecata clustering folder.
- esmecata_annotation_folders (list): list of path to esmecata anntoation folder.
+ list_db_files (list): list of path to precomputed zip file of esmecata.
output_folder (str): path to output folder containing zip database of esmecata.
- nb_core (int): number of core to use when creating database.
"""
- if not os.path.exists(output_folder):
- os.mkdir(output_folder)
-
- output_database_folder = os.path.join(output_folder, 'database_folder')
- if not os.path.exists(output_database_folder):
- os.mkdir(output_database_folder)
-
- for index, esmecata_proteomes_folder in enumerate(esmecata_proteomes_folders):
- taxon_level = os.path.basename(esmecata_proteomes_folder).split('_')[0]
- esmecata_clustering_folder = esmecata_clustering_folders[index]
- esmecata_annotation_folder = esmecata_annotation_folders[index]
- create_json(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_database_folder, database_version, taxon_level)
- create_database(esmecata_proteomes_folder, esmecata_clustering_folder, esmecata_annotation_folder, output_database_folder, cpu_number)
-
- compress_database_file = os.path.join(output_folder, 'esmecata_database')
- if not os.path.exists(compress_database_file):
- os.mkdir(compress_database_file)
- shutil.make_archive(compress_database_file, 'zip', output_database_folder)
- shutil.rmtree(output_database_folder)
-
-
-def merge_db_files(list_db_files, output_folder):
stat_number_proteome_df = pd.DataFrame(columns=['observation_name', 'Number_proteomes', 'Input_taxon_Name', 'Taxon_rank', 'EsMeCaTa_used_taxon', 'EsMeCaTa_used_rank', 'only_reference_proteome_used'])
stat_number_clustering_df = pd.DataFrame(columns=['observation_name', 'Number_protein_clusters_panproteome', 'Number_protein_clusters_kept', 'Number_protein_clusters_coreproteome'])
stat_number_annotation_df = pd.DataFrame(columns=['observation_name', 'Number_go_terms', 'Number_ecs'])
diff --git a/esmecata/precomputed/create_input_precomputation.py b/esmecata/precomputed/create_input_precomputation.py
index 8fdee4e..4cd7873 100644
--- a/esmecata/precomputed/create_input_precomputation.py
+++ b/esmecata/precomputed/create_input_precomputation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
+# Copyright (C) 2024-2025 Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
diff --git a/esmecata/report/ec_sunburst_per_model.py b/esmecata/report/ec_sunburst_per_model.py
index 30e35f9..a60c384 100755
--- a/esmecata/report/ec_sunburst_per_model.py
+++ b/esmecata/report/ec_sunburst_per_model.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024 Alice Mataigne - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2023-2025 Alice Mataigne - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/report/esmecata2taxontology.py b/esmecata/report/esmecata2taxontology.py
index 98614d0..7fad632 100644
--- a/esmecata/report/esmecata2taxontology.py
+++ b/esmecata/report/esmecata2taxontology.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024 Alice Mataigne and Pauline Hamon-Giraud - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2023-2025 Alice Mataigne and Pauline Hamon-Giraud - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/report/esmecata_compression.py b/esmecata/report/esmecata_compression.py
index f6348f7..87eacab 100644
--- a/esmecata/report/esmecata_compression.py
+++ b/esmecata/report/esmecata_compression.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024 Alice Mataigne and Pauline Hamon-Giraud - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2023-2025 Alice Mataigne and Pauline Hamon-Giraud - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/report/report_creation.py b/esmecata/report/report_creation.py
index ebc5b52..5edc9c9 100755
--- a/esmecata/report/report_creation.py
+++ b/esmecata/report/report_creation.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-# Copyright (C) 2023-2024 Pauline Hamon-Giraud and Alice Mataigne - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2023-2025 Pauline Hamon-Giraud and Alice Mataigne - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/report/stats_workflow_figures.py b/esmecata/report/stats_workflow_figures.py
index 795655c..e317df3 100755
--- a/esmecata/report/stats_workflow_figures.py
+++ b/esmecata/report/stats_workflow_figures.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# Copyright (C) 2023-2024 Pauline Hamon-Giraud and Alice Mataigne - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2023-2025 Pauline Hamon-Giraud and Alice Mataigne - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/report/workflow_create_report.py b/esmecata/report/workflow_create_report.py
index 9fc6e69..bf82c38 100755
--- a/esmecata/report/workflow_create_report.py
+++ b/esmecata/report/workflow_create_report.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024 Alice Mataigne and Pauline Hamon-Giraud - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2023-2025 Alice Mataigne and Pauline Hamon-Giraud - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Arnaud Belcour - Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/esmecata/utils.py b/esmecata/utils.py
index 4061d7f..7d94d7a 100644
--- a/esmecata/utils.py
+++ b/esmecata/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2024 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
+# Copyright (C) 2021-2025 Arnaud Belcour - Inria, Univ Rennes, CNRS, IRISA Dyliss
# Univ. Grenoble Alpes, Inria, Microcosme
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
diff --git a/pictures/esmecata.svg b/pictures/esmecata.svg
deleted file mode 100644
index 406d2cc..0000000
--- a/pictures/esmecata.svg
+++ /dev/null
@@ -1,8662 +0,0 @@
-
-
-
-
diff --git a/pictures/esmecata_10.svg b/pictures/esmecata_10.svg
index e1e28aa..5cab1ef 100644
--- a/pictures/esmecata_10.svg
+++ b/pictures/esmecata_10.svg
@@ -8,7 +8,7 @@
version="1.1"
id="svg5"
sodipodi:docname="esmecata_10.svg"
- inkscape:version="1.3.2 (1:1.3.2+202311252150+091e20ef0f)"
+ inkscape:version="1.4 (1:1.4+202410151213+e7c3feb100)"
xml:space="preserve"
inkscape:export-filename="esmecata_10.pdf"
inkscape:export-xdpi="96"
@@ -27,9 +27,9 @@
inkscape:pagecheckerboard="true"
inkscape:document-units="mm"
showgrid="false"
- inkscape:zoom="0.29457855"
- inkscape:cx="1322.228"
- inkscape:cy="1772.0231"
+ inkscape:zoom="0.20829849"
+ inkscape:cx="2726.8561"
+ inkscape:cy="2157.9609"
inkscape:window-width="2490"
inkscape:window-height="1376"
inkscape:window-x="70"
@@ -401,8 +401,8 @@
inkscape:stockid="Arrow2Lend"
inkscape:isstock="true"
viewBox="0 0 12.705841 9.5264135"
- markerWidth="12.705841"
- markerHeight="9.5264139"
+ markerWidth="12.70584106"
+ markerHeight="9.52641355"
preserveAspectRatio="xMidYMid">GenusFamilyPhylumOrder List of Taxonomic List of Taxonomicaffiliations TA TA 3 affiliations (TA)Search on UniProt Search for proteomes to find in UniProtproteomesKeep lowesttaxon rankKeep lowest taxon rankwith proteomeswith enough proteomesClustering withCluster with MMSeqs2 Filtering according Filtering according to proteomes to proteomesrepresentationrepresentativeness sequencesRules: - at least 5 proteomes - proteomes wit BUSCO - proteomes with BUSCOscore >= 80 %Annotation with Annotate witheggNOG-mapperGenusFamilyPhylumOrder TA 2GenusFamilyPhylumOrder TA 1eggNOG-mapperGenusFamilyPhylumOrderTA tableProtein cluster 2Protein cluster 3Protein cluster 1Step 1 applied on TA1 Step 1 Step 2 applied on TA 1 Step 2Step 3 applied on TA 1 Step 3Output TSV file TSV TSV fileTA 1TA 2TA 1TA 3 file5 proteomesFamilyOutput TSV file
+ id="path13" />GenusFamilyPhylumOrderTAGenusFamilyPhylumOrderTA
diff --git a/test/buchnera_database.zip b/test/buchnera_database.zip
index 5d9ccbc..fe772b4 100644
Binary files a/test/buchnera_database.zip and b/test/buchnera_database.zip differ
diff --git a/test/test_annotation.py b/test/test_annotation.py
index 2d92ecd..17bdb20 100644
--- a/test/test_annotation.py
+++ b/test/test_annotation.py
@@ -12,16 +12,8 @@
[],
'secA']}
-UP000119554_ANOTATIONS = {'O91464': ['Genome polyprotein', 'UniProtKB reviewed (Swiss-Prot)',
- ['GO:0039618', 'GO:0044162', 'GO:0006508', 'GO:0003723', 'GO:0039694', 'GO:0016020', 'GO:0019062',
- 'GO:0003968', 'GO:0016887', 'GO:0005524', 'GO:0044178', 'GO:0046718', 'GO:0003724', 'GO:0005198',
- 'GO:0039522', 'GO:0006351', 'GO:0004197', 'GO:0034220', 'GO:0015267'],
- ['2.7.7.48', '3.6.4.13'],
- ['IPR000199', 'IPR000605', 'IPR001205', 'IPR001676', 'IPR004004',
- 'IPR007053', 'IPR007094', 'IPR009003', 'IPR014759', 'IPR027417',
- 'IPR029053', 'IPR033703', 'IPR043128', 'IPR043502', 'IPR043504'],
- ['13065', '21248'],
- '']}
+UP000119554_ANOTATIONS = {'O91464': ['Genome polyprotein', 'UniProtKB reviewed (Swiss-Prot)', ['GO:0003723', 'GO:0003724', 'GO:0003968', 'GO:0004197', 'GO:0005198', 'GO:0005524', 'GO:0006351', 'GO:0006508', 'GO:0015267', 'GO:0016020', 'GO:0016887', 'GO:0018144', 'GO:0019062', 'GO:0034220', 'GO:0039522', 'GO:0039618', 'GO:0039694', 'GO:0044162', 'GO:0044178', 'GO:0046718'], ['3.6.4.13', '2.7.7.48'], ['IPR000199', 'IPR000605', 'IPR001205', 'IPR001676', 'IPR004004', 'IPR007053', 'IPR007094', 'IPR009003', 'IPR014759', 'IPR027417', 'IPR029053', 'IPR033703', 'IPR043128', 'IPR043502', 'IPR043504'], ['13065', '21248'], '']}
+
TREMBL_ANNOTATIONS = {'A0A1B2H8S9': ['Siroheme synthase', False,
['GO:0009236', 'GO:0043115', 'GO:0032259', 'GO:0019354', 'GO:0004851', 'GO:0051266', 'GO:0051287'],
diff --git a/test/test_gseapy.py b/test/test_gseapy.py
index 8c3f7c6..ed4e93a 100644
--- a/test/test_gseapy.py
+++ b/test/test_gseapy.py
@@ -42,6 +42,42 @@ def test_taxon_rank_annotation_enrichment_selected_cli():
shutil.rmtree(output_folder)
+def test_taxon_rank_annotation_enrichment_selected_cutoff():
+ input_folder = 'annotation_output'
+ output_folder = 'output_folder'
+ grouping = 'selected'
+ taxa_list_file = 'taxa_list.tsv'
+ expected_terms = ['1.1.1.86', '2.8.4.1']
+ if not os.path.exists(output_folder):
+ os.mkdir(output_folder)
+ taxon_rank_annotation_enrichment(input_folder, output_folder, grouping, taxa_lists_file=taxa_list_file, orsum_minterm_size=4, selected_adjust_pvalue_cutoff=0.1)
+
+ result_file = os.path.join(output_folder, 'orsum_output_folder', 'filteredResult-Summary.tsv')
+ df = pd.read_csv(result_file, sep='\t')
+ representing_terms = df['Representing term id']
+
+ assert sorted(representing_terms) == sorted(expected_terms)
+ shutil.rmtree(output_folder)
+
+
+def test_taxon_rank_annotation_enrichment_selected_cutoff_cli():
+ input_folder = 'annotation_output'
+ output_folder = 'output_folder'
+ grouping = 'selected'
+ taxa_list_file = 'taxa_list.tsv'
+ expected_terms = ['1.1.1.86', '2.8.4.1']
+ if not os.path.exists(output_folder):
+ os.mkdir(output_folder)
+ subprocess.call(['esmecata_gseapy', 'gseapy_enrichr', '-f', input_folder, '-o', output_folder, '--grouping', grouping, '--taxa-list', taxa_list_file, '--orsumMinTermSize', '4', '--gseapyCutOff', '0.1'])
+
+ result_file = os.path.join(output_folder, 'orsum_output_folder', 'filteredResult-Summary.tsv')
+ df = pd.read_csv(result_file, sep='\t')
+ representing_terms = df['Representing term id']
+
+ assert sorted(representing_terms) == sorted(expected_terms)
+ shutil.rmtree(output_folder)
+
+
def test_taxon_rank_annotation_enrichment_tax_rank():
input_folder = 'annotation_output'
output_folder = 'output_folder'
@@ -74,4 +110,3 @@ def test_taxon_rank_annotation_enrichment_tax_rank_cli():
assert sorted(representing_terms) == sorted(expected_terms)
shutil.rmtree(output_folder)
-
\ No newline at end of file
diff --git a/test/test_proteomes.py b/test/test_proteomes.py
index 880c132..6a30163 100644
--- a/test/test_proteomes.py
+++ b/test/test_proteomes.py
@@ -13,6 +13,7 @@
update_taxonomy
TAXONOMIES = {'id_1': 'cellular organisms;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Yersiniaceae;Yersinia;species not found'}
+EXPECTED_PROTEOMES_DATA = [['UP000000815', 94.54545454545455, 'full', '632', True, [['Chromosome', 'Yersinia pestis CO92 complete genome'], ['Plasmid pCD1', 'Yersinia pestis CO92 plasmid pCD1'], ['Plasmid pMT1', 'Yersinia pestis CO92 plasmid pMT1'], ['Plasmid pPCP1', 'Yersinia pestis CO92 plasmid pPCP1']]], ['UP000255169', 98.86363636363636, 'full', '29486', True, [['Unassembled WGS sequence', 'Yersinia ruckeri']]], ['UP000000642', 100.0, 'full', '393305', False, [['Chromosome', 'Yersinia enterocolitica subsp. enterocolitica 8081 complete genome'], ['Plasmid pYVe8081', 'Yersinia enterocolitica subsp. enterocolitica 8081 plasmid pYVe8081 complete genome']]], ['UP000001011', 100.0, 'full', '273123', False, [['Chromosome', 'Yersinia pseudotuberculosis IP32953 genome'], ['Plasmid pYV', 'Yersinia pseudotuberculosis IP32953 pYV plasmid'], ['Plasmid pYptb32953', 'Yersinia pseudotuberculosis IP32953 cryptic plasmid']]], ['UP000001019', 99.31818181818181, 'full', '632', False, [['Chromosome', 'Yersinia pestis biovar Microtus str. 91001'], ['Plasmid pCD1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pCD1'], ['Plasmid pCRY', 'Yersinia pestis biovar Microtus str. 91001 plasmid pCRY'], ['Plasmid pMT1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pMT1'], ['Plasmid pPCP1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1']]], ['UP000001971', 99.0909090909091, 'full', '360102', False, [['Chromosome', 'Yersinia pestis Antiqua'], ['Plasmid pMT', 'Yersinia pestis Antiqua plasmid pMT'], ['Plasmid pPCP', 'Yersinia pestis Antiqua plasmid pPCP'], ['Plasmid pCD', 'Yersinia pestis Antiqua plasmid pCD']]], ['UP000002412', 99.54545454545455, 'full', '349747', False, [['Chromosome', 'Yersinia pseudotuberculosis IP 31758'], ['Plasmid plasmid_59kb', 'Yersinia pseudotuberculosis IP 31758 plasmid plasmid_59kb'], ['Plasmid plasmid_153kb', 'Yersinia pseudotuberculosis IP 31758 plasmid plasmid_153kb']]], ['UP000002490', 98.18181818181819, 'full', '632', False, [['Chromosome', 'Yersinia pestis KIM10+'], ['Plasmid pMT-1', 'Yersinia pestis KIM10+ plasmid pMT-1']]], ['UP000008084', 98.18181818181819, 'full', '930944', False, [['Chromosome', 'Yersinia enterocolitica subsp. palearctica Y11'], ['Plasmid pYV03', 'Yersinia enterocolitica subsp. palearctica Y11 plasmid pYVO3 complete sequence']]], ['UP000008936', 99.54545454545455, 'full', '377628', False, [['Chromosome', 'Yersinia pestis Nepal516'], ['Plasmid pMT', 'Yersinia pestis Nepal516 plasmid pMT'], ['Plasmid pPCP', 'Yersinia pestis Nepal516 plasmid pPCP']]], ['UP000038204', 99.0909090909091, 'full', '367190', False, [['Unassembled WGS sequence', 'Yersinia similis']]], ['UP000038750', 99.0909090909091, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000040088', 98.63636363636363, 'full', '263819', False, [['Unassembled WGS sequence', 'Yersinia aleksiciae']]], ['UP000040841', 98.86363636363636, 'full', '33060', False, [['Unassembled WGS sequence', 'Yersinia mollaretii']]], ['UP000041356', 99.0909090909091, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000041595', 98.63636363636363, 'full', '29483', False, [['Unassembled WGS sequence', 'Yersinia aldovae']]], ['UP000041882', 99.0909090909091, 'full', '2890319', False, [['Unassembled WGS sequence', 'Yersinia thracica']]], ['UP000042054', 98.86363636363636, 'full', '29485', False, [['Unassembled WGS sequence', 'Yersinia rohdei']]], ['UP000043316', 98.63636363636363, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000045824', 99.0909090909091, 'full', '28152', False, [['Unassembled WGS sequence', 'Yersinia kristensenii']]], ['UP000045840', 98.18181818181819, 'full', '1288385', False, [['Unassembled WGS sequence', 'Yersinia pekkanenii']]], ['UP000046784', 98.86363636363636, 'full', '29484', False, [['Unassembled WGS sequence', 'Yersinia frederiksenii']]], ['UP000048687', 98.86363636363636, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000048841', 99.0909090909091, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000196440', 99.54545454545455, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000220513', 100.0, 'full', '28152', False, [['Unassembled WGS sequence', 'Yersinia kristensenii']]], ['UP000229378', 99.54545454545455, 'full', '634', False, [['Unassembled WGS sequence', 'Yersinia bercovieri']]], ['UP000230961', 99.0909090909091, 'full', '2339259', False, [['Plasmid p2_50K', 'Yersinia enterocolitica LC20 plasmid plasmid2_50K'], ['Plasmid p1_80K', 'Yersinia enterocolitica LC20 plasmid plasmid1_80K'], ['Chromosome', 'Yersinia enterocolitica LC20']]], ['UP000254835', 99.0909090909091, 'full', '29484', False, [['Unassembled WGS sequence', 'Yersinia frederiksenii']]], ['UP000255087', 97.72727272727273, 'full', '633', False, [['Unassembled WGS sequence', 'Yersinia pseudotuberculosis']]], ['UP000265864', 100.0, 'full', '1604335', False, [['Chromosome', 'Yersinia rochesterensis strain ATCC BAA-2637 chromosome.'], ['Plasmid pUnnamed1', 'Yersinia rochesterensis strain ATCC BAA-2637 plasmid pUnnamed1.'], ['Plasmid pUnnamed2', 'Yersinia rochesterensis strain ATCC BAA-2637 plasmid pUnnamed2.']]], ['UP000326807', 100.0, 'full', '1234662', False, [['Plasmid pmt', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pMT'], ['Plasmid pckf', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pCKF'], ['Plasmid ppcp', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pPCP'], ['Chromosome', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 chromosome'], ['Plasmid pcd', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pCD']]], ['UP000464402', 93.86363636363636, 'full', '2607663', False, [['Chromosome', 'Yersinia canariae strain NCTC 14382 chromosome']]], ['UP000595309', 100.0, 'full', '630', False, [['Chromosome', 'Yersinia enterocolitica strain FDAARGOS_1082 chromosome']]], ['UP000698240', 100.0, 'full', '419257', False, [['Unassembled WGS sequence', 'Yersinia massiliensis']]], ['UP000712947', 100.0, 'full', '33060', False, [['Unassembled WGS sequence', 'Yersinia mollaretii']]], ['UP001146905', 96.81818181818181, 'full', '385964', False, [['Unassembled WGS sequence', 'Yersinia pestis subsp. pestis']]], ['UP001182355', 99.77272727272727, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000004430', 94.0909090909091, 'full', '373665', False, [['Plasmid pIP1202', 'Yersinia pestis biovar Orientalis str. IP275 plasmid pIP1202'], ['Unassembled WGS sequence', 'Yersinia pestis biovar Orientalis str. IP275']]], ['UP000251879', 98.4090909090909, 'full', '632', False, [['Unassembled WGS sequence', 'Yersinia pestis']]], ['UP001167864', 99.77272727272727, 'full', '685706', False, [['Unassembled WGS sequence', 'Yersinia nurmii']]]]
def test_taxonomic_affiliation_to_taxon_id_offline():
@@ -146,7 +147,7 @@ def test_update_taxonomy_yersinia_offline():
outdated_taxonomic_affiliation = 'Bacteria;Yersinia'
new_taxonomic_affiliation = update_taxonomy('test', outdated_taxonomic_affiliation)
- expected_taconomic_affiliation = 'root;cellular organisms;Bacteria;Pseudomonadota;Gammaproteobacteria;Enterobacterales;Yersiniaceae;Yersinia'
+ expected_taconomic_affiliation = 'root;cellular organisms;Bacteria;Pseudomonadati;Pseudomonadota;Gammaproteobacteria;Enterobacterales;Yersiniaceae;Yersinia'
assert new_taxonomic_affiliation == expected_taconomic_affiliation
@@ -154,22 +155,20 @@ def test_update_taxonomy_yersinia_offline():
def test_rest_query_proteomes_online():
expected_proteoems = ['UP000255169', 'UP000000815']
expected_organism_ids = {'632': ['UP000000815'], '29486': ['UP000255169']}
- expected_proteome_data = [['UP000000815', 94.54545454545455, 'full', '632', True, [['Chromosome', 'Yersinia pestis CO92 complete genome'], ['Plasmid pCD1', 'Yersinia pestis CO92 plasmid pCD1'], ['Plasmid pMT1', 'Yersinia pestis CO92 plasmid pMT1'], ['Plasmid pPCP1', 'Yersinia pestis CO92 plasmid pPCP1']]], ['UP000255169', 98.86363636363636, 'full', '29486', True, [['Unassembled WGS sequence', 'Yersinia ruckeri']]], ['UP000000642', 100.0, 'full', '393305', False, [['Chromosome', 'Yersinia enterocolitica subsp. enterocolitica 8081 complete genome'], ['Plasmid pYVe8081', 'Yersinia enterocolitica subsp. enterocolitica 8081 plasmid pYVe8081 complete genome']]], ['UP000001011', 100.0, 'full', '273123', False, [['Chromosome', 'Yersinia pseudotuberculosis IP32953 genome'], ['Plasmid pYV', 'Yersinia pseudotuberculosis IP32953 pYV plasmid'], ['Plasmid pYptb32953', 'Yersinia pseudotuberculosis IP32953 cryptic plasmid']]], ['UP000001019', 99.31818181818181, 'full', '632', False, [['Chromosome', 'Yersinia pestis biovar Microtus str. 91001'], ['Plasmid pCD1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pCD1'], ['Plasmid pCRY', 'Yersinia pestis biovar Microtus str. 91001 plasmid pCRY'], ['Plasmid pMT1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pMT1'], ['Plasmid pPCP1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1']]], ['UP000001971', 99.0909090909091, 'full', '360102', False, [['Chromosome', 'Yersinia pestis Antiqua'], ['Plasmid pMT', 'Yersinia pestis Antiqua plasmid pMT'], ['Plasmid pPCP', 'Yersinia pestis Antiqua plasmid pPCP'], ['Plasmid pCD', 'Yersinia pestis Antiqua plasmid pCD']]], ['UP000002412', 99.54545454545455, 'full', '349747', False, [['Chromosome', 'Yersinia pseudotuberculosis IP 31758'], ['Plasmid plasmid_59kb', 'Yersinia pseudotuberculosis IP 31758 plasmid plasmid_59kb'], ['Plasmid plasmid_153kb', 'Yersinia pseudotuberculosis IP 31758 plasmid plasmid_153kb']]], ['UP000002490', 98.18181818181819, 'full', '632', False, [['Chromosome', 'Yersinia pestis KIM10+'], ['Plasmid pMT-1', 'Yersinia pestis KIM10+ plasmid pMT-1']]], ['UP000008084', 98.18181818181819, 'full', '930944', False, [['Chromosome', 'Yersinia enterocolitica subsp. palearctica Y11'], ['Plasmid pYV03', 'Yersinia enterocolitica subsp. palearctica Y11 plasmid pYVO3 complete sequence']]], ['UP000008936', 99.54545454545455, 'full', '377628', False, [['Chromosome', 'Yersinia pestis Nepal516'], ['Plasmid pMT', 'Yersinia pestis Nepal516 plasmid pMT'], ['Plasmid pPCP', 'Yersinia pestis Nepal516 plasmid pPCP']]], ['UP000038204', 99.0909090909091, 'full', '367190', False, [['Unassembled WGS sequence', 'Yersinia similis']]], ['UP000038750', 99.0909090909091, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000040088', 98.63636363636363, 'full', '263819', False, [['Unassembled WGS sequence', 'Yersinia aleksiciae']]], ['UP000040841', 98.86363636363636, 'full', '33060', False, [['Unassembled WGS sequence', 'Yersinia mollaretii']]], ['UP000041356', 99.0909090909091, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000041595', 98.63636363636363, 'full', '29483', False, [['Unassembled WGS sequence', 'Yersinia aldovae']]], ['UP000041882', 99.0909090909091, 'full', '2890319', False, [['Unassembled WGS sequence', 'Yersinia thracica']]], ['UP000042054', 98.86363636363636, 'full', '29485', False, [['Unassembled WGS sequence', 'Yersinia rohdei']]], ['UP000043316', 98.63636363636363, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000045824', 99.0909090909091, 'full', '28152', False, [['Unassembled WGS sequence', 'Yersinia kristensenii']]], ['UP000045840', 98.18181818181819, 'full', '1288385', False, [['Unassembled WGS sequence', 'Yersinia pekkanenii']]], ['UP000046784', 98.86363636363636, 'full', '29484', False, [['Unassembled WGS sequence', 'Yersinia frederiksenii']]], ['UP000048687', 98.86363636363636, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000048841', 99.0909090909091, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000196440', 99.54545454545455, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000220513', 100.0, 'full', '28152', False, [['Unassembled WGS sequence', 'Yersinia kristensenii']]], ['UP000229378', 99.54545454545455, 'full', '634', False, [['Unassembled WGS sequence', 'Yersinia bercovieri']]], ['UP000230961', 99.0909090909091, 'full', '2339259', False, [['Plasmid p2_50K', 'Yersinia enterocolitica LC20 plasmid plasmid2_50K'], ['Plasmid p1_80K', 'Yersinia enterocolitica LC20 plasmid plasmid1_80K'], ['Chromosome', 'Yersinia enterocolitica LC20']]], ['UP000254835', 99.0909090909091, 'full', '29484', False, [['Unassembled WGS sequence', 'Yersinia frederiksenii']]], ['UP000255087', 97.72727272727273, 'full', '633', False, [['Unassembled WGS sequence', 'Yersinia pseudotuberculosis']]], ['UP000265864', 100.0, 'full', '1604335', False, [['Chromosome', 'Yersinia rochesterensis strain ATCC BAA-2637 chromosome.'], ['Plasmid pUnnamed1', 'Yersinia rochesterensis strain ATCC BAA-2637 plasmid pUnnamed1.'], ['Plasmid pUnnamed2', 'Yersinia rochesterensis strain ATCC BAA-2637 plasmid pUnnamed2.']]], ['UP000464402', 93.86363636363636, 'full', '2607663', False, [['Chromosome', 'Yersinia canariae strain NCTC 14382 chromosome']]], ['UP000595309', 100.0, 'full', '630', False, [['Chromosome', 'Yersinia enterocolitica strain FDAARGOS_1082 chromosome']]], ['UP000698240', 100.0, 'full', '419257', False, [['Unassembled WGS sequence', 'Yersinia massiliensis']]], ['UP000712947', 100.0, 'full', '33060', False, [['Unassembled WGS sequence', 'Yersinia mollaretii']]], ['UP001146905', 96.81818181818181, 'full', '385964', False, [['Unassembled WGS sequence', 'Yersinia pestis subsp. pestis']]], ['UP001182355', 99.77272727272727, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000033938', 99.0909090909091, 'full', '632', False, [['Plasmid pMT1', 'Yersinia pestis strain Strain 125 B plague Bombay plasmid pMT1'], ['Unassembled WGS sequence', 'Yersinia pestis']]], ['UP000055342', 99.54545454545455, 'full', '1345705', False, [['Plasmid pMT', 'Yersinia pestis 1522 plasmid pMT'], ['Chromosome', 'Yersinia pestis 1522'], ['Plasmid pCD', 'Yersinia pestis 1522 plasmid pCD']]], ['UP000196736', 93.4090909090909, 'full', '935293', False, [['Unassembled WGS sequence', 'Yersinia entomophaga']]], ['UP000326807', 100.0, 'full', '1234662', False, [['Plasmid pmt', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pMT'], ['Plasmid pckf', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pCKF'], ['Plasmid ppcp', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pPCP'], ['Chromosome', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 chromosome'], ['Plasmid pcd', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pCD']]]]
-
+
proteomes, organism_ids, proteomes_data = rest_query_proteomes('test', 629, 'Yersinia', 0.8, all_proteomes=None)
time.sleep(1)
assert set(expected_proteoems) == set(proteomes)
for organism in expected_organism_ids:
assert set(expected_organism_ids[organism]).issubset(set(organism_ids[organism]))
- for data in expected_proteome_data:
+ for data in EXPECTED_PROTEOMES_DATA:
assert data in proteomes_data
def test_rest_query_proteomes_bioservices_online():
expected_proteoems = ['UP000255169', 'UP000000815']
expected_organism_ids = {'632': ['UP000000815'], '29486': ['UP000255169']}
- expected_proteome_data = [['UP000000815', 94.54545454545455, 'full', '632', True, [['Chromosome', 'Yersinia pestis CO92 complete genome'], ['Plasmid pCD1', 'Yersinia pestis CO92 plasmid pCD1'], ['Plasmid pMT1', 'Yersinia pestis CO92 plasmid pMT1'], ['Plasmid pPCP1', 'Yersinia pestis CO92 plasmid pPCP1']]], ['UP000255169', 98.86363636363636, 'full', '29486', True, [['Unassembled WGS sequence', 'Yersinia ruckeri']]], ['UP000000642', 100.0, 'full', '393305', False, [['Chromosome', 'Yersinia enterocolitica subsp. enterocolitica 8081 complete genome'], ['Plasmid pYVe8081', 'Yersinia enterocolitica subsp. enterocolitica 8081 plasmid pYVe8081 complete genome']]], ['UP000001011', 100.0, 'full', '273123', False, [['Chromosome', 'Yersinia pseudotuberculosis IP32953 genome'], ['Plasmid pYV', 'Yersinia pseudotuberculosis IP32953 pYV plasmid'], ['Plasmid pYptb32953', 'Yersinia pseudotuberculosis IP32953 cryptic plasmid']]], ['UP000001019', 99.31818181818181, 'full', '632', False, [['Chromosome', 'Yersinia pestis biovar Microtus str. 91001'], ['Plasmid pCD1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pCD1'], ['Plasmid pCRY', 'Yersinia pestis biovar Microtus str. 91001 plasmid pCRY'], ['Plasmid pMT1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pMT1'], ['Plasmid pPCP1', 'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1']]], ['UP000001971', 99.0909090909091, 'full', '360102', False, [['Chromosome', 'Yersinia pestis Antiqua'], ['Plasmid pMT', 'Yersinia pestis Antiqua plasmid pMT'], ['Plasmid pPCP', 'Yersinia pestis Antiqua plasmid pPCP'], ['Plasmid pCD', 'Yersinia pestis Antiqua plasmid pCD']]], ['UP000002412', 99.54545454545455, 'full', '349747', False, [['Chromosome', 'Yersinia pseudotuberculosis IP 31758'], ['Plasmid plasmid_59kb', 'Yersinia pseudotuberculosis IP 31758 plasmid plasmid_59kb'], ['Plasmid plasmid_153kb', 'Yersinia pseudotuberculosis IP 31758 plasmid plasmid_153kb']]], ['UP000002490', 98.18181818181819, 'full', '632', False, [['Chromosome', 'Yersinia pestis KIM10+'], ['Plasmid pMT-1', 'Yersinia pestis KIM10+ plasmid pMT-1']]], ['UP000008084', 98.18181818181819, 'full', '930944', False, [['Chromosome', 'Yersinia enterocolitica subsp. palearctica Y11'], ['Plasmid pYV03', 'Yersinia enterocolitica subsp. palearctica Y11 plasmid pYVO3 complete sequence']]], ['UP000008936', 99.54545454545455, 'full', '377628', False, [['Chromosome', 'Yersinia pestis Nepal516'], ['Plasmid pMT', 'Yersinia pestis Nepal516 plasmid pMT'], ['Plasmid pPCP', 'Yersinia pestis Nepal516 plasmid pPCP']]], ['UP000038204', 99.0909090909091, 'full', '367190', False, [['Unassembled WGS sequence', 'Yersinia similis']]], ['UP000038750', 99.0909090909091, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000040088', 98.63636363636363, 'full', '263819', False, [['Unassembled WGS sequence', 'Yersinia aleksiciae']]], ['UP000040841', 98.86363636363636, 'full', '33060', False, [['Unassembled WGS sequence', 'Yersinia mollaretii']]], ['UP000041356', 99.0909090909091, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000041595', 98.63636363636363, 'full', '29483', False, [['Unassembled WGS sequence', 'Yersinia aldovae']]], ['UP000041882', 99.0909090909091, 'full', '2890319', False, [['Unassembled WGS sequence', 'Yersinia thracica']]], ['UP000042054', 98.86363636363636, 'full', '29485', False, [['Unassembled WGS sequence', 'Yersinia rohdei']]], ['UP000043316', 98.63636363636363, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000045824', 99.0909090909091, 'full', '28152', False, [['Unassembled WGS sequence', 'Yersinia kristensenii']]], ['UP000045840', 98.18181818181819, 'full', '1288385', False, [['Unassembled WGS sequence', 'Yersinia pekkanenii']]], ['UP000046784', 98.86363636363636, 'full', '29484', False, [['Unassembled WGS sequence', 'Yersinia frederiksenii']]], ['UP000048687', 98.86363636363636, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000048841', 99.0909090909091, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000196440', 99.54545454545455, 'full', '631', False, [['Unassembled WGS sequence', 'Yersinia intermedia']]], ['UP000220513', 100.0, 'full', '28152', False, [['Unassembled WGS sequence', 'Yersinia kristensenii']]], ['UP000229378', 99.54545454545455, 'full', '634', False, [['Unassembled WGS sequence', 'Yersinia bercovieri']]], ['UP000230961', 99.0909090909091, 'full', '2339259', False, [['Plasmid p2_50K', 'Yersinia enterocolitica LC20 plasmid plasmid2_50K'], ['Plasmid p1_80K', 'Yersinia enterocolitica LC20 plasmid plasmid1_80K'], ['Chromosome', 'Yersinia enterocolitica LC20']]], ['UP000254835', 99.0909090909091, 'full', '29484', False, [['Unassembled WGS sequence', 'Yersinia frederiksenii']]], ['UP000255087', 97.72727272727273, 'full', '633', False, [['Unassembled WGS sequence', 'Yersinia pseudotuberculosis']]], ['UP000265864', 100.0, 'full', '1604335', False, [['Chromosome', 'Yersinia rochesterensis strain ATCC BAA-2637 chromosome.'], ['Plasmid pUnnamed1', 'Yersinia rochesterensis strain ATCC BAA-2637 plasmid pUnnamed1.'], ['Plasmid pUnnamed2', 'Yersinia rochesterensis strain ATCC BAA-2637 plasmid pUnnamed2.']]], ['UP000464402', 93.86363636363636, 'full', '2607663', False, [['Chromosome', 'Yersinia canariae strain NCTC 14382 chromosome']]], ['UP000595309', 100.0, 'full', '630', False, [['Chromosome', 'Yersinia enterocolitica strain FDAARGOS_1082 chromosome']]], ['UP000698240', 100.0, 'full', '419257', False, [['Unassembled WGS sequence', 'Yersinia massiliensis']]], ['UP000712947', 100.0, 'full', '33060', False, [['Unassembled WGS sequence', 'Yersinia mollaretii']]], ['UP001146905', 96.81818181818181, 'full', '385964', False, [['Unassembled WGS sequence', 'Yersinia pestis subsp. pestis']]], ['UP001182355', 99.77272727272727, 'full', '630', False, [['Unassembled WGS sequence', 'Yersinia enterocolitica']]], ['UP000033938', 99.0909090909091, 'full', '632', False, [['Plasmid pMT1', 'Yersinia pestis strain Strain 125 B plague Bombay plasmid pMT1'], ['Unassembled WGS sequence', 'Yersinia pestis']]], ['UP000055342', 99.54545454545455, 'full', '1345705', False, [['Plasmid pMT', 'Yersinia pestis 1522 plasmid pMT'], ['Chromosome', 'Yersinia pestis 1522'], ['Plasmid pCD', 'Yersinia pestis 1522 plasmid pCD']]], ['UP000196736', 93.4090909090909, 'full', '935293', False, [['Unassembled WGS sequence', 'Yersinia entomophaga']]], ['UP000326807', 100.0, 'full', '1234662', False, [['Plasmid pmt', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pMT'], ['Plasmid pckf', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pCKF'], ['Plasmid ppcp', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pPCP'], ['Chromosome', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 chromosome'], ['Plasmid pcd', 'Yersinia pestis subsp. pestis bv. Medievalis strain SCPM-O-B-6530 plasmid pCD']]]]
proteomes, organism_ids, proteomes_data = rest_query_proteomes('test', 629, 'Yersinia', 0.8, all_proteomes=None, option_bioservices=True)
time.sleep(1)
@@ -177,7 +176,7 @@ def test_rest_query_proteomes_bioservices_online():
assert set(expected_proteoems) == set(proteomes)
for organism in expected_organism_ids:
assert set(expected_organism_ids[organism]).issubset(set(organism_ids[organism]))
- for data in expected_proteome_data:
+ for data in EXPECTED_PROTEOMES_DATA:
assert data in proteomes_data
@@ -208,6 +207,7 @@ def test_find_proteome_rest_all_proteomes_online():
['UP000267007', 77.8984350047908, 'full', '6265', False, [['Unassembled WGS sequence', 'Toxocara canis']]]]
proteomes, organism_ids, proteomes_data = rest_query_proteomes('test', 33256, 'Ascaridoidea', 0.8, all_proteomes=True)
+
time.sleep(1)
assert set(expected_proteoems) == set(proteomes)
@@ -220,60 +220,7 @@ def test_find_proteome_rest_all_proteomes_online():
def test_find_non_reference_proteome_sparql_online():
expected_proteoems = ['UP000829720']
expected_organism_ids = {'1534307': ['UP000829720']}
- expected_proteome_data = [['UP000824540', 67.91208791208791, 'full', '121402', True, [['Unassembled WGS sequence', 'Albula glossodonta'], ['Unassembled WGS sequence', 'Albula glossodonta']]],
- ['UP000829720', 85.71428571428571, 'full', '1534307', True,
- [['Chromosome 25', 'Albula goreensis ecotype Florida chromosome 25, whole genome shotgun sequence.'],
- ['Chromosome 25', 'Albula goreensis ecotype Florida chromosome 25, whole genome shotgun sequence.'],
- ['Chromosome 20', 'Albula goreensis ecotype Florida chromosome 20, whole genome shotgun sequence.'],
- ['Chromosome 20', 'Albula goreensis ecotype Florida chromosome 20, whole genome shotgun sequence.'],
- ['Chromosome 12', 'Albula goreensis ecotype Florida chromosome 12, whole genome shotgun sequence.'],
- ['Chromosome 12', 'Albula goreensis ecotype Florida chromosome 12, whole genome shotgun sequence.'],
- ['Chromosome 11', 'Albula goreensis ecotype Florida chromosome 11, whole genome shotgun sequence.'],
- ['Chromosome 11', 'Albula goreensis ecotype Florida chromosome 11, whole genome shotgun sequence.'],
- ['Chromosome 21', 'Albula goreensis ecotype Florida chromosome 21, whole genome shotgun sequence.'],
- ['Chromosome 21', 'Albula goreensis ecotype Florida chromosome 21, whole genome shotgun sequence.'],
- ['Chromosome 17', 'Albula goreensis ecotype Florida chromosome 17, whole genome shotgun sequence.'],
- ['Chromosome 17', 'Albula goreensis ecotype Florida chromosome 17, whole genome shotgun sequence.'],
- ['Chromosome 15', 'Albula goreensis ecotype Florida chromosome 15, whole genome shotgun sequence.'],
- ['Chromosome 15', 'Albula goreensis ecotype Florida chromosome 15, whole genome shotgun sequence.'],
- ['Unassembled WGS sequence', 'Albula goreensis'], ['Unassembled WGS sequence', 'Albula goreensis'],
- ['Chromosome 16', 'Albula goreensis ecotype Florida chromosome 16, whole genome shotgun sequence.'],
- ['Chromosome 16', 'Albula goreensis ecotype Florida chromosome 16, whole genome shotgun sequence.'],
- ['Chromosome 19', 'Albula goreensis ecotype Florida chromosome 19, whole genome shotgun sequence.'],
- ['Chromosome 19', 'Albula goreensis ecotype Florida chromosome 19, whole genome shotgun sequence.'],
- ['Chromosome 2', 'Albula goreensis ecotype Florida chromosome 2, whole genome shotgun sequence.'],
- ['Chromosome 2', 'Albula goreensis ecotype Florida chromosome 2, whole genome shotgun sequence.'],
- ['Chromosome 13', 'Albula goreensis ecotype Florida chromosome 13, whole genome shotgun sequence.'],
- ['Chromosome 13', 'Albula goreensis ecotype Florida chromosome 13, whole genome shotgun sequence.'],
- ['Chromosome 6', 'Albula goreensis ecotype Florida chromosome 6, whole genome shotgun sequence.'],
- ['Chromosome 6', 'Albula goreensis ecotype Florida chromosome 6, whole genome shotgun sequence.'],
- ['Chromosome 22', 'Albula goreensis ecotype Florida chromosome 22, whole genome shotgun sequence.'],
- ['Chromosome 22', 'Albula goreensis ecotype Florida chromosome 22, whole genome shotgun sequence.'],
- ['Chromosome 9', 'Albula goreensis ecotype Florida chromosome 9, whole genome shotgun sequence.'],
- ['Chromosome 9', 'Albula goreensis ecotype Florida chromosome 9, whole genome shotgun sequence.'],
- ['Chromosome 14', 'Albula goreensis ecotype Florida chromosome 14, whole genome shotgun sequence.'],
- ['Chromosome 14', 'Albula goreensis ecotype Florida chromosome 14, whole genome shotgun sequence.'],
- ['Chromosome 7', 'Albula goreensis ecotype Florida chromosome 7, whole genome shotgun sequence.'],
- ['Chromosome 7', 'Albula goreensis ecotype Florida chromosome 7, whole genome shotgun sequence.'],
- ['Chromosome 23', 'Albula goreensis ecotype Florida chromosome 23, whole genome shotgun sequence.'],
- ['Chromosome 23', 'Albula goreensis ecotype Florida chromosome 23, whole genome shotgun sequence.'],
- ['Chromosome 1', 'Albula goreensis ecotype Florida chromosome 1, whole genome shotgun sequence.'],
- ['Chromosome 1', 'Albula goreensis ecotype Florida chromosome 1, whole genome shotgun sequence.'],
- ['Chromosome 18', 'Albula goreensis ecotype Florida chromosome 18, whole genome shotgun sequence.'],
- ['Chromosome 18', 'Albula goreensis ecotype Florida chromosome 18, whole genome shotgun sequence.'],
- ['Chromosome 4', 'Albula goreensis ecotype Florida chromosome 4, whole genome shotgun sequence.'],
- ['Chromosome 4', 'Albula goreensis ecotype Florida chromosome 4, whole genome shotgun sequence.'],
- ['Chromosome 5', 'Albula goreensis ecotype Florida chromosome 5, whole genome shotgun sequence.'],
- ['Chromosome 5', 'Albula goreensis ecotype Florida chromosome 5, whole genome shotgun sequence.'],
- ['Chromosome 10', 'Albula goreensis ecotype Florida chromosome 10, whole genome shotgun sequence.'],
- ['Chromosome 10', 'Albula goreensis ecotype Florida chromosome 10, whole genome shotgun sequence.'],
- ['Chromosome 3', 'Albula goreensis ecotype Florida chromosome 3, whole genome shotgun sequence.'],
- ['Chromosome 3', 'Albula goreensis ecotype Florida chromosome 3, whole genome shotgun sequence.'],
- ['Chromosome 8', 'Albula goreensis ecotype Florida chromosome 8, whole genome shotgun sequence.'],
- ['Chromosome 8', 'Albula goreensis ecotype Florida chromosome 8, whole genome shotgun sequence.'],
- ['Chromosome 24', 'Albula goreensis ecotype Florida chromosome 24, whole genome shotgun sequence.'],
- ['Chromosome 24', 'Albula goreensis ecotype Florida chromosome 24, whole genome shotgun sequence.']]]]
-
+ expected_proteome_data = [['UP000829720', 85.71428571428571, 'full', '1534307', True, [['Chromosome 20', 'Albula goreensis ecotype Florida chromosome 20, whole genome shotgun sequence.'], ['Chromosome 20', 'Albula goreensis ecotype Florida chromosome 20, whole genome shotgun sequence.'], ['Chromosome 14', 'Albula goreensis ecotype Florida chromosome 14, whole genome shotgun sequence.'], ['Chromosome 14', 'Albula goreensis ecotype Florida chromosome 14, whole genome shotgun sequence.'], ['Chromosome 9', 'Albula goreensis ecotype Florida chromosome 9, whole genome shotgun sequence.'], ['Chromosome 9', 'Albula goreensis ecotype Florida chromosome 9, whole genome shotgun sequence.'], ['Chromosome 11', 'Albula goreensis ecotype Florida chromosome 11, whole genome shotgun sequence.'], ['Chromosome 11', 'Albula goreensis ecotype Florida chromosome 11, whole genome shotgun sequence.'], ['Chromosome 24', 'Albula goreensis ecotype Florida chromosome 24, whole genome shotgun sequence.'], ['Chromosome 24', 'Albula goreensis ecotype Florida chromosome 24, whole genome shotgun sequence.'], ['Chromosome 13', 'Albula goreensis ecotype Florida chromosome 13, whole genome shotgun sequence.'], ['Chromosome 13', 'Albula goreensis ecotype Florida chromosome 13, whole genome shotgun sequence.'], ['Chromosome 25', 'Albula goreensis ecotype Florida chromosome 25, whole genome shotgun sequence.'], ['Chromosome 25', 'Albula goreensis ecotype Florida chromosome 25, whole genome shotgun sequence.'], ['Chromosome 1', 'Albula goreensis ecotype Florida chromosome 1, whole genome shotgun sequence.'], ['Chromosome 1', 'Albula goreensis ecotype Florida chromosome 1, whole genome shotgun sequence.'], ['Chromosome 16', 'Albula goreensis ecotype Florida chromosome 16, whole genome shotgun sequence.'], ['Chromosome 16', 'Albula goreensis ecotype Florida chromosome 16, whole genome shotgun sequence.'], ['Chromosome 17', 'Albula goreensis ecotype Florida chromosome 17, whole genome shotgun sequence.'], ['Chromosome 17', 'Albula goreensis ecotype Florida chromosome 17, whole genome shotgun sequence.'], ['Chromosome 10', 'Albula goreensis ecotype Florida chromosome 10, whole genome shotgun sequence.'], ['Chromosome 10', 'Albula goreensis ecotype Florida chromosome 10, whole genome shotgun sequence.'], ['Chromosome 19', 'Albula goreensis ecotype Florida chromosome 19, whole genome shotgun sequence.'], ['Chromosome 19', 'Albula goreensis ecotype Florida chromosome 19, whole genome shotgun sequence.'], ['Chromosome 12', 'Albula goreensis ecotype Florida chromosome 12, whole genome shotgun sequence.'], ['Chromosome 12', 'Albula goreensis ecotype Florida chromosome 12, whole genome shotgun sequence.'], ['Chromosome 15', 'Albula goreensis ecotype Florida chromosome 15, whole genome shotgun sequence.'], ['Chromosome 15', 'Albula goreensis ecotype Florida chromosome 15, whole genome shotgun sequence.'], ['Chromosome 23', 'Albula goreensis ecotype Florida chromosome 23, whole genome shotgun sequence.'], ['Chromosome 23', 'Albula goreensis ecotype Florida chromosome 23, whole genome shotgun sequence.'], ['Unassembled WGS sequence', 'Albula goreensis'], ['Unassembled WGS sequence', 'Albula goreensis'], ['Chromosome 18', 'Albula goreensis ecotype Florida chromosome 18, whole genome shotgun sequence.'], ['Chromosome 18', 'Albula goreensis ecotype Florida chromosome 18, whole genome shotgun sequence.'], ['Chromosome 3', 'Albula goreensis ecotype Florida chromosome 3, whole genome shotgun sequence.'], ['Chromosome 3', 'Albula goreensis ecotype Florida chromosome 3, whole genome shotgun sequence.'], ['Chromosome 22', 'Albula goreensis ecotype Florida chromosome 22, whole genome shotgun sequence.'], ['Chromosome 22', 'Albula goreensis ecotype Florida chromosome 22, whole genome shotgun sequence.'], ['Chromosome 6', 'Albula goreensis ecotype Florida chromosome 6, whole genome shotgun sequence.'], ['Chromosome 6', 'Albula goreensis ecotype Florida chromosome 6, whole genome shotgun sequence.'], ['Chromosome 4', 'Albula goreensis ecotype Florida chromosome 4, whole genome shotgun sequence.'], ['Chromosome 4', 'Albula goreensis ecotype Florida chromosome 4, whole genome shotgun sequence.'], ['Chromosome 5', 'Albula goreensis ecotype Florida chromosome 5, whole genome shotgun sequence.'], ['Chromosome 5', 'Albula goreensis ecotype Florida chromosome 5, whole genome shotgun sequence.'], ['Chromosome 21', 'Albula goreensis ecotype Florida chromosome 21, whole genome shotgun sequence.'], ['Chromosome 21', 'Albula goreensis ecotype Florida chromosome 21, whole genome shotgun sequence.'], ['Chromosome 2', 'Albula goreensis ecotype Florida chromosome 2, whole genome shotgun sequence.'], ['Chromosome 2', 'Albula goreensis ecotype Florida chromosome 2, whole genome shotgun sequence.'], ['Chromosome 8', 'Albula goreensis ecotype Florida chromosome 8, whole genome shotgun sequence.'], ['Chromosome 8', 'Albula goreensis ecotype Florida chromosome 8, whole genome shotgun sequence.'], ['Chromosome 7', 'Albula goreensis ecotype Florida chromosome 7, whole genome shotgun sequence.'], ['Chromosome 7', 'Albula goreensis ecotype Florida chromosome 7, whole genome shotgun sequence.']]], ['UP000824540', 67.91208791208791, 'full', '121402', True, [['Unassembled WGS sequence', 'Albula glossodonta'], ['Unassembled WGS sequence', 'Albula glossodonta']]]]
proteomes, organism_ids, proteomes_data = sparql_query_proteomes('test', 54906, 'Albuliformes', 80, all_proteomes=None)
@@ -282,8 +229,10 @@ def test_find_non_reference_proteome_sparql_online():
assert set(expected_proteoems) == set(proteomes)
for organism in expected_organism_ids:
assert set(expected_organism_ids[organism]).issubset(set(organism_ids[organism]))
- for data in expected_proteome_data:
- assert data in proteomes_data
+ for data in proteomes_data:
+ if data[0] == 'UP000829720':
+ assert data[0:5] == expected_proteome_data[0][0:5]
+ assert sorted(data[5]) == sorted(expected_proteome_data[0][5])
def test_find_proteome_sparql_all_proteomes_online():
@@ -299,7 +248,9 @@ def test_find_proteome_sparql_all_proteomes_online():
for organism in expected_organism_ids:
assert set(expected_organism_ids[organism]).issubset(set(organism_ids[organism]))
for data in expected_proteome_data:
- assert data in proteomes_data
+ if data[0] == 'UP000036680':
+ assert data[0:5] == expected_proteome_data[0][0:5]
+ assert sorted(data[5]) == sorted(expected_proteome_data[0][5])
def test_find_proteomes_tax_ids_online():
diff --git a/test/test_workflow.py b/test/test_workflow_uniprot.py
similarity index 97%
rename from test/test_workflow.py
rename to test/test_workflow_uniprot.py
index 8018080..99e30cc 100644
--- a/test/test_workflow.py
+++ b/test/test_workflow_uniprot.py
@@ -8,7 +8,7 @@
from esmecata.report.workflow_create_report import run_create_workflow_report
RESULTS = {
- 'Cluster_1': {'proteomes': 2, 'protein_clusters': 603, 'GOs': 856, 'ECs': 313}
+ 'Cluster_1': {'proteomes': 2, 'protein_clusters': 603, 'GOs': 902, 'ECs': 313}
}
def test_workflow_online():