From 7488fb316c7f6710c3fd5df989797cf32a823193 Mon Sep 17 00:00:00 2001 From: Olivier Filangi Date: Thu, 24 Oct 2024 15:04:13 +0200 Subject: [PATCH] fix nb total abstract information --- llm_semantic_annotator/core.py | 13 ++++++++++--- .../misc/scientific_abstract_rdf_annotator.py | 2 +- llm_semantic_annotator/misc/stats.py | 11 +++++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/llm_semantic_annotator/core.py b/llm_semantic_annotator/core.py index 0a5ce48..12a84f6 100644 --- a/llm_semantic_annotator/core.py +++ b/llm_semantic_annotator/core.py @@ -57,6 +57,9 @@ def main_populate_ncbi_taxon_tag_embeddings(config_all): def main_populate_abstract_embeddings(config_all): get_abstract_manager(config_all).manage_abstracts() +def get_doi_file(config_all): + return config_all['retention_dir']+"/total_doi.txt" + def main_compute_tag_chunk_similarities(config_all): """Fonction principale pour calculer la similarité entre tous les tags et chunks.""" tags_pth_files = get_owl_tag_manager(config_all).get_files_tags_embeddings() @@ -92,7 +95,7 @@ def main_compute_tag_chunk_similarities(config_all): ### ----------------------- keep_tag_embeddings = {} results_complete_similarities = {} - + total_doi = 0 for abstracts_pth_file in abstracts_pth_files: json_f = str(os.path.splitext(abstracts_pth_file)[0])+"_scores.json" if os.path.exists(json_f) : @@ -101,6 +104,7 @@ def main_compute_tag_chunk_similarities(config_all): chunk_embeddings = mem.load_filepth(abstracts_pth_file) for doi,res in mem.compare_tags_with_chunks(tag_embeddings, chunk_embeddings).items(): + total_doi+=1 if doi not in results_complete_similarities: results_complete_similarities[doi] = res for tag in res.keys(): @@ -119,6 +123,9 @@ def main_compute_tag_chunk_similarities(config_all): with open(json_f, "w") as fichier: json.dump(results_complete_similarities, fichier) + + with open(get_doi_file(config_all), "w") as fichier: + fichier.write(str(total_doi)) def get_scores_files(retention_dir): scores_files = [] @@ -170,14 +177,14 @@ def get_results_complete_similarities_and_tags_embedding(config_all): return results_complete_similarities,tag_embeddings def main_display_summary(config_all): - + doi_file = get_doi_file(config_all) results_complete_similarities,tag_embeddings = get_results_complete_similarities_and_tags_embedding(config_all) retention_dir = config_all['retention_dir'] if len(results_complete_similarities)>0: display_best_similarity_abstract_tag(results_complete_similarities,tag_embeddings,retention_dir) display_ontologies_summary(results_complete_similarities,tag_embeddings,retention_dir) - display_ontologies_distribution(results_complete_similarities,tag_embeddings) + display_ontologies_distribution(results_complete_similarities,tag_embeddings,doi_file) else: print("No results found") diff --git a/llm_semantic_annotator/misc/scientific_abstract_rdf_annotator.py b/llm_semantic_annotator/misc/scientific_abstract_rdf_annotator.py index dfa3fab..6dd9b4a 100644 --- a/llm_semantic_annotator/misc/scientific_abstract_rdf_annotator.py +++ b/llm_semantic_annotator/misc/scientific_abstract_rdf_annotator.py @@ -46,7 +46,7 @@ def create_rdf_graph(results_complete_similarities, abstracts_processed = len(results_complete_similarities) for doi, complete_similarities in results_complete_similarities.items(): - doi_uri = URIRef(urllib.parse.quote(f"https://doi.org/{doi}")) + doi_uri = URIRef("https://doi.org/"+urllib.parse.quote(doi)) for tag, similarity in complete_similarities.items(): tag_uri = URIRef(tag) annotation_node = BNode() diff --git a/llm_semantic_annotator/misc/stats.py b/llm_semantic_annotator/misc/stats.py index 67b8599..23c155a 100644 --- a/llm_semantic_annotator/misc/stats.py +++ b/llm_semantic_annotator/misc/stats.py @@ -4,9 +4,13 @@ from rich.panel import Panel from rich.text import Text -def display_ontologies_distribution(data, keep_tag_embeddings): +def display_ontologies_distribution(data, keep_tag_embeddings,total_doi_file): console = Console() + nb_doi = 0 + with open(total_doi_file, 'r') as file: + nb_doi = int(file.read()) + # Extract key prefixes ontologies = [] labels = [] @@ -21,14 +25,13 @@ def display_ontologies_distribution(data, keep_tag_embeddings): distributionLabels = Counter(labels) # General statistics - nb_abstracts = len(data) - nb_annotated = sum(1 for item in data.values() if len(item) > 0) + nb_annotated = len(data) total_labels = sum(distributionOntologies.values()) # Display general statistics console.print(Panel( f"[bold cyan]General Statistics[/bold cyan]\n" - f"Total number of abstracts: [green]{nb_abstracts}[/green]\n" + f"Total number of abstracts: [green]{nb_doi}[/green]\n" f"Number of annotated abstracts: [green]{nb_annotated}[/green]\n" f"Total number of labels used: [green]{total_labels}[/green]", title="Summary",