diff --git a/config/planteome-demo.json b/config/planteome-demo.json index 261c1ff..97151d1 100644 --- a/config/planteome-demo.json +++ b/config/planteome-demo.json @@ -1,6 +1,6 @@ { "encodeur" : "sentence-transformers/all-MiniLM-L6-v2", - "threshold_similarity_tag_chunk" : 0.65, + "threshold_similarity_tag_chunk" : 0.70, "threshold_similarity_tag" : 0.80, "batch_size" : 32, diff --git a/llm_semantic_annotator/tag/owl_tag_manager.py b/llm_semantic_annotator/tag/owl_tag_manager.py index 4d6c33f..6f3e14e 100644 --- a/llm_semantic_annotator/tag/owl_tag_manager.py +++ b/llm_semantic_annotator/tag/owl_tag_manager.py @@ -69,8 +69,7 @@ def get_ontologies(self,list_ontologies): return list_ontologies - def remove_prefix_tags(self,prefix_tag,text): - escaped_prefix = re.escape(prefix_tag.upper()) + def remove_prefix_tags(self,text): pattern = r'\([A-Z]+:\d+\)' v = re.sub(pattern, '', text) @@ -108,8 +107,16 @@ def build_corpus( len_properties = len(ontology_config['properties']) var_properties = ' '.join([ f"?prop{i}" for i in range(len_properties) ]) - query_properties = ' '.join([ f"?term {ontology_config['properties'][i]} ?prop{i} .\n" for i in range(len_properties) ]) - filter_properties = ' '.join([ f"FILTER(LANG(?prop{i}) = 'en' || LANG(?prop{i}) = '') .\n" for i in range(len_properties) ]) + + query_properties = "" + for i in range(len_properties): + query_properties += "OPTIONAL { "+f""" + ?term {ontology_config['properties'][i]} ?prop{i} . + FILTER(LANG(?prop{i}) = 'en' || LANG(?prop{i}) = '') . + """ + "}\n" + + filter_prefix = f"FILTER(STRSTARTS(STR(?term), '{ontology_config['prefix']}' )) .\n" + constraints_query = "" if 'constraints' in ontology_config: @@ -119,9 +126,9 @@ def build_corpus( query_base = """ SELECT ?term ?labelLeaf """+var_properties+""" WHERE { ?term """+ontology_config['label']+""" ?labelLeaf . + """+filter_prefix+""" FILTER(LANG(?labelLeaf) = "en" || LANG(?labelLeaf) = "") . """+query_properties+""" - """+filter_properties+""" """+constraints_query+""" } """ @@ -133,8 +140,11 @@ def build_corpus( nb_record=0 print(f"Ontology {ontology} NB RECORDS:{len(results)}") for row in tqdm(results): - - descriptionLeaf = '\n'.join([ row.get(prop.replace('?',''), '') for prop in var_properties.split(' ') ]) + #print(row) + descriptionLeaf = '\n'.join([ + str(row.get(prop.replace('?',''), '')) for prop in var_properties.split(' ') + ]) + #print("----") labelLeaf = row.labelLeaf descriptionLeaf = descriptionLeaf.strip() @@ -149,7 +159,7 @@ def build_corpus( 'ontology' : ontology, 'term': str(row.term), 'rdfs_label': labelLeaf, - 'description' : self.remove_prefix_tags(ontology,descriptionLeaf), + 'description' : self.remove_prefix_tags(descriptionLeaf), 'group': ontology_group_name })