From f026c2b4e06d48e53a64319d4e9b4d477e9effe5 Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Fri, 8 Nov 2024 00:55:33 +0000 Subject: [PATCH 1/3] simplified curie logic --- .../linker/src/main/java/LinkerPass2.java | 73 +++++-------------- .../annotators/ShortFormAnnotator.java | 39 +--------- 2 files changed, 22 insertions(+), 90 deletions(-) diff --git a/dataload/linker/src/main/java/LinkerPass2.java b/dataload/linker/src/main/java/LinkerPass2.java index 6c511429f..68dc92576 100644 --- a/dataload/linker/src/main/java/LinkerPass2.java +++ b/dataload/linker/src/main/java/LinkerPass2.java @@ -144,6 +144,8 @@ private static void writeEntityArray(JsonReader jsonReader, JsonWriter jsonWrite Set stringsInEntity = new HashSet(); String entityIri = null; + EntityDefinitionSet defOfThisEntity = pass1Result.iriToDefinitions.get(entityIri); + while(jsonReader.peek() != JsonToken.END_OBJECT) { String name = jsonReader.nextName(); @@ -153,17 +155,26 @@ private static void writeEntityArray(JsonReader jsonReader, JsonWriter jsonWrite if(name.equals("iri")) { entityIri = jsonReader.nextString(); jsonWriter.value(entityIri); - } else if (name.equalsIgnoreCase("curie")) { - processCurieObject(jsonReader, jsonWriter, pass1Result, entityIri); - } else if (name.equalsIgnoreCase("shortForm")) { - processShortFormObject(jsonReader, jsonWriter, pass1Result, entityIri); - } else { - CopyJsonGatheringStrings.copyJsonGatheringStrings(jsonReader, jsonWriter, stringsInEntity); + continue; } - } + if(name == "curie") { + if(defOfThisEntity.definingDefinitions.size() > 0) { + // always use the defining ontology's curie, as the defining + // ontology knows the base URI and we might not + // + com.google.gson.internal.Streams.write( + defOfThisEntity.definingDefinitions.iterator().next().curie, + jsonWriter); + continue; + } else { + // fallback to using the curie we already have + } + } + + CopyJsonGatheringStrings.copyJsonGatheringStrings(jsonReader, jsonWriter, stringsInEntity); + } - EntityDefinitionSet defOfThisEntity = pass1Result.iriToDefinitions.get(entityIri); if(defOfThisEntity != null) { jsonWriter.name(IS_DEFINING_ONTOLOGY.getText()); @@ -476,50 +487,4 @@ private static void processShortFormObject(JsonReader jsonReader, JsonWriter jso jsonWriter.name("value").value(shortFormObject.get("value").getAsString()); jsonWriter.endObject(); } - - private static void processCurieObject(JsonReader jsonReader, JsonWriter jsonWriter, LinkerPass1.LinkerPass1Result pass1Result, String entityIri) throws IOException { - jsonReader.beginObject(); - JsonObject curieObject = new JsonObject(); - - while (jsonReader.peek() != JsonToken.END_OBJECT) { - String curieFieldName = jsonReader.nextName(); - if (curieFieldName.equals("type")) { - JsonArray typeArray = new JsonArray(); - jsonReader.beginArray(); - while (jsonReader.peek() != JsonToken.END_ARRAY) { - typeArray.add(jsonReader.nextString()); - } - jsonReader.endArray(); - curieObject.add("type", typeArray); - } else if (curieFieldName.equals("value")) { - String curieValue = jsonReader.nextString(); - // Modify the value attribute - curieValue = getProcessedCurieValue(pass1Result, entityIri); - curieObject.addProperty("value", curieValue); - } - } - jsonReader.endObject(); - - // Write the modified curie object - jsonWriter.beginObject(); - jsonWriter.name("type"); - jsonWriter.beginArray(); - for (JsonElement typeElement : curieObject.getAsJsonArray("type")) { - jsonWriter.value(typeElement.getAsString()); - } - jsonWriter.endArray(); - jsonWriter.name("value").value(curieObject.get("value").getAsString()); - jsonWriter.endObject(); - } - - private static String getProcessedCurieValue(LinkerPass1.LinkerPass1Result pass1Result, String entityIri) { - var def = pass1Result.iriToDefinitions.get(entityIri); - if (def.definitions.iterator().hasNext()) { - JsonObject defCurieObject = def.definitions.iterator().next().curie.getAsJsonObject(); - if (defCurieObject.has("value")) { - return defCurieObject.get("value").getAsString(); - } - } - return ""; - } } diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java index fa4f2002f..60c2400fd 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java @@ -36,36 +36,7 @@ public static void annotateShortForms(OntologyGraph graph) { preferredPrefix = graph.config.get("id").toString().toUpperCase(); } - String shortForm = extractShortForm(graph, ontologyBaseUris, preferredPrefix, c.uri); - - /* - CURIEs are formed by following rules: - If there is only one underscore "_" AND the characters before the underscore are PreferredPrefix then replace the underscore with colon ":" - If there is only one underscore "_" AND the characters after the underscore are numbers then replace the underscore with colon ":" - If there is only one underscore "_" and the characters after the underscore are not just numbers then just keep the curie same as shortform - If there are multiple underscore but has only digits after the last underscore then the code replaces the last underscore with a colon - */ - String curie; - // Pattern for: single underscore, prefix matches preferredPrefix - String preferredPrefixPattern = "^(?:" + Pattern.quote(preferredPrefix) + ")_([^_]+)$"; - // Pattern for: single underscore, suffix is all digits - String singleUnderscoreDigitsPattern = "^[^_]+_(\\d+)$"; - // Pattern for: multiple underscores, suffix is all digits - String multipleUnderscoresDigitsPattern = "^(.*)_(\\d+)$"; - if (shortForm.matches(preferredPrefixPattern)) { - curie = shortForm.replaceFirst("_", ":"); - } else if (shortForm.matches(singleUnderscoreDigitsPattern)) { - curie = shortForm.replaceFirst("_", ":"); - } else if (shortForm.matches(multipleUnderscoresDigitsPattern)) { - // Multiple underscores, suffix is digits - // Replace the last underscore with a colon - curie = shortForm.replaceFirst("_(?=\\d+$)", ":"); - } else { - // No transformation needed - curie = shortForm; - } - - c.properties.addProperty("shortForm", PropertyValueLiteral.fromString(shortForm)); + String curie = extractCurie(graph, ontologyBaseUris, preferredPrefix, c.uri); c.properties.addProperty("curie", PropertyValueLiteral.fromString(curie)); } } @@ -75,20 +46,16 @@ public static void annotateShortForms(OntologyGraph graph) { } - private static String extractShortForm(OntologyGraph graph, Set ontologyBaseUris, String preferredPrefix, + private static String extractCurie(OntologyGraph graph, Set ontologyBaseUris, String preferredPrefix, String uri) { if (uri.startsWith("urn:")) { return uri.substring(4); } - // if(uri.startsWith("http://purl.obolibrary.org/obo/")) { - // return uri.substring("http://purl.obolibrary.org/obo/".length()); - // } - for (String baseUri : ontologyBaseUris) { if (uri.startsWith(baseUri) && preferredPrefix != null) { - return preferredPrefix + "_" + uri.substring(baseUri.length()); + return preferredPrefix + ":" + uri.substring(baseUri.length()); } } From c58e8736d4b221081d7bafa16d56c9856dda14af Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Fri, 8 Nov 2024 01:05:19 +0000 Subject: [PATCH 2/3] move shortform to the linker --- .../linker/src/main/java/LinkerPass2.java | 62 ++++++------------- .../uk/ac/ebi/rdf2json/OntologyGraph.java | 4 +- ...FormAnnotator.java => CurieAnnotator.java} | 8 +-- .../rdf2json/annotators/LabelAnnotator.java | 2 +- 4 files changed, 25 insertions(+), 51 deletions(-) rename dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/{ShortFormAnnotator.java => CurieAnnotator.java} (86%) diff --git a/dataload/linker/src/main/java/LinkerPass2.java b/dataload/linker/src/main/java/LinkerPass2.java index 68dc92576..6c2878ed4 100644 --- a/dataload/linker/src/main/java/LinkerPass2.java +++ b/dataload/linker/src/main/java/LinkerPass2.java @@ -146,6 +146,14 @@ private static void writeEntityArray(JsonReader jsonReader, JsonWriter jsonWrite EntityDefinitionSet defOfThisEntity = pass1Result.iriToDefinitions.get(entityIri); + String curie = null; + if(defOfThisEntity.definingDefinitions.size() > 0) { + // always use the defining ontology's curie, as the defining + // ontology knows the base URI and we might not + // + curie = defOfThisEntity.definingDefinitions.iterator().next().curie.getAsString(); + } + while(jsonReader.peek() != JsonToken.END_OBJECT) { String name = jsonReader.nextName(); @@ -159,22 +167,23 @@ private static void writeEntityArray(JsonReader jsonReader, JsonWriter jsonWrite } if(name == "curie") { - if(defOfThisEntity.definingDefinitions.size() > 0) { - // always use the defining ontology's curie, as the defining - // ontology knows the base URI and we might not - // - com.google.gson.internal.Streams.write( - defOfThisEntity.definingDefinitions.iterator().next().curie, - jsonWriter); - continue; + if(curie != null) { + // use the defining ontology curie + jsonWriter.value(curie); } else { - // fallback to using the curie we already have + // fallthrough to using the curie from rdf2json + curie = jsonReader.nextString(); + jsonWriter.value(curie); } + continue; } CopyJsonGatheringStrings.copyJsonGatheringStrings(jsonReader, jsonWriter, stringsInEntity); } + jsonWriter.name("shortForm"); + jsonWriter.value(curie.replaceFirst(":", "_")); + if(defOfThisEntity != null) { jsonWriter.name(IS_DEFINING_ONTOLOGY.getText()); @@ -452,39 +461,4 @@ private static class CurieMapResult { public String url; public String source; } - - private static void processShortFormObject(JsonReader jsonReader, JsonWriter jsonWriter, LinkerPass1.LinkerPass1Result pass1Result, String entityIri) throws IOException { - jsonReader.beginObject(); - JsonObject shortFormObject = new JsonObject(); - - while (jsonReader.peek() != JsonToken.END_OBJECT) { - String shortFormFieldName = jsonReader.nextName(); - if (shortFormFieldName.equals("type")) { - JsonArray typeArray = new JsonArray(); - jsonReader.beginArray(); - while (jsonReader.peek() != JsonToken.END_ARRAY) { - typeArray.add(jsonReader.nextString()); - } - jsonReader.endArray(); - shortFormObject.add("type", typeArray); - } else if (shortFormFieldName.equals("value")) { - String shortFormValue = jsonReader.nextString(); - // Modify the value attribute - shortFormValue = getProcessedCurieValue(pass1Result, entityIri).replace(":", "_"); - shortFormObject.addProperty("value", shortFormValue); - } - } - jsonReader.endObject(); - - // Write the modified short form object - jsonWriter.beginObject(); - jsonWriter.name("type"); - jsonWriter.beginArray(); - for (JsonElement typeElement : shortFormObject.getAsJsonArray("type")) { - jsonWriter.value(typeElement.getAsString()); - } - jsonWriter.endArray(); - jsonWriter.name("value").value(shortFormObject.get("value").getAsString()); - jsonWriter.endObject(); - } } diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java index 659583938..3595acf98 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java @@ -262,14 +262,14 @@ private String urlToFilename(String url) { HierarchicalParentsAnnotator.annotateHierarchicalParents(this); // must run after RelatedAnnotator AncestorsAnnotator.annotateAncestors(this); HierarchyMetricsAnnotator.annotateHierarchyMetrics(this); // must run after HierarchicalParentsAnnotator - ShortFormAnnotator.annotateShortForms(this); + CurieAnnotator.annotateCuries(this); DefinitionAnnotator.annotateDefinitions(this); SynonymAnnotator.annotateSynonyms(this); ReifiedPropertyAnnotator.annotateReifiedProperties(this); OntologyMetadataAnnotator.annotateOntologyMetadata(this); HierarchyFlagsAnnotator.annotateHierarchyFlags(this); // must run after DirectParentsAnnotator and HierarchicalParentsAnnotator IsObsoleteAnnotator.annotateIsObsolete(this); - LabelAnnotator.annotateLabels(this); // must run after ShortFormAnnotator + LabelAnnotator.annotateLabels(this); // must run after CurieAnnotator ConfigurablePropertyAnnotator.annotateConfigurableProperties(this); PreferredRootsAnnotator.annotatePreferredRoots(this); DisjointWithAnnotator.annotateDisjointWith(this); diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/CurieAnnotator.java similarity index 86% rename from dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java rename to dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/CurieAnnotator.java index 60c2400fd..926adf93a 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/CurieAnnotator.java @@ -10,10 +10,10 @@ import uk.ac.ebi.rdf2json.annotators.helpers.OntologyBaseUris; import uk.ac.ebi.rdf2json.properties.PropertyValueLiteral; -public class ShortFormAnnotator { - private static final Logger logger = LoggerFactory.getLogger(ShortFormAnnotator.class); +public class CurieAnnotator { + private static final Logger logger = LoggerFactory.getLogger(CurieAnnotator.class); - public static void annotateShortForms(OntologyGraph graph) { + public static void annotateCuries(OntologyGraph graph) { long startTime3 = System.nanoTime(); @@ -41,7 +41,7 @@ public static void annotateShortForms(OntologyGraph graph) { } } long endTime3 = System.nanoTime(); - logger.info("annotate short forms: {}", ((endTime3 - startTime3) / 1000 / 1000 / 1000)); + logger.info("annotate curies: {}", ((endTime3 - startTime3) / 1000 / 1000 / 1000)); } diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/LabelAnnotator.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/LabelAnnotator.java index 9373f76ba..61eacb6ef 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/LabelAnnotator.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/LabelAnnotator.java @@ -37,7 +37,7 @@ public static Set getLabelProperties(OntologyGraph graph) { } public static void annotateLabels(OntologyGraph graph) { - collateProperties(graph, "label", getLabelProperties(graph), List.of("shortForm")); + collateProperties(graph, "label", getLabelProperties(graph), List.of("curie")); } private static void collateProperties(OntologyGraph graph, String destProp, Collection sourceProps, Collection fallbackProps) { From 758b2db88e754842264454e7129d2e622fd0809c Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Fri, 8 Nov 2024 01:35:12 +0000 Subject: [PATCH 3/3] skip json values when using defining ontology curie --- dataload/linker/src/main/java/LinkerPass2.java | 1 + 1 file changed, 1 insertion(+) diff --git a/dataload/linker/src/main/java/LinkerPass2.java b/dataload/linker/src/main/java/LinkerPass2.java index 6c2878ed4..698e1f561 100644 --- a/dataload/linker/src/main/java/LinkerPass2.java +++ b/dataload/linker/src/main/java/LinkerPass2.java @@ -169,6 +169,7 @@ private static void writeEntityArray(JsonReader jsonReader, JsonWriter jsonWrite if(name == "curie") { if(curie != null) { // use the defining ontology curie + jsonReader.skipValue(); jsonWriter.value(curie); } else { // fallthrough to using the curie from rdf2json