diff --git a/examples/entities/example0.json b/examples/entities/example0.json index 7ef10bb..1523708 100644 --- a/examples/entities/example0.json +++ b/examples/entities/example0.json @@ -1,30 +1,35 @@ { - "id": "EXP-1888-01-09-a-i0035", + "ci_id": "EXP-1888-01-09-a-i0035", "ts": "2019-10-17T11:49:50Z", - "sys_id": "bert-fr", + "model_id": "bert-fr", "nes": [ { "type": "org.adm", "surface": "Société suisse du Grutli", "lOffset": 32, "rOffset": 56, - "id": "EXP-1888-01-09-a-i0035:32:56:org.adm:bert-fr", - "name": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + "name": "Société suisse du Grutli", "wkd_id": "Q683672", "wkpedia_pagename": "Société_du_Grütli", - "confidence": "medium" + "wkpedia_url": "https://fr.wikipedia.org/wiki/Soci%C3%A9t%C3%A9_du_Gr%C3%BCtli", + "confidence_ner": 50.45, + "confidence_nel": 50.45, + "id": "EXP-1888-01-09-a-i0035:32:56:org.adm:bert-fr" }, { "type": "pers.ind", - "surface": "Bovat, mécanicien à Echallens", + "surface": "Mr. Bovat, mécanicien à Echallens", "lOffset": 156, "rOffset": 178, - "id": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr", - "name": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr", - "function": "mécanicien à Echallens", "wkd_id": "NIL", "wkpedia_pagename": "NIL", - "confidence": "high" + "wkpedia_url": "N/A", + "confidence_ner": 50.45, + "confidence_nel": 50.45, + "name": "Bovat", + "title": "Mr.", + "function": "mécanicien à Echallens", + "id": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr" } ] } diff --git a/examples/entities/example1.json b/examples/entities/example1.json index cccf567..cdf00c6 100644 --- a/examples/entities/example1.json +++ b/examples/entities/example1.json @@ -1,30 +1,34 @@ { - "id": "EXP-1928-05-15-a-i0009", + "ci_id": "EXP-1928-05-15-a-i0009", "ts": "2019-10-17T11:49:50Z", - "sys_id": "bert-fr", + "model_id": "bert-fr", "nes": [ { + "id": "EXP-1928-05-15-a-i0009:50:99:pers.ind:bert-fr", "type": "pers.ind", "surface": "M. Wou, ancien ministre du gouvernement cantonais", "lOffset": 50, "rOffset": 99, - "id": "EXP-1928-05-15-a-i0009:50:99:loc.adm.nat:bert-fr", "name": "Wou", "title": "M.", "function": "ancien ministre du gouvernement cantonais", "wkd_id": "NIL", "wkpedia_pagename": "NIL", - "confidence": "medium" + "wkpedia_url": "N/A", + "confidence_ner": 50.45, + "confidence_nel": 50.45 }, { + "id": "EXP-1888-01-09-a-i0035:32:42:loc.adm.nat:bert-fr", "type": "loc.adm.nat", "surface": "Etats-Unis", "lOffset": 32, "rOffset": 42, - "id": "EXP-1888-01-09-a-i0035:32:42:loc.adm.nat:bert-fr", "wkd_id": "Q30", "wkpedia_pagename": "États-Unis", - "confidence": "medium" + "wkpedia_url": "https://fr.wikipedia.org/wiki/%C3%89tats-Unis", + "confidence_ner": 50.45, + "confidence_nel": 50.45 } ] } \ No newline at end of file diff --git a/examples/entities/example2.json b/examples/entities/example2.json index 3f640b6..20ca320 100644 --- a/examples/entities/example2.json +++ b/examples/entities/example2.json @@ -1,29 +1,33 @@ { - "id": "EXP-1968-02-23-a-i0262", + "ci_id": "EXP-1968-02-23-a-i0262", "ts": "2019-10-17T11:49:50Z", - "sys_id": "bert-fr", + "model_id": "bert-fr", "nes": [ { "type": "pers.ind", "surface": "championne de France de ski nautique, Sylvie Maurial", "lOffset": 50, "rOffset": 102, - "id": "EXP-1968-02-23-a-i0262:50:102:loc.adm.nat:bert-fr", + "confidence_ner": 50.45, + "confidence_nel": 50.45, + "wkd_id": "Q20993704", + "wkpedia_pagename": "Sylvie_Maurial", + "wkpedia_url": "https://fr.wikipedia.org/wiki/Sylvie_Maurial", "name": "Sylvie Maurial", "function": "championne de France de ski nautique", - "wkd_id": "Q20993704", - "wkpedia_pagename": "Sylvie_Maurial" + "id": "EXP-1968-02-23-a-i0262:50:102:pers.ind:bert-fr" }, { "type": "loc.adm.nat", "surface": "France", "lOffset": 64, "rOffset": 70, - "id": "EXP-1888-01-09-a-i0035:64:70:loc.adm.nat:bert-fr", - "nested": true, "wkd_id": "Q20993704", "wkpedia_pagename": "France", - "confidence": "medium" + "wkpedia_url": "https://fr.wikipedia.org/wiki/France", + "confidence_ner": 50.45, + "confidence_nel": 50.45, + "id": "EXP-1888-01-09-a-i0035:64:70:loc.adm.nat:bert-fr" } ] } diff --git a/json/entities/entities-backup.schema.json b/json/entities/entities-backup.schema.json new file mode 100644 index 0000000..7b7515c --- /dev/null +++ b/json/entities/entities-backup.schema.json @@ -0,0 +1,144 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://impresso.github.io/impresso-schemas/json/entities.schema.json", + "title": "Named Entity JSON Schema", + "description": "Definition of the output representation of entity processing, before indexing. Named entity mentions are expressed as offline annotations with character offsets relative to content items. Essentially, the NE output is a list of JSON documents (in json line format), where each document corresponds to a content item that has a list of NE mentions (no output for CI with no mentions). The tagset corresponds to impresso-HIPE.", + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "impresso content item id." + }, + "ts": { + "type": "string", + "description": "Timestamp of creation of the JSON file (e.g. '2018-09-18T08:00:08Z')" + }, + "sys_id": { + "type": "string", + "description": "An alias for the system or model that produced this output (preferably short, but still understandable), used for transparency and traceability. Should be unique and thus include elements that distinguish one model from another, such as a base name, a version, the language, e.g. bert-xxxx-xxxx-fr." + }, + "nes": { + "type": "array", + "description": "The list of named entity mentions identified in the document", + "minItems": 1, + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description":"NE type", + "enum": [ + "building", + "loc", + "loc.add", + "loc.add.elec", + "loc.add.phys", + "loc.adm", + "loc.adm.nat", + "loc.adm.reg", + "loc.adm.sup", + "loc.adm.town", + "loc.admin.sup", + "loc.fac", + "loc.oro", + "loc.phys", + "loc.phys.astro", + "loc.phys.geo", + "loc.phys.hydro", + "loc.unk", + "org", + "org.adm", + "org.ent", + "org.ent.pressagency", + "per", + "per.author", + "pers", + "pers.coll", + "pers.ind", + "pers.ind.articleauthor", + "prod", + "prod.doctr", + "prod.media", + "street", + "time", + "time.date.abs" + ] + }, + "surface": { + "type": "string", + "description":"The (string) surface of the named entity mention, as it appears in the text" + }, + "name": { + "type": "string", + "description":"In case of a person mention, the entity component of type name." + }, + "lOffset": { + "type": "integer", + "description":"The left character offset of the named entity with respect to the content item, as in the rebuilt format." + }, + "rOffset": { + "type": "integer", + "description":"The right character offset of the named entity with respect to the content item, as in the rebuilt format." + }, + "firstname": { + "type": "string", + "description":"In case of a person mention and if available, the first name." + }, + "surname": { + "type": "string", + "description":"In case of a person mention and if available, the surname." + }, + "title": { + "type": "string", + "description":"In case of a person mention, the entity component of type 'title'." + }, + "function": { + "type": "string", + "description":"In case of a person mention, the entity component of type 'function'." + }, + "demonym": { + "type": "string", + "description":"In case of a person mention, the entity component of type 'demonym'." + }, + "nested": { + "type": "boolean", + "description":"In case of a nested mention, this property should be set to true. Can be ignored if not." + }, + "wkd_id": { + "type": "string", + "description":"If exists, wikidata QID" + }, + "wkpedia_pagename": { + "type": "string", + "description":"If exists, wikipedia page name or, if not possible, wikipedia URL, in the language the NE recognition is made (e.g. page name 'Etats-Unis' if EL performed against French wikipedia, and 'United_States' is against English Wikipedia. " + }, + "confidence": { + "type": "string", + "enum": [ + "low", + "medium", + "high" + ] + }, + "id": { + "type": "string", + "description":"The id of the named entity mention composed of the following set of values concatenated with a colon (':') : content item id + loffset + roffset + type + sys_id (e.g. 'LLE-1989-04-04-a-i0195:56:69:person:bert-xxxx-xxxx-fr'." + } + }, + "required": [ + "type", + "surface", + "lOffset", + "rOffset", + "id" + ] + } + } + }, + "required": [ + "id", + "ts", + "sys_id", + "nes" + ] +} \ No newline at end of file diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 7b7515c..f11b0d1 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -5,43 +5,52 @@ "description": "Definition of the output representation of entity processing, before indexing. Named entity mentions are expressed as offline annotations with character offsets relative to content items. Essentially, the NE output is a list of JSON documents (in json line format), where each document corresponds to a content item that has a list of NE mentions (no output for CI with no mentions). The tagset corresponds to impresso-HIPE.", "type": "object", "properties": { - "id": { + "ci_id": { "type": "string", - "description": "impresso content item id." + "description": "Impresso content item ID." + }, + "ci_type": + { + "type": "string", + "description": "Impresso content item type." }, "ts": { "type": "string", - "description": "Timestamp of creation of the JSON file (e.g. '2018-09-18T08:00:08Z')" + "description": "Timestamp of creation of the JSON file (e.g. '2024-05-26T09:48:01Z')." }, - "sys_id": { + "model_id": { "type": "string", - "description": "An alias for the system or model that produced this output (preferably short, but still understandable), used for transparency and traceability. Should be unique and thus include elements that distinguish one model from another, such as a base name, a version, the language, e.g. bert-xxxx-xxxx-fr." + "description": "An alias for the system or model that produced this output, used for transparency and traceability. It should include distinguishing elements like a base name, version, and language." }, "nes": { + "id": { + "type": "string", + "description": "The unique identifier of the named entity mention: [Document ID]:[Left Offset]:[Right Offset]:[Entity Type]:[NER Model]|[NEL Model]" + }, "type": "array", - "description": "The list of named entity mentions identified in the document", - "minItems": 1, + "description": "The list of named entity mentions identified in the document.", + "minItems": 0, "items": { "type": "object", "properties": { "type": { "type": "string", - "description":"NE type", + "description": "NE type (coarse-grained and fine-grained).", "enum": [ - "building", + "comp.demonym", + "comp.function", + "comp.name", + "comp.qualifier", + "comp.title", "loc", - "loc.add", "loc.add.elec", "loc.add.phys", - "loc.adm", "loc.adm.nat", "loc.adm.reg", "loc.adm.sup", "loc.adm.town", - "loc.admin.sup", "loc.fac", "loc.oro", - "loc.phys", "loc.phys.astro", "loc.phys.geo", "loc.phys.hydro", @@ -50,8 +59,6 @@ "org.adm", "org.ent", "org.ent.pressagency", - "per", - "per.author", "pers", "pers.coll", "pers.ind", @@ -59,78 +66,95 @@ "prod", "prod.doctr", "prod.media", - "street", "time", - "time.date.abs" + "time.date.abs", + "time.hour.abs", + "org.ent.pressagency.Reuters", + "org.ent.pressagency.Stefani", + "org.ent.pressagency.Extel", + "org.ent.pressagency.Havas", + "org.ent.pressagency.Xinhua", + "org.ent.pressagency.Domei", + "org.ent.pressagency.Belga", + "org.ent.pressagency.CTK", + "org.ent.pressagency.ANSA", + "org.ent.pressagency.DNB", + "pers.ind.articleauthor", + "org.ent.pressagency.Wolff", + "org.ent.pressagency.unk", + "org.ent.pressagency.UP-UPI", + "org.ent.pressagency.ATS-SDA", + "org.ent.pressagency.DPA", + "org.ent.pressagency.AFP", + "pers.ind.articleauthor", + "org.ent.pressagency.Kipa", + "org.ent.pressagency.ag", + "org.ent.pressagency.Extel", + "org.ent.pressagency.ATS-SDA", + "org.ent.pressagency.Havas", + "org.ent.pressagency.Reuters", + "org.ent.pressagency.Xinhua", + "org.ent.pressagency.AP", + "org.ent.pressagency.APA", + "org.ent.pressagency.ANSA", + "org.ent.pressagency.DDP-DAPD", + "org.ent.pressagency.TASS", + "org.ent.pressagency.Europapress", + "org.ent.pressagency.SPK-SMP", + "unk" ] }, "surface": { "type": "string", - "description":"The (string) surface of the named entity mention, as it appears in the text" - }, - "name": { - "type": "string", - "description":"In case of a person mention, the entity component of type name." + "description": "The surface form of the named entity mention, as it appears in the text." }, "lOffset": { "type": "integer", - "description":"The left character offset of the named entity with respect to the content item, as in the rebuilt format." + "description": "The left character offset of the named entity with respect to the content item." }, "rOffset": { "type": "integer", - "description":"The right character offset of the named entity with respect to the content item, as in the rebuilt format." - }, - "firstname": { - "type": "string", - "description":"In case of a person mention and if available, the first name." + "description": "The right character offset of the named entity with respect to the content item." }, - "surname": { - "type": "string", - "description":"In case of a person mention and if available, the surname." + "confidence_ner": { + "type": "number", + "description": "Confidence score of the Named Entity Recognition process." }, - "title": { - "type": "string", - "description":"In case of a person mention, the entity component of type 'title'." + "confidence_nel": { + "type": "number", + "description": "Confidence score of the Named Entity Linking process." }, - "function": { + "wkd_id": { "type": "string", - "description":"In case of a person mention, the entity component of type 'function'." + "description": "Wikidata QID if available." }, - "demonym": { + "wkpedia_pagename": { "type": "string", - "description":"In case of a person mention, the entity component of type 'demonym'." + "description": "Wikipedia page name, i.e. the last part of the wikipedia URL (e.g. United_States)" }, - "nested": { - "type": "boolean", - "description":"In case of a nested mention, this property should be set to true. Can be ignored if not." - }, - "wkd_id": { + "wkpedia_url": { "type": "string", - "description":"If exists, wikidata QID" + "description": "Wikipedia page URL, e.g. https://en.wikipedia.org/wiki/United_States" }, - "wkpedia_pagename": { + "name": { "type": "string", - "description":"If exists, wikipedia page name or, if not possible, wikipedia URL, in the language the NE recognition is made (e.g. page name 'Etats-Unis' if EL performed against French wikipedia, and 'United_States' is against English Wikipedia. " + "description":"In case of a person mention, the entity component of type 'name', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750)." }, - "confidence": { + "title": { "type": "string", - "enum": [ - "low", - "medium", - "high" - ] + "description":"In case of a person mention, the entity component of type 'title', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " }, - "id": { + "function": { "type": "string", - "description":"The id of the named entity mention composed of the following set of values concatenated with a colon (':') : content item id + loffset + roffset + type + sys_id (e.g. 'LLE-1989-04-04-a-i0195:56:69:person:bert-xxxx-xxxx-fr'." + "description":"In case of a person mention, the entity component of type 'function', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " } }, "required": [ + "ci_id", "type", "surface", "lOffset", - "rOffset", - "id" + "rOffset" ] } } @@ -138,7 +162,7 @@ "required": [ "id", "ts", - "sys_id", + "model_id", "nes" ] } \ No newline at end of file