From ff22dc3a041a7c3da9dd9f5cbdeef13bcb14fd4e Mon Sep 17 00:00:00 2001 From: csae8092 Date: Sun, 28 Jan 2024 09:06:35 +0100 Subject: [PATCH 1/6] closes #78 --- issue__78_rm_anno_uris.ipynb | 56 ++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 issue__78_rm_anno_uris.ipynb diff --git a/issue__78_rm_anno_uris.ipynb b/issue__78_rm_anno_uris.ipynb new file mode 100644 index 0000000..af01701 --- /dev/null +++ b/issue__78_rm_anno_uris.ipynb @@ -0,0 +1,56 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8cf560f5", + "metadata": {}, + "source": [ + "executed 2024-01-28" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fe08400", + "metadata": {}, + "outputs": [], + "source": [ + "uris = Uri.objects.filter(domain=\"anno\")\n", + "uris.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08522f35", + "metadata": {}, + "outputs": [], + "source": [ + "for x in uris:\n", + " x.delete()\n", + "Uri.objects.filter(domain=\"anno\").count()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Django Shell-Plus", + "language": "python", + "name": "django_extensions" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5a1941022e0b3dd4293c987972e8e07b84b7c1b6 Mon Sep 17 00:00:00 2001 From: csae8092 Date: Sun, 28 Jan 2024 09:07:54 +0100 Subject: [PATCH 2/6] bumped acdh-tei-pyutils --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c1bcbaa..21ce1e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ acdh-django-browsing acdh_geonames_utils acdh-id-reconciler>=0.2,<1 -acdh-tei-pyutils>=0.34,<1 +acdh-tei-pyutils>=1.1,<2 acdh-wikidata-pyutils==1.0 apis-override-select2js==0.1 Django>4.1,<6 From 66d466c2282c69b525d5c1f8cd32cf034b109f39 Mon Sep 17 00:00:00 2001 From: csae8092 Date: Sun, 28 Jan 2024 18:36:16 +0100 Subject: [PATCH 3/6] starting with #17 --- issue__17_dataimports.ipynb | 122 ++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 issue__17_dataimports.ipynb diff --git a/issue__17_dataimports.ipynb b/issue__17_dataimports.ipynb new file mode 100644 index 0000000..bf73998 --- /dev/null +++ b/issue__17_dataimports.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d7335a42", + "metadata": {}, + "outputs": [], + "source": [ + "from acdh_tei_pyutils.tei import TeiReader\n", + "from tqdm import tqdm\n", + "from icecream import ic\n", + "from normdata.utils import import_from_normdata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd68245d", + "metadata": {}, + "outputs": [], + "source": [ + "source_file = \"https://github.com/hermann-bahr/bahr-textverzeichnis-data/raw/main/data/indices/listperson.xml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570aec42", + "metadata": {}, + "outputs": [], + "source": [ + "doc = TeiReader(source_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "070e76d4", + "metadata": {}, + "outputs": [], + "source": [ + "nsmap = doc.nsmap\n", + "bahr_url = \"https://hermanbahrtextverzeichnis/\"\n", + "bahr_domain = \"hermanbahrtextverzeichnis\"\n", + "bahr_col, _ = Collection.objects.get_or_create(name=\"Bahr Textverzeichnis\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6d54fa4", + "metadata": {}, + "outputs": [], + "source": [ + "no_gnd = set()\n", + "no_wikidata = set()\n", + "not_created = set()\n", + "for x in tqdm(doc.any_xpath(\".//tei:person\")):\n", + " entity = False\n", + " hbtv_uri = x.xpath(\".//tei:idno[@type='HB-tv']/text()\", namespaces=nsmap)[0]\n", + " hbtv_url = f\"{bahr_url}{hbtv_uri}\"\n", + " try:\n", + " gnd = x.xpath(\".//tei:idno[@type='gnd']/text()\", namespaces=nsmap)[0]\n", + " entity = import_from_normdata(gnd, 'person')\n", + " except IndexError:\n", + " no_gnd.add(hbtv_uri)\n", + " try:\n", + " wikidata = x.xpath(\".//tei:idno[@type='wikidata']/text()\", namespaces=nsmap)[0]\n", + " entity = import_from_normdata(gnd, 'person')\n", + " except IndexError:\n", + " no_wikidata.add(hbtv_uri)\n", + " not_created.add(hbtv_uri)\n", + " if entity:\n", + " entity.collection.add(bahr_col)\n", + " uri, _ = Uri.objects.get_or_create(uri=hbtv_url, domain=bahr_domain)\n", + " uri.entity = entity\n", + " uri.save()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ac5108", + "metadata": {}, + "outputs": [], + "source": [ + "len(not_created)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78f67f70", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Django Shell-Plus", + "language": "python", + "name": "django_extensions" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5c220cfa2e4386d5c580435412cfa376c68d6e93 Mon Sep 17 00:00:00 2001 From: csae8092 Date: Mon, 29 Jan 2024 10:29:23 +0100 Subject: [PATCH 4/6] wip #17 --- issue__17_dataimports.ipynb | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/issue__17_dataimports.ipynb b/issue__17_dataimports.ipynb index bf73998..6e51210 100644 --- a/issue__17_dataimports.ipynb +++ b/issue__17_dataimports.ipynb @@ -67,7 +67,8 @@ " no_gnd.add(hbtv_uri)\n", " try:\n", " wikidata = x.xpath(\".//tei:idno[@type='wikidata']/text()\", namespaces=nsmap)[0]\n", - " entity = import_from_normdata(gnd, 'person')\n", + " wikidata_url = f\"http://www.wikidata.org/entity/{wikidata}\"\n", + " entity = import_from_normdata(wikidata_url, 'person')\n", " except IndexError:\n", " no_wikidata.add(hbtv_uri)\n", " not_created.add(hbtv_uri)\n", @@ -95,6 +96,26 @@ "id": "78f67f70", "metadata": {}, "outputs": [], + "source": [ + "len(doc.any_xpath(\".//tei:person\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa7a2300", + "metadata": {}, + "outputs": [], + "source": [ + "len(doc.any_xpath(\".//tei:person\")) - len(not_created)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40ff7ca2", + "metadata": {}, + "outputs": [], "source": [] } ], From 122b79323e34f5c0cd4b5f5524a2de3bf19aaa18 Mon Sep 17 00:00:00 2001 From: csae8092 Date: Mon, 29 Jan 2024 11:54:15 +0100 Subject: [PATCH 5/6] done with listperson.xml #17 --- issue__17_dataimports.ipynb | 61 +++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/issue__17_dataimports.ipynb b/issue__17_dataimports.ipynb index 6e51210..b02171d 100644 --- a/issue__17_dataimports.ipynb +++ b/issue__17_dataimports.ipynb @@ -56,7 +56,7 @@ "no_gnd = set()\n", "no_wikidata = set()\n", "not_created = set()\n", - "for x in tqdm(doc.any_xpath(\".//tei:person\")):\n", + "# for x in tqdm(doc.any_xpath(\".//tei:person\")):\n", " entity = False\n", " hbtv_uri = x.xpath(\".//tei:idno[@type='HB-tv']/text()\", namespaces=nsmap)[0]\n", " hbtv_url = f\"{bahr_url}{hbtv_uri}\"\n", @@ -87,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "len(not_created)" + "print(\"no import entities without normdata records\")" ] }, { @@ -107,7 +107,45 @@ "metadata": {}, "outputs": [], "source": [ - "len(doc.any_xpath(\".//tei:person\")) - len(not_created)" + "for x in tqdm(doc.any_xpath(\".//tei:person\")):\n", + " if len(x.xpath(\".//tei:idno\", namespaces=nsmap)) == 1:\n", + " hbtv_uri = x.xpath(\".//tei:idno[@type='HB-tv']/text()\", namespaces=nsmap)[0]\n", + " hbtv_url = f\"{bahr_url}{hbtv_uri}\"\n", + " uri, _ = Uri.objects.get_or_create(uri=hbtv_url, domain=bahr_domain)\n", + " if uri.entity:\n", + " continue\n", + " else:\n", + " try:\n", + " name = x.xpath(\"./tei:persName[1]/tei:surname[1]/text()\", namespaces=nsmap)[0]\n", + " except IndexError:\n", + " name = \"\"\n", + " try:\n", + " first_name = x.xpath(\"./tei:persName[1]/tei:forename[1]/text()\", namespaces=nsmap)[0]\n", + " except IndexError:\n", + " first_name = \"\"\n", + " try:\n", + " start_date_written = x.xpath(\"./tei:birth/tei:date/text()\", namespaces=nsmap)[0]\n", + " except IndexError:\n", + " start_date_written = \"\"\n", + " try:\n", + " end_date_written = x.xpath(\"./tei:death/tei:date/text()\", namespaces=nsmap)[0]\n", + " except IndexError:\n", + " end_date_written = \"\"\n", + " try:\n", + " gender = x.xpath(\"./tei:sex\", namespaces=nsmap)[0]\n", + " gender = gender.attrib[\"value\"]\n", + " except IndexError:\n", + " gender = \"\"\n", + " entity = Person.objects.create(\n", + " name=name,\n", + " first_name=first_name,\n", + " start_date_written=start_date_written,\n", + " end_date_written=end_date_written,\n", + " gender=gender\n", + " )\n", + " entity.collection.add(bahr_col)\n", + " uri.entity = entity\n", + " uri.save()" ] }, { @@ -116,6 +154,23 @@ "id": "40ff7ca2", "metadata": {}, "outputs": [], + "source": [ + "for x in tqdm(doc.any_xpath(\".//tei:person[./tei:occupation]\")):\n", + " hbtv_uri = x.xpath(\".//tei:idno[@type='HB-tv']/text()\", namespaces=nsmap)[0]\n", + " hbtv_url = f\"{bahr_url}{hbtv_uri}\"\n", + " uri, _ = Uri.objects.get_or_create(uri=hbtv_url, domain=bahr_domain)\n", + " entity = uri.entity.get_child_entity()\n", + " for o in x.xpath(\"./tei:occupation/text()\", namespaces=nsmap):\n", + " profession, _ = ProfessionType.objects.get_or_create(name=o)\n", + " entity.profession.add(profession)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77e897d5", + "metadata": {}, + "outputs": [], "source": [] } ], From 29dec224299e887b9e76635f2187114db5c79c82 Mon Sep 17 00:00:00 2001 From: csae8092 Date: Mon, 29 Jan 2024 12:36:19 +0100 Subject: [PATCH 6/6] closes #125 and closes #124 --- .../autocomplete_light_registry.py | 249 ------------------ apis_core/apis_relations/forms2.py | 7 +- apis_core/apis_relations/models.py | 1 + apis_core/apis_vocabularies/admin.py | 2 +- 4 files changed, 3 insertions(+), 256 deletions(-) delete mode 100644 apis_core/apis_relations/autocomplete_light_registry.py diff --git a/apis_core/apis_relations/autocomplete_light_registry.py b/apis_core/apis_relations/autocomplete_light_registry.py deleted file mode 100644 index 1155ebb..0000000 --- a/apis_core/apis_relations/autocomplete_light_registry.py +++ /dev/null @@ -1,249 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -import operator -import re -from functools import reduce - -import autocomplete_light.shortcuts as al -import dateutil.parser -import requests -from django.db.models import Q - -from apis_core.apis_entities.models import Event, Institution, Person, Place, Work -from apis_core.apis_metainfo.models import Uri -from apis_core.apis_relations.models import PersonInstitution, PersonPerson, PersonPlace -from apis_core.default_settings.NER_settings import autocomp_settings as ac_settings - - -class StanbolAutocompleteBase(al.AutocompleteListTemplate): - autocomplete_template = "apis_templates/autocomplete/stanbol.html" - - widget_attrs = {} - - attrs = { - "data-autocomplete-minimum-characters": 3, - "placeholder": "Start typing to get suggestions", - "class": "autocomplete-ort-uri form-control", - } - - @staticmethod - def parse_stanbol_object(obj, key, *args): - if len(args) > 0: - lst1 = args[0] - else: - lst1 = None - if obj[1] == "GNDDate": - if lst1 is not None: - try: - return dateutil.parser.parse(lst1[key][0]["value"]) - except: - return lst1[key][0]["value"] - else: - return obj[0] - elif obj[1] == "String": - if lst1 is not None: - return lst1[key][0]["value"] - else: - return obj[0] - elif obj[1] == "gndLong": - if lst1 is not None: - try: - return re.search( - "Point \( [+-]([0-9\.]+) [+-]([0-9\.]+)", lst1[key][0]["value"] - ).group(1) - except: - print("extract fails") - return None - else: - print("no match") - - def choices_for_request(self): - ac_type = self.autocomplete_type - ac_type_model = self.autocomplete_type_model - choices = [] - headers = {"Content-Type": "application/json"} - q = self.request.GET.get("q") - for m in ac_type_model: - arg_list = [] - for mm in m[1]: - arg_list.append(Q(**{mm + "__icontains": q})) - res = m[0].objects.filter(reduce(operator.or_, arg_list)).distinct() - for r in res: - f = dict() - f["ac_type"] = "_".join(ac_type) - f["name"] = r - f["score"] = None - try: - f["uri"] = Uri.objects.filter(entity=r)[0].uri - except: - continue - f["source"] = "db" - if ac_type[0] == "Place": - if r.lng != None and r.lat != None: - f["long"] = str(r.lng) - f["lat"] = str(r.lat) - f["descr"] = m[2][0].format(*[getattr(r, s) for s in m[2][1]]) - choices.append(f) - - for o in ac_type: - for y in ac_settings[o]: - ldpath = "" - for d in y["fields"].keys(): - ldpath += "{} = <{}>;\n".format(d, y["fields"][d][0]) - data = {"limit": 20, "name": q, "ldpath": ldpath} - try: - r = requests.get(y["url"], params=data, headers=headers) - if r.status_code != 200: - choices.append({"name": "Connection to Stanbol failed"}) - continue - res = r.json() - except: - choices.append({"name": "Connection to Stanbol failed"}) - continue - for x in res["results"]: - f = dict() - name = x["name"][0]["value"] - score = str(x[ac_settings["score"]][0]["value"]) - id = x[ac_settings["uri"]] - f["ac_type"] = "_".join(ac_type) - f["name"] = name - f["score"] = score - f["uri"] = id - f["source"] = y["source"] - for field in y["fields"].keys(): - if field in x.keys(): - f[field] = self.parse_stanbol_object( - y["fields"][field], field, x - ) - else: - f[field] = None - choices.append(f) - return choices - - -class AddRelationBaseAutocomplete(al.AutocompleteListTemplate): - autocomplete_template = "apis_templates/autocomplete/AddRelation.html" - - widget_attrs = {} - - attrs = { - "data-autocomplete-minimum-characters": 3, - "placeholder": "Start typing to get suggestions", - "class": "autocomplete-add-relation form-control", - } - - def choices_for_request(self): - q = self.request.GET.get("q", None) - instance_pk = self.request.GET.get("instance_pk", None) - choices = [] - model_name = self.model2.__name__.lower() - if instance_pk and q: - instance = self.model2.objects.get(pk=instance_pk) - else: - return choices - for rel in self.relations: - if "related_" + model_name in dir(rel): - dd = rel.objects.filter( - **{ - "related_" + model_name: instance, - "relation_type__name__icontains": q, - } - ).exclude(annotation__isnull=False) - choices.extend(dd) - elif "related_" + model_name + "A" in dir(rel): - choices.extend( - rel.objects.filter( - Q( - **{ - "related_" + model_name + "A": instance, - "relation_type__name__icontains": q, - } - ) - | Q( - **{ - "related_" + model_name + "B": instance, - "relation_type__name__icontains": q, - } - ) - ) - .distinct() - .exclude(annotation__isnull=False) - ) - return choices - - -class PlaceAutocomplete(StanbolAutocompleteBase): - autocomplete_type = [ - "Place", - ] - autocomplete_type_model = [ - (Place, ["name", "label__label"], ("Status: {}", ["status"])), - ] - - -class InstitutionAutocomplete(StanbolAutocompleteBase): - autocomplete_type = [ - "Institution", - ] - autocomplete_type_model = [ - ( - Institution, - ["name", "label__label"], - ("Status: {}, Gründungsdatum: {}", ["status", "start_date_written"]), - ), - ] - - -class PersonAutocomplete(StanbolAutocompleteBase): - autocomplete_type = [ - "Person", - ] - autocomplete_type_model = [ - ( - Person, - ["name", "first_name", "label__label"], - ( - "Geburtsdatum: {}, Sterbedatum: {}", - ["start_date_written", "end_date_written"], - ), - ) - ] - - -class EventAutocomplete(StanbolAutocompleteBase): - autocomplete_type = [ - "Event", - ] - autocomplete_type_model = [ - ( - Event, - ["name", "label__label"], - ("Start date: {}, Status: {}", ["start_date", "status"]), - ), - ] - - -class WorkAutocomplete(StanbolAutocompleteBase): - autocomplete_type = [ - "Work", - ] - autocomplete_type_model = [ - ( - Work, - ["name", "label__label"], - ("Start date: {}, Status: {}", ["start_date", "status"]), - ), - ] - - -class AddRelationPersonHighlighterAutocomplete(AddRelationBaseAutocomplete): - relations = [PersonPerson, PersonPlace, PersonInstitution] - model2 = Person - - -al.register(PlaceAutocomplete) -al.register(InstitutionAutocomplete) -al.register(PersonAutocomplete) -al.register(EventAutocomplete) -al.register(WorkAutocomplete) -al.register(AddRelationPersonHighlighterAutocomplete) diff --git a/apis_core/apis_relations/forms2.py b/apis_core/apis_relations/forms2.py index c5d099c..7582b8c 100644 --- a/apis_core/apis_relations/forms2.py +++ b/apis_core/apis_relations/forms2.py @@ -15,15 +15,12 @@ from apis_core.apis_entities.fields import ListSelect2 from apis_core.apis_entities.models import AbstractEntity -# from dal.autocomplete import ListSelect2 from apis_core.apis_metainfo.models import TempEntityClass, Uri from apis_core.apis_relations.models import AbstractRelation from apis_core.helper_functions import DateParser from .tables import get_generic_relations_table -# from dal.autocomplete import ListSelect2 - def validate_target_autocomplete(value): try: @@ -130,7 +127,7 @@ def __init__(self, siteID=None, highlighter=False, *args, **kwargs): """ attrs = { "data-placeholder": "Type to get suggestions", - "data-minimum-input-length": getattr(settings, "APIS_MIN_CHAR", 3), + "data-minimum-input-length": 0, "data-html": True, "style": "width: 100%", } @@ -262,7 +259,6 @@ def __init__(self, siteID=None, highlighter=False, *args, **kwargs): self.fields["relation_type"] = autocomplete.Select2ListCreateChoiceField( label="Relation type", widget=ListSelect2( - # url='/vocabularies/autocomplete/{}{}relation/reverse'.format(lst_src_target[0].lower(), lst_src_target[1].lower()), url=reverse( "apis:apis_vocabularies:generic_vocabularies_autocomplete", args=[ @@ -282,7 +278,6 @@ def __init__(self, siteID=None, highlighter=False, *args, **kwargs): self.fields["target"] = autocomplete.Select2ListCreateChoiceField( label=lst_src_target[0], widget=ListSelect2( - # url='/entities/autocomplete/{}'.format(lst_src_target[0].lower()), url=reverse( "apis:apis_entities:generic_entities_autocomplete", args=[lst_src_target[0].lower()], diff --git a/apis_core/apis_relations/models.py b/apis_core/apis_relations/models.py index 3f29c9b..851ff09 100644 --- a/apis_core/apis_relations/models.py +++ b/apis_core/apis_relations/models.py @@ -22,6 +22,7 @@ class AbstractRelation(TempEntityClass): class Meta: abstract = True default_manager_name = "objects" + ordering = ["start_date", "id"] def save(self, *args, **kwargs): if ( diff --git a/apis_core/apis_vocabularies/admin.py b/apis_core/apis_vocabularies/admin.py index ef8073a..65b5b63 100644 --- a/apis_core/apis_vocabularies/admin.py +++ b/apis_core/apis_vocabularies/admin.py @@ -33,7 +33,7 @@ def save_model(self, request, obj, form, change): def formfield_for_foreignkey(self, db_field, request, **kwargs): attrs = { "data-placeholder": "Type to get suggestions", - "data-minimum-input-length": getattr(settings, "APIS_MIN_CHAR", 3), + "data-minimum-input-length": 0, "data-html": True, } c_name = db_field.model.__name__