Skip to content

Commit

Permalink
Merge pull request #41 from arthur-schnitzler/95-scrape-wikidata-uris…
Browse files Browse the repository at this point in the history
…-for-geonames

mint wikidata for geonames
  • Loading branch information
csae8092 authored Jan 14, 2024
2 parents 795f1f1 + dc0f3f0 commit 8ed2788
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 37 deletions.
3 changes: 3 additions & 0 deletions apis_core/apis_entities/list_view_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from apis_core.apis_entities.models import Event
from apis_core.apis_entities.base_filter import MyBaseFilter
from apis_core.apis_metainfo.models import Collection
from apis_core.apis_vocabularies.models import (
EventEventRelation,
EventType,
Expand Down Expand Up @@ -87,6 +88,7 @@ class EventListFilter(MyBaseFilter):
url="/apis/vocabularies/autocomplete/eventtype/normal/",
),
)
collection = django_filters.ModelChoiceFilter(queryset=Collection.objects.all())

def related_work_filter(self, qs, name, value):
rels = get_child_classes(
Expand Down Expand Up @@ -142,6 +144,7 @@ def __init__(self, *args, **kwargs):
"name",
"kind",
"year_of_creation",
"collection",
css_id="more",
),
AccordionGroup(
Expand Down
3 changes: 3 additions & 0 deletions apis_core/apis_entities/list_view_institution.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dal import autocomplete

from apis_core.apis_entities.models import Institution
from apis_core.apis_metainfo.models import Collection
from apis_core.apis_entities.base_filter import MyBaseFilter
from apis_core.apis_vocabularies.models import (
InstitutionEventRelation,
Expand Down Expand Up @@ -93,6 +94,7 @@ class InstitutionListFilter(MyBaseFilter):
url="/apis/vocabularies/autocomplete/institutiontype/normal/",
),
)
collection = django_filters.ModelChoiceFilter(queryset=Collection.objects.all())

def related_event_filter(self, qs, name, value):
rels = get_child_classes(
Expand Down Expand Up @@ -161,6 +163,7 @@ def __init__(self, *args, **kwargs):
"name",
"kind",
"year_of_creation",
"collection",
css_id="more",
),
AccordionGroup(
Expand Down
3 changes: 3 additions & 0 deletions apis_core/apis_entities/list_view_person.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from apis_core.apis_entities.models import Person
from apis_core.apis_entities.base_filter import MyBaseFilter
from apis_core.apis_metainfo.models import Collection
from apis_core.apis_vocabularies.models import (
PersonInstitutionRelation,
PersonPersonRelation,
Expand Down Expand Up @@ -99,6 +100,7 @@ class PersonListFilter(MyBaseFilter):
help_text="Name einer Institution und die Art des Beziehung, z.B. 'Znanie' und 'besitzt'",
method="related_institution_filter",
)
collection = django_filters.ModelChoiceFilter(queryset=Collection.objects.all())

def related_work_filter(self, qs, name, value):
rels = get_child_classes(
Expand Down Expand Up @@ -163,6 +165,7 @@ def __init__(self, *args, **kwargs):
"gender",
"birth_year",
"death_year",
"collection",
css_id="more",
),
AccordionGroup(
Expand Down
3 changes: 3 additions & 0 deletions apis_core/apis_entities/list_view_place.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from apis_core.apis_entities.models import Place
from apis_core.apis_entities.base_filter import MyBaseFilter
from apis_core.apis_metainfo.models import Collection
from apis_core.apis_vocabularies.models import (
PersonPlaceRelation,
PlaceType,
Expand Down Expand Up @@ -65,6 +66,7 @@ class PlaceListFilter(MyBaseFilter):
url="/apis/vocabularies/autocomplete/placetype/normal/",
),
)
collection = django_filters.ModelChoiceFilter(queryset=Collection.objects.all())

def related_work_filter(self, qs, name, value):
rels = get_child_classes(
Expand Down Expand Up @@ -112,6 +114,7 @@ def __init__(self, *args, **kwargs):
"Beziehungen",
"related_with_person",
"related_with_work",
"collection",
css_id="admin_search",
),
)
Expand Down
3 changes: 3 additions & 0 deletions apis_core/apis_entities/list_view_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from apis_core.apis_entities.models import Work
from apis_core.apis_entities.base_filter import MyBaseFilter
from apis_core.apis_metainfo.models import Collection
from apis_core.apis_vocabularies.models import (
InstitutionWorkRelation,
PersonWorkRelation,
Expand Down Expand Up @@ -76,6 +77,7 @@ class WorkListFilter(MyBaseFilter):
url="/apis/vocabularies/autocomplete/worktype/normal/",
),
)
collection = django_filters.ModelChoiceFilter(queryset=Collection.objects.all())

def related_work_filter(self, qs, name, value):
rels = get_child_classes(
Expand Down Expand Up @@ -117,6 +119,7 @@ def __init__(self, *args, **kwargs):
"name",
"kind",
"year_of_creation",
"collection",
css_id="more",
),
AccordionGroup(
Expand Down
81 changes: 44 additions & 37 deletions dumper/management/commands/wikidata_minter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import time
from datetime import datetime

from acdh_id_reconciler import gnd_to_wikidata
from acdh_id_reconciler import gnd_to_wikidata, geonames_to_gnd
from AcdhArcheAssets.uri_norm_rules import get_normalized_uri
from django.conf import settings
from django.core.management.base import BaseCommand
Expand All @@ -16,43 +16,50 @@ class Command(BaseCommand):
help = "mint WikiData IDs for GND-URIs"

def handle(self, *args, **kwargs):
start_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
LIMIT = 100
USER_AGENT_PMB = "pmb (https://pmb.acdh.oeaw.ac.at)"
col, _ = Collection.objects.get_or_create(name="No WikiData-ID found")
types = ["d-nb.info", "geonames"]
for uri_type in types:
print(f"processing URIS with type: {uri_type}")
start_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
ents = (
TempEntityClass.objects.filter(uri__uri__icontains=uri_type)
.exclude(uri__uri__icontains="wikidata")
.exclude(collection=col)
)
uris_to_process = Uri.objects.filter(entity__in=ents).filter(
uri__icontains=uri_type
)

ents = (
TempEntityClass.objects.filter(uri__uri__icontains="d-nb.info")
.exclude(uri__uri__icontains="wikidata")
.exclude(collection=col)
)
uris_to_process = Uri.objects.filter(entity__in=ents).filter(
uri__icontains="d-nb.info"
)

print(f"All in all {uris_to_process.count()} GND-Entities without Wikidata")
for x in tqdm(uris_to_process.order_by("id")[:LIMIT], total=LIMIT):
time.sleep(1)
ent = x.entity
try:
results = gnd_to_wikidata(x.uri, USER_AGENT_PMB)
except Exception as e:
print(x, ent.id, e)
ent.collection.add(col)
continue
wd_url = get_normalized_uri(results["wikidata"])
wd_uri, _ = Uri.objects.get_or_create(uri=wd_url)
wd_uri.entity = ent
wd_uri.domain = "wikidata"
wd_uri.save()
ents = TempEntityClass.objects.filter(uri__uri__icontains="d-nb.info").exclude(
uri__uri__icontains="wikidata"
)
uris_to_process = Uri.objects.filter(entity__in=ents).filter(
uri__icontains="d-nb.info"
)
mgs = f"{uris_to_process.count()} left"
print(mgs)
end_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
report = [os.path.basename(__file__), start_time, end_time]
write_report(report)
print(
f"All in all {uris_to_process.count()} {uri_type}-Entities without Wikidata"
)
for x in tqdm(uris_to_process.order_by("id")[:LIMIT], total=LIMIT):
time.sleep(1)
ent = x.entity
try:
if uri_type == "d-nb.info":
results = gnd_to_wikidata(x.uri, USER_AGENT_PMB)
else:
results = geonames_to_gnd(x.uri, USER_AGENT_PMB)
except Exception as e:
print(x, ent.id, e)
ent.collection.add(col)
continue
wd_url = get_normalized_uri(results["wikidata"])
wd_uri, _ = Uri.objects.get_or_create(uri=wd_url)
wd_uri.entity = ent
wd_uri.domain = "wikidata"
wd_uri.save()
ents = TempEntityClass.objects.filter(uri__uri__icontains=uri_type).exclude(
uri__uri__icontains="wikidata"
)
uris_to_process = Uri.objects.filter(entity__in=ents).filter(
uri__icontains=uri_type
)
mgs = f"{uris_to_process.count()} left"
print(mgs)
end_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
report = [os.path.basename(__file__), start_time, end_time]
write_report(report)

0 comments on commit 8ed2788

Please sign in to comment.