Skip to content

Commit

Permalink
Merge pull request #69 from arthur-schnitzler/main
Browse files Browse the repository at this point in the history
updates
  • Loading branch information
csae8092 authored Feb 15, 2024
2 parents 4755bdc + bcd4299 commit a20cbba
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 73 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Create Coverage Report
run: coverage xml
- name: "Upload coverage to Codecov"
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
token: ${{secrets.CODECOV_TOKEN}}
file: ./coverage.xml
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,4 @@ hansi.csv
media/duplicated_*.csv
Untitled.ipynb
listevent.xml
relations.csv
4 changes: 2 additions & 2 deletions apis_core/apis_entities/list_view_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@

class WorkListFilter(MyBaseFilter):
name = django_filters.CharFilter(
lookup_expr="icontains",
method="name_label_filter",
label="Werktitel",
help_text="eingegebene Zeichenkette muss im Titel enthalten sein",
help_text="eingegebene Zeichenkette muss im Titel oder in einem der Labels enthalten sein",
)
references = django_filters.CharFilter(
lookup_expr="icontains",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import pandas as pd
import recordlinkage
from recordlinkage.compare import Geographic

from typing import Any
from django.conf import settings
Expand All @@ -18,11 +19,13 @@ def handle(self, *args: Any, **options: Any) -> str | None:
props = [
"id",
"name",
"lat",
"lng"
]
df = pd.DataFrame(
Place.objects.values_list(*props),
columns=props,
).astype("str")
).astype("str").fillna("nix")
df["custom_index"] = df["id"].astype(str) + " " + df["name"]
df.set_index("custom_index", inplace=True)
indexer = recordlinkage.Index()
Expand All @@ -31,8 +34,10 @@ def handle(self, *args: Any, **options: Any) -> str | None:
len(candidate_links)
compare_cl = recordlinkage.Compare()
compare_cl.exact("name", "name", label="name")
compare_cl.exact("lat", "lat", label="lat")
compare_cl.exact("lng", "lng", label="lng")
features = compare_cl.compute(candidate_links, df)
matches = features[features.sum(axis=1) > 0]
matches = features[features.sum(axis=1) > 2]
save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_places.csv")
matches.to_csv(save_path)
print(f"found {len(matches)} potential duplicates")
13 changes: 12 additions & 1 deletion apis_core/apis_relations/forms2.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,19 @@ def save(self, site_instance, instance=None, commit=True):
target = AbstractEntity.get_entity_class_of_name(self.rel_accessor[0])
t1 = target.get_or_create_uri(cd["target"])
setattr(x, self.rel_accessor[2], t1)
params = {
self.rel_accessor[3]: site_instance,
self.rel_accessor[2]: t1,
"start_date_written": cd["start_date_written"],
"end_date_written": cd["end_date_written"],
"relation_type_id": cd["relation_type"],
}
if commit:
x.save()
qs = x.__class__.objects.filter(**params)
if qs.count() > 0:
pass
else:
x.save()
return x

def get_text_id(self):
Expand Down
Empty file.
72 changes: 72 additions & 0 deletions apis_core/apis_relations/management/commands/dump_relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import pandas as pd
import recordlinkage

from datetime import datetime

from django.conf import settings
from django.core.management.base import BaseCommand
from icecream import ic
from tqdm import tqdm
from typing import Any
from apis_core.apis_relations.models import AbstractRelation
from dumper.utils import upload_files_to_owncloud, write_report


class Command(BaseCommand):
help = "Dumps all relations into a csv"

def handle(self, *args: Any, **options: Any) -> str | None:
start_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
print("dumping all relations into a csv")

data = []
issues = []
for x in AbstractRelation.get_all_relation_classes():
print(x.__name__)
for y in tqdm(x.objects.all()):
try:
item = y.get_web_object()
item["relation_pk"] = y.id
data.append(item)
except AttributeError:
issues.append(y)
df = pd.DataFrame(data)
print("lets find and delete duplicated relations")
df.set_index("relation_pk", inplace=True, drop=False)
save_path = os.path.join(settings.MEDIA_ROOT, "relations.csv")
print(f"serialized {len(df)} relations")
df.to_csv(save_path, index=False)

df = pd.read_csv(save_path).fillna("nodate")
df.set_index("relation_pk", inplace=True, drop=False)
indexer = recordlinkage.Index()
indexer.block(["relation_type", "source_id", "target_id", "start_date", "end_date"])
duplicates = indexer.index(df)
print(f"deleting {len(duplicates)} duplicated relations")

deleted = []
for double in duplicates:
for x in AbstractRelation.get_all_relation_classes():
try:
item = x.objects.get(id=double[1])
except: # noqa
continue
deleted.append(item.id)
item.delete()
break
print(deleted)
df.drop(deleted)
save_path = os.path.join(settings.MEDIA_ROOT, "relations.csv")
df.to_csv(save_path, index=False)
end_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
report = [os.path.basename(__file__), start_time, end_time]
write_report(report)
print(f"serialized {len(df)} relations")
files = list()
files.append(save_path)
try:
upload_files_to_owncloud(files)
except Exception as e:
ic(e)
return "done"
25 changes: 7 additions & 18 deletions apis_core/apis_relations/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import inspect
import sys

from icecream import ic
from apis_core.apis_entities.models import Person
from apis_core.apis_metainfo.models import TempEntityClass

Expand Down Expand Up @@ -45,28 +46,16 @@ def __str__(self):
)

def get_web_object(self):
namea = self.get_related_entity_instancea().name
nameb = self.get_related_entity_instanceb().name

if self.get_related_entity_classa() == Person:
namea += ", "
if self.get_related_entity_instancea().first_name is None:
namea += "-"
else:
namea += self.get_related_entity_instancea().first_name

if self.get_related_entity_classb() == Person:
nameb += ", "
if self.get_related_entity_instanceb().first_name is None:
nameb += "-"
else:
nameb += self.get_related_entity_instanceb().first_name
namea = self.get_related_entity_instancea()
nameb = self.get_related_entity_instanceb()

result = {
"relation_pk": self.pk,
"relation_type": self.relation_type.name,
self.get_related_entity_field_namea(): namea,
self.get_related_entity_field_nameb(): nameb,
"source": namea,
"target": nameb,
"source_id": namea.id,
"target_id": nameb.id,
"start_date": self.start_date_written,
"end_date": self.end_date_written,
}
Expand Down
3 changes: 3 additions & 0 deletions crontab
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
1 3 * * * root cd /opt/app && /usr/local/bin/python3 manage.py wikipedia_minter >> /var/log/cron.log 2>&1
30 3 * * * root cd /opt/app && /usr/local/bin/python3 manage.py wikidata_minter >> /var/log/cron.log 2>&1
1 4 * * * root cd /opt/app && /usr/local/bin/python3 manage.py dump_entities >> /var/log/cron.log 2>&1
30 5 * * * root cd /opt/app && /usr/local/bin/python3 manage.py dump_relations >> /var/log/cron.log 2>&1
1 6 * * * root cd /opt/app && /usr/local/bin/python3 manage.py add_gn_feature_codes >> /var/log/cron.log 2>&1
1 7 * * * root cd /opt/app && /usr/local/bin/python3 manage.py fetch_images >> /var/log/cron.log 2>&1
30 7 * * * root cd /opt/app && /usr/local/bin/python3 manage.py find_duplicted_persons >> /var/log/cron.log 2>&1
50 7 * * * root cd /opt/app && /usr/local/bin/python3 manage.py find_duplicted_places >> /var/log/cron.log 2>&1
#
78 changes: 29 additions & 49 deletions issue__80_professions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,17 @@
"# run 2024-01-19 on production server"
]
},
{
"cell_type": "markdown",
"id": "014098af",
"metadata": {},
"source": [
"## run again 2024-02-15"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "2fe08400",
"metadata": {},
"outputs": [],
Expand All @@ -21,52 +29,22 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "2b4ee9e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"200\n"
]
}
],
"outputs": [],
"source": [
"df = gsheet_to_df(\"1MnS_eJbPNLzXp4YkS5I9Xkhv2GVPbYtrFiQjJla5rJE\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "f47bcd68",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"1518it [01:10, 27.09it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1513\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1567it [01:12, 21.47it/s]\n"
]
}
],
"outputs": [],
"source": [
"for i, row in tqdm(df.iterrows()):\n",
"for i, row in tqdm(df.iterrows(), total=len(df)):\n",
" try:\n",
" pl = ProfessionType.objects.get(id=row[\"id\"])\n",
" except:\n",
Expand All @@ -79,21 +57,13 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "f820cd7f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"1567it [01:34, 16.66it/s] \n"
]
}
],
"outputs": [],
"source": [
"delete_us = []\n",
"for i, row in tqdm(df.iterrows()):\n",
"for i, row in tqdm(df.iterrows(), total=len(df)):\n",
" if isinstance(row[\"duplicates\"], str):\n",
" pl = ProfessionType.objects.get(id=row[\"id\"])\n",
" ids = [int(x) for x in row[\"duplicates\"].split('|')]\n",
Expand All @@ -112,12 +82,12 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "fe06fca6",
"metadata": {},
"outputs": [],
"source": [
"for x in delete_us:\n",
"for x in tqdm(delete_us, total=len(delete_us)):\n",
" try:\n",
" x.delete()\n",
" except:\n",
Expand All @@ -130,6 +100,16 @@
"id": "85c6f39d",
"metadata": {},
"outputs": [],
"source": [
"delete_us"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab5120e2",
"metadata": {},
"outputs": [],
"source": []
}
],
Expand Down

0 comments on commit a20cbba

Please sign in to comment.