Merge pull request #69 from arthur-schnitzler/main

updates
acdh-oeaw · Feb 15, 2024 · a20cbba · a20cbba
2 parents 4755bdc + bcd4299
commit a20cbba
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 73 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -41,7 +41,7 @@ jobs:
     - name: Create Coverage Report
       run: coverage xml
     - name: "Upload coverage to Codecov"
-      uses: codecov/codecov-action@v3
+      uses: codecov/codecov-action@v4
       with:
         token: ${{secrets.CODECOV_TOKEN}}
         file: ./coverage.xml

diff --git a/.gitignore b/.gitignore
@@ -180,3 +180,4 @@ hansi.csv
 media/duplicated_*.csv
 Untitled.ipynb
 listevent.xml
+relations.csv
diff --git a/apis_core/apis_entities/list_view_work.py b/apis_core/apis_entities/list_view_work.py
@@ -49,9 +49,9 @@
 
 class WorkListFilter(MyBaseFilter):
     name = django_filters.CharFilter(
-        lookup_expr="icontains",
+        method="name_label_filter",
         label="Werktitel",
-        help_text="eingegebene Zeichenkette muss im Titel enthalten sein",
+        help_text="eingegebene Zeichenkette muss im Titel oder in einem der Labels enthalten sein",
     )
     references = django_filters.CharFilter(
         lookup_expr="icontains",

diff --git a/apis_core/apis_entities/management/commands/find_duplicated_places.py b/apis_core/apis_entities/management/commands/find_duplicated_places.py
@@ -1,6 +1,7 @@
 import os
 import pandas as pd
 import recordlinkage
+from recordlinkage.compare import Geographic
 
 from typing import Any
 from django.conf import settings
@@ -18,11 +19,13 @@ def handle(self, *args: Any, **options: Any) -> str | None:
         props = [
             "id",
             "name",
+            "lat",
+            "lng"
         ]
         df = pd.DataFrame(
             Place.objects.values_list(*props),
             columns=props,
-        ).astype("str")
+        ).astype("str").fillna("nix")
         df["custom_index"] = df["id"].astype(str) + " " + df["name"]
         df.set_index("custom_index", inplace=True)
         indexer = recordlinkage.Index()
@@ -31,8 +34,10 @@ def handle(self, *args: Any, **options: Any) -> str | None:
         len(candidate_links)
         compare_cl = recordlinkage.Compare()
         compare_cl.exact("name", "name", label="name")
+        compare_cl.exact("lat", "lat", label="lat")
+        compare_cl.exact("lng", "lng", label="lng")
         features = compare_cl.compute(candidate_links, df)
-        matches = features[features.sum(axis=1) > 0]
+        matches = features[features.sum(axis=1) > 2]
         save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_places.csv")
         matches.to_csv(save_path)
         print(f"found {len(matches)} potential duplicates")
diff --git a/apis_core/apis_relations/forms2.py b/apis_core/apis_relations/forms2.py
@@ -73,8 +73,19 @@ def save(self, site_instance, instance=None, commit=True):
         target = AbstractEntity.get_entity_class_of_name(self.rel_accessor[0])
         t1 = target.get_or_create_uri(cd["target"])
         setattr(x, self.rel_accessor[2], t1)
+        params = {
+            self.rel_accessor[3]: site_instance,
+            self.rel_accessor[2]: t1,
+            "start_date_written": cd["start_date_written"],
+            "end_date_written": cd["end_date_written"],
+            "relation_type_id": cd["relation_type"],
+        }
         if commit:
-            x.save()
+            qs = x.__class__.objects.filter(**params)
+            if qs.count() > 0:
+                pass
+            else:
+                x.save()
         return x
 
     def get_text_id(self):

diff --git a/apis_core/apis_relations/management/commands/__init__.py b/apis_core/apis_relations/management/commands/__init__.py
diff --git a/apis_core/apis_relations/management/commands/dump_relations.py b/apis_core/apis_relations/management/commands/dump_relations.py
@@ -0,0 +1,72 @@
+import os
+import pandas as pd
+import recordlinkage
+
+from datetime import datetime
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+from icecream import ic
+from tqdm import tqdm
+from typing import Any
+from apis_core.apis_relations.models import AbstractRelation
+from dumper.utils import upload_files_to_owncloud, write_report
+
+
+class Command(BaseCommand):
+    help = "Dumps all relations into a csv"
+
+    def handle(self, *args: Any, **options: Any) -> str | None:
+        start_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
+        print("dumping all relations into a csv")
+
+        data = []
+        issues = []
+        for x in AbstractRelation.get_all_relation_classes():
+            print(x.__name__)
+            for y in tqdm(x.objects.all()):
+                try:
+                    item = y.get_web_object()
+                    item["relation_pk"] = y.id
+                    data.append(item)
+                except AttributeError:
+                    issues.append(y)
+        df = pd.DataFrame(data)
+        print("lets find and delete duplicated relations")
+        df.set_index("relation_pk", inplace=True, drop=False)
+        save_path = os.path.join(settings.MEDIA_ROOT, "relations.csv")
+        print(f"serialized {len(df)} relations")
+        df.to_csv(save_path, index=False)
+
+        df = pd.read_csv(save_path).fillna("nodate")
+        df.set_index("relation_pk", inplace=True, drop=False)
+        indexer = recordlinkage.Index()
+        indexer.block(["relation_type", "source_id", "target_id", "start_date", "end_date"])
+        duplicates = indexer.index(df)
+        print(f"deleting {len(duplicates)} duplicated relations")
+
+        deleted = []
+        for double in duplicates:
+            for x in AbstractRelation.get_all_relation_classes():
+                try:
+                    item = x.objects.get(id=double[1])
+                except:  # noqa
+                    continue
+                deleted.append(item.id)
+                item.delete()
+                break
+        print(deleted)
+        df.drop(deleted)
+        save_path = os.path.join(settings.MEDIA_ROOT, "relations.csv")
+        df.to_csv(save_path, index=False)
+        end_time = datetime.now().strftime(settings.PMB_TIME_PATTERN)
+        report = [os.path.basename(__file__), start_time, end_time]
+        write_report(report)
+        print(f"serialized {len(df)} relations")
+        files = list()
+        files.append(save_path)
+        try:
+            upload_files_to_owncloud(files)
+        except Exception as e:
+            ic(e)
+        return "done"
diff --git a/apis_core/apis_relations/models.py b/apis_core/apis_relations/models.py
@@ -1,6 +1,7 @@
 import inspect
 import sys
 
+from icecream import ic
 from apis_core.apis_entities.models import Person
 from apis_core.apis_metainfo.models import TempEntityClass
 
@@ -45,28 +46,16 @@ def __str__(self):
         )
 
     def get_web_object(self):
-        namea = self.get_related_entity_instancea().name
-        nameb = self.get_related_entity_instanceb().name
-
-        if self.get_related_entity_classa() == Person:
-            namea += ", "
-            if self.get_related_entity_instancea().first_name is None:
-                namea += "-"
-            else:
-                namea += self.get_related_entity_instancea().first_name
-
-        if self.get_related_entity_classb() == Person:
-            nameb += ", "
-            if self.get_related_entity_instanceb().first_name is None:
-                nameb += "-"
-            else:
-                nameb += self.get_related_entity_instanceb().first_name
+        namea = self.get_related_entity_instancea()
+        nameb = self.get_related_entity_instanceb()
 
         result = {
             "relation_pk": self.pk,
             "relation_type": self.relation_type.name,
-            self.get_related_entity_field_namea(): namea,
-            self.get_related_entity_field_nameb(): nameb,
+            "source": namea,
+            "target": nameb,
+            "source_id": namea.id,
+            "target_id": nameb.id,
             "start_date": self.start_date_written,
             "end_date": self.end_date_written,
         }

diff --git a/crontab b/crontab
@@ -5,6 +5,9 @@
 1 3 * * * root cd /opt/app && /usr/local/bin/python3 manage.py wikipedia_minter >> /var/log/cron.log 2>&1
 30 3 * * * root cd /opt/app && /usr/local/bin/python3 manage.py wikidata_minter >> /var/log/cron.log 2>&1
 1 4 * * * root cd /opt/app && /usr/local/bin/python3 manage.py dump_entities >> /var/log/cron.log 2>&1
+30 5 * * * root cd /opt/app && /usr/local/bin/python3 manage.py dump_relations >> /var/log/cron.log 2>&1
 1 6 * * * root cd /opt/app && /usr/local/bin/python3 manage.py add_gn_feature_codes >> /var/log/cron.log 2>&1
 1 7 * * * root cd /opt/app && /usr/local/bin/python3 manage.py fetch_images >> /var/log/cron.log 2>&1
+30 7 * * * root cd /opt/app && /usr/local/bin/python3 manage.py find_duplicted_persons >> /var/log/cron.log 2>&1
+50 7 * * * root cd /opt/app && /usr/local/bin/python3 manage.py find_duplicted_places >> /var/log/cron.log 2>&1
 #
diff --git a/issue__80_professions.ipynb b/issue__80_professions.ipynb
@@ -8,9 +8,17 @@
     "# run 2024-01-19 on production server"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "014098af",
+   "metadata": {},
+   "source": [
+    "## run again 2024-02-15"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "2fe08400",
    "metadata": {},
    "outputs": [],
@@ -21,52 +29,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "2b4ee9e3",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "200\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "df = gsheet_to_df(\"1MnS_eJbPNLzXp4YkS5I9Xkhv2GVPbYtrFiQjJla5rJE\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "f47bcd68",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "1518it [01:10, 27.09it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1513\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "1567it [01:12, 21.47it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "for i, row in tqdm(df.iterrows()):\n",
+    "for i, row in tqdm(df.iterrows(), total=len(df)):\n",
     "    try:\n",
     "        pl = ProfessionType.objects.get(id=row[\"id\"])\n",
     "    except:\n",
@@ -79,21 +57,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "f820cd7f",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "1567it [01:34, 16.66it/s] \n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "delete_us = []\n",
-    "for i, row in tqdm(df.iterrows()):\n",
+    "for i, row in tqdm(df.iterrows(), total=len(df)):\n",
     "    if isinstance(row[\"duplicates\"], str):\n",
     "        pl = ProfessionType.objects.get(id=row[\"id\"])\n",
     "        ids = [int(x) for x in row[\"duplicates\"].split('|')]\n",
@@ -112,12 +82,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "fe06fca6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "for x in delete_us:\n",
+    "for x in tqdm(delete_us, total=len(delete_us)):\n",
     "    try:\n",
     "        x.delete()\n",
     "    except:\n",
@@ -130,6 +100,16 @@
    "id": "85c6f39d",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "delete_us"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab5120e2",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],