Skip to content

Commit

Permalink
entity: better replace_identified_by
Browse files Browse the repository at this point in the history
* Corrects handling of rero only and not found identifiers.

Co-Authored-by: Peter Weber <[email protected]>
  • Loading branch information
rerowep committed Oct 25, 2023
1 parent d62590f commit 057647d
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 28 deletions.
16 changes: 10 additions & 6 deletions rero_ils/modules/entities/remote_entities/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,15 @@ def replace_identified_by_cli(field, dry_run, verbose, log_dir):
fg='green'
)
if verbose:
if replace_identified_by.not_found:
if replace_identified_by._error_count(
replace_identified_by.not_found):
click.secho('Not found:', fg='yellow')
for pid, data in replace_identified_by.not_found.items():
click.echo(f'\t{pid}: {data}')
if replace_identified_by.rero_only:
for etype, values in replace_identified_by.not_found.items():
for pid, data in values.items():
click.echo(f'\t{etype} {pid}: {data}')
if replace_identified_by._error_count(
replace_identified_by.rero_only):
click.secho('RERO only:', fg='yellow')
for pid, data in replace_identified_by.rero_only.items():
click.echo(f'\t{pid}: {data}')
for etype, values in replace_identified_by.rero_only.items():
for pid, data in values.items():
click.echo(f'\t{pid}: {data}')
38 changes: 24 additions & 14 deletions rero_ils/modules/entities/remote_entities/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import requests
from flask import current_app
from sqlalchemy.orm.exc import NoResultFound

from rero_ils.modules.documents.api import Document, DocumentsSearch
from rero_ils.modules.utils import get_mef_url, get_timestamp, \
Expand Down Expand Up @@ -157,13 +158,15 @@ def _do_entity(self, entity, doc_pid):
"""
changed = False
doc_entity_type = entity['entity']['type']
self.not_found.setdefault(doc_entity_type, {})
self.rero_only.setdefault(doc_entity_type, {})
if mef_type := self.entity_types.get(doc_entity_type):
source_pid = entity['entity']['identifiedBy']['value']
source = entity['entity']['identifiedBy']['type'].lower()
identifier = f'{source}:{source_pid}'
if (
identifier in self.not_found or
identifier in self.rero_only
identifier in self.not_found[doc_entity_type] or
identifier in self.rero_only[doc_entity_type]
):
# MEF was not found previously. Do not try it again.
return None
Expand Down Expand Up @@ -200,16 +203,16 @@ def _do_entity(self, entity, doc_pid):
f'{doc_entity_type} != {mef_entity_type} '
f': "{authorized_access_point}"'
)
self.rero_only[identifier] = info
self.rero_only[doc_entity_type][identifier] = info
self.logger.warning(
f'Type differ:{doc_pid} '
f'{self.field} - ({mef_type}) {identifier} {info}'
)
else:
authorized_access_point = mef_data.get(
source, {}).get('authorized_access_point')
info = f'{doc_entity_type}: {authorized_access_point}'
self.rero_only[identifier] = info
info = f'{authorized_access_point}'
self.rero_only[doc_entity_type][identifier] = info
self.logger.info(
f'No other source found for document:{doc_pid} '
f'{self.field} - ({mef_type}|{doc_entity_type}) '
Expand All @@ -218,12 +221,14 @@ def _do_entity(self, entity, doc_pid):
else:
authorized_access_point = entity[
'entity']['authorized_access_point']
info = f'{doc_entity_type}: {authorized_access_point}'
self.not_found[identifier] = info
info = f'{authorized_access_point}'
self.not_found[doc_entity_type][identifier] = info
self.logger.info(
f'No MEF found for document:{doc_pid} '
f' - ({mef_type}) {identifier} "{info}"'
)
self.not_found = {k: v for k, v in self.not_found.items() if v}
self.rero_only = {k: v for k, v in self.rero_only.items() if v}
return changed

def _replace_entities_in_document(self, doc_id):
Expand All @@ -232,7 +237,7 @@ def _replace_entities_in_document(self, doc_id):
:param doc_id: (string) document id
"""
changed = False
with contextlib.suppress(Exception):
with contextlib.suppress(NoResultFound):
doc = Document.get_record(doc_id)
entities_to_update = filter(
lambda c: c.get('entity', {}).get('identifiedBy'),
Expand All @@ -248,6 +253,10 @@ def _replace_entities_in_document(self, doc_id):
if changed:
return doc

def _error_count(self, counter_dict):
"""Summ of error count."""
return sum(len(values) for values in counter_dict.values())

def run(self):
"""Replace identifiedBy with $ref."""
self.changed = 0
Expand All @@ -256,16 +265,17 @@ def run(self):
self.logger.info(
f'Found {self.field} identifiedBy: {self.count()}')
query = self.query \
.params(preserve_order=True) \
.sort({'_created': {'order': 'asc'}}) \
.source(['pid', self.field])
.params(preserve_order=True) \
.sort({'_created': {'order': 'asc'}}) \
.source(['pid', self.field])
for hit in list(query.scan()):
if doc := self._replace_entities_in_document(hit.meta.id):
self.changed += 1
if not self.dry_run:
doc.update(data=doc, dbcommit=True, reindex=True)
self.set_timestamp()
return self.changed, len(self.not_found), len(self.rero_only)
return self.changed, self._error_count(self.not_found), \
self._error_count(self.rero_only)

def get_timestamp(self):
"""Get time stamp."""
Expand All @@ -287,8 +297,8 @@ def set_timestamp(self):
# rero only: entity was found but has only `rero` as source.
data[self.field] = {
'changed': self.changed,
'not found': len(self.not_found),
'rero only': len(self.rero_only),
'not found': self._error_count(self.not_found),
'rero only': self._error_count(self.rero_only),
'time': datetime.now(timezone.utc),
}
set_timestamp(self.timestamp_name, **data)
27 changes: 19 additions & 8 deletions tests/ui/entities/remote_entities/test_remote_entities_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,13 @@ def test_replace_identified_by(
assert not_found == 2
assert rero_only == 0
assert replace_identified_by.not_found == {
'gnd:1161956409': 'bf:Organisation: Convegno internazionale '
'di italianistica Craiova',
'rero:A003633163': 'bf:Person: Nebehay, Christian Michael'
'bf:Organisation': {
'gnd:1161956409': 'Convegno internazionale '
'di italianistica Craiova'
},
'bf:Person': {
'rero:A003633163': 'Nebehay, Christian Michael'
}
}
replace_identified_by.set_timestamp()
data = replace_identified_by.get_timestamp()
Expand All @@ -404,7 +408,9 @@ def test_replace_identified_by(
assert not_found == 0
assert rero_only == 1
assert replace_identified_by.rero_only == {
'rero:A003633163': 'bf:Person: Nebehay, Christian Michael'
'bf:Person': {
'rero:A003633163': 'Nebehay, Christian Michael'
}
}
# with MEF response for concepts in subjects
replace_identified_by = ReplaceIdentifiedBy(
Expand Down Expand Up @@ -434,10 +440,15 @@ def test_replace_identified_by(
assert not_found == 0
assert rero_only == 3
assert dict(sorted(replace_identified_by.rero_only.items())) == {
'rero:A009963344':
'bf:Person: Athenagoras (patriarche oecuménique ; 1)',
'rero:A021039750': 'bf:Topic: Bases de données déductives',
'rero:A009975209': 'bf:Place: Europe occidentale'
'bf:Person': {
'rero:A009963344': 'Athenagoras (patriarche oecuménique ; 1)'
},
'bf:Topic': {
'rero:A021039750': 'Bases de données déductives'
},
'bf:Place': {
'rero:A009975209': 'Europe occidentale'
}
}


Expand Down

0 comments on commit 057647d

Please sign in to comment.