diff --git a/README.md b/README.md index 67e1630..6c64b43 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,8 @@ To do so, some simple modifications should be made to the process' code: - Example instantiation: ```python + from impresso_commons.versioning.data_manifest import DataManifest + manifest = DataManifest( data_stage="passim", # DataStage.PASSIM also accepted s3_output_bucket="32-passim-rebuilt-final/passim", # includes partition within bucket diff --git a/impresso_commons/versioning/helpers.py b/impresso_commons/versioning/helpers.py index 70f0ece..bc6d4f1 100644 --- a/impresso_commons/versioning/helpers.py +++ b/impresso_commons/versioning/helpers.py @@ -927,7 +927,15 @@ def compute_stats_in_entities_bag( "content_items_out": 1, "ne_mentions": len(ci["nes"]), "ne_entities": sorted( - list(set([m["wkd_id"] for m in ci["nes"] if m["wkd_id"] != "NIL"])) + list( + set( + [ + m["wkd_id"] + for m in ci["nes"] + if "wkd_id" in m and m["wkd_id"] not in ["NIL", None] + ] + ) + ) ), # sorted list to ensure all are the same } )