Skip to content

Commit

Permalink
Actually fix nes problem + incorporate changes for Maud
Browse files Browse the repository at this point in the history
  • Loading branch information
piconti committed Aug 6, 2024
1 parent 75f5bb2 commit 3d89136
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 10 deletions.
20 changes: 11 additions & 9 deletions impresso_commons/versioning/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,8 +938,7 @@ def compute_stats_in_entities_bag(
)
), # sorted list to ensure all are the same
}
)
.to_dataframe(
).to_dataframe(
meta={
"np_id": str,
"year": str,
Expand All @@ -949,9 +948,14 @@ def compute_stats_in_entities_bag(
"ne_entities": object,
}
)
.explode("ne_entities")
.persist()
# .explode("ne_entities")
# .persist()
)

count_df["ne_entities"] = count_df["ne_entities"].apply(
lambda x: x if isinstance(x, list) else [x]
)
count_df = count_df.explode("ne_entities").persist()

# cum the counts for all values collected
aggregated_df = (
Expand Down Expand Up @@ -1179,7 +1183,7 @@ def manifest_summary(mnf_json: dict[str, Any], extended_summary: bool = False) -


def filter_new_or_modified_media(
rebuilt_mft_path: str, previous_mft_path_str: str
rebuilt_mft_json: dict[str, Any], previous_mft_json: dict[str, Any]
) -> dict[str, Any]:
"""
Compares two manifests to determine new or modified media items.
Expand All @@ -1188,8 +1192,8 @@ def filter_new_or_modified_media(
compared to the previous process need to be ingested or processed.
Args:
rebuilt_mft_path (str): Path of the rebuilt manifest (new).
previous_mft_path_str (str): Path of the previous process manifest.
rebuilt_mft_json (dict[str, Any]): json of the rebuilt manifest (new).
previous_mft_json (dict[str, Any]): json of the previous process manifest.
Returns:
list[dict[str, Any]]: A manifest identical to 'rebuilt_mft_path' but only with
Expand All @@ -1201,8 +1205,6 @@ def filter_new_or_modified_media(
{'media_title': 'modified_media_item_2', 'last_modif_date':
'2024-04-03T12:00:00Z', etc.}]
"""
rebuilt_mft_json = read_manifest_from_s3_path(rebuilt_mft_path)
previous_mft_json = read_manifest_from_s3_path(previous_mft_path_str)
filtered_manifest = copy.deepcopy(rebuilt_mft_json)

# Extract last modification date of each media item of the previous process
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ notebook>=7.0.3
notebook_shim>=0.2.3
numpy>=1.25.2
oauthlib>=3.2.2
opencv-python>=4.8.0.76
opencv-python>=4.9.0
overrides>=7.4.0
packaging>=23.1
pandas>=2.1.0
Expand Down

0 comments on commit 3d89136

Please sign in to comment.