Skip to content

Commit

Permalink
Do the same replacement to non-versioning modules
Browse files Browse the repository at this point in the history
  • Loading branch information
piconti committed May 31, 2024
1 parent f0737df commit 6a330fb
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 52 deletions.
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/_build/doctrees/io.doctree
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/_build/html/io.html
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ <h1>Input/Output<a class="headerlink" href="#input-output" title="Link to this h

<dl class="py function">
<dt class="sig sig-object py" id="impresso_commons.path.path_s3.list_files">
<span class="sig-prename descclassname"><span class="pre">impresso_commons.path.path_s3.</span></span><span class="sig-name descname"><span class="pre">list_files</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'issues'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspapers_filter</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.path.path_s3.list_files" title="Link to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">impresso_commons.path.path_s3.</span></span><span class="sig-name descname"><span class="pre">list_files</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'issues'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspapers_filter</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.path.path_s3.list_files" title="Link to this definition"></a></dt>
<dd><p>List the canonical files located in a given S3 bucket.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
Expand Down
2 changes: 0 additions & 2 deletions impresso_commons/path/path_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
import os
import logging
from datetime import date, datetime
from smart_open import s3_iter_bucket
from collections import namedtuple
import re
import json

logger = logging.getLogger(__name__)

Expand Down
8 changes: 4 additions & 4 deletions impresso_commons/path/path_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import warnings
from datetime import date
from collections import namedtuple
from typing import Union
from typing import Optional, Union

from dask.diagnostics import ProgressBar
import dask.bag as db
Expand Down Expand Up @@ -360,8 +360,8 @@ def list_newspapers(
def list_files(
bucket_name: str,
file_type: str = "issues",
newspapers_filter: Union[list[str], None] = None,
) -> tuple[Union[list[str], None, list[str], None]]:
newspapers_filter: Optional[list[str]] = None,
) -> tuple[Optional[list[str]], Optional[list[str]]]:
"""List the canonical files located in a given S3 bucket.
Note:
Expand Down Expand Up @@ -419,7 +419,7 @@ def fetch_files(
bucket_name: str,
compute: bool = True,
file_type: str = "issues",
newspapers_filter: Union [list[str], None] = None,
newspapers_filter: Optional[list[str]] = None,
) -> tuple[Union[db.core.Bag, list[str], None], Union[db.core.Bag, list[str], None]]:
"""Fetch issue and/or page canonical JSON files from an s3 bucket.
Expand Down
9 changes: 6 additions & 3 deletions impresso_commons/text/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import logging
import os
from typing import Any, Union
from typing import Any, Optional, Union

from impresso_commons.utils.s3 import (
IMPRESSO_STORAGEOPT,
Expand Down Expand Up @@ -235,7 +235,7 @@ def text_apply_breaks(fulltext, breaks):
return text


def get_iiif_and_coords(ci: dict[str, Any]) -> tuple[Union[str, None], Union[str, None]]:
def get_iiif_and_coords(ci: dict[str, Any]) -> tuple[Optional[str], Optional[str]]:
"""Fetch the iiif link and image coordinates from CI metadata.
Adapts to the various cases currently present in the canonical data, see
Expand Down Expand Up @@ -307,7 +307,10 @@ def reconstruct_iiif_link(content_item: dict[str, Any]) -> str:


def insert_whitespace(
token: str, next_t: Union[str, None], prev_t: Union[str,None], lang: Union[str,None]
token: str,
next_t: Optional[str],
prev_t: Optional[str],
lang: Optional[str],
) -> bool:
"""Determine whether a whitespace should be inserted after a token.
Expand Down
46 changes: 26 additions & 20 deletions impresso_commons/utils/daskutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@
--config-file=<cf> json configuration dict specifying various arguments
"""

import os
import logging
import docopt
import os

from dask.diagnostics import ProgressBar
import dask.bag as db
import numpy as np

from impresso_commons.utils import init_logger
from impresso_commons.utils import Timer, user_confirmation
from impresso_commons.utils import Timer
from impresso_commons.path.path_s3 import s3_filter_archives
from impresso_commons.utils.s3 import get_bucket, read_jsonlines, readtext_jsonlines
from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT
Expand All @@ -40,12 +40,14 @@ def partitioner(bag, path, nbpart):
items.to_textfiles(path)


def create_even_partitions(bucket,
config_newspapers,
output_dir,
local_fs=False,
keep_full=False,
nb_partition=500):
def create_even_partitions(
bucket,
config_newspapers,
output_dir,
local_fs=False,
keep_full=False,
nb_partition=500,
):
"""Convert yearly bz2 archives to even bz2 archives, i.e. partitions.
Enables efficient (distributed) processing, bypassing the size discrepancies of newspaper archives.
Expand All @@ -68,7 +70,7 @@ def create_even_partitions(bucket,
os.makedirs(output_dir, exist_ok=True)
path = os.path.join(output_dir, "*.jsonl.bz2")
else:
path = f'{output_dir}/*.jsonl.gz'
path = f"{output_dir}/*.jsonl.gz"
logger.info(f"Will write partitions to {path}")

# collect (yearly) keys & load in bag
Expand All @@ -77,12 +79,16 @@ def create_even_partitions(bucket,

# read and filter lines (1 elem = list of lines, or articles, from a key)
if keep_full is False:
bag_items = bag_bz2_keys.map(readtext_jsonlines, bucket_name=bucket.name).flatten()
bag_items = bag_bz2_keys.map(
readtext_jsonlines, bucket_name=bucket.name
).flatten()
else:
bag_items = bag_bz2_keys.map(read_jsonlines, bucket_name=bucket.name).flatten()

# repartition evenly
grouped_items = bag_items.groupby(lambda x: np.random.randint(1000), npartitions=nb_partition)
grouped_items = bag_items.groupby(
lambda x: np.random.randint(1000), npartitions=nb_partition
)
items = grouped_items.map(lambda x: x[1]).flatten()

# write partitions
Expand All @@ -91,9 +97,7 @@ def create_even_partitions(bucket,
# if local_fs:
# items.to_textfiles(path)
# else:
items.to_textfiles(path,
storage_options=IMPRESSO_STORAGEOPT,
compute=True)
items.to_textfiles(path, storage_options=IMPRESSO_STORAGEOPT, compute=True)

logger.info(f"Partitioning done in {t.stop()}.")

Expand All @@ -116,12 +120,14 @@ def main(args):
logger.info(f"Retrieved bucket: {bucket.name}")

if args["partition"] is True:
create_even_partitions(bucket,
config.newspapers,
config.output_dir,
local_fs=config.local_fs,
keep_full=config.keep_full,
nb_partition=int(nb_partitions))
create_even_partitions(
bucket,
config.newspapers,
config.output_dir,
local_fs=config.local_fs,
keep_full=config.keep_full,
nb_partition=int(nb_partitions),
)


if __name__ == "__main__":
Expand Down
5 changes: 2 additions & 3 deletions impresso_commons/utils/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
import json
import warnings
import bz2
from typing import Union
import boto3
from smart_open.s3 import iter_bucket
from smart_open import open as s_open
from dotenv import load_dotenv
from typing import Union
import botocore

from impresso_commons.utils import _get_cores
Expand Down Expand Up @@ -518,9 +518,8 @@ def get_s3_object_size(bucket_name, key):
try:
# Get the object metadata to retrieve its size
response = s3_client.head_object(Bucket=bucket_name, Key=key)
size = response['ContentLength']
size = response["ContentLength"]
return int(size)
except botocore.exceptions.ClientError as err:
logger.error(f"Error: {err} for {key} in {bucket_name}")
return None

38 changes: 21 additions & 17 deletions impresso_commons/utils/uima.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@

import os
import json
from typing import Dict, Tuple, List
from typing import Dict, List

from dask import bag as db
from cassis import load_cas_from_xmi, load_typesystem, Cas
from cassis import load_typesystem, Cas


from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT
from impresso_commons.classes import ContentItem
from impresso_commons.images.olive_boxes import get_iiif_url

IMPRESSO_IIIF_ENDPOINT = 'https://dhlabsrv17.epfl.ch/iiif_impresso/'
IMPRESSO_IIIF_ENDPOINT = "https://dhlabsrv17.epfl.ch/iiif_impresso/"
# IMPRESSO_IIIF_ENDPOINT = 'http://pub.cl.uzh.ch/service/iiif_impresso'


Expand Down Expand Up @@ -45,9 +45,9 @@ def compute_image_links(
if len(tokens) == 0:
continue

page_id = tokens[0]['page_id']
page_id = tokens[0]["page_id"]

if 'hy1' in tokens[0] and len(tokens) > 1:
if "hy1" in tokens[0] and len(tokens) > 1:
first_token = tokens[1]
else:
first_token = tokens[0]
Expand All @@ -58,20 +58,22 @@ def compute_image_links(
next_offset = ci.lines[line_n + 1]
next_line_tokens = ci.get_coordinates(start_offset, next_offset)

if len(next_line_tokens) > 0 and 'hy1' in next_line_tokens[0]:
if len(next_line_tokens) > 0 and "hy1" in next_line_tokens[0]:
last_token = next_line_tokens[0]
else:
last_token = tokens[-1]

# compute box coordinates of line
x1, y1, w1, h1 = first_token['coords']
x2, y2, w2, h2 = last_token['coords']
x1, y1, w1, h1 = first_token["coords"]
x2, y2, w2, h2 = last_token["coords"]
x3, y3, w3, h3 = x1, y1 - padding, w2 + (x2 - x1), h1 + padding
box = " ".join([str(coord) for coord in [x3, y3, w3, h3]])
if iiif_links is None:
iiif_link = get_iiif_url(page_id, box, IMPRESSO_IIIF_ENDPOINT, pct)
else:
iiif_link = get_iiif_url(page_id, box, iiif_manifest_uri=iiif_links[page_id], pct=pct)
iiif_link = get_iiif_url(
page_id, box, iiif_manifest_uri=iiif_links[page_id], pct=pct
)
image_links.append((iiif_link, start, end))

return image_links
Expand All @@ -81,13 +83,13 @@ def get_iiif_links(contentitems: List[ContentItem], canonical_bucket: str):
"""Retrieves from S3 IIIF links for a set of canonical pages where the input content items are found."""

# derive the IDs of all issues involved
issue_ids = set(["-".join(ci.id.split('-')[:-1]) for ci in contentitems])
issue_ids = set(["-".join(ci.id.split("-")[:-1]) for ci in contentitems])

# reconstruct S3 links to canonical pages
page_files = [
os.path.join(
canonical_bucket,
issue_id.split('-')[0],
issue_id.split("-")[0],
"pages",
f"{issue_id.split('-')[0]}-{issue_id.split('-')[1]}",
f"{issue_id}-pages.jsonl.bz2",
Expand All @@ -98,14 +100,16 @@ def get_iiif_links(contentitems: List[ContentItem], canonical_bucket: str):
iiif_links = (
db.read_text(page_files, storage_options=IMPRESSO_STORAGEOPT)
.map(json.loads)
.map(lambda x: (x['id'], x['iiif']))
.map(lambda x: (x["id"], x["iiif"]))
.compute()
)

return {page_id: iiif_link for page_id, iiif_link in iiif_links}


def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False) -> str:
def rebuilt2xmi(
ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False
) -> str:
"""
Converts a rebuilt ContentItem into Apache UIMA/XMI format.
Expand All @@ -126,10 +130,10 @@ def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=

cas = Cas(typesystem=typesystem)
cas.sofa_string = ci.fulltext
cas.sofa_mime = 'text/plain'
cas.sofa_mime = "text/plain"

sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
imgLinkType = 'webanno.custom.ImpressoImages'
sentType = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
imgLinkType = "webanno.custom.ImpressoImages"
Sentence = typesystem.get_type(sentType)
ImageLink = typesystem.get_type(imgLinkType)

Expand All @@ -147,6 +151,6 @@ def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=
for iiif_link, start, end in iiif_links:
cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link))

outfile_path = os.path.join(output_dir, f'{ci.id}.xmi')
outfile_path = os.path.join(output_dir, f"{ci.id}.xmi")
cas.to_xmi(outfile_path, pretty_print=True)
return outfile_path
4 changes: 2 additions & 2 deletions impresso_commons/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
import pathlib
from contextlib import ExitStack
from typing import Any, Union
from typing import Any, Optional
import jsonschema
import importlib_resources

Expand Down Expand Up @@ -50,7 +50,7 @@ def get_pkg_resource(


def init_logger(
level: int = logging.INFO, file: Union[str, None] = None
level: int = logging.INFO, file: Optional[str] = None
) -> logging.RootLogger:
"""Initialises the root logger.
Expand Down

0 comments on commit 6a330fb

Please sign in to comment.