diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle
index f3dea9c..d999cbb 100644
Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ
diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree
index 51411b5..1590498 100644
Binary files a/docs/_build/doctrees/index.doctree and b/docs/_build/doctrees/index.doctree differ
diff --git a/docs/_build/doctrees/io.doctree b/docs/_build/doctrees/io.doctree
index d2c1131..221f894 100644
Binary files a/docs/_build/doctrees/io.doctree and b/docs/_build/doctrees/io.doctree differ
diff --git a/docs/_build/doctrees/rebuild.doctree b/docs/_build/doctrees/rebuild.doctree
index 5f217df..dc4e806 100644
Binary files a/docs/_build/doctrees/rebuild.doctree and b/docs/_build/doctrees/rebuild.doctree differ
diff --git a/docs/_build/doctrees/utils.doctree b/docs/_build/doctrees/utils.doctree
index 7eb571d..dc5ac2d 100644
Binary files a/docs/_build/doctrees/utils.doctree and b/docs/_build/doctrees/utils.doctree differ
diff --git a/docs/_build/doctrees/versioning.doctree b/docs/_build/doctrees/versioning.doctree
new file mode 100644
index 0000000..f4fc961
Binary files /dev/null and b/docs/_build/doctrees/versioning.doctree differ
diff --git a/docs/_build/html/_sources/index.rst.txt b/docs/_build/html/_sources/index.rst.txt
index 4eda377..65a983f 100644
--- a/docs/_build/html/_sources/index.rst.txt
+++ b/docs/_build/html/_sources/index.rst.txt
@@ -17,4 +17,5 @@ Python module with bits of code (objects, functions) highly reusable within impr
    rebuild
    utils
    images
+   versioning
 
diff --git a/docs/_build/html/_sources/versioning.rst.txt b/docs/_build/html/_sources/versioning.rst.txt
new file mode 100644
index 0000000..03a69ea
--- /dev/null
+++ b/docs/_build/html/_sources/versioning.rst.txt
@@ -0,0 +1,43 @@
+Data Versioning
+================================
+
+The `versioning` package of `impresso_commons` contains several modules and scripts with classes and functions that allow to version Impresso's data at various stages of the processing pipeline.
+
+The main goal of this approach is to version the data and track information at every stage to:
+1. **Ensure data consisteny and ease of debugging:** Data elements should be consistent across stages, and inconsistencies/differences should be justifiable through the identification of data leakage points.
+2. **Allow partial updates:** It should be possible to (re)run all or part of the processes on subsets of the data, knowing which version of the data was used at each step. This can be necessary when new media collections arrive, or when an existing collection has been patched.
+3. **Ensure transparency:** Citation of the various data stages and datasets should be straightforward; users should know when using the interface exactly what versions they are using, and should be able to consult the precise statistics related to them.
+
+
+Data Statistics and NewspaperStatistics
+------------------------------------------
+
+.. automodule:: impresso_commons.versioning.data_statistics
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Data Manifest
+--------------------------------------------
+
+.. automodule:: impresso_commons.versioning.data_manifest
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Versioning Helpers
+--------------------------------------------
+
+.. automodule:: impresso_commons.versioning.helpers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Manifest Computing Script
+--------------------------------------------
+
+.. automodule:: impresso_commons.versioning.compute_manifest
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html
index 0b5dfd5..8523987 100644
--- a/docs/_build/html/genindex.html
+++ b/docs/_build/html/genindex.html
@@ -45,6 +45,7 @@
 <li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
 <li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -81,21 +82,42 @@ <h1 id="index">Index</h1>
  | <a href="#E"><strong>E</strong></a>
  | <a href="#F"><strong>F</strong></a>
  | <a href="#G"><strong>G</strong></a>
+ | <a href="#H"><strong>H</strong></a>
  | <a href="#I"><strong>I</strong></a>
  | <a href="#J"><strong>J</strong></a>
  | <a href="#L"><strong>L</strong></a>
  | <a href="#M"><strong>M</strong></a>
+ | <a href="#N"><strong>N</strong></a>
+ | <a href="#O"><strong>O</strong></a>
  | <a href="#P"><strong>P</strong></a>
  | <a href="#R"><strong>R</strong></a>
  | <a href="#S"><strong>S</strong></a>
  | <a href="#T"><strong>T</strong></a>
  | <a href="#U"><strong>U</strong></a>
+ | <a href="#V"><strong>V</strong></a>
+ | <a href="#W"><strong>W</strong></a>
  
 </div>
 <h2 id="A">A</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id">add_by_ci_id() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year">add_by_title_year() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year">add_count_list_by_title_year() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.add_counts">add_counts() (impresso_commons.versioning.data_statistics.DataStatistics method)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.agg">agg() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title">aggregate_stats_for_title() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
       <li><a href="utils.html#impresso_commons.utils.s3.alternative_read_text">alternative_read_text() (in module impresso_commons.utils.s3)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.append_to_notes">append_to_notes() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -108,6 +130,8 @@ <h2 id="B">B</h2>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="images.html#impresso_commons.images.img_utils.BoxStrategy">BoxStrategy (class in impresso_commons.images.img_utils)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.utils.bytes_to">bytes_to() (in module impresso_commons.utils.utils)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -115,6 +139,8 @@ <h2 id="B">B</h2>
 <h2 id="C">C</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.CANONICAL">CANONICAL (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
       <li><a href="io.html#impresso_commons.path.path_fs.canonical_path">canonical_path() (in module impresso_commons.path.path_fs)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.config_loader.Base.check_bucket">check_bucket() (impresso_commons.utils.config_loader.Base method)</a>
@@ -124,26 +150,64 @@ <h2 id="C">C</h2>
       <li><a href="utils.html#impresso_commons.utils.config_loader.Base.check_params">check_params() (impresso_commons.utils.config_loader.Base method)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.utils.chunk">chunk() (in module impresso_commons.utils.utils)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.helpers.chunk">(in module impresso_commons.versioning.helpers)</a>
 </li>
+      </ul></li>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.cleanup">cleanup() (in module impresso_commons.text.rebuilder)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.clone_git_repo">clone_git_repo() (in module impresso_commons.versioning.helpers)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.img_utils.compose">compose() (in module impresso_commons.images.img_utils)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.compress">compress() (in module impresso_commons.text.rebuilder)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.compute">compute() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.olive_boxes.compute_box">compute_box() (in module impresso_commons.images.olive_boxes)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.uima.compute_image_links">compute_image_links() (in module impresso_commons.utils.uima)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.olive_boxes.compute_scale_factor">compute_scale_factor() (in module impresso_commons.images.olive_boxes)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.compute_manifest.compute_stats_for_stage">compute_stats_for_stage() (in module impresso_commons.versioning.compute_manifest)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.compute_stats_in_canonical_bag">compute_stats_in_canonical_bag() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.compute_stats_in_entities_bag">compute_stats_in_entities_bag() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.compute_stats_in_langident_bag">compute_stats_in_langident_bag() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag">compute_stats_in_rebuilt_bag() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag">compute_stats_in_solr_text_bag() (in module impresso_commons.versioning.helpers)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_fs.ContentItem">ContentItem (in module impresso_commons.path.path_fs)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.olive_boxes.convert_box">convert_box() (in module impresso_commons.images.olive_boxes)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.count_keys">count_keys (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys">(impresso_commons.versioning.data_statistics.NewspaperStatistics attribute)</a>
+</li>
+      </ul></li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.counts">counts (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.counts">(impresso_commons.versioning.data_statistics.NewspaperStatistics attribute)</a>
+</li>
+      </ul></li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.counts_for_canonical_issue">counts_for_canonical_issue() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.counts_for_rebuilt">counts_for_rebuilt() (in module impresso_commons.versioning.helpers)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.daskutils.create_even_partitions">create_even_partitions() (in module impresso_commons.utils.daskutils)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.compute_manifest.create_manifest">create_manifest() (in module impresso_commons.versioning.compute_manifest)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -151,12 +215,20 @@ <h2 id="C">C</h2>
 <h2 id="D">D</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
-      <li><a href="io.html#impresso_commons.path.path_fs.IssueDir.date">date (impresso_commons.path.path_fs.IssueDir attribute)</a>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest">DataManifest (class in impresso_commons.versioning.data_manifest)</a>
 </li>
-      <li><a href="io.html#impresso_commons.path.path_fs.detect_canonical_issues">detect_canonical_issues() (in module impresso_commons.path.path_fs)</a>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage">DataStage (class in impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics">DataStatistics (class in impresso_commons.versioning.data_statistics)</a>
+</li>
+      <li><a href="io.html#impresso_commons.path.path_fs.IssueDir.date">date (impresso_commons.path.path_fs.IssueDir attribute)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title">define_update_info_for_title() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="io.html#impresso_commons.path.path_fs.detect_canonical_issues">detect_canonical_issues() (in module impresso_commons.path.path_fs)</a>
+</li>
       <li><a href="io.html#impresso_commons.path.path_fs.detect_issues">detect_issues() (in module impresso_commons.path.path_fs)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_fs.detect_journal_issues">detect_journal_issues() (in module impresso_commons.path.path_fs)</a>
@@ -168,6 +240,24 @@ <h2 id="E">E</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="io.html#impresso_commons.path.path_fs.IssueDir.edition">edition (impresso_commons.path.path_fs.IssueDir attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.element">element (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.element">(impresso_commons.versioning.data_statistics.NewspaperStatistics attribute)</a>
+</li>
+      </ul></li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.EMBEDDINGS">EMBEDDINGS (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.ENTITIES">ENTITIES (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.EVENIZED">EVENIZED (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.compute_manifest.extract_np_key">extract_np_key() (in module impresso_commons.versioning.compute_manifest)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.extract_version">extract_version() (in module impresso_commons.versioning.helpers)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -176,9 +266,15 @@ <h2 id="F">F</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="io.html#impresso_commons.path.path_s3.fetch_files">fetch_files() (in module impresso_commons.path.path_s3)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.filter_new_or_modified_media">filter_new_or_modified_media() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.finalize">finalize() (in module impresso_commons.versioning.helpers)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.find_s3_data_manifest_path">find_s3_data_manifest_path() (in module impresso_commons.versioning.helpers)</a>
+</li>
       <li><a href="utils.html#impresso_commons.utils.s3.fixed_s3fs_glob">fixed_s3fs_glob() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.config_loader.Base.from_json">from_json() (impresso_commons.utils.config_loader.Base class method)</a>
@@ -189,11 +285,19 @@ <h2 id="F">F</h2>
 <h2 id="G">G</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict">generate_media_dict() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
       <li><a href="utils.html#impresso_commons.utils.s3.get_boto3_bucket">get_boto3_bucket() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.get_bucket">get_bucket() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.get_bucket_boto3">get_bucket_boto3() (in module impresso_commons.utils.s3)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.get_count_keys">get_count_keys() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.compute_manifest.get_files_to_consider">get_files_to_consider() (in module impresso_commons.versioning.compute_manifest)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.get_head_commit_url">get_head_commit_url() (in module impresso_commons.versioning.helpers)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.helpers.get_iiif_and_coords">get_iiif_and_coords() (in module impresso_commons.text.helpers)</a>
 </li>
@@ -209,10 +313,14 @@ <h2 id="G">G</h2>
 </li>
       <li><a href="images.html#impresso_commons.images.img_utils.get_jpg">get_jpg() (in module impresso_commons.images.img_utils)</a>
 </li>
-      <li><a href="utils.html#impresso_commons.utils.s3.get_or_create_bucket">get_or_create_bucket() (in module impresso_commons.utils.s3)</a>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.get_media_item_years">get_media_item_years() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.get_media_titles">get_media_titles() (in module impresso_commons.versioning.helpers)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="utils.html#impresso_commons.utils.s3.get_or_create_bucket">get_or_create_bucket() (in module impresso_commons.utils.s3)</a>
+</li>
       <li><a href="images.html#impresso_commons.images.img_utils.get_page_folders">get_page_folders() (in module impresso_commons.images.img_utils)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.utils.get_pkg_resource">get_pkg_resource() (in module impresso_commons.utils.utils)</a>
@@ -222,6 +330,8 @@ <h2 id="G">G</h2>
       <li><a href="utils.html#impresso_commons.utils.s3.get_s3_client">get_s3_client() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.get_s3_connection">get_s3_connection() (in module impresso_commons.utils.s3)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.s3.get_s3_object_size">get_s3_object_size() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.get_s3_resource">get_s3_resource() (in module impresso_commons.utils.s3)</a>
 </li>
@@ -234,6 +344,26 @@ <h2 id="G">G</h2>
       <li><a href="utils.html#impresso_commons.utils.s3.get_storage_options">get_storage_options() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.img_utils.get_tif">get_tif() (in module impresso_commons.images.img_utils)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.git_commit_push">git_commit_push() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.granularity">granularity (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity">(impresso_commons.versioning.data_statistics.NewspaperStatistics attribute)</a>
+</li>
+      </ul></li>
+  </ul></td>
+</tr></table>
+
+<h2 id="H">H</h2>
+<table style="width: 100%" class="indextable genindextable"><tr>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key">has_title_year_key() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.has_value">has_value() (impresso_commons.versioning.helpers.DataStage class method)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -292,8 +422,6 @@ <h2 id="I">I</h2>
         <li><a href="rebuild.html#module-impresso_commons.text.rebuilder">module</a>
 </li>
       </ul></li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li>
     impresso_commons.utils.config_loader
 
@@ -322,18 +450,60 @@ <h2 id="I">I</h2>
         <li><a href="utils.html#module-impresso_commons.utils.uima">module</a>
 </li>
       </ul></li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li>
     impresso_commons.utils.utils
 
       <ul>
         <li><a href="utils.html#module-impresso_commons.utils.utils">module</a>
+</li>
+      </ul></li>
+      <li>
+    impresso_commons.versioning.compute_manifest
+
+      <ul>
+        <li><a href="versioning.html#module-impresso_commons.versioning.compute_manifest">module</a>
+</li>
+      </ul></li>
+      <li>
+    impresso_commons.versioning.data_manifest
+
+      <ul>
+        <li><a href="versioning.html#module-impresso_commons.versioning.data_manifest">module</a>
+</li>
+      </ul></li>
+      <li>
+    impresso_commons.versioning.data_statistics
+
+      <ul>
+        <li><a href="versioning.html#module-impresso_commons.versioning.data_statistics">module</a>
+</li>
+      </ul></li>
+      <li>
+    impresso_commons.versioning.helpers
+
+      <ul>
+        <li><a href="versioning.html#module-impresso_commons.versioning.helpers">module</a>
 </li>
       </ul></li>
       <li><a href="io.html#impresso_commons.path.path_s3.impresso_iter_bucket">impresso_iter_bucket() (in module impresso_commons.path.path_s3)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.increment_version">increment_version() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.init_counts">init_counts() (impresso_commons.versioning.data_statistics.DataStatistics method)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.utils.init_logger">init_logger() (in module impresso_commons.utils.utils)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.init_logging">init_logging() (in module impresso_commons.text.rebuilder)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.init_media_info">init_media_info() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict">init_yearly_count_dict() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.helpers.insert_whitespace">insert_whitespace() (in module impresso_commons.text.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.is_git_repo">is_git_repo() (in module impresso_commons.versioning.helpers)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_fs.IssueDir">IssueDir (class in impresso_commons.path.path_fs)</a>
 
@@ -361,10 +531,14 @@ <h2 id="J">J</h2>
 <h2 id="L">L</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
-      <li><a href="io.html#impresso_commons.path.path_s3.list_files">list_files() (in module impresso_commons.path.path_s3)</a>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.LANGIDENT">LANGIDENT (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING">LINGUISTIC_PROCESSING (impresso_commons.versioning.helpers.DataStage attribute)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="io.html#impresso_commons.path.path_s3.list_files">list_files() (in module impresso_commons.path.path_s3)</a>
+</li>
       <li><a href="io.html#impresso_commons.path.path_s3.list_newspapers">list_newspapers() (in module impresso_commons.path.path_s3)</a>
 </li>
   </ul></td>
@@ -379,8 +553,14 @@ <h2 id="M">M</h2>
         <li><a href="utils.html#impresso_commons.utils.config_loader.main">(in module impresso_commons.utils.config_loader)</a>
 </li>
         <li><a href="utils.html#impresso_commons.utils.daskutils.main">(in module impresso_commons.utils.daskutils)</a>
+</li>
+        <li><a href="versioning.html#impresso_commons.versioning.compute_manifest.main">(in module impresso_commons.versioning.compute_manifest)</a>
 </li>
       </ul></li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.manifest_summary">manifest_summary() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.media_list_from_mft_json">media_list_from_mft_json() (in module impresso_commons.versioning.helpers)</a>
+</li>
       <li>
     module
 
@@ -408,9 +588,47 @@ <h2 id="M">M</h2>
         <li><a href="utils.html#module-impresso_commons.utils.uima">impresso_commons.utils.uima</a>
 </li>
         <li><a href="utils.html#module-impresso_commons.utils.utils">impresso_commons.utils.utils</a>
+</li>
+        <li><a href="versioning.html#module-impresso_commons.versioning.compute_manifest">impresso_commons.versioning.compute_manifest</a>
+</li>
+        <li><a href="versioning.html#module-impresso_commons.versioning.data_manifest">impresso_commons.versioning.data_manifest</a>
+</li>
+        <li><a href="versioning.html#module-impresso_commons.versioning.data_statistics">impresso_commons.versioning.data_statistics</a>
+</li>
+        <li><a href="versioning.html#module-impresso_commons.versioning.helpers">impresso_commons.versioning.helpers</a>
 </li>
       </ul></li>
   </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.MYSQL_CIS">MYSQL_CIS (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+  </ul></td>
+</tr></table>
+
+<h2 id="N">N</h2>
+<table style="width: 100%" class="indextable genindextable"><tr>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.new_media">new_media() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics">NewspaperStatistics (class in impresso_commons.versioning.data_statistics)</a>
+</li>
+  </ul></td>
+</tr></table>
+
+<h2 id="O">O</h2>
+<table style="width: 100%" class="indextable genindextable"><tr>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.OCRQA">OCRQA (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path">output_mft_s3_path (impresso_commons.versioning.data_manifest.DataManifest property)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.overall_stats">overall_stats() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+  </ul></td>
 </tr></table>
 
 <h2 id="P">P</h2>
@@ -424,11 +642,13 @@ <h2 id="P">P</h2>
 </li>
       <li><a href="utils.html#impresso_commons.utils.utils.parse_json">parse_json() (in module impresso_commons.utils.utils)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="utils.html#impresso_commons.utils.daskutils.partitioner">partitioner() (in module impresso_commons.utils.daskutils)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.config_loader.PartitionerConfig">PartitionerConfig (class in impresso_commons.utils.config_loader)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.PASSIM">PASSIM (impresso_commons.versioning.helpers.DataStage attribute)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_fs.IssueDir.path">path (impresso_commons.path.path_fs.IssueDir attribute)</a>
 </li>
@@ -436,6 +656,14 @@ <h2 id="P">P</h2>
 </li>
       <li><a href="images.html#impresso_commons.images.img_utils.BoxStrategy.png_uniq">png_uniq (impresso_commons.images.img_utils.BoxStrategy attribute)</a>
 </li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys">possible_count_keys (impresso_commons.versioning.data_statistics.NewspaperStatistics attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.pretty_print">pretty_print() (impresso_commons.versioning.data_statistics.DataStatistics method)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print">(impresso_commons.versioning.data_statistics.NewspaperStatistics method)</a>
+</li>
+      </ul></li>
   </ul></td>
 </tr></table>
 
@@ -447,6 +675,10 @@ <h2 id="R">R</h2>
       <li><a href="rebuild.html#impresso_commons.text.helpers.read_issue_pages">read_issue_pages() (in module impresso_commons.text.helpers)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.read_jsonlines">read_jsonlines() (in module impresso_commons.utils.s3)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.read_manifest_from_s3">read_manifest_from_s3() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.read_manifest_from_s3_path">read_manifest_from_s3_path() (in module impresso_commons.versioning.helpers)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.helpers.read_page">read_page() (in module impresso_commons.text.helpers)</a>
 </li>
@@ -456,21 +688,29 @@ <h2 id="R">R</h2>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.rebuild_for_passim">rebuild_for_passim() (in module impresso_commons.text.rebuilder)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.rebuild_for_solr">rebuild_for_solr() (in module impresso_commons.text.rebuilder)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.rebuild_issues">rebuild_issues() (in module impresso_commons.text.rebuilder)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.rebuild_text">rebuild_text() (in module impresso_commons.text.rebuilder)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.rebuild_text_passim">rebuild_text_passim() (in module impresso_commons.text.rebuilder)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.REBUILT">REBUILT (impresso_commons.versioning.helpers.DataStage attribute)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.uima.rebuilt2xmi">rebuilt2xmi() (in module impresso_commons.utils.uima)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.helpers.reconstruct_iiif_link">reconstruct_iiif_link() (in module impresso_commons.text.helpers)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.helpers.rejoin_articles">rejoin_articles() (in module impresso_commons.text.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.remove_media_in_manifest">remove_media_in_manifest() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id">replace_by_ci_id() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year">replace_by_title_year() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.img_utils.run_cmd">run_cmd() (in module impresso_commons.images.img_utils)</a>
 </li>
@@ -486,14 +726,32 @@ <h2 id="S">S</h2>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.s3_get_pages">s3_get_pages() (in module impresso_commons.utils.s3)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="io.html#impresso_commons.path.path_s3.s3_iter_bucket">s3_iter_bucket() (in module impresso_commons.path.path_s3)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_s3.s3ContentItem">s3ContentItem (class in impresso_commons.path.path_s3)</a>
 </li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.same_counts">same_counts() (impresso_commons.versioning.data_statistics.DataStatistics method)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts">(impresso_commons.versioning.data_statistics.NewspaperStatistics method)</a>
+</li>
+      </ul></li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="io.html#impresso_commons.path.path_fs.select_issues">select_issues() (in module impresso_commons.path.path_fs)</a>
 </li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.SOLR_EMBS">SOLR_EMBS (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES">SOLR_ENTITIES (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.SOLR_TEXT">SOLR_TEXT (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.stage">stage (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>
+
+      <ul>
+        <li><a href="versioning.html#impresso_commons.versioning.data_statistics.NewspaperStatistics.stage">(impresso_commons.versioning.data_statistics.NewspaperStatistics attribute)</a>
+</li>
+      </ul></li>
   </ul></td>
 </tr></table>
 
@@ -503,12 +761,18 @@ <h2 id="T">T</h2>
       <li><a href="images.html#impresso_commons.images.olive_boxes.test">test() (in module impresso_commons.images.olive_boxes)</a>
 </li>
       <li><a href="rebuild.html#impresso_commons.text.helpers.text_apply_breaks">text_apply_breaks() (in module impresso_commons.text.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.TEXT_REUSE">TEXT_REUSE (impresso_commons.versioning.helpers.DataStage attribute)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="images.html#impresso_commons.images.img_utils.BoxStrategy.tif">tif (impresso_commons.images.img_utils.BoxStrategy attribute)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.title_level_stats">title_level_stats() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.config_loader.Base.to_dict">to_dict() (impresso_commons.utils.config_loader.Base method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.TOPICS">TOPICS (impresso_commons.versioning.helpers.DataStage attribute)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -516,6 +780,8 @@ <h2 id="T">T</h2>
 <h2 id="U">U</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.update_media_stats">update_media_stats() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
       <li><a href="rebuild.html#impresso_commons.text.rebuilder.upload">upload() (in module impresso_commons.text.rebuilder)</a>
 
       <ul>
@@ -529,6 +795,40 @@ <h2 id="U">U</h2>
   </ul></td>
 </tr></table>
 
+<h2 id="V">V</h2>
+<table style="width: 100%" class="indextable genindextable"><tr>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="utils.html#impresso_commons.utils.utils.validate_against_schema">validate_against_schema() (in module impresso_commons.utils.utils)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest">validate_and_export_manifest() (impresso_commons.versioning.data_manifest.DataManifest method)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.compute_manifest.validate_config">validate_config() (in module impresso_commons.versioning.compute_manifest)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.validate_granularity">validate_granularity() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.validate_stage">validate_stage() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.validate_version">validate_version() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.version_as_list">version_as_list() (in module impresso_commons.versioning.helpers)</a>
+</li>
+  </ul></td>
+</tr></table>
+
+<h2 id="W">W</h2>
+<table style="width: 100%" class="indextable genindextable"><tr>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.write_and_push_to_git">write_and_push_to_git() (in module impresso_commons.versioning.helpers)</a>
+</li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="versioning.html#impresso_commons.versioning.helpers.write_dump_to_fs">write_dump_to_fs() (in module impresso_commons.versioning.helpers)</a>
+</li>
+  </ul></td>
+</tr></table>
+
 
 
            </div>
diff --git a/docs/_build/html/images.html b/docs/_build/html/images.html
index fa178f9..0196e80 100644
--- a/docs/_build/html/images.html
+++ b/docs/_build/html/images.html
@@ -19,6 +19,7 @@
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
     <link rel="search" title="Search" href="search.html" />
+    <link rel="next" title="Data Versioning" href="versioning.html" />
     <link rel="prev" title="Utilities" href="utils.html" /> 
 </head>
 
@@ -77,6 +78,7 @@
 </li>
 </ul>
 </li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -336,6 +338,7 @@ <h4>Case 4: one jpg only<a class="headerlink" href="#case-4-one-jpg-only" title=
           </div>
           <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
         <a href="utils.html" class="btn btn-neutral float-left" title="Utilities" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
+        <a href="versioning.html" class="btn btn-neutral float-right" title="Data Versioning" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
     </div>
 
   <hr/>
diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html
index 3f2de08..be27cbe 100644
--- a/docs/_build/html/index.html
+++ b/docs/_build/html/index.html
@@ -47,6 +47,7 @@
 <li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
 <li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -104,6 +105,13 @@ <h1>Welcome to Impresso PyCommons’s documentation!<a class="headerlink" href="
 <li class="toctree-l2"><a class="reference internal" href="images.html#module-impresso_commons.images.olive_boxes">Olive Boxes</a></li>
 </ul>
 </li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="versioning.html#module-impresso_commons.versioning.data_statistics">Data Statistics and NewspaperStatistics</a></li>
+<li class="toctree-l2"><a class="reference internal" href="versioning.html#module-impresso_commons.versioning.data_manifest">Data Manifest</a></li>
+<li class="toctree-l2"><a class="reference internal" href="versioning.html#module-impresso_commons.versioning.helpers">Versioning Helpers</a></li>
+<li class="toctree-l2"><a class="reference internal" href="versioning.html#module-impresso_commons.versioning.compute_manifest">Manifest Computing Script</a></li>
+</ul>
+</li>
 </ul>
 </div>
 </section>
diff --git a/docs/_build/html/io.html b/docs/_build/html/io.html
index 46566db..73b68ba 100644
--- a/docs/_build/html/io.html
+++ b/docs/_build/html/io.html
@@ -86,6 +86,7 @@
 <li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
 <li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -391,7 +392,7 @@ <h1>Input/Output<a class="headerlink" href="#input-output" title="Link to this h
 
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.path.path_s3.list_files">
-<span class="sig-prename descclassname"><span class="pre">impresso_commons.path.path_s3.</span></span><span class="sig-name descname"><span class="pre">list_files</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'issues'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspapers_filter</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.path.path_s3.list_files" title="Link to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.path.path_s3.</span></span><span class="sig-name descname"><span class="pre">list_files</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'issues'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspapers_filter</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.path.path_s3.list_files" title="Link to this definition"></a></dt>
 <dd><p>List the canonical files located in a given S3 bucket.</p>
 <div class="admonition note">
 <p class="admonition-title">Note</p>
diff --git a/docs/_build/html/objects.inv b/docs/_build/html/objects.inv
index 5bfc238..166a721 100644
Binary files a/docs/_build/html/objects.inv and b/docs/_build/html/objects.inv differ
diff --git a/docs/_build/html/py-modindex.html b/docs/_build/html/py-modindex.html
index 136773c..be06ca2 100644
--- a/docs/_build/html/py-modindex.html
+++ b/docs/_build/html/py-modindex.html
@@ -48,6 +48,7 @@
 <li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
 <li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -150,6 +151,26 @@ <h1>Python Module Index</h1>
        <td>&#160;&#160;&#160;
        <a href="utils.html#module-impresso_commons.utils.utils"><code class="xref">impresso_commons.utils.utils</code></a></td><td>
        <em></em></td></tr>
+     <tr class="cg-1">
+       <td></td>
+       <td>&#160;&#160;&#160;
+       <a href="versioning.html#module-impresso_commons.versioning.compute_manifest"><code class="xref">impresso_commons.versioning.compute_manifest</code></a></td><td>
+       <em></em></td></tr>
+     <tr class="cg-1">
+       <td></td>
+       <td>&#160;&#160;&#160;
+       <a href="versioning.html#module-impresso_commons.versioning.data_manifest"><code class="xref">impresso_commons.versioning.data_manifest</code></a></td><td>
+       <em></em></td></tr>
+     <tr class="cg-1">
+       <td></td>
+       <td>&#160;&#160;&#160;
+       <a href="versioning.html#module-impresso_commons.versioning.data_statistics"><code class="xref">impresso_commons.versioning.data_statistics</code></a></td><td>
+       <em></em></td></tr>
+     <tr class="cg-1">
+       <td></td>
+       <td>&#160;&#160;&#160;
+       <a href="versioning.html#module-impresso_commons.versioning.helpers"><code class="xref">impresso_commons.versioning.helpers</code></a></td><td>
+       <em></em></td></tr>
    </table>
 
 
diff --git a/docs/_build/html/rebuild.html b/docs/_build/html/rebuild.html
index 29dc2fb..7491e7e 100644
--- a/docs/_build/html/rebuild.html
+++ b/docs/_build/html/rebuild.html
@@ -76,6 +76,7 @@
 </li>
 <li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -112,7 +113,7 @@ <h2>Rebuild functions<a class="headerlink" href="#rebuild-functions" title="Link
 For EPFL members, this script can be scaled by running it using Runai,
 as documented on <a class="reference external" href="https://github.com/impresso/impresso-infrastructure/blob/main/howtos/runai.md">https://github.com/impresso/impresso-infrastructure/blob/main/howtos/runai.md</a>.</p>
 <dl class="simple">
-<dt>Usage:</dt><dd><p>rebuilder.py rebuild_articles –input-bucket=&lt;b&gt; –log-file=&lt;f&gt; –output-dir=&lt;od&gt; –filter-config=&lt;fc&gt; [–format=&lt;fo&gt; –scheduler=&lt;sch&gt; –output-bucket=&lt;ob&gt; –verbose –clear –languages=&lt;lgs&gt; –nworkers=&lt;nw&gt;]</p>
+<dt>Usage:</dt><dd><p>rebuilder.py rebuild_articles –input-bucket=&lt;b&gt; –log-file=&lt;f&gt; –output-dir=&lt;od&gt; –filter-config=&lt;fc&gt; [–format=&lt;fo&gt; –scheduler=&lt;sch&gt; –output-bucket=&lt;ob&gt; –verbose –clear –languages=&lt;lgs&gt; –nworkers=&lt;nw&gt; –git-repo=&lt;gr&gt; –temp-dir=&lt;tp&gt; –prev-manifest=&lt;pm&gt;]</p>
 </dd>
 </dl>
 <p>Options:</p>
@@ -139,10 +140,22 @@ <h2>Rebuild functions<a class="headerlink" href="#rebuild-functions" title="Link
 <dd><p>Remove output directory before and after rebuilding</p>
 </dd>
 <dt><kbd><span class="option">--format=<var>&lt;fo&gt;</var></span></kbd></dt>
-<dd><p>stuff</p>
+<dd><p>Rebuilt format to use (can be “solr” or “passim”)</p>
+</dd>
+<dt><kbd><span class="option">--languages=<var>&lt;lgs&gt;</var></span></kbd></dt>
+<dd><p>Languages to filter the articles to rebuild on.</p>
 </dd>
 <dt><kbd><span class="option">--nworkers=<var>&lt;nw&gt;</var></span></kbd></dt>
-<dd><p>number of workers for (local) dask client</p>
+<dd><p>number of workers for (local) Dask client.</p>
+</dd>
+<dt><kbd><span class="option">--git-repo=<var>&lt;gr&gt;</var></span></kbd></dt>
+<dd><p>Local path to the “impresso-text-acquisition” git directory (including it).</p>
+</dd>
+<dt><kbd><span class="option">--temp-dir=<var>&lt;tp&gt;</var></span></kbd></dt>
+<dd><p>Temporary directory in which to clone the impresso-data-release git repository.</p>
+</dd>
+<dt><kbd><span class="option">--prev-manifest=<var>&lt;pm&gt;</var></span></kbd></dt>
+<dd><p>Optional S3 path to the previous manifest to use for the manifest generation.</p>
 </dd>
 </dl>
 <dl class="py function">
@@ -208,7 +221,7 @@ <h2>Rebuild functions<a class="headerlink" href="#rebuild-functions" title="Link
 
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.text.rebuilder.main">
-<span class="sig-prename descclassname"><span class="pre">impresso_commons.text.rebuilder.</span></span><span class="sig-name descname"><span class="pre">main</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.text.rebuilder.main" title="Link to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.text.rebuilder.</span></span><span class="sig-name descname"><span class="pre">main</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.text.rebuilder.main" title="Link to this definition"></a></dt>
 <dd></dd></dl>
 
 <dl class="py function">
@@ -252,7 +265,7 @@ <h2>Rebuild functions<a class="headerlink" href="#rebuild-functions" title="Link
 
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.text.rebuilder.rebuild_issues">
-<span class="sig-prename descclassname"><span class="pre">impresso_commons.text.rebuilder.</span></span><span class="sig-name descname"><span class="pre">rebuild_issues</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">issues</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_bucket</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_dir</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dask_client</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">format</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'solr'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">filter_language</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.text.rebuilder.rebuild_issues" title="Link to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.text.rebuilder.</span></span><span class="sig-name descname"><span class="pre">rebuild_issues</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">issues</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_bucket</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_dir</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dask_client</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">_format</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'solr'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">filter_language</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.text.rebuilder.rebuild_issues" title="Link to this definition"></a></dt>
 <dd><p>Rebuild a set of newspaper issues into a given format.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
diff --git a/docs/_build/html/search.html b/docs/_build/html/search.html
index 8460542..7258c61 100644
--- a/docs/_build/html/search.html
+++ b/docs/_build/html/search.html
@@ -48,6 +48,7 @@
 <li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
 <li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js
index dd3eefe..c2c84e0 100644
--- a/docs/_build/html/searchindex.js
+++ b/docs/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["images", "index", "io", "rebuild", "utils"], "filenames": ["images.rst", "index.rst", "io.rst", "rebuild.rst", "utils.rst"], "titles": ["Image handling", "Welcome to Impresso PyCommons\u2019s documentation!", "Input/Output", "Text Rebuild", "Utilities"], "terms": {"class": [0, 2, 4], "impresso_common": [0, 2, 3, 4], "img_util": 0, "boxstrategi": 0, "valu": [0, 2, 4], "name": [0, 2, 3, 4], "none": [0, 2, 3, 4], "modul": [0, 1], "qualnam": 0, "type": [0, 2, 3, 4], "start": [0, 4], "boundari": 0, "base": [0, 2, 3, 4], "enum": 0, "jpg_highest": 0, "jpg_uniq": 0, "png_highest": 0, "png_uniq": 0, "compos": 0, "path_img_on": 0, "path_img_two": 0, "path_img_thre": 0, "get_img_from_arch": 0, "archiv": [0, 3, 4], "path_check": 0, "ext_check": 0, "name_check": 0, "get_imgdimens": 0, "image_data": 0, "return": [0, 2, 3, 4], "height": 0, "width": 0, "get_jpg": 0, "page_digit": 0, "get_page_fold": 0, "get_png": 0, "get_tif": 0, "run_cmd": 0, "cmd": 0, "execut": 0, "shell": 0, "result": [0, 2, 4], "stdout": 0, "stderr": 0, "function": [0, 1], "support": [0, 2], "re": 0, "comput": [0, 2, 4], "coordin": [0, 3, 4], "olive_box": 0, "compute_box": 0, "scale_factor": 0, "input_box": 0, "iiif": [0, 3, 4], "rel": 0, "paramet": [0, 2, 3, 4], "float": 0, "ratio": 0, "between": 0, "differ": [0, 3, 4], "dimens": 0, "str": [0, 2, 3, 4], "string": [0, 2, 3, 4], "separ": [0, 2, 4], "space": 0, "new": [0, 4], "compute_scale_factor": 0, "img_source_path": 0, "img_dest_path": 0, "x": 0, "scale": [0, 3], "factor": 0, "bewteen": 0, "full": 0, "path": [0, 2, 3, 4], "sourc": [0, 4], "destin": 0, "convert_box": 0, "convert": [0, 4], "y": 0, "w": 0, "h": 0, "upper": 0, "left": 0, "lower": 0, "right": 0, "get_iiif_url": 0, "page_id": 0, "http": [0, 2, 3, 4], "dhlabsrv17": [0, 4], "epfl": [0, 3, 4], "ch": [0, 4], "iiif_impresso": [0, 4], "iiif_manifest_uri": 0, "pct": [0, 4], "bool": [0, 2, 3, 4], "fals": [0, 2, 4], "impresso": [0, 2, 3, 4], "url": [0, 4], "given": [0, 2, 3, 4], "page": [0, 2, 3, 4], "id": [0, 2, 3, 4], "e": [0, 2, 3, 4], "g": [0, 2, 3, 4], "exp": 0, "1930": 0, "06": 0, "10": [0, 2, 4], "p0001": [0, 2, 4], "blank": [0, 2], "get_scale_factor": 0, "issue_dir_path": 0, "page_xml": 0, "box_strategi": 0, "img_source_nam": 0, "context": [0, 4], "strategi": 0, "choos": 0, "issu": [0, 2, 3, 4], "zipfil": 0, "zip": 0, "byte": 0, "xml": 0, "handler": 0, "found": [0, 4], "info": [0, 3], "txt": [0, 2], "from": [0, 1, 3, 4], "jp2": 0, "folder": 0, "hopefulli": 0, "correct": 0, "librari": [0, 4], "take": [0, 2, 4], "best": 0, "avail": 0, "highest": 0, "were": 0, "accord": [0, 2], "an": [0, 2, 3, 4], "which": [0, 3, 4], "we": 0, "have": [0, 3, 4], "identifi": 0, "among": 0, "format": [0, 3, 4], "coverag": 0, "i": [0, 1, 3, 4], "devis": 0, "The": [0, 2, 3, 4], "present": [0, 2, 3], "file": [0, 1], "wa": [0, 3], "dest": 0, "can": [0, 3, 4], "therefor": 0, "us": [0, 2, 3, 4], "need": [0, 3, 4], "read": [0, 2, 3, 4], "normal": 0, "In": [0, 2, 3], "thi": [0, 3, 4], "acquir": 0, "It": [0, 3, 4], "look": 0, "also": 0, "took": 0, "ocr": [0, 4], "possibl": [0, 2, 4], "reli": 0, "resolut": 0, "indic": 0, "should": [0, 3, 4], "same": 0, "our": [0, 2, 3, 4], "n": [0, 4], "b": [0, 2, 3, 4], "heigth": 0, "do": 0, "correspond": [0, 2], "usual": 0, "discrep": [0, 4], "tag": 0, "images_resolut": 0, "hand": 0, "page_width": 0, "other": 0, "seem": [0, 2, 4], "ignor": 0, "current": [0, 3], "here": [0, 4], "ar": [0, 2, 3, 4], "equal": 0, "To": 0, "check": [0, 2, 4], "happen": 0, "choic": 0, "acquisit": [0, 3], "addit": [0, 3], "see": [0, 2, 3], "ones": 0, "danger": 0, "anoth": 0, "did": 0, "provid": 0, "thing": 0, "fit": 0, "like": 0, "test": 0, "move": 0, "proper": [0, 4], "unit": 0, "python": 1, "bit": 1, "code": [1, 2], "object": [1, 2, 3, 4], "highli": 1, "reusabl": [1, 4], "within": [1, 2, 4], "input": [1, 3, 4], "output": [1, 3, 4], "gener": [1, 4], "o": [1, 4], "system": 1, "s3": [1, 3], "text": [1, 4], "rebuild": 1, "helper": 1, "config": [1, 2], "exampl": [1, 2, 4], "util": [1, 3], "basic": [1, 3], "dask": [1, 2, 3], "apach": 1, "uima": 1, "xmi": 1, "loader": 1, "imag": [1, 2, 3], "handl": [1, 4], "oliv": 1, "box": [1, 4], "id2issuedir": 2, "todo": 2, "document": [2, 3], "parse_canonical_filenam": 2, "filenam": [2, 4], "pars": 2, "canon": [2, 3, 4], "its": [2, 3], "compon": 2, "tupl": [2, 3, 4], "gdl": [2, 3, 4], "1950": 2, "01": 2, "02": 2, "i0002": 2, "2": [2, 3, 4], "": [2, 3, 4], "directori": [2, 3, 4], "structur": 2, "path_f": 2, "contentitem": [2, 4], "alia": 2, "item": [2, 3, 4], "issuedir": [2, 3, 4], "journal": 2, "date": [2, 4], "edit": 2, "field": 2, "number": [2, 3, 4], "1": [2, 3, 4], "0": [2, 3, 4], "3": 2, "canonical_path": 2, "dir": [2, 3], "extens": [2, 4], "path_typ": 2, "creat": [2, 3, 4], "repres": 2, "newspap": [2, 3, 4], "onli": [2, 4], "build": 2, "check_filenam": 2, "file_basenam": 2, "whether": [2, 3, 4], "compli": 2, "convent": 2, "1900": [2, 3], "detect_canonical_issu": 2, "base_dir": 2, "detect": [2, 4], "import": [2, 3], "nb": [2, 3, 4], "invalid": 2, "skip": 2, "warn": [2, 4], "messag": 2, "log": [2, 3, 4], "root": [2, 3], "list": [2, 3, 4], "consid": [2, 4], "acronym": 2, "instanc": [2, 4], "detect_issu": 2, "journal_filt": 2, "exclud": 2, "basestr": 2, "set": [2, 3, 4], "filter": [2, 3, 4], "posit": 2, "neg": 2, "boolean": [2, 3, 4], "detect_journal_issu": 2, "get_issueshortpath": 2, "short": [2, 4], "version": [2, 4], "pair_issu": 2, "issue_list1": 2, "issue_list2": 2, "associ": 2, "pair": 2, "origin": 2, "repositori": [2, 4], "arrai": 2, "contain": [2, 3, 4], "issue1": 2, "issue2": 2, "select_issu": 2, "config_dict": [2, 4], "inp_dir": 2, "configur": [2, 3, 4], "select": 2, "md": [2, 3], "explan": 2, "usag": [2, 3, 4], "config_fil": 2, "isfil": 2, "open": [2, 3], "r": 2, "f": [2, 3, 4], "json": [2, 3, 4], "load": [2, 4], "els": [2, 3], "dict": [2, 3, 4], "dit": 2, "where": [2, 3, 4], "get": [2, 3, 4], "path_s3": 2, "issuedirectori": 2, "fetch_fil": 2, "bucket_nam": [2, 3, 4], "true": [2, 4], "file_typ": 2, "newspapers_filt": 2, "bag": [2, 4], "fetch": [2, 3], "bucket": [2, 3, 4], "If": [2, 3, 4], "content": [2, 3, 4], "all": [2, 3, 4], "specifi": [2, 3, 4], "remain": 2, "distribut": [2, 4], "both": 2, "alwai": 2, "first": [2, 3], "element": [2, 4], "second": 2, "henc": 2, "entri": 2, "undesir": 2, "adapt": [2, 3], "github": [2, 3], "com": [2, 3], "data": [2, 3, 4], "sanitycheck": 2, "tree": 2, "master": 2, "sanity_check": 2, "s3_data": 2, "py": [2, 3, 4], "form": 2, "option": [2, 3, 4], "default": [2, 3, 4], "rais": [2, 4], "notimplementederror": 2, "one": [2, 3, 4], "db": [2, 4], "core": [2, 4], "impresso_iter_bucket": 2, "item_typ": 2, "prefix": [2, 4], "filter_config": 2, "partition_s": 2, "15": [2, 4], "iter": 2, "over": 2, "possibli": 2, "either": 2, "valid": 2, "individu": 2, "articl": [2, 3, 4], "param": [2, 3, 4], "kei": [2, 3, 4], "exclus": 2, "year": [2, 3, 4], "interv": 2, "1960": 2, "jdg": 2, "1890": 2, "last": [2, 4], "partit": [2, 4], "size": [2, 4], "list_fil": 2, "locat": 2, "list_newspap": 2, "s3_client": [2, 3], "botocor": 2, "client": [2, 3, 4], "page_s": 2, "int": [2, 3, 4], "10000": 2, "25": 2, "000": 2, "maximum": 2, "pages": 2, "switchengin": 2, "implement": 2, "ceph": 2, "copi": 2, "get_s3_client": [2, 4], "pagin": 2, "alias": 2, "read_s3_issu": 2, "input_bucket": [2, 3], "s3contentitem": 2, "key_nam": [2, 4], "doc_typ": 2, "rebuilt_vers": 2, "canonical_vers": 2, "s3_filter_arch": 2, "suffix": 2, "jsonl": [2, 4], "bz2": [2, 4], "k": 2, "v": 2, "time": 2, "rubric": 2, "1970": 2, "empti": 2, "mean": 2, "1798": 2, "1999": [2, 3], "each": [2, 4], "10th": 2, "sequenc": 2, "key_suffix": 2, "end": 2, "s3_iter_bucket": 2, "get_bucket": [2, 4], "mybucket": 2, "begin": 2, "how": 2, "A": [3, 4], "transform": 3, "purpos": 3, "cli": 3, "For": 3, "member": 3, "script": 3, "run": 3, "runai": 3, "infrastructur": 3, "blob": 3, "main": [3, 4], "howto": 3, "rebuild_articl": 3, "od": 3, "fc": 3, "fo": 3, "schedul": 3, "sch": 3, "ob": 3, "verbos": [3, 4], "clear": 3, "languag": 3, "lg": 3, "nworker": 3, "nw": 3, "rebuilt": [3, 4], "upload": [3, 4], "otherwis": [3, 4], "tell": 3, "exist": [3, 4], "ll": 3, "level": 3, "debug": 3, "remov": 3, "befor": 3, "after": [3, 4], "stuff": 3, "worker": [3, 4], "local": [3, 4], "cleanup": 3, "upload_success": 3, "filepath": 3, "ha": 3, "been": 3, "successfulli": 3, "success": [3, 4], "compress": 3, "json_fil": [3, 4], "output_dir": [3, 4], "merg": 3, "line": [3, 4], "singl": 3, "signatur": 3, "write": [3, 4], "sort": 3, "serial": 3, "rytp": 3, "sort_kei": 3, "expect": 3, "concaten": 3, "init_log": 3, "initialis": 3, "logger": 3, "desir": [3, 4], "rootlogg": 3, "duplic": 3, "init_logg": 3, "could": 3, "work": 3, "properli": 3, "so": 3, "keep": [3, 4], "rebuild_for_passim": 3, "content_item": 3, "ani": [3, 4], "passim": 3, "metadata": [3, 4], "built": 3, "rebuild_for_solr": 3, "thought": 3, "especi": 3, "ingest": 3, "solr": 3, "index": 3, "follow": [3, 4], "schema": 3, "rebuild_issu": 3, "dask_client": 3, "filter_languag": 3, "outp_dir": 3, "store": [3, 4], "rebuild_text": 3, "append": 3, "conform": 3, "being": 3, "previou": 3, "fulltext": 3, "offset": [3, 4], "token": 3, "region": 3, "rebuild_text_passim": 3, "group": 3, "get_iiif_and_coord": 3, "ci": [3, 4], "link": [3, 4], "variou": [3, 4], "case": [3, 4], "117": 3, "retriev": [3, 4], "inform": [3, 4], "part": 3, "miss": 3, "insert_whitespac": 3, "next_t": 3, "prev_t": 3, "lang": 3, "determin": 3, "whitespac": 3, "insert": 3, "pages_to_articl": 3, "belong": 3, "read_issu": 3, "inject": 3, "s3_version": 3, "boto3": [3, 4], "resourc": [3, 4], "factori": [3, 4], "serviceresourc": [3, 4], "connect": [3, 4], "storag": 3, "represent": 3, "read_issue_pag": 3, "issue_json": 3, "parallel": [3, 4], "read_pag": 3, "page_kei": 3, "reconstruct_iiif_link": 3, "construct": 3, "api": 3, "endpoint": [3, 4], "process": [3, 4], "some": 3, "inconsist": 3, "variat": 3, "more": [3, 4], "detail": [3, 4], "area": 3, "rejoin_articl": 3, "text_apply_break": 3, "break": [3, 4], "appli": 3, "visual": 3, "charact": 3, "paragraph": 3, "etc": 3, "chunk": [3, 4], "cluster": 3, "chunksiz": 4, "yield": 4, "get_pkg_resourc": 4, "file_manag": 4, "exitstack": 4, "packag": 4, "posixpath": 4, "manag": 4, "instanti": 4, "prior": 4, "call": 4, "close": 4, "onc": 4, "longer": 4, "contextlib": 4, "pathlib": 4, "parse_json": 4, "drive": 4, "boto": 4, "kept": 4, "until": 4, "third": 4, "parti": 4, "lib": 4, "depend": 4, "solv": 4, "alternative_read_text": 4, "s3_kei": 4, "s3_credenti": 4, "reason": 4, "bug": 4, "read_text": 4, "1000": 4, "filenotfounderror": 4, "fixed_s3fs_glob": 4, "boto3_bucket": 4, "benoit": 4, "pyimag": 4, "custom": 4, "glob": 4, "s3f": 4, "unabl": 4, "than": 4, "switch": 4, "get_boto3_bucket": 4, "request": 4, "ask": 4, "doe": 4, "turn": 4, "newli": 4, "testb": 4, "depreci": 4, "pleas": 4, "priorit": 4, "get_or_create_bucket": 4, "instead": 4, "yet": 4, "get_bucket_boto3": 4, "host_url": 4, "zhdk": 4, "cloud": 4, "get_s3_connect": 4, "host": 4, "assum": 4, "two": 4, "environ": 4, "variabl": 4, "se_access_kei": 4, "se_secret_kei": 4, "get_s3_resourc": 4, "relat": 4, "get_s3_vers": 4, "modifi": 4, "datetim": 4, "get_s3_versions_cli": 4, "get_storage_opt": 4, "read_jsonlin": 4, "point": 4, "extract": 4, "doc": 4, "per": 4, "from_sequ": 4, "s3r": 4, "print": 4, "count": 4, "map": 4, "pluck": 4, "ft": 4, "without": 4, "readtext_jsonlin": 4, "limit": 4, "textual": 4, "leav": 4, "out": 4, "serv": 4, "pure": 4, "ne": 4, "reus": 4, "topic": 4, "s3_get_articl": 4, "ad": 4, "advertis": 4, "iter_bucket": 4, "cpu": 4, "dictionari": 4, "s3_get_pag": 4, "issue_id": 4, "page_nam": 4, "imp": 4, "1990": 4, "03": 4, "partition_nam": 4, "newspaper_prefix": 4, "upload_to_s3": 4, "local_path": 4, "path_within_bucket": 4, "help": 4, "prepar": 4, "view": 4, "orient": 4, "daskutil": 4, "cf": 4, "p": 4, "argument": 4, "create_even_partit": 4, "config_newspap": 4, "local_f": 4, "keep_ful": 4, "nb_partit": 4, "500": 4, "yearli": 4, "even": 4, "enabl": 4, "effici": 4, "bypass": 4, "shuffl": 4, "well": 4, "decid": 4, "what": 4, "memori": 4, "span": 4, "classic": 4, "produc": 4, "arg": 4, "partition": 4, "nbpart": 4, "export": 4, "compute_image_link": 4, "pad": 4, "20": 4, "iiif_endpoint": 4, "iiif_link": 4, "summari": 4, "descript": 4, "get_iiif_link": 4, "canonical_bucket": 4, "rebuilt2xmi": 4, "typesystem_path": 4, "iiif_map": 4, "pct_coordin": 4, "typesystem": 4, "defit": 4, "annot": 4, "layer": 4, "task": 4, "config_load": 4, "initi": 4, "method": 4, "check_bucket": 4, "attribut": 4, "check_param": 4, "classmethod": 4, "from_json": 4, "to_dict": 4, "partitionerconfig": 4, "As": 4, "now": 4, "solr_serv": 4, "server": 4, "solr_cor": 4, "s3_host": 4, "s3_bucket_rebuilt": 4, "s3_bucket_partit": 4, "s3_bucket_process": 4, "key_batch": 4, "batch": 4, "number_partit": 4, "1991": 4, "1998": 4}, "objects": {"impresso_commons.images": [[0, 0, 0, "-", "img_utils"], [0, 0, 0, "-", "olive_boxes"]], "impresso_commons.images.img_utils": [[0, 1, 1, "", "BoxStrategy"], [0, 3, 1, "", "compose"], [0, 3, 1, "", "get_img_from_archive"], [0, 3, 1, "", "get_imgdimensions"], [0, 3, 1, "", "get_jpg"], [0, 3, 1, "", "get_page_folders"], [0, 3, 1, "", "get_png"], [0, 3, 1, "", "get_tif"], [0, 3, 1, "", "run_cmd"]], "impresso_commons.images.img_utils.BoxStrategy": [[0, 2, 1, "", "jpg_highest"], [0, 2, 1, "", "jpg_uniq"], [0, 2, 1, "", "png_highest"], [0, 2, 1, "", "png_uniq"], [0, 2, 1, "", "tif"]], "impresso_commons.images.olive_boxes": [[0, 3, 1, "", "compute_box"], [0, 3, 1, "", "compute_scale_factor"], [0, 3, 1, "", "convert_box"], [0, 3, 1, "", "get_iiif_url"], [0, 3, 1, "", "get_scale_factor"], [0, 3, 1, "", "test"]], "impresso_commons": [[2, 0, 0, "-", "path"]], "impresso_commons.path": [[2, 3, 1, "", "id2IssueDir"], [2, 3, 1, "", "parse_canonical_filename"], [2, 0, 0, "-", "path_fs"], [2, 0, 0, "-", "path_s3"]], "impresso_commons.path.path_fs": [[2, 2, 1, "", "ContentItem"], [2, 1, 1, "", "IssueDir"], [2, 3, 1, "", "canonical_path"], [2, 3, 1, "", "check_filenaming"], [2, 3, 1, "", "detect_canonical_issues"], [2, 3, 1, "", "detect_issues"], [2, 3, 1, "", "detect_journal_issues"], [2, 3, 1, "", "get_issueshortpath"], [2, 3, 1, "", "pair_issue"], [2, 3, 1, "", "select_issues"]], "impresso_commons.path.path_fs.IssueDir": [[2, 2, 1, "", "date"], [2, 2, 1, "", "edition"], [2, 2, 1, "", "journal"], [2, 2, 1, "", "path"]], "impresso_commons.path.path_s3": [[2, 2, 1, "", "IssueDir"], [2, 3, 1, "", "fetch_files"], [2, 3, 1, "", "impresso_iter_bucket"], [2, 3, 1, "", "list_files"], [2, 3, 1, "", "list_newspapers"], [2, 3, 1, "", "read_s3_issues"], [2, 1, 1, "", "s3ContentItem"], [2, 3, 1, "", "s3_filter_archives"], [2, 3, 1, "", "s3_iter_bucket"]], "impresso_commons.text": [[3, 0, 0, "-", "helpers"], [3, 0, 0, "-", "rebuilder"]], "impresso_commons.text.helpers": [[3, 3, 1, "", "get_iiif_and_coords"], [3, 3, 1, "", "insert_whitespace"], [3, 3, 1, "", "pages_to_article"], [3, 3, 1, "", "read_issue"], [3, 3, 1, "", "read_issue_pages"], [3, 3, 1, "", "read_page"], [3, 3, 1, "", "reconstruct_iiif_link"], [3, 3, 1, "", "rejoin_articles"], [3, 3, 1, "", "text_apply_breaks"]], "impresso_commons.text.rebuilder": [[3, 3, 1, "", "cleanup"], [3, 3, 1, "", "compress"], [3, 3, 1, "", "init_logging"], [3, 3, 1, "", "main"], [3, 3, 1, "", "rebuild_for_passim"], [3, 3, 1, "", "rebuild_for_solr"], [3, 3, 1, "", "rebuild_issues"], [3, 3, 1, "", "rebuild_text"], [3, 3, 1, "", "rebuild_text_passim"], [3, 3, 1, "", "upload"]], "impresso_commons.utils": [[4, 0, 0, "-", "config_loader"], [4, 0, 0, "-", "daskutils"], [4, 0, 0, "-", "s3"], [4, 0, 0, "-", "uima"], [4, 0, 0, "-", "utils"]], "impresso_commons.utils.config_loader": [[4, 1, 1, "", "Base"], [4, 1, 1, "", "PartitionerConfig"], [4, 3, 1, "", "main"]], "impresso_commons.utils.config_loader.Base": [[4, 4, 1, "", "check_bucket"], [4, 4, 1, "", "check_params"], [4, 4, 1, "", "from_json"], [4, 4, 1, "", "to_dict"]], "impresso_commons.utils.daskutils": [[4, 3, 1, "", "create_even_partitions"], [4, 3, 1, "", "main"], [4, 3, 1, "", "partitioner"]], "impresso_commons.utils.s3": [[4, 3, 1, "", "alternative_read_text"], [4, 3, 1, "", "fixed_s3fs_glob"], [4, 3, 1, "", "get_boto3_bucket"], [4, 3, 1, "", "get_bucket"], [4, 3, 1, "", "get_bucket_boto3"], [4, 3, 1, "", "get_or_create_bucket"], [4, 3, 1, "", "get_s3_client"], [4, 3, 1, "", "get_s3_connection"], [4, 3, 1, "", "get_s3_resource"], [4, 3, 1, "", "get_s3_versions"], [4, 3, 1, "", "get_s3_versions_client"], [4, 3, 1, "", "get_storage_options"], [4, 3, 1, "", "read_jsonlines"], [4, 3, 1, "", "readtext_jsonlines"], [4, 3, 1, "", "s3_get_articles"], [4, 3, 1, "", "s3_get_pages"], [4, 3, 1, "", "upload"], [4, 3, 1, "", "upload_to_s3"]], "impresso_commons.utils.uima": [[4, 3, 1, "", "compute_image_links"], [4, 3, 1, "", "get_iiif_links"], [4, 3, 1, "", "rebuilt2xmi"]], "impresso_commons.utils.utils": [[4, 3, 1, "", "chunk"], [4, 3, 1, "", "get_pkg_resource"], [4, 3, 1, "", "parse_json"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:function", "4": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"], "4": ["py", "method", "Python method"]}, "titleterms": {"imag": 0, "handl": 0, "util": [0, 4], "oliv": 0, "box": 0, "background": 0, "inform": 0, "case": 0, "1": 0, "tif": 0, "2": 0, "sever": 0, "png": 0, "3": 0, "one": 0, "onli": 0, "4": 0, "jpg": 0, "welcom": 1, "impresso": 1, "pycommon": 1, "": 1, "document": 1, "content": 1, "input": 2, "output": 2, "gener": 2, "i": 2, "o": 2, "from": 2, "file": [2, 3, 4], "system": 2, "s3": [2, 4], "text": 3, "rebuild": 3, "function": [3, 4], "helper": 3, "config": [3, 4], "exampl": 3, "basic": 4, "dask": 4, "apach": 4, "uima": 4, "xmi": 4, "loader": 4}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 60}, "alltitles": {"Image handling": [[0, "image-handling"]], "Image Utils": [[0, "module-impresso_commons.images.img_utils"]], "Olive Boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "Background information": [[0, "background-information"]], "Case 1: tif": [[0, "case-1-tif"]], "Case 2: several png": [[0, "case-2-several-png"]], "Case 3: one png only": [[0, "case-3-one-png-only"]], "Case 4: one jpg only": [[0, "case-4-one-jpg-only"]], "Welcome to Impresso PyCommons\u2019s documentation!": [[1, "welcome-to-impresso-pycommons-s-documentation"]], "Contents:": [[1, null]], "Input/Output": [[2, "input-output"]], "General": [[2, "module-impresso_commons.path"]], "I/O from file system": [[2, "module-impresso_commons.path.path_fs"]], "I/O from S3": [[2, "module-impresso_commons.path.path_s3"]], "Text Rebuild": [[3, "text-rebuild"]], "Rebuild functions": [[3, "rebuild-functions"]], "Helpers": [[3, "module-impresso_commons.text.helpers"]], "Config file example": [[3, "config-file-example"]], "Utilities": [[4, "utilities"]], "Basic Utils Functions": [[4, "module-impresso_commons.utils.utils"]], "S3 Utils Functions": [[4, "module-impresso_commons.utils.s3"]], "Dask Utils Functions": [[4, "module-impresso_commons.utils.daskutils"]], "Apache UIMA XMI Utils Functions": [[4, "module-impresso_commons.utils.uima"]], "Config File Loader": [[4, "module-impresso_commons.utils.config_loader"]]}, "indexentries": {"boxstrategy (class in impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.BoxStrategy"]], "compose() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.compose"]], "compute_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_box"]], "compute_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_scale_factor"]], "convert_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.convert_box"]], "get_iiif_url() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_iiif_url"]], "get_img_from_archive() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_img_from_archive"]], "get_imgdimensions() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_imgdimensions"]], "get_jpg() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_jpg"]], "get_page_folders() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_page_folders"]], "get_png() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_png"]], "get_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_scale_factor"]], "get_tif() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_tif"]], "impresso_commons.images.img_utils": [[0, "module-impresso_commons.images.img_utils"]], "impresso_commons.images.olive_boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "jpg_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_highest"]], "jpg_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_uniq"]], "module": [[0, "module-impresso_commons.images.img_utils"], [0, "module-impresso_commons.images.olive_boxes"], [2, "module-impresso_commons.path"], [2, "module-impresso_commons.path.path_fs"], [2, "module-impresso_commons.path.path_s3"], [3, "module-impresso_commons.text.helpers"], [3, "module-impresso_commons.text.rebuilder"], [4, "module-impresso_commons.utils.config_loader"], [4, "module-impresso_commons.utils.daskutils"], [4, "module-impresso_commons.utils.s3"], [4, "module-impresso_commons.utils.uima"], [4, "module-impresso_commons.utils.utils"]], "png_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_highest"]], "png_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_uniq"]], "run_cmd() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.run_cmd"]], "test() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.test"]], "tif (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.tif"]], "contentitem (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.ContentItem"]], "issuedir (class in impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.IssueDir"]], "issuedir (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.IssueDir"]], "canonical_path() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.canonical_path"]], "check_filenaming() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.check_filenaming"]], "date (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.date"]], "detect_canonical_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_canonical_issues"]], "detect_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_issues"]], "detect_journal_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_journal_issues"]], "edition (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.edition"]], "fetch_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.fetch_files"]], "get_issueshortpath() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.get_issueshortpath"]], "id2issuedir() (in module impresso_commons.path)": [[2, "impresso_commons.path.id2IssueDir"]], "impresso_commons.path": [[2, "module-impresso_commons.path"]], "impresso_commons.path.path_fs": [[2, "module-impresso_commons.path.path_fs"]], "impresso_commons.path.path_s3": [[2, "module-impresso_commons.path.path_s3"]], "impresso_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.impresso_iter_bucket"]], "journal (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.journal"]], "list_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_files"]], "list_newspapers() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_newspapers"]], "pair_issue() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.pair_issue"]], "parse_canonical_filename() (in module impresso_commons.path)": [[2, "impresso_commons.path.parse_canonical_filename"]], "path (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.path"]], "read_s3_issues() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.read_s3_issues"]], "s3contentitem (class in impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3ContentItem"]], "s3_filter_archives() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_filter_archives"]], "s3_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_iter_bucket"]], "select_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.select_issues"]], "cleanup() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.cleanup"]], "compress() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.compress"]], "get_iiif_and_coords() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.get_iiif_and_coords"]], "impresso_commons.text.helpers": [[3, "module-impresso_commons.text.helpers"]], "impresso_commons.text.rebuilder": [[3, "module-impresso_commons.text.rebuilder"]], "init_logging() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.init_logging"]], "insert_whitespace() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.insert_whitespace"]], "main() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.main"]], "pages_to_article() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.pages_to_article"]], "read_issue() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue"]], "read_issue_pages() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue_pages"]], "read_page() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_page"]], "rebuild_for_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_passim"]], "rebuild_for_solr() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_solr"]], "rebuild_issues() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_issues"]], "rebuild_text() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text"]], "rebuild_text_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text_passim"]], "reconstruct_iiif_link() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.reconstruct_iiif_link"]], "rejoin_articles() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.rejoin_articles"]], "text_apply_breaks() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.text_apply_breaks"]], "upload() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.upload"]], "base (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.Base"]], "partitionerconfig (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.PartitionerConfig"]], "alternative_read_text() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.alternative_read_text"]], "check_bucket() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_bucket"]], "check_params() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_params"]], "chunk() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.chunk"]], "compute_image_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.compute_image_links"]], "create_even_partitions() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.create_even_partitions"]], "fixed_s3fs_glob() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.fixed_s3fs_glob"]], "from_json() (impresso_commons.utils.config_loader.base class method)": [[4, "impresso_commons.utils.config_loader.Base.from_json"]], "get_boto3_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_boto3_bucket"]], "get_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket"]], "get_bucket_boto3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket_boto3"]], "get_iiif_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.get_iiif_links"]], "get_or_create_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_or_create_bucket"]], "get_pkg_resource() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.get_pkg_resource"]], "get_s3_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_client"]], "get_s3_connection() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_connection"]], "get_s3_resource() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_resource"]], "get_s3_versions() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions"]], "get_s3_versions_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions_client"]], "get_storage_options() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_storage_options"]], "impresso_commons.utils.config_loader": [[4, "module-impresso_commons.utils.config_loader"]], "impresso_commons.utils.daskutils": [[4, "module-impresso_commons.utils.daskutils"]], "impresso_commons.utils.s3": [[4, "module-impresso_commons.utils.s3"]], "impresso_commons.utils.uima": [[4, "module-impresso_commons.utils.uima"]], "impresso_commons.utils.utils": [[4, "module-impresso_commons.utils.utils"]], "main() (in module impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.main"]], "main() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.main"]], "parse_json() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.parse_json"]], "partitioner() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.partitioner"]], "read_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.read_jsonlines"]], "readtext_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.readtext_jsonlines"]], "rebuilt2xmi() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.rebuilt2xmi"]], "s3_get_articles() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_articles"]], "s3_get_pages() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_pages"]], "to_dict() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.to_dict"]], "upload() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload"]], "upload_to_s3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload_to_s3"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["images", "index", "io", "rebuild", "utils", "versioning"], "filenames": ["images.rst", "index.rst", "io.rst", "rebuild.rst", "utils.rst", "versioning.rst"], "titles": ["Image handling", "Welcome to Impresso PyCommons\u2019s documentation!", "Input/Output", "Text Rebuild", "Utilities", "Data Versioning"], "terms": {"class": [0, 2, 4, 5], "impresso_common": [0, 2, 3, 4, 5], "img_util": 0, "boxstrategi": 0, "valu": [0, 2, 4, 5], "name": [0, 2, 3, 4, 5], "none": [0, 2, 3, 4, 5], "modul": [0, 1, 5], "qualnam": [0, 5], "type": [0, 2, 3, 4, 5], "start": [0, 4, 5], "boundari": [0, 5], "base": [0, 2, 3, 4, 5], "enum": [0, 5], "jpg_highest": 0, "jpg_uniq": 0, "png_highest": 0, "png_uniq": 0, "compos": 0, "path_img_on": 0, "path_img_two": 0, "path_img_thre": 0, "get_img_from_arch": 0, "archiv": [0, 3, 4], "path_check": 0, "ext_check": 0, "name_check": 0, "get_imgdimens": 0, "image_data": 0, "return": [0, 2, 3, 4, 5], "height": 0, "width": 0, "get_jpg": 0, "page_digit": 0, "get_page_fold": 0, "get_png": 0, "get_tif": 0, "run_cmd": 0, "cmd": 0, "execut": 0, "shell": 0, "result": [0, 2, 4, 5], "stdout": 0, "stderr": 0, "function": [0, 1, 5], "support": [0, 2, 4], "re": [0, 5], "comput": [0, 1, 2, 4], "coordin": [0, 3, 4], "olive_box": 0, "compute_box": 0, "scale_factor": 0, "input_box": 0, "iiif": [0, 3, 4], "rel": [0, 5], "paramet": [0, 2, 3, 4, 5], "float": [0, 4, 5], "ratio": 0, "between": [0, 5], "differ": [0, 3, 4, 5], "dimens": 0, "str": [0, 2, 3, 4, 5], "string": [0, 2, 3, 4, 5], "separ": [0, 2, 4, 5], "space": 0, "new": [0, 4, 5], "compute_scale_factor": 0, "img_source_path": 0, "img_dest_path": 0, "x": 0, "scale": [0, 3], "factor": 0, "bewteen": 0, "full": [0, 5], "path": [0, 2, 3, 4, 5], "sourc": [0, 4], "destin": 0, "convert_box": 0, "convert": [0, 4, 5], "y": 0, "w": 0, "h": 0, "upper": 0, "left": 0, "lower": [0, 5], "right": 0, "get_iiif_url": 0, "page_id": 0, "http": [0, 2, 3, 4, 5], "dhlabsrv17": [0, 4], "epfl": [0, 3, 4], "ch": [0, 4], "iiif_impresso": [0, 4], "iiif_manifest_uri": 0, "pct": [0, 4], "bool": [0, 2, 3, 4, 5], "fals": [0, 2, 4, 5], "impresso": [0, 2, 3, 4, 5], "url": [0, 4, 5], "given": [0, 2, 3, 4, 5], "page": [0, 2, 3, 4, 5], "id": [0, 2, 3, 4, 5], "e": [0, 2, 3, 4, 5], "g": [0, 2, 3, 4], "exp": 0, "1930": 0, "06": 0, "10": [0, 2, 4, 5], "p0001": [0, 2, 4], "blank": [0, 2], "get_scale_factor": 0, "issue_dir_path": 0, "page_xml": 0, "box_strategi": 0, "img_source_nam": 0, "context": [0, 4], "strategi": 0, "choos": 0, "issu": [0, 2, 3, 4, 5], "zipfil": 0, "zip": 0, "byte": [0, 4], "xml": 0, "handler": 0, "found": [0, 4, 5], "info": [0, 3, 4, 5], "txt": [0, 2], "from": [0, 1, 3, 4, 5], "jp2": 0, "folder": 0, "hopefulli": 0, "correct": 0, "librari": [0, 4], "take": [0, 2, 4], "best": 0, "avail": 0, "highest": 0, "were": [0, 5], "accord": [0, 2], "an": [0, 2, 3, 4, 5], "which": [0, 3, 4, 5], "we": 0, "have": [0, 3, 4, 5], "identifi": 0, "among": 0, "format": [0, 3, 4, 5], "coverag": 0, "i": [0, 1, 3, 4, 5], "devis": 0, "The": [0, 2, 3, 4, 5], "present": [0, 2, 3, 5], "file": [0, 1, 5], "wa": [0, 3, 5], "dest": 0, "can": [0, 3, 4, 5], "therefor": 0, "us": [0, 2, 3, 4, 5], "need": [0, 3, 4, 5], "read": [0, 2, 3, 4, 5], "normal": 0, "In": [0, 2, 3, 5], "thi": [0, 3, 4, 5], "acquir": 0, "It": [0, 3, 4, 5], "look": [0, 5], "also": [0, 5], "took": [0, 5], "ocr": [0, 4], "possibl": [0, 2, 4, 5], "reli": 0, "resolut": 0, "indic": [0, 5], "should": [0, 3, 4, 5], "same": [0, 5], "our": [0, 2, 3, 4], "n": [0, 4], "b": [0, 2, 3, 4], "heigth": 0, "do": [0, 5], "correspond": [0, 2, 4, 5], "usual": 0, "discrep": [0, 4], "tag": 0, "images_resolut": 0, "hand": 0, "page_width": 0, "other": [0, 5], "seem": [0, 2, 4], "ignor": 0, "current": [0, 3, 5], "here": [0, 4, 5], "ar": [0, 2, 3, 4, 5], "equal": 0, "To": 0, "check": [0, 2, 4, 5], "happen": 0, "choic": 0, "acquisit": [0, 3], "addit": [0, 3, 5], "see": [0, 2, 3], "ones": [0, 5], "danger": 0, "anoth": [0, 5], "did": 0, "provid": [0, 4, 5], "thing": [0, 5], "fit": 0, "like": 0, "test": [0, 5], "move": [0, 5], "proper": [0, 4], "unit": [0, 4], "python": 1, "bit": 1, "code": [1, 2], "object": [1, 2, 3, 4, 5], "highli": 1, "reusabl": [1, 4], "within": [1, 2, 4, 5], "input": [1, 3, 4, 5], "output": [1, 3, 4, 5], "gener": [1, 3, 4, 5], "o": [1, 4], "system": [1, 5], "s3": [1, 3, 5], "text": [1, 4, 5], "rebuild": 1, "helper": 1, "config": [1, 2, 5], "exampl": [1, 2, 4, 5], "util": [1, 3, 5], "basic": [1, 3, 5], "dask": [1, 2, 3, 5], "apach": 1, "uima": 1, "xmi": 1, "loader": 1, "imag": [1, 2, 3, 5], "handl": [1, 4, 5], "oliv": 1, "box": [1, 4], "data": [1, 2, 3, 4], "version": [1, 2, 4], "statist": 1, "newspaperstatist": 1, "manifest": [1, 3, 4], "script": [1, 3], "id2issuedir": 2, "todo": [2, 5], "document": [2, 3, 5], "parse_canonical_filenam": 2, "filenam": [2, 4, 5], "pars": 2, "canon": [2, 3, 4, 5], "its": [2, 3, 5], "compon": 2, "tupl": [2, 3, 4, 5], "gdl": [2, 3, 4], "1950": 2, "01": 2, "02": 2, "i0002": 2, "2": [2, 3, 4, 5], "": [2, 3, 4, 5], "directori": [2, 3, 4, 5], "structur": [2, 5], "path_f": 2, "contentitem": [2, 4], "alia": 2, "item": [2, 3, 4, 5], "issuedir": [2, 3, 4], "journal": 2, "date": [2, 4, 5], "edit": 2, "field": [2, 5], "number": [2, 3, 4, 5], "1": [2, 3, 4, 5], "0": [2, 3, 4, 5], "3": [2, 5], "canonical_path": 2, "dir": [2, 3, 5], "extens": [2, 4, 5], "path_typ": 2, "creat": [2, 3, 4, 5], "repres": [2, 5], "newspap": [2, 3, 4, 5], "onli": [2, 4, 5], "build": 2, "check_filenam": 2, "file_basenam": 2, "whether": [2, 3, 4, 5], "compli": 2, "convent": 2, "1900": [2, 3], "detect_canonical_issu": 2, "base_dir": 2, "detect": [2, 4], "import": [2, 3], "nb": [2, 3, 4], "invalid": 2, "skip": 2, "warn": [2, 4], "messag": [2, 5], "log": [2, 3, 4, 5], "root": [2, 3, 4], "list": [2, 3, 4, 5], "consid": [2, 4, 5], "acronym": 2, "instanc": [2, 4], "detect_issu": 2, "journal_filt": 2, "exclud": [2, 5], "basestr": 2, "set": [2, 3, 4, 5], "filter": [2, 3, 4], "posit": 2, "neg": 2, "boolean": [2, 3, 4], "detect_journal_issu": 2, "get_issueshortpath": 2, "short": [2, 4], "pair_issu": 2, "issue_list1": 2, "issue_list2": 2, "associ": [2, 5], "pair": [2, 5], "origin": [2, 5], "repositori": [2, 3, 4, 5], "arrai": 2, "contain": [2, 3, 4, 5], "issue1": 2, "issue2": 2, "select_issu": 2, "config_dict": [2, 4, 5], "inp_dir": 2, "configur": [2, 3, 4, 5], "select": 2, "md": [2, 3, 5], "explan": 2, "usag": [2, 3, 4, 5], "config_fil": 2, "isfil": 2, "open": [2, 3], "r": 2, "f": [2, 3, 4], "json": [2, 3, 4, 5], "load": [2, 4, 5], "els": [2, 3], "dict": [2, 3, 4, 5], "dit": 2, "where": [2, 3, 4, 5], "get": [2, 3, 4, 5], "path_s3": 2, "issuedirectori": 2, "fetch_fil": 2, "bucket_nam": [2, 3, 4, 5], "true": [2, 4, 5], "file_typ": 2, "newspapers_filt": 2, "bag": [2, 4, 5], "fetch": [2, 3, 5], "bucket": [2, 3, 4, 5], "If": [2, 3, 4, 5], "content": [2, 3, 4, 5], "all": [2, 3, 4, 5], "specifi": [2, 3, 4], "remain": 2, "distribut": [2, 4], "both": [2, 5], "alwai": [2, 5], "first": [2, 3], "element": [2, 4, 5], "second": 2, "henc": 2, "entri": 2, "undesir": 2, "adapt": [2, 3], "github": [2, 3, 5], "com": [2, 3], "sanitycheck": 2, "tree": 2, "master": [2, 5], "sanity_check": 2, "s3_data": 2, "py": [2, 3, 4, 5], "form": [2, 5], "option": [2, 3, 4, 5], "default": [2, 3, 4, 5], "rais": [2, 4, 5], "notimplementederror": 2, "one": [2, 3, 4, 5], "db": [2, 4, 5], "core": [2, 4, 5], "impresso_iter_bucket": 2, "item_typ": 2, "prefix": [2, 4], "filter_config": 2, "partition_s": 2, "15": [2, 4], "iter": 2, "over": 2, "possibli": 2, "either": [2, 5], "valid": [2, 4, 5], "individu": 2, "articl": [2, 3, 4], "param": [2, 3, 4], "kei": [2, 3, 4, 5], "exclus": 2, "year": [2, 3, 4, 5], "interv": 2, "1960": 2, "jdg": 2, "1890": 2, "last": [2, 4, 5], "partit": [2, 4, 5], "size": [2, 4, 5], "list_fil": 2, "locat": 2, "list_newspap": 2, "s3_client": [2, 3], "botocor": 2, "client": [2, 3, 4, 5], "page_s": 2, "int": [2, 3, 4, 5], "10000": 2, "25": 2, "000": 2, "maximum": 2, "pages": 2, "switchengin": 2, "implement": [2, 5], "ceph": 2, "copi": 2, "get_s3_client": [2, 4], "pagin": 2, "alias": 2, "read_s3_issu": 2, "input_bucket": [2, 3], "s3contentitem": 2, "key_nam": [2, 4], "doc_typ": 2, "rebuilt_vers": 2, "canonical_vers": 2, "s3_filter_arch": 2, "suffix": 2, "jsonl": [2, 4, 5], "bz2": [2, 4, 5], "k": [2, 4], "v": [2, 5], "time": [2, 5], "rubric": 2, "1970": 2, "empti": [2, 5], "mean": [2, 5], "1798": 2, "1999": [2, 3], "each": [2, 4, 5], "10th": 2, "sequenc": 2, "key_suffix": 2, "end": [2, 5], "s3_iter_bucket": 2, "get_bucket": [2, 4], "mybucket": 2, "begin": 2, "how": [2, 5], "A": [3, 4, 5], "transform": 3, "purpos": 3, "cli": 3, "For": [3, 5], "member": [3, 5], "run": [3, 5], "runai": 3, "infrastructur": 3, "blob": 3, "main": [3, 4, 5], "howto": 3, "rebuild_articl": 3, "od": 3, "fc": 3, "fo": 3, "schedul": [3, 5], "sch": [3, 5], "ob": 3, "verbos": [3, 4, 5], "clear": 3, "languag": 3, "lg": 3, "nworker": [3, 5], "nw": [3, 5], "git": [3, 5], "repo": [3, 5], "gr": 3, "temp": 3, "tp": 3, "prev": 3, "pm": 3, "rebuilt": [3, 4, 5], "upload": [3, 4, 5], "otherwis": [3, 4, 5], "tell": [3, 5], "exist": [3, 4, 5], "ll": [3, 5], "level": [3, 4, 5], "debug": [3, 5], "remov": [3, 5], "befor": 3, "after": [3, 4, 5], "solr": [3, 5], "passim": [3, 5], "worker": [3, 4, 5], "local": [3, 4, 5], "includ": [3, 5], "temporari": 3, "clone": [3, 5], "releas": [3, 5], "previou": [3, 5], "cleanup": 3, "upload_success": 3, "filepath": [3, 5], "ha": [3, 5], "been": [3, 5], "successfulli": 3, "success": [3, 4, 5], "compress": 3, "json_fil": [3, 4], "output_dir": [3, 4], "merg": 3, "line": [3, 4, 5], "singl": 3, "signatur": 3, "write": [3, 4, 5], "sort": 3, "serial": [3, 5], "rytp": 3, "sort_kei": 3, "expect": [3, 5], "concaten": 3, "init_log": 3, "initialis": [3, 4], "logger": [3, 4], "desir": [3, 4, 5], "rootlogg": [3, 4], "duplic": 3, "init_logg": [3, 4], "could": [3, 4, 5], "work": 3, "properli": 3, "so": [3, 5], "keep": [3, 4], "rebuild_for_passim": 3, "content_item": 3, "ani": [3, 4, 5], "metadata": [3, 4], "built": 3, "rebuild_for_solr": 3, "thought": 3, "especi": 3, "ingest": [3, 5], "index": [3, 5], "follow": [3, 4, 5], "schema": [3, 4, 5], "rebuild_issu": 3, "dask_client": 3, "_format": 3, "filter_languag": 3, "outp_dir": 3, "store": [3, 4, 5], "rebuild_text": 3, "append": [3, 5], "conform": 3, "being": 3, "fulltext": 3, "offset": [3, 4], "token": 3, "region": 3, "rebuild_text_passim": 3, "group": 3, "get_iiif_and_coord": 3, "ci": [3, 4, 5], "link": [3, 4], "variou": [3, 4, 5], "case": [3, 4, 5], "117": 3, "retriev": [3, 4, 5], "inform": [3, 4, 5], "part": [3, 5], "miss": [3, 5], "insert_whitespac": 3, "next_t": 3, "prev_t": 3, "lang": 3, "determin": [3, 5], "whitespac": 3, "insert": 3, "pages_to_articl": 3, "belong": 3, "read_issu": 3, "inject": 3, "s3_version": 3, "boto3": [3, 4], "resourc": [3, 4], "factori": [3, 4], "serviceresourc": [3, 4], "connect": [3, 4], "storag": 3, "represent": [3, 5], "read_issue_pag": 3, "issue_json": 3, "parallel": [3, 4], "read_pag": 3, "page_kei": 3, "reconstruct_iiif_link": 3, "construct": [3, 5], "api": 3, "endpoint": [3, 4], "process": [3, 4, 5], "some": [3, 5], "inconsist": [3, 5], "variat": 3, "more": [3, 4, 5], "detail": [3, 4], "area": 3, "rejoin_articl": 3, "text_apply_break": 3, "break": [3, 4], "appli": 3, "visual": 3, "charact": 3, "paragraph": 3, "etc": [3, 5], "chunk": [3, 4, 5], "cluster": 3, "bytes_to": 4, "bytes_nb": 4, "to_unit": 4, "bsize": 4, "1024": 4, "target": [4, 5], "kilobyt": 4, "m": [4, 5], "megabyt": 4, "gigabyt": 4, "t": [4, 5], "terabyt": 4, "p": [4, 5], "petabyt": 4, "exabyt": 4, "convers": 4, "keyerror": [4, 5], "chunksiz": 4, "yield": 4, "get_pkg_resourc": 4, "file_manag": 4, "exitstack": 4, "packag": [4, 5], "posixpath": 4, "manag": 4, "instanti": [4, 5], "prior": 4, "call": [4, 5], "close": 4, "onc": [4, 5], "longer": 4, "contextlib": 4, "pathlib": 4, "20": 4, "_description_": [4, 5], "parse_json": 4, "validate_against_schema": 4, "json_to_valid": 4, "path_to_schema": 4, "against": [4, 5], "drive": 4, "boto": 4, "kept": 4, "until": 4, "third": 4, "parti": 4, "lib": 4, "depend": 4, "solv": 4, "alternative_read_text": 4, "s3_kei": [4, 5], "s3_credenti": 4, "line_by_lin": 4, "reason": [4, 5], "bug": 4, "read_text": 4, "1000": 4, "filenotfounderror": 4, "fixed_s3fs_glob": [4, 5], "boto3_bucket": 4, "benoit": 4, "pyimag": 4, "custom": 4, "glob": 4, "s3f": 4, "unabl": 4, "than": 4, "switch": 4, "get_boto3_bucket": 4, "request": 4, "ask": 4, "doe": 4, "turn": 4, "newli": [4, 5], "testb": 4, "depreci": 4, "pleas": 4, "priorit": 4, "get_or_create_bucket": 4, "instead": [4, 5], "yet": 4, "get_bucket_boto3": 4, "host_url": 4, "zhdk": 4, "cloud": 4, "get_s3_connect": 4, "host": 4, "assum": 4, "two": [4, 5], "environ": 4, "variabl": 4, "se_access_kei": 4, "se_secret_kei": 4, "get_s3_resourc": 4, "get_s3_object_s": 4, "whose": 4, "you": 4, "want": 4, "doesn": 4, "relat": [4, 5], "get_s3_vers": 4, "modifi": [4, 5], "datetim": 4, "get_s3_versions_cli": 4, "get_storage_opt": 4, "read_jsonlin": 4, "point": [4, 5], "extract": [4, 5], "doc": 4, "per": [4, 5], "from_sequ": 4, "s3r": 4, "print": [4, 5], "count": [4, 5], "map": [4, 5], "pluck": 4, "ft": 4, "without": 4, "readtext_jsonlin": 4, "limit": 4, "textual": 4, "leav": 4, "out": 4, "serv": 4, "pure": 4, "ne": [4, 5], "reus": [4, 5], "topic": [4, 5], "s3_get_articl": 4, "ad": [4, 5], "advertis": 4, "iter_bucket": 4, "cpu": 4, "dictionari": [4, 5], "s3_get_pag": 4, "issue_id": 4, "page_nam": 4, "imp": 4, "1990": 4, "03": 4, "partition_nam": 4, "newspaper_prefix": 4, "upload_to_s3": 4, "local_path": 4, "path_within_bucket": 4, "help": 4, "prepar": 4, "view": 4, "orient": 4, "daskutil": 4, "cf": [4, 5], "argument": [4, 5], "create_even_partit": 4, "config_newspap": 4, "local_f": 4, "keep_ful": 4, "nb_partit": 4, "500": 4, "yearli": [4, 5], "even": [4, 5], "enabl": 4, "effici": 4, "bypass": 4, "shuffl": 4, "well": 4, "decid": 4, "what": [4, 5], "memori": 4, "span": 4, "classic": 4, "produc": 4, "arg": 4, "partition": 4, "nbpart": 4, "export": [4, 5], "compute_image_link": 4, "pad": 4, "iiif_endpoint": 4, "iiif_link": 4, "summari": [4, 5], "descript": 4, "get_iiif_link": 4, "canonical_bucket": 4, "rebuilt2xmi": 4, "typesystem_path": 4, "iiif_map": 4, "pct_coordin": 4, "typesystem": 4, "defit": 4, "annot": 4, "layer": 4, "task": 4, "config_load": 4, "initi": [4, 5], "method": [4, 5], "check_bucket": 4, "attribut": [4, 5], "check_param": 4, "classmethod": [4, 5], "from_json": 4, "to_dict": 4, "partitionerconfig": 4, "As": [4, 5], "now": 4, "solr_serv": 4, "server": 4, "solr_cor": 4, "s3_host": 4, "s3_bucket_rebuilt": 4, "s3_bucket_partit": 4, "s3_bucket_process": 4, "key_batch": 4, "batch": 4, "number_partit": 4, "1991": 4, "1998": 4, "sever": 5, "allow": 5, "stage": 5, "pipelin": 5, "goal": 5, "approach": 5, "track": 5, "everi": 5, "ensur": 5, "consisteni": 5, "eas": 5, "consist": 5, "across": 5, "justifi": 5, "through": 5, "identif": 5, "leakag": 5, "partial": 5, "updat": 5, "subset": 5, "know": 5, "step": 5, "necessari": 5, "when": 5, "media": 5, "collect": 5, "arriv": 5, "patch": 5, "transpar": 5, "citat": 5, "dataset": 5, "straightforward": 5, "user": 5, "interfac": 5, "exactli": 5, "thei": 5, "abl": 5, "consult": 5, "precis": 5, "them": 5, "definit": 5, "datastatst": 5, "dure": 5, "preprocess": 5, "augment": 5, "project": 5, "progress": 5, "data_statist": 5, "datastatist": 5, "data_stag": 5, "datastag": 5, "granular": 5, "abc": 5, "specif": 5, "portion": 5, "stat": 5, "respect": 5, "count_kei": 5, "frequenc": 5, "add_count": 5, "new_count": 5, "replac": 5, "add": 5, "init_count": 5, "defin": 5, "pretty_print": 5, "modif_d": 5, "include_count": 5, "These": 5, "agnost": 5, "self": 5, "child": 5, "modif": 5, "union": 5, "about": 5, "abstract": 5, "same_count": 5, "other_stat": 5, "possible_count_kei": 5, "titl": 5, "content_items_out": 5, "ft_token": 5, "content_items_in": 5, "ne_ment": 5, "ne_ent": 5, "embeddings_el": 5, "lang_fd": 5, "text_reuse_clust": 5, "text_reuse_passag": 5, "nps_stat": 5, "pretti": 5, "data_manifest": 5, "datamanifest": 5, "s3_output_bucket": 5, "git_repo": 5, "temp_dir": 5, "s3_input_bucket": 5, "new_vers": 5, "is_patch": 5, "patched_field": 5, "previous_mft_path": 5, "only_count": 5, "note": 5, "push_to_git": 5, "add_by_ci_id": 5, "ci_id": 5, "add_by_title_year": 5, "add_count_list_by_title_year": 5, "all_count": 5, "lsit": 5, "aggregate_stats_for_titl": 5, "media_dict": 5, "aggreg": 5, "radio": 5, "radiostatist": 5, "don": 5, "displai": 5, "show": 5, "final": 5, "append_to_not": 5, "to_start": 5, "export_to_git_and_s3": 5, "commit_msg": 5, "perform": 5, "logic": 5, "lazi": 5, "behavior": 5, "readi": 5, "access": 5, "particular": 5, "_processing_stat": 5, "crystal": 5, "_generation_d": 5, "corpu": 5, "manifest_data": 5, "dump": 5, "validate_and_export_manifest": 5, "commit": 5, "define_update_info_for_titl": 5, "processed_year": 5, "prev_version_year": 5, "four": 5, "place": 5, "eg": 5, "4": 5, "alreadi": 5, "generate_media_dict": 5, "old_media_list": 5, "conclud": 5, "increas": 5, "flag": 5, "conduct": 5, "major": 5, "verison": 5, "get_count_kei": 5, "integr": 5, "init": 5, "has_title_year_kei": 5, "verifi": 5, "init_yearly_count_dict": 5, "new_media": 5, "By": 5, "update_typ": 5, "update_level": 5, "updated_year": 5, "updated_field": 5, "properti": 5, "output_mft_s3_path": 5, "versison": 5, "cannot": 5, "overall_stat": 5, "title_stat": 5, "overal": 5, "replace_by_ci_id": 5, "oper": 5, "overwrit": 5, "isn": 5, "better": 5, "suit": 5, "replace_by_title_year": 5, "title_level_stat": 5, "media_list": 5, "stats_as_dict": 5, "update_media_stat": 5, "yearly_stat": 5, "actual": 5, "chang": 5, "statisit": 5, "potenti": 5, "match": 5, "output_bucket_nam": 5, "overriden": 5, "problem": 5, "occur": 5, "push": 5, "critic": 5, "won": 5, "alter": 5, "overrid": 5, "strenum": 5, "requir": 5, "accordingli": 5, "exact": 5, "embed": 5, "entiti": 5, "langid": 5, "linguistic_process": 5, "lingproc": 5, "mysql_ci": 5, "mysql": 5, "ocrqa": 5, "solr_emb": 5, "emb": 5, "solr_ent": 5, "solr_text": 5, "text_reus": 5, "has_valu": 5, "cl": 5, "agg": 5, "clone_git_repo": 5, "repo_nam": 5, "branch": 5, "ideal": 5, "absolut": 5, "appear": 5, "fail": 5, "ssh": 5, "compute_stats_in_canonical_bag": 5, "s3_canonical_issu": 5, "compute_stats_in_entities_bag": 5, "s3_entiti": 5, "compute_stats_in_langident_bag": 5, "s3_langid": 5, "compute_stats_in_rebuilt_bag": 5, "rebuilt_articl": 5, "include_np": 5, "compute_stats_in_solr_text_bag": 5, "s3_solr_text": 5, "counts_for_canonical_issu": 5, "include_np_yr": 5, "later": 5, "counts_for_rebuilt": 5, "rebuilt_ci": 5, "extract_vers": 5, "name_or_path": 5, "as_int": 5, "_vm": 5, "filter_new_or_modified_media": 5, "rebuilt_mft_path": 5, "previous_mft_path_str": 5, "compar": 5, "typic": 5, "atom": 5, "ident": 5, "new_or_modifi": 5, "get_new_or_modified_media": 5, "new_manifest": 5, "previous_manifest": 5, "media_titl": 5, "new_media_item_1": 5, "last_modif_d": 5, "2024": 5, "04": 5, "04t12": 5, "00": 5, "00z": 5, "modified_media_item_2": 5, "03t12": 5, "find_s3_data_manifest_path": 5, "find": 5, "latest": 5, "On": 5, "wai": 5, "enrich": 5, "own": 5, "insid": 5, "get_head_commit_url": 5, "three": 5, "alreadai": 5, "outsid": 5, "previous": 5, "activ": 5, "get_media_item_year": 5, "mnf_json": 5, "mb": 5, "media_items_year": 5, "get_media_titl": 5, "input_data": 5, "ex": 5, "typeerror": 5, "git_commit_push": 5, "full_git_filepath": 5, "make": 5, "non": 5, "increment_vers": 5, "prev_vers": 5, "increment": 5, "accod": 5, "minor": 5, "reset": 5, "vesion": 5, "init_media_info": 5, "full_titl": 5, "relev": 5, "comparison": 5, "is_git_repo": 5, "manifest_summari": 5, "extended_summari": 5, "extend": 5, "manifest_json": 5, "8": 5, "5": 5, "media_list_from_mft_json": 5, "json_mft": 5, "still": 5, "along": 5, "read_manifest_from_s3": 5, "read_manifest_from_s3_path": 5, "manifest_s3_path": 5, "arbitrari": 5, "remove_media_in_manifest": 5, "white_list": 5, "whitelist": 5, "whatev": 5, "retain": 5, "validate_granular": 5, "valueerror": 5, "validate_stag": 5, "return_value_str": 5, "neither": 5, "nor": 5, "validate_vers": 5, "regex": 5, "9": 5, "vm": 5, "integ": 5, "version_as_list": 5, "len": 5, "respec": 5, "write_and_push_to_git": 5, "file_cont": 5, "path_in_repo": 5, "write_dump_to_f": 5, "abs_path": 5, "filesystem": 5, "written": 5, "writen": 5, "ioerror": 5, "command": 5, "compute_manifest": 5, "lf": 5, "compute_stats_for_stag": 5, "files_bag": 5, "create_manifest": 5, "togeth": 5, "iption": 5, "np": 5, "further": 5, "markdown": 5, "manifest_config": 5, "guidelin": 5, "extract_np_kei": 5, "31": 5, "indeplux": 5, "1889": 5, "get_files_to_consid": 5, "file_extens": 5, "validate_config": 5, "mssing": 5}, "objects": {"impresso_commons.images": [[0, 0, 0, "-", "img_utils"], [0, 0, 0, "-", "olive_boxes"]], "impresso_commons.images.img_utils": [[0, 1, 1, "", "BoxStrategy"], [0, 3, 1, "", "compose"], [0, 3, 1, "", "get_img_from_archive"], [0, 3, 1, "", "get_imgdimensions"], [0, 3, 1, "", "get_jpg"], [0, 3, 1, "", "get_page_folders"], [0, 3, 1, "", "get_png"], [0, 3, 1, "", "get_tif"], [0, 3, 1, "", "run_cmd"]], "impresso_commons.images.img_utils.BoxStrategy": [[0, 2, 1, "", "jpg_highest"], [0, 2, 1, "", "jpg_uniq"], [0, 2, 1, "", "png_highest"], [0, 2, 1, "", "png_uniq"], [0, 2, 1, "", "tif"]], "impresso_commons.images.olive_boxes": [[0, 3, 1, "", "compute_box"], [0, 3, 1, "", "compute_scale_factor"], [0, 3, 1, "", "convert_box"], [0, 3, 1, "", "get_iiif_url"], [0, 3, 1, "", "get_scale_factor"], [0, 3, 1, "", "test"]], "impresso_commons": [[2, 0, 0, "-", "path"]], "impresso_commons.path": [[2, 3, 1, "", "id2IssueDir"], [2, 3, 1, "", "parse_canonical_filename"], [2, 0, 0, "-", "path_fs"], [2, 0, 0, "-", "path_s3"]], "impresso_commons.path.path_fs": [[2, 2, 1, "", "ContentItem"], [2, 1, 1, "", "IssueDir"], [2, 3, 1, "", "canonical_path"], [2, 3, 1, "", "check_filenaming"], [2, 3, 1, "", "detect_canonical_issues"], [2, 3, 1, "", "detect_issues"], [2, 3, 1, "", "detect_journal_issues"], [2, 3, 1, "", "get_issueshortpath"], [2, 3, 1, "", "pair_issue"], [2, 3, 1, "", "select_issues"]], "impresso_commons.path.path_fs.IssueDir": [[2, 2, 1, "", "date"], [2, 2, 1, "", "edition"], [2, 2, 1, "", "journal"], [2, 2, 1, "", "path"]], "impresso_commons.path.path_s3": [[2, 2, 1, "", "IssueDir"], [2, 3, 1, "", "fetch_files"], [2, 3, 1, "", "impresso_iter_bucket"], [2, 3, 1, "", "list_files"], [2, 3, 1, "", "list_newspapers"], [2, 3, 1, "", "read_s3_issues"], [2, 1, 1, "", "s3ContentItem"], [2, 3, 1, "", "s3_filter_archives"], [2, 3, 1, "", "s3_iter_bucket"]], "impresso_commons.text": [[3, 0, 0, "-", "helpers"], [3, 0, 0, "-", "rebuilder"]], "impresso_commons.text.helpers": [[3, 3, 1, "", "get_iiif_and_coords"], [3, 3, 1, "", "insert_whitespace"], [3, 3, 1, "", "pages_to_article"], [3, 3, 1, "", "read_issue"], [3, 3, 1, "", "read_issue_pages"], [3, 3, 1, "", "read_page"], [3, 3, 1, "", "reconstruct_iiif_link"], [3, 3, 1, "", "rejoin_articles"], [3, 3, 1, "", "text_apply_breaks"]], "impresso_commons.text.rebuilder": [[3, 3, 1, "", "cleanup"], [3, 3, 1, "", "compress"], [3, 3, 1, "", "init_logging"], [3, 3, 1, "", "main"], [3, 3, 1, "", "rebuild_for_passim"], [3, 3, 1, "", "rebuild_for_solr"], [3, 3, 1, "", "rebuild_issues"], [3, 3, 1, "", "rebuild_text"], [3, 3, 1, "", "rebuild_text_passim"], [3, 3, 1, "", "upload"]], "impresso_commons.utils": [[4, 0, 0, "-", "config_loader"], [4, 0, 0, "-", "daskutils"], [4, 0, 0, "-", "s3"], [4, 0, 0, "-", "uima"], [4, 0, 0, "-", "utils"]], "impresso_commons.utils.config_loader": [[4, 1, 1, "", "Base"], [4, 1, 1, "", "PartitionerConfig"], [4, 3, 1, "", "main"]], "impresso_commons.utils.config_loader.Base": [[4, 4, 1, "", "check_bucket"], [4, 4, 1, "", "check_params"], [4, 4, 1, "", "from_json"], [4, 4, 1, "", "to_dict"]], "impresso_commons.utils.daskutils": [[4, 3, 1, "", "create_even_partitions"], [4, 3, 1, "", "main"], [4, 3, 1, "", "partitioner"]], "impresso_commons.utils.s3": [[4, 3, 1, "", "alternative_read_text"], [4, 3, 1, "", "fixed_s3fs_glob"], [4, 3, 1, "", "get_boto3_bucket"], [4, 3, 1, "", "get_bucket"], [4, 3, 1, "", "get_bucket_boto3"], [4, 3, 1, "", "get_or_create_bucket"], [4, 3, 1, "", "get_s3_client"], [4, 3, 1, "", "get_s3_connection"], [4, 3, 1, "", "get_s3_object_size"], [4, 3, 1, "", "get_s3_resource"], [4, 3, 1, "", "get_s3_versions"], [4, 3, 1, "", "get_s3_versions_client"], [4, 3, 1, "", "get_storage_options"], [4, 3, 1, "", "read_jsonlines"], [4, 3, 1, "", "readtext_jsonlines"], [4, 3, 1, "", "s3_get_articles"], [4, 3, 1, "", "s3_get_pages"], [4, 3, 1, "", "upload"], [4, 3, 1, "", "upload_to_s3"]], "impresso_commons.utils.uima": [[4, 3, 1, "", "compute_image_links"], [4, 3, 1, "", "get_iiif_links"], [4, 3, 1, "", "rebuilt2xmi"]], "impresso_commons.utils.utils": [[4, 3, 1, "", "bytes_to"], [4, 3, 1, "", "chunk"], [4, 3, 1, "", "get_pkg_resource"], [4, 3, 1, "", "init_logger"], [4, 3, 1, "", "parse_json"], [4, 3, 1, "", "validate_against_schema"]], "impresso_commons.versioning": [[5, 0, 0, "-", "compute_manifest"], [5, 0, 0, "-", "data_manifest"], [5, 0, 0, "-", "data_statistics"], [5, 0, 0, "-", "helpers"]], "impresso_commons.versioning.compute_manifest": [[5, 3, 1, "", "compute_stats_for_stage"], [5, 3, 1, "", "create_manifest"], [5, 3, 1, "", "extract_np_key"], [5, 3, 1, "", "get_files_to_consider"], [5, 3, 1, "", "main"], [5, 3, 1, "", "validate_config"]], "impresso_commons.versioning.data_manifest": [[5, 1, 1, "", "DataManifest"]], "impresso_commons.versioning.data_manifest.DataManifest": [[5, 4, 1, "", "add_by_ci_id"], [5, 4, 1, "", "add_by_title_year"], [5, 4, 1, "", "add_count_list_by_title_year"], [5, 4, 1, "", "aggregate_stats_for_title"], [5, 4, 1, "", "append_to_notes"], [5, 4, 1, "", "compute"], [5, 4, 1, "", "define_update_info_for_title"], [5, 4, 1, "", "generate_media_dict"], [5, 4, 1, "", "get_count_keys"], [5, 4, 1, "", "has_title_year_key"], [5, 4, 1, "", "init_yearly_count_dict"], [5, 4, 1, "", "new_media"], [5, 5, 1, "", "output_mft_s3_path"], [5, 4, 1, "", "overall_stats"], [5, 4, 1, "", "replace_by_ci_id"], [5, 4, 1, "", "replace_by_title_year"], [5, 4, 1, "", "title_level_stats"], [5, 4, 1, "", "update_media_stats"], [5, 4, 1, "", "validate_and_export_manifest"]], "impresso_commons.versioning.data_statistics": [[5, 1, 1, "", "DataStatistics"], [5, 1, 1, "", "NewspaperStatistics"]], "impresso_commons.versioning.data_statistics.DataStatistics": [[5, 4, 1, "", "add_counts"], [5, 2, 1, "", "count_keys"], [5, 2, 1, "", "counts"], [5, 2, 1, "", "element"], [5, 2, 1, "", "granularity"], [5, 4, 1, "", "init_counts"], [5, 4, 1, "", "pretty_print"], [5, 4, 1, "", "same_counts"], [5, 2, 1, "", "stage"]], "impresso_commons.versioning.data_statistics.NewspaperStatistics": [[5, 2, 1, "", "count_keys"], [5, 2, 1, "", "counts"], [5, 2, 1, "", "element"], [5, 2, 1, "", "granularity"], [5, 2, 1, "", "possible_count_keys"], [5, 4, 1, "", "pretty_print"], [5, 4, 1, "", "same_counts"], [5, 2, 1, "", "stage"]], "impresso_commons.versioning.helpers": [[5, 1, 1, "", "DataStage"], [5, 3, 1, "", "agg"], [5, 3, 1, "", "chunk"], [5, 3, 1, "", "clone_git_repo"], [5, 3, 1, "", "compute_stats_in_canonical_bag"], [5, 3, 1, "", "compute_stats_in_entities_bag"], [5, 3, 1, "", "compute_stats_in_langident_bag"], [5, 3, 1, "", "compute_stats_in_rebuilt_bag"], [5, 3, 1, "", "compute_stats_in_solr_text_bag"], [5, 3, 1, "", "counts_for_canonical_issue"], [5, 3, 1, "", "counts_for_rebuilt"], [5, 3, 1, "", "extract_version"], [5, 3, 1, "", "filter_new_or_modified_media"], [5, 3, 1, "", "finalize"], [5, 3, 1, "", "find_s3_data_manifest_path"], [5, 3, 1, "", "get_head_commit_url"], [5, 3, 1, "", "get_media_item_years"], [5, 3, 1, "", "get_media_titles"], [5, 3, 1, "", "git_commit_push"], [5, 3, 1, "", "increment_version"], [5, 3, 1, "", "init_media_info"], [5, 3, 1, "", "is_git_repo"], [5, 3, 1, "", "manifest_summary"], [5, 3, 1, "", "media_list_from_mft_json"], [5, 3, 1, "", "read_manifest_from_s3"], [5, 3, 1, "", "read_manifest_from_s3_path"], [5, 3, 1, "", "remove_media_in_manifest"], [5, 3, 1, "", "validate_granularity"], [5, 3, 1, "", "validate_stage"], [5, 3, 1, "", "validate_version"], [5, 3, 1, "", "version_as_list"], [5, 3, 1, "", "write_and_push_to_git"], [5, 3, 1, "", "write_dump_to_fs"]], "impresso_commons.versioning.helpers.DataStage": [[5, 2, 1, "", "CANONICAL"], [5, 2, 1, "", "EMBEDDINGS"], [5, 2, 1, "", "ENTITIES"], [5, 2, 1, "", "EVENIZED"], [5, 2, 1, "", "LANGIDENT"], [5, 2, 1, "", "LINGUISTIC_PROCESSING"], [5, 2, 1, "", "MYSQL_CIS"], [5, 2, 1, "", "OCRQA"], [5, 2, 1, "", "PASSIM"], [5, 2, 1, "", "REBUILT"], [5, 2, 1, "", "SOLR_EMBS"], [5, 2, 1, "", "SOLR_ENTITIES"], [5, 2, 1, "", "SOLR_TEXT"], [5, 2, 1, "", "TEXT_REUSE"], [5, 2, 1, "", "TOPICS"], [5, 4, 1, "", "has_value"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:function", "4": "py:method", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"], "4": ["py", "method", "Python method"], "5": ["py", "property", "Python property"]}, "titleterms": {"imag": 0, "handl": 0, "util": [0, 4], "oliv": 0, "box": 0, "background": 0, "inform": 0, "case": 0, "1": 0, "tif": 0, "2": 0, "sever": 0, "png": 0, "3": 0, "one": 0, "onli": 0, "4": 0, "jpg": 0, "welcom": 1, "impresso": 1, "pycommon": 1, "": 1, "document": 1, "content": 1, "input": 2, "output": 2, "gener": 2, "i": 2, "o": 2, "from": 2, "file": [2, 3, 4], "system": 2, "s3": [2, 4], "text": 3, "rebuild": 3, "function": [3, 4], "helper": [3, 5], "config": [3, 4], "exampl": 3, "basic": 4, "dask": 4, "apach": 4, "uima": 4, "xmi": 4, "loader": 4, "data": 5, "version": 5, "statist": 5, "newspaperstatist": 5, "manifest": 5, "comput": 5, "script": 5}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 60}, "alltitles": {"Image handling": [[0, "image-handling"]], "Image Utils": [[0, "module-impresso_commons.images.img_utils"]], "Olive Boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "Background information": [[0, "background-information"]], "Case 1: tif": [[0, "case-1-tif"]], "Case 2: several png": [[0, "case-2-several-png"]], "Case 3: one png only": [[0, "case-3-one-png-only"]], "Case 4: one jpg only": [[0, "case-4-one-jpg-only"]], "Welcome to Impresso PyCommons\u2019s documentation!": [[1, "welcome-to-impresso-pycommons-s-documentation"]], "Contents:": [[1, null]], "Input/Output": [[2, "input-output"]], "General": [[2, "module-impresso_commons.path"]], "I/O from file system": [[2, "module-impresso_commons.path.path_fs"]], "I/O from S3": [[2, "module-impresso_commons.path.path_s3"]], "Text Rebuild": [[3, "text-rebuild"]], "Rebuild functions": [[3, "rebuild-functions"]], "Helpers": [[3, "module-impresso_commons.text.helpers"]], "Config file example": [[3, "config-file-example"]], "Utilities": [[4, "utilities"]], "Basic Utils Functions": [[4, "module-impresso_commons.utils.utils"]], "S3 Utils Functions": [[4, "module-impresso_commons.utils.s3"]], "Dask Utils Functions": [[4, "module-impresso_commons.utils.daskutils"]], "Apache UIMA XMI Utils Functions": [[4, "module-impresso_commons.utils.uima"]], "Config File Loader": [[4, "module-impresso_commons.utils.config_loader"]], "Data Versioning": [[5, "data-versioning"]], "Data Statistics and NewspaperStatistics": [[5, "module-impresso_commons.versioning.data_statistics"]], "Data Manifest": [[5, "module-impresso_commons.versioning.data_manifest"]], "Versioning Helpers": [[5, "module-impresso_commons.versioning.helpers"]], "Manifest Computing Script": [[5, "module-impresso_commons.versioning.compute_manifest"]]}, "indexentries": {"boxstrategy (class in impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.BoxStrategy"]], "compose() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.compose"]], "compute_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_box"]], "compute_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_scale_factor"]], "convert_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.convert_box"]], "get_iiif_url() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_iiif_url"]], "get_img_from_archive() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_img_from_archive"]], "get_imgdimensions() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_imgdimensions"]], "get_jpg() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_jpg"]], "get_page_folders() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_page_folders"]], "get_png() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_png"]], "get_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_scale_factor"]], "get_tif() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_tif"]], "impresso_commons.images.img_utils": [[0, "module-impresso_commons.images.img_utils"]], "impresso_commons.images.olive_boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "jpg_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_highest"]], "jpg_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_uniq"]], "module": [[0, "module-impresso_commons.images.img_utils"], [0, "module-impresso_commons.images.olive_boxes"], [2, "module-impresso_commons.path"], [2, "module-impresso_commons.path.path_fs"], [2, "module-impresso_commons.path.path_s3"], [3, "module-impresso_commons.text.helpers"], [3, "module-impresso_commons.text.rebuilder"], [4, "module-impresso_commons.utils.config_loader"], [4, "module-impresso_commons.utils.daskutils"], [4, "module-impresso_commons.utils.s3"], [4, "module-impresso_commons.utils.uima"], [4, "module-impresso_commons.utils.utils"], [5, "module-impresso_commons.versioning.compute_manifest"], [5, "module-impresso_commons.versioning.data_manifest"], [5, "module-impresso_commons.versioning.data_statistics"], [5, "module-impresso_commons.versioning.helpers"]], "png_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_highest"]], "png_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_uniq"]], "run_cmd() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.run_cmd"]], "test() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.test"]], "tif (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.tif"]], "contentitem (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.ContentItem"]], "issuedir (class in impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.IssueDir"]], "issuedir (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.IssueDir"]], "canonical_path() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.canonical_path"]], "check_filenaming() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.check_filenaming"]], "date (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.date"]], "detect_canonical_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_canonical_issues"]], "detect_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_issues"]], "detect_journal_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_journal_issues"]], "edition (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.edition"]], "fetch_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.fetch_files"]], "get_issueshortpath() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.get_issueshortpath"]], "id2issuedir() (in module impresso_commons.path)": [[2, "impresso_commons.path.id2IssueDir"]], "impresso_commons.path": [[2, "module-impresso_commons.path"]], "impresso_commons.path.path_fs": [[2, "module-impresso_commons.path.path_fs"]], "impresso_commons.path.path_s3": [[2, "module-impresso_commons.path.path_s3"]], "impresso_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.impresso_iter_bucket"]], "journal (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.journal"]], "list_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_files"]], "list_newspapers() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_newspapers"]], "pair_issue() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.pair_issue"]], "parse_canonical_filename() (in module impresso_commons.path)": [[2, "impresso_commons.path.parse_canonical_filename"]], "path (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.path"]], "read_s3_issues() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.read_s3_issues"]], "s3contentitem (class in impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3ContentItem"]], "s3_filter_archives() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_filter_archives"]], "s3_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_iter_bucket"]], "select_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.select_issues"]], "cleanup() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.cleanup"]], "compress() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.compress"]], "get_iiif_and_coords() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.get_iiif_and_coords"]], "impresso_commons.text.helpers": [[3, "module-impresso_commons.text.helpers"]], "impresso_commons.text.rebuilder": [[3, "module-impresso_commons.text.rebuilder"]], "init_logging() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.init_logging"]], "insert_whitespace() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.insert_whitespace"]], "main() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.main"]], "pages_to_article() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.pages_to_article"]], "read_issue() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue"]], "read_issue_pages() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue_pages"]], "read_page() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_page"]], "rebuild_for_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_passim"]], "rebuild_for_solr() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_solr"]], "rebuild_issues() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_issues"]], "rebuild_text() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text"]], "rebuild_text_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text_passim"]], "reconstruct_iiif_link() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.reconstruct_iiif_link"]], "rejoin_articles() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.rejoin_articles"]], "text_apply_breaks() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.text_apply_breaks"]], "upload() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.upload"]], "base (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.Base"]], "partitionerconfig (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.PartitionerConfig"]], "alternative_read_text() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.alternative_read_text"]], "bytes_to() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.bytes_to"]], "check_bucket() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_bucket"]], "check_params() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_params"]], "chunk() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.chunk"]], "compute_image_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.compute_image_links"]], "create_even_partitions() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.create_even_partitions"]], "fixed_s3fs_glob() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.fixed_s3fs_glob"]], "from_json() (impresso_commons.utils.config_loader.base class method)": [[4, "impresso_commons.utils.config_loader.Base.from_json"]], "get_boto3_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_boto3_bucket"]], "get_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket"]], "get_bucket_boto3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket_boto3"]], "get_iiif_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.get_iiif_links"]], "get_or_create_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_or_create_bucket"]], "get_pkg_resource() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.get_pkg_resource"]], "get_s3_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_client"]], "get_s3_connection() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_connection"]], "get_s3_object_size() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_object_size"]], "get_s3_resource() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_resource"]], "get_s3_versions() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions"]], "get_s3_versions_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions_client"]], "get_storage_options() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_storage_options"]], "impresso_commons.utils.config_loader": [[4, "module-impresso_commons.utils.config_loader"]], "impresso_commons.utils.daskutils": [[4, "module-impresso_commons.utils.daskutils"]], "impresso_commons.utils.s3": [[4, "module-impresso_commons.utils.s3"]], "impresso_commons.utils.uima": [[4, "module-impresso_commons.utils.uima"]], "impresso_commons.utils.utils": [[4, "module-impresso_commons.utils.utils"]], "init_logger() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.init_logger"]], "main() (in module impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.main"]], "main() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.main"]], "parse_json() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.parse_json"]], "partitioner() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.partitioner"]], "read_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.read_jsonlines"]], "readtext_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.readtext_jsonlines"]], "rebuilt2xmi() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.rebuilt2xmi"]], "s3_get_articles() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_articles"]], "s3_get_pages() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_pages"]], "to_dict() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.to_dict"]], "upload() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload"]], "upload_to_s3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload_to_s3"]], "validate_against_schema() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.validate_against_schema"]], "canonical (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.CANONICAL"]], "datamanifest (class in impresso_commons.versioning.data_manifest)": [[5, "impresso_commons.versioning.data_manifest.DataManifest"]], "datastage (class in impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.DataStage"]], "datastatistics (class in impresso_commons.versioning.data_statistics)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics"]], "embeddings (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.EMBEDDINGS"]], "entities (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.ENTITIES"]], "evenized (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.EVENIZED"]], "langident (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.LANGIDENT"]], "linguistic_processing (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING"]], "mysql_cis (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.MYSQL_CIS"]], "newspaperstatistics (class in impresso_commons.versioning.data_statistics)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics"]], "ocrqa (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.OCRQA"]], "passim (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.PASSIM"]], "rebuilt (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.REBUILT"]], "solr_embs (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_EMBS"]], "solr_entities (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES"]], "solr_text (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_TEXT"]], "text_reuse (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.TEXT_REUSE"]], "topics (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.TOPICS"]], "add_by_ci_id() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id"]], "add_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year"]], "add_count_list_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year"]], "add_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.add_counts"]], "agg() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.agg"]], "aggregate_stats_for_title() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title"]], "append_to_notes() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.append_to_notes"]], "chunk() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.chunk"]], "clone_git_repo() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.clone_git_repo"]], "compute() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.compute"]], "compute_stats_for_stage() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.compute_stats_for_stage"]], "compute_stats_in_canonical_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_canonical_bag"]], "compute_stats_in_entities_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_entities_bag"]], "compute_stats_in_langident_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_langident_bag"]], "compute_stats_in_rebuilt_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag"]], "compute_stats_in_solr_text_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag"]], "count_keys (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.count_keys"]], "count_keys (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys"]], "counts (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.counts"]], "counts (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.counts"]], "counts_for_canonical_issue() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.counts_for_canonical_issue"]], "counts_for_rebuilt() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.counts_for_rebuilt"]], "create_manifest() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.create_manifest"]], "define_update_info_for_title() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title"]], "element (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.element"]], "element (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.element"]], "extract_np_key() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.extract_np_key"]], "extract_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.extract_version"]], "filter_new_or_modified_media() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.filter_new_or_modified_media"]], "finalize() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.finalize"]], "find_s3_data_manifest_path() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.find_s3_data_manifest_path"]], "generate_media_dict() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict"]], "get_count_keys() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.get_count_keys"]], "get_files_to_consider() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.get_files_to_consider"]], "get_head_commit_url() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_head_commit_url"]], "get_media_item_years() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_media_item_years"]], "get_media_titles() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_media_titles"]], "git_commit_push() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.git_commit_push"]], "granularity (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.granularity"]], "granularity (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity"]], "has_title_year_key() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key"]], "has_value() (impresso_commons.versioning.helpers.datastage class method)": [[5, "impresso_commons.versioning.helpers.DataStage.has_value"]], "impresso_commons.versioning.compute_manifest": [[5, "module-impresso_commons.versioning.compute_manifest"]], "impresso_commons.versioning.data_manifest": [[5, "module-impresso_commons.versioning.data_manifest"]], "impresso_commons.versioning.data_statistics": [[5, "module-impresso_commons.versioning.data_statistics"]], "impresso_commons.versioning.helpers": [[5, "module-impresso_commons.versioning.helpers"]], "increment_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.increment_version"]], "init_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.init_counts"]], "init_media_info() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.init_media_info"]], "init_yearly_count_dict() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict"]], "is_git_repo() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.is_git_repo"]], "main() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.main"]], "manifest_summary() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.manifest_summary"]], "media_list_from_mft_json() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.media_list_from_mft_json"]], "new_media() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.new_media"]], "output_mft_s3_path (impresso_commons.versioning.data_manifest.datamanifest property)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path"]], "overall_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.overall_stats"]], "possible_count_keys (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys"]], "pretty_print() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.pretty_print"]], "pretty_print() (impresso_commons.versioning.data_statistics.newspaperstatistics method)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print"]], "read_manifest_from_s3() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.read_manifest_from_s3"]], "read_manifest_from_s3_path() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.read_manifest_from_s3_path"]], "remove_media_in_manifest() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.remove_media_in_manifest"]], "replace_by_ci_id() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id"]], "replace_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year"]], "same_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.same_counts"]], "same_counts() (impresso_commons.versioning.data_statistics.newspaperstatistics method)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts"]], "stage (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.stage"]], "stage (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.stage"]], "title_level_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.title_level_stats"]], "update_media_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.update_media_stats"]], "validate_and_export_manifest() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest"]], "validate_config() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.validate_config"]], "validate_granularity() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_granularity"]], "validate_stage() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_stage"]], "validate_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_version"]], "version_as_list() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.version_as_list"]], "write_and_push_to_git() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.write_and_push_to_git"]], "write_dump_to_fs() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.write_dump_to_fs"]]}})
\ No newline at end of file
diff --git a/docs/_build/html/utils.html b/docs/_build/html/utils.html
index 078dcca..174128f 100644
--- a/docs/_build/html/utils.html
+++ b/docs/_build/html/utils.html
@@ -48,9 +48,12 @@
 <li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Utilities</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.utils.utils">Basic Utils Functions</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.bytes_to"><code class="docutils literal notranslate"><span class="pre">bytes_to()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.chunk"><code class="docutils literal notranslate"><span class="pre">chunk()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.get_pkg_resource"><code class="docutils literal notranslate"><span class="pre">get_pkg_resource()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.init_logger"><code class="docutils literal notranslate"><span class="pre">init_logger()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.parse_json"><code class="docutils literal notranslate"><span class="pre">parse_json()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.validate_against_schema"><code class="docutils literal notranslate"><span class="pre">validate_against_schema()</span></code></a></li>
 </ul>
 </li>
 <li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.utils.s3">S3 Utils Functions</a><ul>
@@ -62,6 +65,7 @@
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_or_create_bucket"><code class="docutils literal notranslate"><span class="pre">get_or_create_bucket()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_client"><code class="docutils literal notranslate"><span class="pre">get_s3_client()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_connection"><code class="docutils literal notranslate"><span class="pre">get_s3_connection()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_object_size"><code class="docutils literal notranslate"><span class="pre">get_s3_object_size()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_resource"><code class="docutils literal notranslate"><span class="pre">get_s3_resource()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_versions"><code class="docutils literal notranslate"><span class="pre">get_s3_versions()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_versions_client"><code class="docutils literal notranslate"><span class="pre">get_s3_versions_client()</span></code></a></li>
@@ -101,6 +105,7 @@
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="versioning.html">Data Versioning</a></li>
 </ul>
 
         </div>
@@ -131,6 +136,34 @@
 <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading"></a></h1>
 <section id="module-impresso_commons.utils.utils">
 <span id="basic-utils-functions"></span><h2>Basic Utils Functions<a class="headerlink" href="#module-impresso_commons.utils.utils" title="Link to this heading"></a></h2>
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.utils.bytes_to">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">bytes_to</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bytes_nb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">to_unit</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bsize</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1024</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">float</span></span></span><a class="headerlink" href="#impresso_commons.utils.utils.bytes_to" title="Link to this definition"></a></dt>
+<dd><p>Convert bytes to the specified unit.</p>
+<p>Supported target units:
+- ‘k’ (kilobytes), ‘m’ (megabytes),
+- ‘g’ (gigabytes), ‘t’ (terabytes),
+- ‘p’ (petabytes), ‘e’ (exabytes).</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>bytes_nb</strong> (<em>int</em>) – The number of bytes to be converted.</p></li>
+<li><p><strong>to_unit</strong> (<em>str</em>) – The target unit for conversion.</p></li>
+<li><p><strong>bsize</strong> (<em>int</em><em>, </em><em>optional</em>) – The base size used for conversion (default is 1024).</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>The converted value in the specified unit.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>float</p>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>KeyError</strong> – If the specified target unit is not supported.</p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.utils.chunk">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">chunk</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">chunksize</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.utils.chunk" title="Link to this definition"></a></dt>
@@ -164,11 +197,49 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 </dl>
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.utils.init_logger">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">init_logger</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">level</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">RootLogger</span></span></span><a class="headerlink" href="#impresso_commons.utils.utils.init_logger" title="Link to this definition"></a></dt>
+<dd><p>Initialises the root logger.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>level</strong> (<em>int</em><em>, </em><em>optional</em>) – desired level of logging. Defaults to logging.INFO.</p></li>
+<li><p><strong>file</strong> (<em>str</em><em> | </em><em>None</em><em>, </em><em>optional</em>) – _description_. Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>the initialised logger</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>logging.RootLogger</p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.utils.parse_json">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">parse_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.utils.parse_json" title="Link to this definition"></a></dt>
 <dd></dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.utils.validate_against_schema">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">validate_against_schema</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">json_to_validate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path_to_schema</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'schemas/json/versioning/manifest.schema.json'</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.utils.utils.validate_against_schema" title="Link to this definition"></a></dt>
+<dd><p>Validate a dict corresponding to a JSON against a provided JSON schema.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>json</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – JSON data to validate against a schema.</p></li>
+<li><p><strong>path_to_schema</strong> (<em>str</em><em>, </em><em>optional</em>) – Path to the JSON schema to validate against.
+Defaults to “impresso-schemas/json/versioning/manifest.schema.json”.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>e</strong> – The provided JSON could not be validated against the provided schema.</p>
+</dd>
+</dl>
+</dd></dl>
+
 </section>
 <section id="module-impresso_commons.utils.s3">
 <span id="s3-utils-functions"></span><h2>S3 Utils Functions<a class="headerlink" href="#module-impresso_commons.utils.s3" title="Link to this heading"></a></h2>
@@ -176,7 +247,7 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 Warning: 2 boto libraries are used, and need to be kept until third party lib dependencies are solved.</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.s3.alternative_read_text">
-<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">alternative_read_text</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_key</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">s3_credentials</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.alternative_read_text" title="Link to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">alternative_read_text</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_key</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">s3_credentials</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">line_by_line</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span></span></span><a class="headerlink" href="#impresso_commons.utils.s3.alternative_read_text" title="Link to this definition"></a></dt>
 <dd><p>Read from S3 a line-separated text file (e.g. <cite>*.jsonl.bz2</cite>).</p>
 <div class="admonition note">
 <p class="admonition-title">Note</p>
@@ -319,6 +390,26 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 </dl>
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.s3.get_s3_object_size">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">get_s3_object_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">key</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.get_s3_object_size" title="Link to this definition"></a></dt>
+<dd><p>Get the size of an object (key) in an S3 bucket.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>bucket_name</strong> (<em>str</em>) – The name of the S3 bucket.</p></li>
+<li><p><strong>key</strong> (<em>str</em>) – The key (object) whose size you want to retrieve.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>The size of the object in bytes, or None if the object doesn’t exist.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>int</p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.s3.get_s3_resource">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">get_s3_resource</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">host_url</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'https://os.zhdk.cloud.switch.ch/'</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.get_s3_resource" title="Link to this definition"></a></dt>
@@ -440,7 +531,7 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.s3.upload">
-<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">upload</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">partition_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspaper_prefix</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.upload" title="Link to this definition"></a></dt>
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">upload</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">partition_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspaper_prefix</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.upload" title="Link to this definition"></a></dt>
 <dd></dd></dl>
 
 <dl class="py function">
diff --git a/docs/_build/html/versioning.html b/docs/_build/html/versioning.html
new file mode 100644
index 0000000..bdec0ae
--- /dev/null
+++ b/docs/_build/html/versioning.html
@@ -0,0 +1,1829 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Data Versioning &mdash; Impresso PyCommons  documentation</title>
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+  <!--[if lt IE 9]>
+    <script src="_static/js/html5shiv.min.js"></script>
+  <![endif]-->
+  
+        <script src="_static/jquery.js?v=5d32c60e"></script>
+        <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
+        <script src="_static/documentation_options.js?v=5929fcd5"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
+        <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="_static/js/theme.js"></script>
+    <link rel="index" title="Index" href="genindex.html" />
+    <link rel="search" title="Search" href="search.html" />
+    <link rel="prev" title="Image handling" href="images.html" /> 
+</head>
+
+<body class="wy-body-for-nav"> 
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search" >
+
+          
+          
+          <a href="index.html" class="icon icon-home">
+            Impresso PyCommons
+          </a>
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
+<ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="io.html">Input/Output</a></li>
+<li class="toctree-l1"><a class="reference internal" href="rebuild.html">Text Rebuild</a></li>
+<li class="toctree-l1"><a class="reference internal" href="utils.html">Utilities</a></li>
+<li class="toctree-l1"><a class="reference internal" href="images.html">Image handling</a></li>
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Data Versioning</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.versioning.data_statistics">Data Statistics and NewspaperStatistics</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics"><code class="docutils literal notranslate"><span class="pre">DataStatistics</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.stage"><code class="docutils literal notranslate"><span class="pre">DataStatistics.stage</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.granularity"><code class="docutils literal notranslate"><span class="pre">DataStatistics.granularity</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.element"><code class="docutils literal notranslate"><span class="pre">DataStatistics.element</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.count_keys"><code class="docutils literal notranslate"><span class="pre">DataStatistics.count_keys</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.counts"><code class="docutils literal notranslate"><span class="pre">DataStatistics.counts</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.add_counts"><code class="docutils literal notranslate"><span class="pre">DataStatistics.add_counts()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.init_counts"><code class="docutils literal notranslate"><span class="pre">DataStatistics.init_counts()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.pretty_print"><code class="docutils literal notranslate"><span class="pre">DataStatistics.pretty_print()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics.same_counts"><code class="docutils literal notranslate"><span class="pre">DataStatistics.same_counts()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.stage"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.stage</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.granularity</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.element"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.element</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.count_keys</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.counts"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.counts</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.possible_count_keys</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.pretty_print()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts"><code class="docutils literal notranslate"><span class="pre">NewspaperStatistics.same_counts()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.versioning.data_manifest">Data Manifest</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest"><code class="docutils literal notranslate"><span class="pre">DataManifest</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id"><code class="docutils literal notranslate"><span class="pre">DataManifest.add_by_ci_id()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year"><code class="docutils literal notranslate"><span class="pre">DataManifest.add_by_title_year()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year"><code class="docutils literal notranslate"><span class="pre">DataManifest.add_count_list_by_title_year()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title"><code class="docutils literal notranslate"><span class="pre">DataManifest.aggregate_stats_for_title()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.append_to_notes"><code class="docutils literal notranslate"><span class="pre">DataManifest.append_to_notes()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.compute"><code class="docutils literal notranslate"><span class="pre">DataManifest.compute()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title"><code class="docutils literal notranslate"><span class="pre">DataManifest.define_update_info_for_title()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict"><code class="docutils literal notranslate"><span class="pre">DataManifest.generate_media_dict()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.get_count_keys"><code class="docutils literal notranslate"><span class="pre">DataManifest.get_count_keys()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key"><code class="docutils literal notranslate"><span class="pre">DataManifest.has_title_year_key()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict"><code class="docutils literal notranslate"><span class="pre">DataManifest.init_yearly_count_dict()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.new_media"><code class="docutils literal notranslate"><span class="pre">DataManifest.new_media()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path"><code class="docutils literal notranslate"><span class="pre">DataManifest.output_mft_s3_path</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.overall_stats"><code class="docutils literal notranslate"><span class="pre">DataManifest.overall_stats()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id"><code class="docutils literal notranslate"><span class="pre">DataManifest.replace_by_ci_id()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year"><code class="docutils literal notranslate"><span class="pre">DataManifest.replace_by_title_year()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.title_level_stats"><code class="docutils literal notranslate"><span class="pre">DataManifest.title_level_stats()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.update_media_stats"><code class="docutils literal notranslate"><span class="pre">DataManifest.update_media_stats()</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest"><code class="docutils literal notranslate"><span class="pre">DataManifest.validate_and_export_manifest()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.versioning.helpers">Versioning Helpers</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage"><code class="docutils literal notranslate"><span class="pre">DataStage</span></code></a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.CANONICAL"><code class="docutils literal notranslate"><span class="pre">DataStage.CANONICAL</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.EMBEDDINGS"><code class="docutils literal notranslate"><span class="pre">DataStage.EMBEDDINGS</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.ENTITIES"><code class="docutils literal notranslate"><span class="pre">DataStage.ENTITIES</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.EVENIZED"><code class="docutils literal notranslate"><span class="pre">DataStage.EVENIZED</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.LANGIDENT"><code class="docutils literal notranslate"><span class="pre">DataStage.LANGIDENT</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING"><code class="docutils literal notranslate"><span class="pre">DataStage.LINGUISTIC_PROCESSING</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.MYSQL_CIS"><code class="docutils literal notranslate"><span class="pre">DataStage.MYSQL_CIS</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.OCRQA"><code class="docutils literal notranslate"><span class="pre">DataStage.OCRQA</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.PASSIM"><code class="docutils literal notranslate"><span class="pre">DataStage.PASSIM</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.REBUILT"><code class="docutils literal notranslate"><span class="pre">DataStage.REBUILT</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.SOLR_EMBS"><code class="docutils literal notranslate"><span class="pre">DataStage.SOLR_EMBS</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES"><code class="docutils literal notranslate"><span class="pre">DataStage.SOLR_ENTITIES</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.SOLR_TEXT"><code class="docutils literal notranslate"><span class="pre">DataStage.SOLR_TEXT</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.TEXT_REUSE"><code class="docutils literal notranslate"><span class="pre">DataStage.TEXT_REUSE</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.TOPICS"><code class="docutils literal notranslate"><span class="pre">DataStage.TOPICS</span></code></a></li>
+<li class="toctree-l4"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage.has_value"><code class="docutils literal notranslate"><span class="pre">DataStage.has_value()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.agg"><code class="docutils literal notranslate"><span class="pre">agg()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.chunk"><code class="docutils literal notranslate"><span class="pre">chunk()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.clone_git_repo"><code class="docutils literal notranslate"><span class="pre">clone_git_repo()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.compute_stats_in_canonical_bag"><code class="docutils literal notranslate"><span class="pre">compute_stats_in_canonical_bag()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.compute_stats_in_entities_bag"><code class="docutils literal notranslate"><span class="pre">compute_stats_in_entities_bag()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.compute_stats_in_langident_bag"><code class="docutils literal notranslate"><span class="pre">compute_stats_in_langident_bag()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag"><code class="docutils literal notranslate"><span class="pre">compute_stats_in_rebuilt_bag()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag"><code class="docutils literal notranslate"><span class="pre">compute_stats_in_solr_text_bag()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.counts_for_canonical_issue"><code class="docutils literal notranslate"><span class="pre">counts_for_canonical_issue()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.counts_for_rebuilt"><code class="docutils literal notranslate"><span class="pre">counts_for_rebuilt()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.extract_version"><code class="docutils literal notranslate"><span class="pre">extract_version()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.filter_new_or_modified_media"><code class="docutils literal notranslate"><span class="pre">filter_new_or_modified_media()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.finalize"><code class="docutils literal notranslate"><span class="pre">finalize()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.find_s3_data_manifest_path"><code class="docutils literal notranslate"><span class="pre">find_s3_data_manifest_path()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.get_head_commit_url"><code class="docutils literal notranslate"><span class="pre">get_head_commit_url()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.get_media_item_years"><code class="docutils literal notranslate"><span class="pre">get_media_item_years()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.get_media_titles"><code class="docutils literal notranslate"><span class="pre">get_media_titles()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.git_commit_push"><code class="docutils literal notranslate"><span class="pre">git_commit_push()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.increment_version"><code class="docutils literal notranslate"><span class="pre">increment_version()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.init_media_info"><code class="docutils literal notranslate"><span class="pre">init_media_info()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.is_git_repo"><code class="docutils literal notranslate"><span class="pre">is_git_repo()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.manifest_summary"><code class="docutils literal notranslate"><span class="pre">manifest_summary()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.media_list_from_mft_json"><code class="docutils literal notranslate"><span class="pre">media_list_from_mft_json()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.read_manifest_from_s3"><code class="docutils literal notranslate"><span class="pre">read_manifest_from_s3()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.read_manifest_from_s3_path"><code class="docutils literal notranslate"><span class="pre">read_manifest_from_s3_path()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.remove_media_in_manifest"><code class="docutils literal notranslate"><span class="pre">remove_media_in_manifest()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.validate_granularity"><code class="docutils literal notranslate"><span class="pre">validate_granularity()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.validate_stage"><code class="docutils literal notranslate"><span class="pre">validate_stage()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.validate_version"><code class="docutils literal notranslate"><span class="pre">validate_version()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.version_as_list"><code class="docutils literal notranslate"><span class="pre">version_as_list()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.write_and_push_to_git"><code class="docutils literal notranslate"><span class="pre">write_and_push_to_git()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.helpers.write_dump_to_fs"><code class="docutils literal notranslate"><span class="pre">write_dump_to_fs()</span></code></a></li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.versioning.compute_manifest">Manifest Computing Script</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.compute_manifest.compute_stats_for_stage"><code class="docutils literal notranslate"><span class="pre">compute_stats_for_stage()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.compute_manifest.create_manifest"><code class="docutils literal notranslate"><span class="pre">create_manifest()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.compute_manifest.extract_np_key"><code class="docutils literal notranslate"><span class="pre">extract_np_key()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.compute_manifest.get_files_to_consider"><code class="docutils literal notranslate"><span class="pre">get_files_to_consider()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.compute_manifest.main"><code class="docutils literal notranslate"><span class="pre">main()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.versioning.compute_manifest.validate_config"><code class="docutils literal notranslate"><span class="pre">validate_config()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="index.html">Impresso PyCommons</a>
+      </nav>
+
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="Page navigation">
+  <ul class="wy-breadcrumbs">
+      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
+      <li class="breadcrumb-item active">Data Versioning</li>
+      <li class="wy-breadcrumbs-aside">
+            <a href="_sources/versioning.rst.txt" rel="nofollow"> View page source</a>
+      </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+             
+  <section id="data-versioning">
+<h1>Data Versioning<a class="headerlink" href="#data-versioning" title="Link to this heading"></a></h1>
+<p>The <cite>versioning</cite> package of <cite>impresso_commons</cite> contains several modules and scripts with classes and functions that allow to version Impresso’s data at various stages of the processing pipeline.</p>
+<p>The main goal of this approach is to version the data and track information at every stage to:
+1. <strong>Ensure data consisteny and ease of debugging:</strong> Data elements should be consistent across stages, and inconsistencies/differences should be justifiable through the identification of data leakage points.
+2. <strong>Allow partial updates:</strong> It should be possible to (re)run all or part of the processes on subsets of the data, knowing which version of the data was used at each step. This can be necessary when new media collections arrive, or when an existing collection has been patched.
+3. <strong>Ensure transparency:</strong> Citation of the various data stages and datasets should be straightforward; users should know when using the interface exactly what versions they are using, and should be able to consult the precise statistics related to them.</p>
+<section id="module-impresso_commons.versioning.data_statistics">
+<span id="data-statistics-and-newspaperstatistics"></span><h2>Data Statistics and NewspaperStatistics<a class="headerlink" href="#module-impresso_commons.versioning.data_statistics" title="Link to this heading"></a></h2>
+<p>This module contains the definition of a data statistics class.</p>
+<p>A DataStatstics object should be instantiated during each processing step of
+the data preprocessing and augmentation of the Impresso project, and used to
+progressively count the number of elements modified or added by the processing.</p>
+<dl class="py class">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.data_statistics.</span></span><span class="sig-name descname"><span class="pre">DataStatistics</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data_stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><span class="pre">DataStage</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">granularity</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics" title="Link to this definition"></a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">ABC</span></code></p>
+<p>Count statistics computed on a specific portion and granularity of the data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>data_stage</strong> (<a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><em>DataStage</em></a><em> | </em><em>str</em>) – The stage of data the stats are computed on.</p></li>
+<li><p><strong>granularity</strong> (<em>str</em>) – The granularity of the statistics with respect to the data.</p></li>
+<li><p><strong>element</strong> (<em>str</em><em>, </em><em>optional</em>) – The specific element associated with the statistics.
+Defaults to “” (empty string).</p></li>
+<li><p><strong>counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em> | </em><em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em><em>] </em><em>| </em><em>None</em><em>, </em><em>optional</em>) – Initial counts for
+statistics. Defaults to None.</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.stage">
+<span class="sig-name descname"><span class="pre">stage</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.stage" title="Link to this definition"></a></dt>
+<dd><p>The stage of data the stats are computed on.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage">DataStage</a></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.granularity">
+<span class="sig-name descname"><span class="pre">granularity</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.granularity" title="Link to this definition"></a></dt>
+<dd><p>The granularity of the statistics with respect to the data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.element">
+<span class="sig-name descname"><span class="pre">element</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.element" title="Link to this definition"></a></dt>
+<dd><p>The specific element associated with the statistics.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.count_keys">
+<span class="sig-name descname"><span class="pre">count_keys</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.count_keys" title="Link to this definition"></a></dt>
+<dd><p>The count keys for these statistics.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[str]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.counts">
+<span class="sig-name descname"><span class="pre">counts</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.counts" title="Link to this definition"></a></dt>
+<dd><p>The count statistics computed on the
+specific data, can include frequency dicts.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, int | dict[str, int]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.add_counts">
+<span class="sig-name descname"><span class="pre">add_counts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">new_counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">replace</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.add_counts" title="Link to this definition"></a></dt>
+<dd><p>Add new counts to the existing counts if the new keys are validated.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>new_counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em> | </em><em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em><em>]</em>) – New counts to be added.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the counts were valid and could be added, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.init_counts">
+<span class="sig-name descname"><span class="pre">init_counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.init_counts" title="Link to this definition"></a></dt>
+<dd><p>Initialize a dict with all the keys associated to this object.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p><dl class="simple">
+<dt>A dict with all defined keys, and values</dt><dd><p>initialized to 0 (or to empty frequency dicts).</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>dict[str, int | dict[str, int]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.pretty_print">
+<span class="sig-name descname"><span class="pre">pretty_print</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">modif_date</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.pretty_print" title="Link to this definition"></a></dt>
+<dd><p>Generate a dict representation of these statistics to add to a json.</p>
+<p>These stats are agnostic to the type of statistics they represent so the values
+of <cite>self.counts</cite> are excluded by default, to be included in child classes.
+The modification date can also be included (when granularity=’year’)</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>modif_date</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Last modification date of the
+corresponding elements. Defaults to None.</p></li>
+<li><p><strong>include_counts</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to include the current counts with
+key “stats”. Defaults to False.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>A dict with the general information about these statistics.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, Any]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.DataStatistics.same_counts">
+<em class="property"><span class="pre">abstract</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">same_counts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">other_stats</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Self</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.DataStatistics.same_counts" title="Link to this definition"></a></dt>
+<dd><p>Given another dict of stats, check whether the values are the same.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py class">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.data_statistics.</span></span><span class="sig-name descname"><span class="pre">NewspaperStatistics</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data_stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><span class="pre">DataStage</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">granularity</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">element</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics" title="Link to this definition"></a></dt>
+<dd><p>Bases: <a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics" title="impresso_commons.versioning.data_statistics.DataStatistics"><code class="xref py py-class docutils literal notranslate"><span class="pre">DataStatistics</span></code></a></p>
+<p>Count statistics computed on a specific portion and granularity of the data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>data_stage</strong> (<a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><em>DataStage</em></a><em> | </em><em>str</em>) – The stage of data the stats are computed on.</p></li>
+<li><p><strong>granularity</strong> (<em>str</em>) – The granularity of the statistics with respect to the data.</p></li>
+<li><p><strong>element</strong> (<em>str</em><em>, </em><em>optional</em>) – The specific element associated with the statistics.
+Defaults to “” (empty string).</p></li>
+<li><p><strong>counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>] </em><em>| </em><em>None</em><em>, </em><em>optional</em>) – Initial counts for statistics.
+Defaults to None.</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.stage">
+<span class="sig-name descname"><span class="pre">stage</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.stage" title="Link to this definition"></a></dt>
+<dd><p>The stage of data the stats are computed on.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage">DataStage</a></p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity">
+<span class="sig-name descname"><span class="pre">granularity</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity" title="Link to this definition"></a></dt>
+<dd><p>The granularity of the statistics with respect to the data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.element">
+<span class="sig-name descname"><span class="pre">element</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.element" title="Link to this definition"></a></dt>
+<dd><p>The specific element associated with the statistics.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys">
+<span class="sig-name descname"><span class="pre">count_keys</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys" title="Link to this definition"></a></dt>
+<dd><p>The count keys for these statistics.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[str]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.counts">
+<span class="sig-name descname"><span class="pre">counts</span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.counts" title="Link to this definition"></a></dt>
+<dd><p>The count statistics computed on the specific data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, int]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys">
+<span class="sig-name descname"><span class="pre">possible_count_keys</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">['titles',</span> <span class="pre">'issues',</span> <span class="pre">'pages',</span> <span class="pre">'content_items_out',</span> <span class="pre">'ft_tokens',</span> <span class="pre">'images',</span> <span class="pre">'content_items_in',</span> <span class="pre">'ne_mentions',</span> <span class="pre">'ne_entities',</span> <span class="pre">'embeddings_el',</span> <span class="pre">'topics',</span> <span class="pre">'lang_fd',</span> <span class="pre">'text_reuse_clusters',</span> <span class="pre">'text_reuse_passages']</span></em><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print">
+<span class="sig-name descname"><span class="pre">pretty_print</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">modif_date</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print" title="Link to this definition"></a></dt>
+<dd><p>Generate a dict representation of these statistics to add to a json.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>modif_date</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Last modification date of the
+corresponding elements. Defaults to None.</p></li>
+<li><p><strong>include_counts</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to include the current newspaper
+counts with key “nps_stats”. Defaults to True.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>A dict representation of these statistics.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, Any]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts">
+<span class="sig-name descname"><span class="pre">same_counts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">other_stats</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Self</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts" title="Link to this definition"></a></dt>
+<dd><p>Given another dict of stats, check whether the values are the same.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>other_stats</strong> (<em>Union</em><em>[</em><em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em><em>, </em><em>Self</em><em>]</em>) – Dict with pretty-printed
+newspaper statistics or other NewspaperStatistics object.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>True if the values for the various fields of <cite>nps_stats</cite> where the</dt><dd><p>same, False otherwise.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+</section>
+<section id="module-impresso_commons.versioning.data_manifest">
+<span id="data-manifest"></span><h2>Data Manifest<a class="headerlink" href="#module-impresso_commons.versioning.data_manifest" title="Link to this heading"></a></h2>
+<p>This module contains the definition of a manifest class.</p>
+<p>A manifest object should be instantiated for each processing step of the data
+preprocessing and augmentation of the Impresso project.</p>
+<dl class="py class">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.data_manifest.</span></span><span class="sig-name descname"><span class="pre">DataManifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data_stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><span class="pre">DataStage</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">s3_output_bucket</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">git_repo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Repo</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">temp_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">s3_input_bucket</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">staging</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">new_version</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">is_patch</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">patched_fields</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">previous_mft_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">only_counting</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">notes</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">push_to_git</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest" title="Link to this definition"></a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id">
+<span class="sig-name descname"><span class="pre">add_by_ci_id</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ci_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id" title="Link to this definition"></a></dt>
+<dd><p>Add new counts corresponding to a specific content-item ID.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>ci_id</strong> (<em>str</em>) – Content-item canonical ID to which the counts correspond.</p></li>
+<li><p><strong>counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em>) – Counts corresponding to that ID.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the processing stats’ update was successful, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year">
+<span class="sig-name descname"><span class="pre">add_by_title_year</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">year</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year" title="Link to this definition"></a></dt>
+<dd><p>Add new counts corresponding to a specific media title and year.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>title</strong> (<em>str</em>) – Media title to which the counts correspond.</p></li>
+<li><p><strong>year</strong> (<em>str</em>) – Year to which the counts correspond.</p></li>
+<li><p><strong>counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em>) – Counts corresponding to that title and year.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the processing stats’ update was successful, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year">
+<span class="sig-name descname"><span class="pre">add_count_list_by_title_year</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">year</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">all_counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year" title="Link to this definition"></a></dt>
+<dd><p>Add a list of new counts corresponding to a specific media title and year.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>title</strong> (<em>str</em>) – Media title to which the counts correspond.</p></li>
+<li><p><strong>year</strong> (<em>str</em>) – Year to which the counts correspond.</p></li>
+<li><p><strong>all_counts</strong> (<em>list</em><em>[</em><em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em><em>]</em>) – Lsit of counts for that title and year.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if all the updates were successful, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title">
+<span class="sig-name descname"><span class="pre">aggregate_stats_for_title</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">media_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics" title="impresso_commons.versioning.data_statistics.NewspaperStatistics"><span class="pre">NewspaperStatistics</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title" title="Link to this definition"></a></dt>
+<dd><p>Aggregate all stats of given title and export them to a “pretty print” dict.</p>
+<p>TODO once the radio data is handled, add RadioStatistics</p>
+<p>The <cite>DataStatistics</cite> objects don’t display in the dict format by default,
+but need to be converted to dicts to show as desired on the final manifest.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>title</strong> (<em>str</em>) – Media title for which to aggregate the yearly stats.</p></li>
+<li><p><strong>media_dict</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – Title’s media dict with formatted statistics.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>Updated media dict and</dt><dd><p>corresponding title-level DataStatistics object.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple[dict[str, Any], <a class="reference internal" href="#impresso_commons.versioning.data_statistics.NewspaperStatistics" title="impresso_commons.versioning.data_statistics.NewspaperStatistics">NewspaperStatistics</a>]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.append_to_notes">
+<span class="sig-name descname"><span class="pre">append_to_notes</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">contents</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">to_start</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.append_to_notes" title="Link to this definition"></a></dt>
+<dd><p>Append a string content to the manifest notes, initialize them if needed.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>contents</strong> (<em>str</em>) – Text to add to the manifest notes.</p></li>
+<li><p><strong>to_start</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether the contents should be added to the
+start of the notes instead of the end. Defaults to True.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.compute">
+<span class="sig-name descname"><span class="pre">compute</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">export_to_git_and_s3</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">commit_msg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.compute" title="Link to this definition"></a></dt>
+<dd><p>Perform all necessary logic to compute and construct the resulting manifest.</p>
+<p>This lazy behavior ensures all necessary information is ready and accessible
+when generating the manifest (in particular the <cite>_processing_stats</cite>).</p>
+<p>The steps of this computation are the following:
+- Ensure <cite>_processing_stats</cite> is not empty so the manifest can be computed and
+crystallize the time this function is called as the <cite>_generation_date</cite> .
+- Fetch the previous version of this manifest from S3, extract its media list.
+- Generate the new media list given the previous one and <cite>_processing_stats</cite> .
+- Compute the new title and corpus level statistics using the new media list.
+- Compute the new version based on the performed updates.
+- Define the <cite>manifest_data</cite> attribute corresponding to the final manifest.
+- Optionally, dump it to JSON, export it to S3 and Git.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>export_to_git_and_s3</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to export the final
+<cite>manifest_data</cite> as JSON to S3 and GitHub. Defaults to True. If False,
+<cite>validate_and_export_manifest</cite> can be called separately to do it.</p></li>
+<li><p><strong>commit_msg</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Commit message to use instead of
+the default from <cite>validate_and_export_manifest</cite>. Defaults to None.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title">
+<span class="sig-name descname"><span class="pre">define_update_info_for_title</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">processed_years</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">set</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prev_version_years</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">set</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title" title="Link to this definition"></a></dt>
+<dd><p>Define a title’s update info from the previous and newly updated years.</p>
+<p>The update information for a given title corresponds to four keys, for which
+the values provide information about what modifications took place during the
+processing this manifest is documenting.
+They are defined based on various values:
+- <cite>self.patched_fields</cite>: fields updated during the processing (eg. for a patch).
+- <cite>processed_years</cite> and <cite>prev_version_years</cite></p>
+<p>Four cases exist:
+1. All newly processed years were in the previous version
+-&gt; full title update, only modification.
+2. Part of the previous years were updated, and no newly added years:
+-&gt; year-specific update, where all modified years will be listed.
+3. All previous years were updated, and new years were added:
+-&gt; full title update with addition.
+4. Part of the previous years were updated, and new years were added:
+-&gt; year-specific update, with addition.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>processed_years</strong> (<em>set</em><em>[</em><em>str</em><em>]</em>) – Years for which statistics were computed for
+this manifest.</p></li>
+<li><p><strong>prev_version_years</strong> (<em>set</em><em>[</em><em>str</em><em>]</em>) – Years for which statistics has already been
+computed for the previous version of this manifest.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>New update info dict for the given title.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, Union[str, list]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict">
+<span class="sig-name descname"><span class="pre">generate_media_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">old_media_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">bool</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict" title="Link to this definition"></a></dt>
+<dd><p>Given the previous manifest’s and current statistics, generate new media dict.</p>
+<p>The previous version media list is updated with current processing media list:
+- Setting new modification date &amp; git url for each modified title.
+- Compute update level &amp; targets if not the processing is not a patch.</p>
+<p>From this update, also conclude on whether new data was added, informing the
+how the version should be increased: if new title-year keys exist, the “addition”
+flag will conduct to a major verison increase.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>old_media_list</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>dict</em><em>]</em>) – _description_</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>_description_</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple[dict, bool]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.get_count_keys">
+<span class="sig-name descname"><span class="pre">get_count_keys</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.get_count_keys" title="Link to this definition"></a></dt>
+<dd><p>Get the list of count keys for this manifest’s media dict.</p>
+<p>TODO when integrating radio data: init RadioStatistics instead.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Count keys corresponding to this manifest’s DataStage.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>list[str]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key">
+<span class="sig-name descname"><span class="pre">has_title_year_key</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">year</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key" title="Link to this definition"></a></dt>
+<dd><p>Verify whether the provided title and year have been processed.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>title</strong> (<em>str</em>) – Media title to check.</p></li>
+<li><p><strong>year</strong> (<em>str</em>) – Year to check.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the title-year pair has instantiated counts, false otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict">
+<span class="sig-name descname"><span class="pre">init_yearly_count_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict" title="Link to this definition"></a></dt>
+<dd><p>Initialize new newspaper statistics counts for this manifest.</p>
+<p>TODO when integrating radio data: init RadioStatistics instead.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Initialized counts for this manifest.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>dict[str, int]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.new_media">
+<span class="sig-name descname"><span class="pre">new_media</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.new_media" title="Link to this definition"></a></dt>
+<dd><p>Add a new media dict to the media list, given its title.</p>
+<p>By default, this means the update information will be the following:
+- “update_type”: “addition”
+- “update_level”: “title”
+- “updated_years”: [] # all represented years will be new
+- “updated_fields”: [] # all fields will be new</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>title</strong> (<em>str</em>) – Media title for which to add a new media.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>_description_</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, Any]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">output_mft_s3_path</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path" title="Link to this definition"></a></dt>
+<dd><p>Get this manifest’s output S3 path based on its output bucket.</p>
+<p>The manifest will be uploaded to the S3 bucket and partition corresponding
+to the value provided for its input argument <cite>s3_output_bucket</cite>.
+If the versison attribute for this manifest is not defined, the S3 output
+path cannot be provided and the empty string will be returned.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p><dl class="simple">
+<dt>Full S3 path of this manifest if the version is already defined,</dt><dd><p>the empty string otherwise.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.overall_stats">
+<span class="sig-name descname"><span class="pre">overall_stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title_stats</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics" title="impresso_commons.versioning.data_statistics.DataStatistics"><span class="pre">DataStatistics</span></a><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.overall_stats" title="Link to this definition"></a></dt>
+<dd><p>Generate the overall stats and append the ones from the input manifest.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>title_stats</strong> (<em>list</em><em>[</em><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics" title="impresso_commons.versioning.data_statistics.DataStatistics"><em>DataStatistics</em></a><em>]</em>) – List of all title-level statistics
+used to compute the overall stats.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>This manifest’s overall stats with the ones of previous stages.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[dict]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id">
+<span class="sig-name descname"><span class="pre">replace_by_ci_id</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ci_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id" title="Link to this definition"></a></dt>
+<dd><p>Replace the current counts for a CI id’s title-year pair with new ones.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning</p>
+<p>This operation will overwrite any current counts corresponding to the
+media title and year of the provided content-item ID. If the goal isn’t
+to overwrite these counts, <cite>add_by_ci_id</cite> is better suited.</p>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>ci_id</strong> (<em>str</em>) – Content-item canonical ID to which the counts correspond.</p></li>
+<li><p><strong>counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em>) – Counts for that ID to overwrite current counts with.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the stats’ modification was successful, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year">
+<span class="sig-name descname"><span class="pre">replace_by_title_year</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">year</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">counts</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year" title="Link to this definition"></a></dt>
+<dd><p>Replace the current counts for a given title-year pair with new ones.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning</p>
+<p>This operation will overwrite any current counts corresponding to the
+media title and year of the provided content-item ID. If the goal isn’t
+to overwrite these counts, <cite>add_by_title_year</cite> is better suited.</p>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>title</strong> (<em>str</em>) – Media title to which the counts correspond.</p></li>
+<li><p><strong>year</strong> (<em>str</em>) – Year to which the counts correspond.</p></li>
+<li><p><strong>counts</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>int</em><em>]</em>) – Counts for that ID to overwrite current counts with.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the stats’ modification was successful, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.title_level_stats">
+<span class="sig-name descname"><span class="pre">title_level_stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">media_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics" title="impresso_commons.versioning.data_statistics.DataStatistics"><span class="pre">DataStatistics</span></a><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.title_level_stats" title="Link to this definition"></a></dt>
+<dd><p>Compute the title-level statistics from the new media list.</p>
+<p>Also removes the <cite>stats_as_dict</cite> field from the media list, and returns
+the media list with each NewspaperStatistics object “pretty printed”.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>media_list</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>dict</em><em>]</em>) – Updated media list for this manifest.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>New title-level stats and</dt><dd><p>media list.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple[list[<a class="reference internal" href="#impresso_commons.versioning.data_statistics.DataStatistics" title="impresso_commons.versioning.data_statistics.DataStatistics">DataStatistics</a>], dict[str, dict]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.update_media_stats">
+<span class="sig-name descname"><span class="pre">update_media_stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">yearly_stats</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">old_media_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.update_media_stats" title="Link to this definition"></a></dt>
+<dd><p>Update a title’s media statistics given the its newly computed yearly stats.</p>
+<p>Note that it’s actually the <cite>old_media_list</cite>’s contents which are updated when
+necessary.
+In addition, the value of <cite>self.only_counting</cite> will change the behavior:
+- When False, the computation of the manifest should follow a processing, and
+all data within the <cite>_processing_stats</cite> (here <cite>yearly_stats</cite> for 1 title) will
+be considered to have been modified (or re-generated).
+- When True, the manifest is computed to verify the contents of the data, and
+the media’s information will be update only if differences in statisitics are
+found between the previous and current version.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>title</strong> (<em>str</em>) – Media title for which to update the media list.</p></li>
+<li><p><strong>yearly_stats</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>dict</em><em>]</em>) – New yearly statistics for the title.</p></li>
+<li><p><strong>old_media_list</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>dict</em><em>]</em>) – Previous version manifest’ media list.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>Previous manifest’s media list potentially updated to match</dt><dd><p>new counts, and the list of years which have been modified</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[dict, list[str]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest">
+<span class="sig-name descname"><span class="pre">validate_and_export_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">push_to_git</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">commit_msg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest" title="Link to this definition"></a></dt>
+<dd><p>Validate the current manifest against a schema and export it (s3 and Git).</p>
+<p>This function will always upload the generated manifest to S3, using a path
+constructed based on <cite>self.output_bucket_name</cite> and the DataStage.</p>
+<p>If <cite>push_to_git</cite> is True, by default the commit message used will be
+“Add generated manifest file {filename}.” It can be overriden.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>If a problem occurs when pushing to Git, a critical message will be logged,
+but it won’t modify or alter the upload of the manifest to S3.</p>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>push_to_git</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to also push the generated manifest to
+GitHub (impresso/impresso-data-release). Defaults to False.</p></li>
+<li><p><strong>commit_msg</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Commit message to override the
+default message. Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Whether the upload to s3 was successful.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+</section>
+<section id="module-impresso_commons.versioning.helpers">
+<span id="versioning-helpers"></span><h2>Versioning Helpers<a class="headerlink" href="#module-impresso_commons.versioning.helpers" title="Link to this heading"></a></h2>
+<p>Helper functions to read, generate and write data versioning manifests.</p>
+<dl class="py class">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">DataStage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">module</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">qualname</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">type</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">start</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">boundary</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage" title="Link to this definition"></a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
+<p>Enum all stages requiring a versioning manifest.</p>
+<p>Each member corresponds to a data stage and the associated string is used to name
+each generated manifest accordingly.</p>
+<p>TODO: finalize the exact list of names and strings based on needs.
+TODO: add options for data indexing in Solr</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.CANONICAL">
+<span class="sig-name descname"><span class="pre">CANONICAL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'canonical'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.CANONICAL" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.EMBEDDINGS">
+<span class="sig-name descname"><span class="pre">EMBEDDINGS</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'embeddings'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.EMBEDDINGS" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.ENTITIES">
+<span class="sig-name descname"><span class="pre">ENTITIES</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'entities'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.ENTITIES" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.EVENIZED">
+<span class="sig-name descname"><span class="pre">EVENIZED</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'evenized-rebuilt'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.EVENIZED" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.LANGIDENT">
+<span class="sig-name descname"><span class="pre">LANGIDENT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'langident'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.LANGIDENT" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING">
+<span class="sig-name descname"><span class="pre">LINGUISTIC_PROCESSING</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'lingproc'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.MYSQL_CIS">
+<span class="sig-name descname"><span class="pre">MYSQL_CIS</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'mysql-ingestion'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.MYSQL_CIS" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.OCRQA">
+<span class="sig-name descname"><span class="pre">OCRQA</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'ocrqa'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.OCRQA" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.PASSIM">
+<span class="sig-name descname"><span class="pre">PASSIM</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'passim'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.PASSIM" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.REBUILT">
+<span class="sig-name descname"><span class="pre">REBUILT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'rebuilt'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.REBUILT" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.SOLR_EMBS">
+<span class="sig-name descname"><span class="pre">SOLR_EMBS</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'solr-ingestion-emb'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.SOLR_EMBS" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES">
+<span class="sig-name descname"><span class="pre">SOLR_ENTITIES</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'solr-ingestion-entities'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.SOLR_TEXT">
+<span class="sig-name descname"><span class="pre">SOLR_TEXT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'solr-ingestion-text'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.SOLR_TEXT" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.TEXT_REUSE">
+<span class="sig-name descname"><span class="pre">TEXT_REUSE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'text-reuse'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.TEXT_REUSE" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.TOPICS">
+<span class="sig-name descname"><span class="pre">TOPICS</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'topics'</span></em><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.TOPICS" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.DataStage.has_value">
+<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">has_value</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">value</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.DataStage.has_value" title="Link to this definition"></a></dt>
+<dd><p>Check if enum contains given value</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>cls</strong> (<em>Self</em>) – This DataStage class</p></li>
+<li><p><strong>value</strong> (<em>str</em>) – Value to check</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the value provided is in this enum’s values, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.agg">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">agg</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.helpers.agg" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.chunk">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">chunk</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.helpers.chunk" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.clone_git_repo">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">clone_git_repo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repo_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'impresso/impresso-data-release'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">branch</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'master'</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">Repo</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.clone_git_repo" title="Link to this definition"></a></dt>
+<dd><p>Clone a git repository into a given path in the local file-system.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>path</strong> (<em>str</em>) – Path (ideally absolute) to the dir in which to clone the git repo.</p></li>
+<li><p><strong>repo_name</strong> (<em>str</em><em>, </em><em>optional</em>) – Full name of the git repository to clone, as it
+appears in its URL. Defaults to “impresso/impresso-data-release”.</p></li>
+<li><p><strong>branch</strong> (<em>str</em><em>, </em><em>optional</em>) – Specific branch to clone. Defaults to “master”.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>e</strong> – Cloning the repo failed, both using SSH and HTTPS.</p>
+</dd>
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Object representing the cloned repository if it was cloned.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>git.Repo</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.compute_stats_in_canonical_bag">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">compute_stats_in_canonical_bag</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_canonical_issues</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.compute_stats_in_canonical_bag" title="Link to this definition"></a></dt>
+<dd><p>Computes number of issues and pages per newspaper from a Dask bag of canonical data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>s3_canonical_issues</strong> (<em>db.core.Bag</em>) – Bag with the contents of canonical files to
+compute statistics on.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>List of counts that match canonical DataStatistics keys.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[dict[str, Any]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.compute_stats_in_entities_bag">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">compute_stats_in_entities_bag</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_entities</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.compute_stats_in_entities_bag" title="Link to this definition"></a></dt>
+<dd><p>TODO</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>s3_entities</strong> (<em>db.core.Bag</em>) – Bag with the contents of entity files.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>List of counts that match NE DataStatistics keys.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[dict[str, Any]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.compute_stats_in_langident_bag">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">compute_stats_in_langident_bag</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_langident</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.compute_stats_in_langident_bag" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">compute_stats_in_rebuilt_bag</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">rebuilt_articles</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">key</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_np</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">passim</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">compute_stats_in_solr_text_bag</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_solr_text</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.counts_for_canonical_issue">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">counts_for_canonical_issue</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">issue</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_np_yr</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.counts_for_canonical_issue" title="Link to this definition"></a></dt>
+<dd><p>Given the canonical representation of an issue, get its counts.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>issue</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – Canonical JSON representation of an issue.</p></li>
+<li><p><strong>include_np_yr</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether the newspaper title and year should
+be included in the returned dict for later aggregation. Defaults to False.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Dict listing the counts for this issue, ready to be aggregated.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, int]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.counts_for_rebuilt">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">counts_for_rebuilt</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">rebuilt_ci</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_np</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">passim</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.counts_for_rebuilt" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.extract_version">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">extract_version</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name_or_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">as_int</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">int</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.extract_version" title="Link to this definition"></a></dt>
+<dd><p>Extract the version from a string filename or path.</p>
+<p>This function is in particular mean to extract the version from paths or filenames
+of manifests: structured as [data-stage]_vM-m-p.json.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>name_or_path</strong> (<em>str</em>) – Filename or path from which to extract the version.</p></li>
+<li><p><strong>as_int</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to return the extracted version as int or str.
+Defaults to False.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Extracted version, as int or str based on <cite>as_int</cite>.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[str, int]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.filter_new_or_modified_media">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">filter_new_or_modified_media</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">rebuilt_mft_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">previous_mft_path_str</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.filter_new_or_modified_media" title="Link to this definition"></a></dt>
+<dd><p>Compares two manifests to determine new or modified media items.</p>
+<p>Typical use-case is during an atomic update, when only media items added or modified
+compared to the previous process need to be ingested or processed.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>rebuilt_mft_path</strong> (<em>str</em>) – Path of the rebuilt manifest (new).</p></li>
+<li><p><strong>previous_mft_path_str</strong> (<em>str</em>) – Path of the previous process manifest.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>A manifest identical to ‘rebuilt_mft_path’ but only with
+media items that are new or modified in the media list.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[dict[str, Any]]</p>
+</dd>
+</dl>
+<p>Example: &gt;&gt;&gt; new_or_modified = get_new_or_modified_media(“new_manifest.json”,
+“previous_manifest.json”) &gt;&gt;&gt; print(new_or_modified) [{‘media_title’:
+‘new_media_item_1’, ‘last_modif_date’: ‘2024-04-04T12:00:00Z’, etc.},
+{‘media_title’: ‘modified_media_item_2’, ‘last_modif_date’:
+‘2024-04-03T12:00:00Z’, etc.}]</p>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.finalize">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">finalize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.helpers.finalize" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.find_s3_data_manifest_path">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">find_s3_data_manifest_path</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">partition</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.find_s3_data_manifest_path" title="Link to this definition"></a></dt>
+<dd><p>Find and return the latest data manifest in a given S3 bucket.</p>
+<p>On S3, different Data stages will be stored in different ways.
+In particular, data stages corresponding to enrichments are all placed in the
+same bucket but in different partitions.
+Data stages “canonical”, “rebuilt”, “evenized-rebuilt” &amp; ones related to Solr
+are the ones where each stage has its own bucket.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>bucket_name</strong> (<em>str</em>) – Name of the bucket in which to look.</p></li>
+<li><p><strong>data_stage</strong> (<em>str</em>) – Data stage corresponding to the manifest to fetch.</p></li>
+<li><p><strong>partition</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Partition within the bucket to look
+into. Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>S3 path of the latest manifest in the bucket, None if no</dt><dd><p>manifests were found inside.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[str, None]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.get_head_commit_url">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">get_head_commit_url</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">repo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Repo</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.get_head_commit_url" title="Link to this definition"></a></dt>
+<dd><p>Get the URL of the last commit on a given Git repository.</p>
+<p>TODO: test the function when repo is https url of repository.
+TODO: provide branch argument.
+<cite>repo</cite> can be one of three things:
+- a git.Repo instantiated object (if alreaday instantiated outside).
+- the local path to the git repository (previously cloned).
+- the HTTPS URL to the Git repository</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>The returned commit URL corresponds to the one on the repository’s active
+branch (master for the URL).</p>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>repo</strong> (<em>str</em><em> | </em><em>git.Repo</em>) – local path, git.Repo object or URL of the repository.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>The HTTPS URL of the last commit on the git repository’s master branch.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.get_media_item_years">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">get_media_item_years</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">mnf_json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.get_media_item_years" title="Link to this definition"></a></dt>
+<dd><p>Retrieves the s3 key and size in MB of each year of media items from a manifest.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>mnf_json</strong> (<em>dict</em>) – A manifest dictionary.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>A dictionary where media titles are keys,</dt><dd><p>and each value is a dictionary with s3 key as key and its size as value.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>media_items_years (dict)</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.get_media_titles">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">get_media_titles</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input_data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.get_media_titles" title="Link to this definition"></a></dt>
+<dd><p>Extracts media titles from the input data which can be either a manifest
+or a media list.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>input_data</strong> (<em>Union</em><em>[</em><em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em><em>, </em><em>list</em><em>[</em><em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em><em>]</em><em>]</em>) – A manifest dictionary
+or the media list of a manifest.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>A list of media titles extracted from the input data.
+Ex:  [‘Title 1’, ‘Title 2’]</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[str]</p>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><ul class="simple">
+<li><p><strong>TypeError</strong> – If the input data is not in the expected format.</p></li>
+<li><p><strong>KeyError</strong> – If the ‘media_title’ key is not found in the input data.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.git_commit_push">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">git_commit_push</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">full_git_filepath</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">git_repo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Repo</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">commit_msg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.git_commit_push" title="Link to this definition"></a></dt>
+<dd><p>Commit and push the addition of a given file within the repository.</p>
+<p>TODO: make more general for non-manifest related uses?</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>full_git_filepath</strong> (<em>str</em>) – Path to the file added to the git repository.</p></li>
+<li><p><strong>git_repo</strong> (<em>git.Repo</em>) – git.Repo object of the repository to commit and push to.</p></li>
+<li><p><strong>commit_msg</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Message to use when commiting. If not
+defined, a basic message on the added manifest will be used. Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Whether the commit and push operations were successful.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.increment_version">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">increment_version</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">prev_version</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">increment</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.increment_version" title="Link to this definition"></a></dt>
+<dd><p>Update  given version accoding to the given increment.</p>
+<p>When the increment is major or minor, all following numbers are reset to 0.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>prev_version</strong> (<em>str</em>) – Version to increment</p></li>
+<li><p><strong>increment</strong> (<em>str</em>) – Increment, can be one of major, minor and patch.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>e</strong> – Increment value provided is not valid.</p>
+</dd>
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Vesion incremented accordingly.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.init_media_info">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">init_media_info</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">add</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">full_title</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">years</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">fields</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.init_media_info" title="Link to this definition"></a></dt>
+<dd><p>Initialize the media update dict for a title given relevant information.</p>
+<p>All the update informations are relating to the newly processed data, in
+comparison with the one computed during the last processing.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>add</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether new data was added. Defaults to True.</p></li>
+<li><p><strong>full_title</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether all the title’s years were modified.
+Defaults to True.</p></li>
+<li><p><strong>years</strong> (<em>Union</em><em>[</em><em>list</em><em>[</em><em>str</em><em>]</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – When <cite>full_title</cite>, the specific years
+which were modified/updated. Defaults to None.</p></li>
+<li><p><strong>fields</strong> (<em>Union</em><em>[</em><em>list</em><em>[</em><em>str</em><em>]</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – List of specific fields that were
+modified/updated. Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Instantiated dict with the update information for a given media.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, Any]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.is_git_repo">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">is_git_repo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.is_git_repo" title="Link to this definition"></a></dt>
+<dd><p>Check if a directory contains a Git repository.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>path</strong> (<em>str</em>) – The path to the directory to be checked.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>True if the directory contains a Git repository, False otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>bool</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.manifest_summary">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">manifest_summary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">mnf_json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">extended_summary</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.manifest_summary" title="Link to this definition"></a></dt>
+<dd><p>Generate a summary of the manifest data.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>mnf_json</strong> (<em>dict</em>) – A dictionary containing manifest data.</p></li>
+<li><p><strong>extended_summary</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to include extended summary</p></li>
+<li><p><strong>False.</strong> (<em>with year statistics. Defaults to</em>) – </p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
+<p>Prints: Summary of the manifest including the number of media items, additions,
+and modifications.</p>
+<p class="rubric">Example</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">manifest_summary</span><span class="p">(</span><span class="n">manifest_json</span><span class="p">)</span>
+<span class="go">Summary of manifest /path/to/manifest.json:</span>
+<span class="go">Number of media items: 10 (8 from set)</span>
+<span class="go">Number of addition at title level: 5</span>
+<span class="go">Number of addition at year level: 3</span>
+<span class="go">Number of modification at title level: 2</span>
+<span class="go">Number of modification at year level: 1</span>
+</pre></div>
+</div>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.media_list_from_mft_json">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">media_list_from_mft_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">json_mft</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.media_list_from_mft_json" title="Link to this definition"></a></dt>
+<dd><p>Extract the <cite>media_list</cite> from a manifest as a dict where each title is a key.</p>
+<p>For each title, all fields from the original media list will still be present
+along with an additional <cite>stats_as_dict</cite> field containing a dict mapping each
+year to its specific statistics.
+As a result:
+- All represented titles are within the keys of the returned media list.
+- For each title, represented years are in the keys of its <cite>stats_as_dict</cite> field.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>json_mft</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – Dict following the JSON schema of a manifest from
+which to extract the media list.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Media list of given manifest, with <cite>stats_as_dict</cite> field.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, dict]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.read_manifest_from_s3">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">read_manifest_from_s3</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><span class="pre">DataStage</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">partition</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">None</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">None</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.read_manifest_from_s3" title="Link to this definition"></a></dt>
+<dd><p>Read and load manifest given an S3 bucket.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>bucket_name</strong> (<em>str</em>) – NAme of the s3 bucket to look into</p></li>
+<li><p><strong>data_stage</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><em>DataStage</em></a><em>, </em><em>str</em><em>]</em>) – Data stage corresponding to the
+manifest to fetch.</p></li>
+<li><p><strong>partition</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Partition within the bucket to look
+into. Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>S3 path of the manifest</dt><dd><p>and corresponding contents, if a manifest was found, None otherwise.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[tuple[str, dict[str, Any]], tuple[None, None]]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.read_manifest_from_s3_path">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">read_manifest_from_s3_path</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">manifest_s3_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.read_manifest_from_s3_path" title="Link to this definition"></a></dt>
+<dd><p>read and extract the contents of an arbitrary manifest,</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>manifest_s3_path</strong> (<em>str</em>) – S3 path of the manifest to read.</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Contents of manifest if found on S3, None otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[dict[str, Any], None]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.remove_media_in_manifest">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">remove_media_in_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">mnf_json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">white_list</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.remove_media_in_manifest" title="Link to this definition"></a></dt>
+<dd><p>Removes media items from the given manifest JSON object based on a whitelist.
+Typical use case is ingestion or processing only part of the media for whatever reason.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>mnf_json</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – The manifest JSON object containing a ‘media_list’.</p></li>
+<li><p><strong>white_list</strong> (<em>list</em><em>[</em><em>str</em><em>]</em>) – A list of media titles to be retained in the manifest.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Modifies the input manifest JSON object in-place by removing media items
+not in the whitelist.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.validate_granularity">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">validate_granularity</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">value</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.validate_granularity" title="Link to this definition"></a></dt>
+<dd><p>Validate that the granularity value provided is valid.</p>
+<p>Statistics are computed on three granularity levels:
+corpus, title and year.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>value</strong> (<em>str</em>) – Granularity value to validate</p>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>ValueError</strong> – The provided granularity isn’t one of corpus, title and year.</p>
+</dd>
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>The provided value, in lower case, or None if not valid.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>Union[str, None]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.validate_stage">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">validate_stage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data_stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_value_str</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><span class="pre">DataStage</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.validate_stage" title="Link to this definition"></a></dt>
+<dd><p>Validate the provided data stage if it’s in the DataStage Enum (key or value).</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>data_stage</strong> (<em>str</em>) – Data stage key or value to validate.</p></li>
+<li><p><strong>return_value_str</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to return the data stage’s value if
+it was valid. Defaults to False.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>e</strong> – The provided str is neither a data stage key nor value.</p>
+</dd>
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>The corresponding DataStage or value string if valid.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage">DataStage</a> | str | None</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.validate_version">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">validate_version</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">v</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">regex</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'^v([0-9]+[.]){2}[0-9]+$'</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.validate_version" title="Link to this definition"></a></dt>
+<dd><p>Validate the provided string version against a regex.</p>
+<p>The provided version should be in format “vM.m.p”, where M, m and p are
+integers representing respectively the Major, minor and patch version.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>v</strong> (<em>str</em>) – version in string format to validate.</p></li>
+<li><p><strong>regex</strong> (<em>str</em><em>, </em><em>optional</em>) – Regex against which to match the version.
+Defaults to “^v([0-9]+[.]){2}[0-9]+$”.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>The provided version if it’s valid, None otherwise.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[str, None]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.version_as_list">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">version_as_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">version</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.version_as_list" title="Link to this definition"></a></dt>
+<dd><p>Return the provided string version as a list of three ints.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>version</strong> (<em>str</em>) – String version to return as list</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>list of len 3 where indices respecively correspond to the</dt><dd><p>Major, minor and patch versions.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[int]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.write_and_push_to_git">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">write_and_push_to_git</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">file_contents</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">git_repo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Repo</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path_in_repo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">filename</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">commit_msg</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">tuple</span><span class="p"><span class="pre">[</span></span><span class="pre">bool</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.write_and_push_to_git" title="Link to this definition"></a></dt>
+<dd><p>Given a serialized dump, write it in local git repo, commit and push.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>file_contents</strong> (<em>str</em>) – Serialized dump of a JSON file.</p></li>
+<li><p><strong>git_repo</strong> (<em>git.Repo</em>) – Object representing the git repository to push to.</p></li>
+<li><p><strong>path_in_repo</strong> (<em>str</em>) – Relative path where to write the file.</p></li>
+<li><p><strong>filename</strong> (<em>str</em>) – Desired name for the file, including extension.</p></li>
+<li><p><strong>commit_msg</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>None</em><em>]</em><em>, </em><em>optional</em>) – Commit message. If not defined, a
+basic message on the added manifest will be used.Defaults to None.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Whether the process was successful and corresponding filepath.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>tuple[bool, str]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.helpers.write_dump_to_fs">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.helpers.</span></span><span class="sig-name descname"><span class="pre">write_dump_to_fs</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">file_contents</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">abs_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">filename</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.helpers.write_dump_to_fs" title="Link to this definition"></a></dt>
+<dd><p>Write a provided string dump to the local filesystem given its path and filename.</p>
+<p>TODO: Potentially moving this method to <cite>utils.py</cite>.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>file_contents</strong> (<em>str</em>) – Dumped contents in str format, ready to be written.</p></li>
+<li><p><strong>abs_path</strong> (<em>str</em>) – Local path to the directory in which the file will be.</p></li>
+<li><p><strong>filename</strong> (<em>str</em>) – Filename of the file to write, including its extension.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Full path of writen file, or None if an IOError occurred.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Union[str, None]</p>
+</dd>
+</dl>
+</dd></dl>
+
+</section>
+<section id="module-impresso_commons.versioning.compute_manifest">
+<span id="manifest-computing-script"></span><h2>Manifest Computing Script<a class="headerlink" href="#module-impresso_commons.versioning.compute_manifest" title="Link to this heading"></a></h2>
+<p>Command-line script to generate a manifest for an S3 bucket or partition after a processing.</p>
+<dl class="simple">
+<dt>Usage:</dt><dd><p>compute_manifest.py –config-file=&lt;cf&gt; –log-file=&lt;lf&gt; [–scheduler=&lt;sch&gt; –nworkers=&lt;nw&gt; –verbose]</p>
+</dd>
+</dl>
+<p>Options:</p>
+<dl class="option-list">
+<dt><kbd><span class="option">--config-file=<var>&lt;cf&gt;</var></span></kbd></dt>
+<dd><p>Path to configuration json file containing all necessary arguments for the computation of the manifest.</p>
+</dd>
+</dl>
+<p>–log-file=&lt;lf&gt; Path to log file to use.
+–scheduler=&lt;sch&gt;  Tell dask to use an existing scheduler (otherwise it’ll create one)
+–nworkers=&lt;nw&gt;  number of workers for (local) Dask client.
+–verbose  Set logging level to DEBUG (by default is INFO).</p>
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.compute_manifest.compute_stats_for_stage">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.compute_manifest.</span></span><span class="sig-name descname"><span class="pre">compute_stats_for_stage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">files_bag</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stage</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><span class="pre">DataStage</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.compute_manifest.compute_stats_for_stage" title="Link to this definition"></a></dt>
+<dd><p>Compute statistics for a specific data stage.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>files_bag</strong> (<em>db.core.Bag</em>) – A bag containing files for statistics computation.</p></li>
+<li><p><strong>stage</strong> (<a class="reference internal" href="#impresso_commons.versioning.helpers.DataStage" title="impresso_commons.versioning.helpers.DataStage"><em>DataStage</em></a>) – The data stage for which statistics are computed.</p></li>
+<li><p><strong>client</strong> (<em>Client</em><em> | </em><em>None</em><em>, </em><em>optional</em>) – Dask client to use.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>List of computed yearly statistics, or None if statistics</dt><dd><p>computation for the given stage is not implemented.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[dict] | None]</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.compute_manifest.create_manifest">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.compute_manifest.</span></span><span class="sig-name descname"><span class="pre">create_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config_dict</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Client</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.compute_manifest.create_manifest" title="Link to this definition"></a></dt>
+<dd><p>Given its configuration, generate the manifest for a given s3 bucket partition.</p>
+<p>TODO: add option to agg for all titles together if desired
+TODO: add iptions to exclude NP for all agg types
+TODO: separate further into functions</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>The contents of the configuration file (or dict) are given in markdown file
+<cite>impresso_commons/data/manifest_config/manifest.config.example.md`</cite></p>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>config_dict</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – Configuration following the guidelines.</p></li>
+<li><p><strong>client</strong> (<em>Client</em><em> | </em><em>None</em><em>, </em><em>optional</em>) – Dask client to use.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.compute_manifest.extract_np_key">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.compute_manifest.</span></span><span class="sig-name descname"><span class="pre">extract_np_key</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">s3_key</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">str</span></span></span><a class="headerlink" href="#impresso_commons.versioning.compute_manifest.extract_np_key" title="Link to this definition"></a></dt>
+<dd><p>Extract the newspaper an s3:key corresponds to given the bucket and partition</p>
+<p>eg. s3_key is in format:
+- s3_key: ‘s3://31-passim-rebuilt-staging/passim/indeplux/indeplux-1889.jsonl.bz2’
+- bucket: ‘31-passim-rebuilt-staging/passim’
+–&gt; returns ‘indeplux’</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>s3_key</strong> (<em>str</em>) – Full S3 path of a file (as returned by fixed_s3fs_glob).</p></li>
+<li><p><strong>bucket</strong> (<em>str</em>) – S3 bucket, including partition, in which the newspaper dirs are.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>Name of the corresponding newspaper, extracted form the s3 path.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>str</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.compute_manifest.get_files_to_consider">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.compute_manifest.</span></span><span class="sig-name descname"><span class="pre">get_files_to_consider</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.versioning.compute_manifest.get_files_to_consider" title="Link to this definition"></a></dt>
+<dd><p>Get the list of S3 files to consider based on the provided configuration.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>config</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – Configuration parameters with the s3 bucket, titles,
+and file extensions</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>Dict mapping each newspaper to the s3 files to</dt><dd><p>consider, or None if no files found.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>dict[str, list[str]] | None</p>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>ValueError</strong> – If <cite>file_extensions</cite> in the config is empty or None.</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.compute_manifest.main">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.compute_manifest.</span></span><span class="sig-name descname"><span class="pre">main</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.versioning.compute_manifest.main" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.versioning.compute_manifest.validate_config">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.versioning.compute_manifest.</span></span><span class="sig-name descname"><span class="pre">validate_config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Any</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.versioning.compute_manifest.validate_config" title="Link to this definition"></a></dt>
+<dd><p>Ensure all required configurations are defined, add any missing optional ones.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>config</strong> (<em>dict</em><em>[</em><em>str</em><em>, </em><em>Any</em><em>]</em>) – Provided configuration dict to compute the manifest.</p>
+</dd>
+<dt class="field-even">Raises<span class="colon">:</span></dt>
+<dd class="field-even"><p><strong>ValueError</strong> – Some required arguments of the configuration are missing.</p>
+</dd>
+<dt class="field-odd">Returns<span class="colon">:</span></dt>
+<dd class="field-odd"><p>Updated config, with any mssing optional argument set to None.</p>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>dict[str, Any]</p>
+</dd>
+</dl>
+</dd></dl>
+
+</section>
+</section>
+
+
+           </div>
+          </div>
+          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
+        <a href="images.html" class="btn btn-neutral float-left" title="Image handling" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>&#169; Copyright 2024, Impresso - Media Monitoring of the Past - EPFL-DHLAB, UZH-ICL, UNILU-C2DH..</p>
+  </div>
+
+  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
+    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
+    provided by <a href="https://readthedocs.org">Read the Docs</a>.
+   
+
+</footer>
+        </div>
+      </div>
+    </section>
+  </div>
+  <script>
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script> 
+
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 4eda377..65a983f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -17,4 +17,5 @@ Python module with bits of code (objects, functions) highly reusable within impr
    rebuild
    utils
    images
+   versioning
 
diff --git a/docs/versioning.rst b/docs/versioning.rst
new file mode 100644
index 0000000..03a69ea
--- /dev/null
+++ b/docs/versioning.rst
@@ -0,0 +1,43 @@
+Data Versioning
+================================
+
+The `versioning` package of `impresso_commons` contains several modules and scripts with classes and functions that allow to version Impresso's data at various stages of the processing pipeline.
+
+The main goal of this approach is to version the data and track information at every stage to:
+1. **Ensure data consisteny and ease of debugging:** Data elements should be consistent across stages, and inconsistencies/differences should be justifiable through the identification of data leakage points.
+2. **Allow partial updates:** It should be possible to (re)run all or part of the processes on subsets of the data, knowing which version of the data was used at each step. This can be necessary when new media collections arrive, or when an existing collection has been patched.
+3. **Ensure transparency:** Citation of the various data stages and datasets should be straightforward; users should know when using the interface exactly what versions they are using, and should be able to consult the precise statistics related to them.
+
+
+Data Statistics and NewspaperStatistics
+------------------------------------------
+
+.. automodule:: impresso_commons.versioning.data_statistics
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Data Manifest
+--------------------------------------------
+
+.. automodule:: impresso_commons.versioning.data_manifest
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Versioning Helpers
+--------------------------------------------
+
+.. automodule:: impresso_commons.versioning.helpers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Manifest Computing Script
+--------------------------------------------
+
+.. automodule:: impresso_commons.versioning.compute_manifest
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/impresso_commons/text/rebuilder.py b/impresso_commons/text/rebuilder.py
index a57de1f..5e50759 100644
--- a/impresso_commons/text/rebuilder.py
+++ b/impresso_commons/text/rebuilder.py
@@ -19,7 +19,7 @@
 --nworkers=<nw>  number of workers for (local) Dask client.
 --git-repo=<gr>   Local path to the "impresso-text-acquisition" git directory (including it).
 --temp-dir=<tp>  Temporary directory in which to clone the impresso-data-release git repository.
---prev-manifest=<pm> Optional S3 path to the previous manifest to use for the manifest generation
+--prev-manifest=<pm>  Optional S3 path to the previous manifest to use for the manifest generation.
 """  # noqa: E501
 
 import sys
diff --git a/impresso_commons/utils/utils.py b/impresso_commons/utils/utils.py
index 673f94c..151437f 100644
--- a/impresso_commons/utils/utils.py
+++ b/impresso_commons/utils/utils.py
@@ -108,15 +108,16 @@ def validate_against_schema(
 
 
 def bytes_to(bytes_nb: int, to_unit: str, bsize: int = 1024) -> float:
-    """
-    Convert bytes to the specified unit.
+    """Convert bytes to the specified unit.
+
+    Supported target units:
+    - 'k' (kilobytes), 'm' (megabytes),
+    - 'g' (gigabytes), 't' (terabytes),
+    - 'p' (petabytes), 'e' (exabytes).
 
     Args:
         bytes_nb (int): The number of bytes to be converted.
         to_unit (str): The target unit for conversion.
-            Supported units: 'k' (kilobytes), 'm' (megabytes),
-                             'g' (gigabytes), 't' (terabytes),
-                             'p' (petabytes), 'e' (exabytes).
         bsize (int, optional): The base size used for conversion (default is 1024).
 
     Returns:
@@ -125,5 +126,5 @@ def bytes_to(bytes_nb: int, to_unit: str, bsize: int = 1024) -> float:
     Raises:
         KeyError: If the specified target unit is not supported.
     """
-    units = {'k': 1, 'm': 2, 'g': 3, 't': 4, 'p': 5, 'e': 6}
-    return float(bytes_nb) / (bsize ** units[to_unit])
\ No newline at end of file
+    units = {"k": 1, "m": 2, "g": 3, "t": 4, "p": 5, "e": 6}
+    return float(bytes_nb) / (bsize ** units[to_unit])
diff --git a/impresso_commons/versioning/data_manifest.py b/impresso_commons/versioning/data_manifest.py
index 6b8d69f..8becf0a 100644
--- a/impresso_commons/versioning/data_manifest.py
+++ b/impresso_commons/versioning/data_manifest.py
@@ -375,7 +375,7 @@ def validate_and_export_manifest(
         This function will always upload the generated manifest to S3, using a path
         constructed based on `self.output_bucket_name` and the DataStage.
 
-        If `push_to_git`is True, by default the commit message used will be
+        If `push_to_git` is True, by default the commit message used will be
         "Add generated manifest file {filename}." It can be overriden.
 
         Note:
@@ -706,13 +706,13 @@ def define_update_info_for_title(
 
         Four cases exist:
         1. All newly processed years were in the previous version
-            -> full title update, only modification.
+        -> full title update, only modification.
         2. Part of the previous years were updated, and no newly added years:
-            -> year-specific update, where all modified years will be listed.
+        -> year-specific update, where all modified years will be listed.
         3. All previous years were updated, and new years were added:
-            -> full title update with addition.
+        -> full title update with addition.
         4. Part of the previous years were updated, and new years were added:
-            -> year-specific update, with addition.
+        -> year-specific update, with addition.
 
         Args:
             processed_years (set[str]): Years for which statistics were computed for
@@ -765,11 +765,11 @@ def update_media_stats(
         necessary.
         In addition, the value of `self.only_counting` will change the behavior:
         - When False, the computation of the manifest should follow a processing, and
-            all data within the `_processing_stats` (here `yearly_stats` for 1 title)
-            will be considered to have been modified (or re-generated).
+        all data within the `_processing_stats` (here `yearly_stats` for 1 title) will
+        be considered to have been modified (or re-generated).
         - When True, the manifest is computed to verify the contents of the data, and
-            the media's information will be update only if differences in statisitics
-            are found between the previous and current version.
+        the media's information will be update only if differences in statisitics are
+        found between the previous and current version.
 
         Args:
             title (str): Media title for which to update the media list.
@@ -987,14 +987,15 @@ def compute(
         when generating the manifest (in particular the `_processing_stats`).
 
         The steps of this computation are the following:
-        1. Ensure `_processing_stats` is not empty so the manifest can be computed
-            and crystallize the time this function is called as the `_generation_date`.
-        2. Fetch the previous version of this manifest from S3, extract its media list.
-        3. Generate the new media list given the previous one and `_processing_stats`.
-        4. Compute the new title and corpus level statistics using the new media list.
-        5. Compute the new version based on the performed updates.
-        6. Define the `manifest_data` attribute corresponding to the final manifest.
-        7. Optionally, dump it to JSON, export it to S3 and Git.
+        - Ensure `_processing_stats` is not empty so the manifest can be computed and
+        crystallize the time this function is called as the `_generation_date` .
+        - Fetch the previous version of this manifest from S3, extract its media list.
+        - Generate the new media list given the previous one and `_processing_stats` .
+        - Compute the new title and corpus level statistics using the new media list.
+        - Compute the new version based on the performed updates.
+        - Define the `manifest_data` attribute corresponding to the final manifest.
+        - Optionally, dump it to JSON, export it to S3 and Git.
+
 
         Args:
             export_to_git_and_s3 (bool, optional): Whether to export the final
diff --git a/impresso_commons/versioning/data_statistics.py b/impresso_commons/versioning/data_statistics.py
index 241272e..8939864 100644
--- a/impresso_commons/versioning/data_statistics.py
+++ b/impresso_commons/versioning/data_statistics.py
@@ -190,9 +190,9 @@ class NewspaperStatistics(DataStatistics):
         element (str): The specific element associated with the statistics.
         count_keys (list[str]): The count keys for these statistics.
         counts (dict[str, int]): The count statistics computed on the specific data.
-        possible_count_keys (list[str]): All possible count keys for newspaper data.
     """
 
+    # All possible count keys for newspaper data.
     possible_count_keys = [
         "titles",
         "issues",