From e549e57b8fd7b029dd2577fa2c6cea3fb1ec9daf Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 19 Aug 2024 14:48:07 +1000
Subject: [PATCH 001/222] Initial code inclusion, conversion from bash scripts

---
 bin/python-get-heroku-video-s3-acls.py | 67 ++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100755 bin/python-get-heroku-video-s3-acls.py

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
new file mode 100755
index 00000000..41086a87
--- /dev/null
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Permissions required:
+#  heroku cli - access to app
+#  aws s3 - NZSL IAM access
+#  s3:GetObjectAcl permissions or READ_ACP access to the object
+#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
+
+import os
+import subprocess
+
+
+# Setup
+# TODO See how difficult using native API calls would be.
+HEROKU = "/usr/bin/heroku"
+AWS = "/usr/local/bin/aws"
+
+TMPDIR = "/tmp/nzsl"
+try:
+    os.makedirs(TMPDIR, exist_ok=True)
+except OSError as err:
+    print(f"Error creating directory: {err}")
+    exit()
+
+NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
+NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
+S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
+S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
+S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
+S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
+for p in (
+    NZSL_RAW_KEYS_FILE,
+    NZSL_COOKED_KEYS_FILE,
+    S3_BUCKET_RAW_KEYS_FILE,
+    S3_BUCKET_ERROR_KEYS_FILE,
+    S3_BUCKET_CONTENTS_FILE,
+    S3_KEYS_NOT_IN_NZSL
+):
+    f = open(p, "a")
+    f.truncate()
+    f.close()
+
+RUN_MODE = "production"
+if RUN_MODE == "production":
+    print("PRODUCTION")
+    NZSL_APP = "nzsl-signbank-production"
+    AWS_S3_BUCKET = "nzsl-signbank-media-production"
+else:
+    print("STAGING")
+    NZSL_APP = "nzsl-signbank-uat"
+    AWS_S3_BUCKET = "nzsl-signbank-media-uat"
+
+new_env = os.environ.copy()
+new_env["AWS_PROFILE"] = "nzsl"
+
+
+# Get all keys from S3
+print("Getting raw S3 keys recursively ($AWS_S3_BUCKET) ...")
+with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
+    result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
+                            env=new_env, shell=False, check=True,
+                            text=True, stdout=f_obj)
+num_lines = sum(1 for _ in open(S3_BUCKET_RAW_KEYS_FILE))
+print(f"{num_lines} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
+
+
+
+

From c07c66a160aafaf54b27676d57e2cd31432ff73e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 19 Aug 2024 15:11:40 +1000
Subject: [PATCH 002/222] S3 retrieval working, NZSL Signbank retrieval working

---
 bin/python-get-heroku-video-s3-acls.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 41086a87..2a119fcc 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -54,14 +54,31 @@
 
 
 # Get all keys from S3
-print("Getting raw S3 keys recursively ($AWS_S3_BUCKET) ...")
+"""
+print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
 with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
     result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
                             env=new_env, shell=False, check=True,
                             text=True, stdout=f_obj)
 num_lines = sum(1 for _ in open(S3_BUCKET_RAW_KEYS_FILE))
 print(f"{num_lines} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
+"""
 
+# Get the video file keys from NZSL Signbank
+print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
+with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
+    result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}",
+                             "-c", "select videofile, is_public from video_glossvideo"],
+                            env=new_env, shell=False, check=True,
+                            text=True, stdout=f_obj)
+# Remove the first 2 and last 2 lines, as we cannot control pg:psql
+with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
+    lines = f_obj.readlines()
+    lines = lines[2:]
+    lines = lines[:-2]
+    for x in lines:
+        print(x)
 
-
+#num_lines = sum(1 for _ in open(NZSL_RAW_KEYS_FILE))
+#print(f"{num_lines} rows retrieved: {NZSL_RAW_KEYS_FILE}")
 

From f2191b8a5e23a98d099a3d47906d2a15429d4efc Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 19 Aug 2024 15:15:48 +1000
Subject: [PATCH 003/222] pg:psql header and footer removed

---
 bin/python-get-heroku-video-s3-acls.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 2a119fcc..e748a575 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -71,14 +71,12 @@
                              "-c", "select videofile, is_public from video_glossvideo"],
                             env=new_env, shell=False, check=True,
                             text=True, stdout=f_obj)
-# Remove the first 2 and last 2 lines, as we cannot control pg:psql
+# Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting
 with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
     lines = f_obj.readlines()
     lines = lines[2:]
     lines = lines[:-2]
-    for x in lines:
-        print(x)
-
-#num_lines = sum(1 for _ in open(NZSL_RAW_KEYS_FILE))
-#print(f"{num_lines} rows retrieved: {NZSL_RAW_KEYS_FILE}")
+with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
+    f_obj.writelines(lines)
+print(f"{len(lines)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
 

From b5ed4b93bfb19a7230d0effd9b9179c8814999f1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 19 Aug 2024 16:17:31 +1000
Subject: [PATCH 004/222] Sorting newlines

---
 bin/python-get-heroku-video-s3-acls.py | 53 +++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index e748a575..41d3b897 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -5,9 +5,16 @@
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
 
+# FIXME
+# Currently pulling all data into text files the way the bash script
+# this python script is based on did it.
+# We may be able to get away with losing some of the files and doing
+# most of it in memory.
+
+
 import os
 import subprocess
-
+from pprint import pprint
 
 # Setup
 # TODO See how difficult using native API calls would be.
@@ -54,15 +61,26 @@
 
 
 # Get all keys from S3
-"""
 print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
 with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
     result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
                             env=new_env, shell=False, check=True,
                             text=True, stdout=f_obj)
-num_lines = sum(1 for _ in open(S3_BUCKET_RAW_KEYS_FILE))
-print(f"{num_lines} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
-"""
+
+# Get just the keys
+# Put them in an in-memory list, stripping newlines
+with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
+    s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
+print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
+
+# Write them back to the file for completeness
+with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
+    for line in s3_bucket_raw_keys_list:
+        f_obj.write(f"{line}\n")
+
+print(S3_BUCKET_RAW_KEYS_FILE)
+print("DEBUG EXIT")
+exit()
 
 # Get the video file keys from NZSL Signbank
 print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
@@ -71,12 +89,27 @@
                              "-c", "select videofile, is_public from video_glossvideo"],
                             env=new_env, shell=False, check=True,
                             text=True, stdout=f_obj)
+
+# Put them in an in-memory list, stripping newlines
 # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting
 with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
-    lines = f_obj.readlines()
-    lines = lines[2:]
-    lines = lines[:-2]
+    nzsl_raw_keys_list = [line.rstrip() for line in f_obj]
+    nzsl_raw_keys_list = nzsl_raw_keys_list[2:]
+    nzsl_raw_keys_list = nzsl_raw_keys_list[:-2]
 with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
-    f_obj.writelines(lines)
-print(f"{len(lines)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
+    f_obj.writelines(nzsl_raw_keys_list)
+print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
+#pprint(nzsl_raw_keys_list)
+
+# Write the NZSL keys to a dictionary so we can do fast operations on them
+nzsl_raw_keys_dict = {}
+for rawl in nzsl_raw_keys_list:
+    columns = rawl.split("|")
+    video_key = columns[0].strip()
+    is_public = columns[1].strip().lower() == 't'
+    nzsl_raw_keys_dict[video_key] = is_public
 
+# Get the s3 keys present and absent from our NZSL keys
+print("Getting S3 keys present and absent from NZSL Signbank ...")
+nzsl_cooked_keys_list = []
+s3_keys_not_in_nzsl_list = []

From dfcba140e260e75d3a5a2daaf4873a462d261353 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 19 Aug 2024 17:01:21 +1000
Subject: [PATCH 005/222] Differencing working

---
 bin/python-get-heroku-video-s3-acls.py | 33 ++++++++++++++++----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 41d3b897..4390ebd9 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -59,49 +59,47 @@
 new_env = os.environ.copy()
 new_env["AWS_PROFILE"] = "nzsl"
 
-
 # Get all keys from S3
 print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
+#TODO Change this to a file-like object
 with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
     result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
                             env=new_env, shell=False, check=True,
                             text=True, stdout=f_obj)
 
-# Get just the keys
-# Put them in an in-memory list, stripping newlines
+# Separate out just the keys (also strips newlines)
+# Put them in an in-memory list
 with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
     s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
 print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
 
-# Write them back to the file for completeness
+# Write the keys back to the file
 with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
     for line in s3_bucket_raw_keys_list:
         f_obj.write(f"{line}\n")
 
-print(S3_BUCKET_RAW_KEYS_FILE)
-print("DEBUG EXIT")
-exit()
-
 # Get the video file keys from NZSL Signbank
 print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
+#TODO Change this to a file-like object
 with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
     result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}",
                              "-c", "select videofile, is_public from video_glossvideo"],
                             env=new_env, shell=False, check=True,
                             text=True, stdout=f_obj)
 
-# Put them in an in-memory list, stripping newlines
 # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting
 with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
-    nzsl_raw_keys_list = [line.rstrip() for line in f_obj]
+    nzsl_raw_keys_list = f_obj.readlines()
     nzsl_raw_keys_list = nzsl_raw_keys_list[2:]
     nzsl_raw_keys_list = nzsl_raw_keys_list[:-2]
+print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
+
+# Put the raw lines back into the text file
 with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
     f_obj.writelines(nzsl_raw_keys_list)
-print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
-#pprint(nzsl_raw_keys_list)
 
-# Write the NZSL keys to a dictionary so we can do fast operations on them
+# Separate out the NZSL key columns
+# Write them to a dictionary so we can do fast operations on them
 nzsl_raw_keys_dict = {}
 for rawl in nzsl_raw_keys_list:
     columns = rawl.split("|")
@@ -113,3 +111,12 @@
 print("Getting S3 keys present and absent from NZSL Signbank ...")
 nzsl_cooked_keys_list = []
 s3_keys_not_in_nzsl_list = []
+
+for video_key in s3_bucket_raw_keys_list:
+    if video_key in nzsl_raw_keys_dict:
+        nzsl_cooked_keys_list.append(video_key)
+    else:
+        s3_keys_not_in_nzsl_list.append(video_key)
+
+print(f"PRESENT: {len(nzsl_cooked_keys_list)} keys")
+print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys")

From 4427cdffb196111d6e6a1b97ac11b79d4f5854ac Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 26 Aug 2024 12:33:44 +1000
Subject: [PATCH 006/222] Comparing is_public with ACL return

---
 bin/python-get-heroku-video-s3-acls.py | 198 ++++++++++++++-----------
 1 file changed, 114 insertions(+), 84 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 4390ebd9..5f75b833 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -7,45 +7,23 @@
 
 # FIXME
 # Currently pulling all data into text files the way the bash script
-# this python script is based on did it.
-# We may be able to get away with losing some of the files and doing
-# most of it in memory.
+# that this python script is based on did it.
+# We may be able to get away with losing some the files and doing most
+# if not all of it in memory.
 
 
 import os
 import subprocess
 from pprint import pprint
 
+
+DEBUG = True
+
 # Setup
 # TODO See how difficult using native API calls would be.
 HEROKU = "/usr/bin/heroku"
 AWS = "/usr/local/bin/aws"
 
-TMPDIR = "/tmp/nzsl"
-try:
-    os.makedirs(TMPDIR, exist_ok=True)
-except OSError as err:
-    print(f"Error creating directory: {err}")
-    exit()
-
-NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
-NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
-S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
-S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
-S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
-S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
-for p in (
-    NZSL_RAW_KEYS_FILE,
-    NZSL_COOKED_KEYS_FILE,
-    S3_BUCKET_RAW_KEYS_FILE,
-    S3_BUCKET_ERROR_KEYS_FILE,
-    S3_BUCKET_CONTENTS_FILE,
-    S3_KEYS_NOT_IN_NZSL
-):
-    f = open(p, "a")
-    f.truncate()
-    f.close()
-
 RUN_MODE = "production"
 if RUN_MODE == "production":
     print("PRODUCTION")
@@ -59,64 +37,116 @@
 new_env = os.environ.copy()
 new_env["AWS_PROFILE"] = "nzsl"
 
-# Get all keys from S3
-print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
-#TODO Change this to a file-like object
-with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
-    result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
-                            env=new_env, shell=False, check=True,
-                            text=True, stdout=f_obj)
-
-# Separate out just the keys (also strips newlines)
-# Put them in an in-memory list
-with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
-    s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
-print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
-
-# Write the keys back to the file
-with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
-    for line in s3_bucket_raw_keys_list:
-        f_obj.write(f"{line}\n")
-
-# Get the video file keys from NZSL Signbank
-print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
-#TODO Change this to a file-like object
-with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
-    result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}",
-                             "-c", "select videofile, is_public from video_glossvideo"],
-                            env=new_env, shell=False, check=True,
-                            text=True, stdout=f_obj)
-
-# Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting
-with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
-    nzsl_raw_keys_list = f_obj.readlines()
-    nzsl_raw_keys_list = nzsl_raw_keys_list[2:]
-    nzsl_raw_keys_list = nzsl_raw_keys_list[:-2]
-print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
-
-# Put the raw lines back into the text file
-with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
-    f_obj.writelines(nzsl_raw_keys_list)
+TMPDIR = "/tmp/nzsl"
+try:
+    os.makedirs(TMPDIR, exist_ok=True)
+except OSError as err:
+    print(f"Error creating directory: {err}")
+    exit()
+NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
+NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
+S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
+S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
+S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
+S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
 
-# Separate out the NZSL key columns
-# Write them to a dictionary so we can do fast operations on them
 nzsl_raw_keys_dict = {}
-for rawl in nzsl_raw_keys_list:
-    columns = rawl.split("|")
-    video_key = columns[0].strip()
-    is_public = columns[1].strip().lower() == 't'
-    nzsl_raw_keys_dict[video_key] = is_public
-
-# Get the s3 keys present and absent from our NZSL keys
-print("Getting S3 keys present and absent from NZSL Signbank ...")
-nzsl_cooked_keys_list = []
+nzsl_cooked_keys_dict = {}
 s3_keys_not_in_nzsl_list = []
 
-for video_key in s3_bucket_raw_keys_list:
-    if video_key in nzsl_raw_keys_dict:
-        nzsl_cooked_keys_list.append(video_key)
-    else:
-        s3_keys_not_in_nzsl_list.append(video_key)
+if not DEBUG:
+    for p in (
+            NZSL_RAW_KEYS_FILE,
+            NZSL_COOKED_KEYS_FILE,
+            S3_BUCKET_RAW_KEYS_FILE,
+            S3_BUCKET_ERROR_KEYS_FILE,
+            S3_BUCKET_CONTENTS_FILE,
+            S3_KEYS_NOT_IN_NZSL
+    ):
+        f = open(p, "a")
+        f.truncate()
+        f.close()
+
+    # Get all keys from S3
+    print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
+    # TODO Change this to a file-like object
+    with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
+        result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
+                                env=new_env, shell=False, check=True,
+                                text=True, stdout=f_obj)
+
+    # Separate out just the keys (also strips newlines)
+    # Put them in an in-memory list
+    with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
+        s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
+    print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
+
+    # Write the keys back to the file
+    with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
+        for line in s3_bucket_raw_keys_list:
+            f_obj.write(f"{line}\n")
+
+    # Get the video file keys from NZSL Signbank
+    print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
+    with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
+        result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}",
+                                 "-c", "select videofile, is_public from video_glossvideo"],
+                                env=new_env, shell=False, check=True,
+                                text=True, stdout=f_obj)
+
+    # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting
+    with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
+        nzsl_raw_keys_list = f_obj.readlines()
+        nzsl_raw_keys_list = nzsl_raw_keys_list[2:]
+        nzsl_raw_keys_list = nzsl_raw_keys_list[:-2]
+    print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
+
+    # Put the raw lines back into the text file
+    with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
+        f_obj.writelines(nzsl_raw_keys_list)
+
+    # Separate out the NZSL key columns
+    # Write them to a dictionary so we can do fast operations on them
+    for rawl in nzsl_raw_keys_list:
+        columns = rawl.split("|")
+        video_key = columns[0].strip()
+        is_public = columns[1].strip().lower() == 't'
+        nzsl_raw_keys_dict[video_key] = is_public
+    # for item in nzsl_raw_keys_dict.items():
+    #    print(item)
+
+    # Get the s3 keys present and absent from our NZSL keys
+    print("Getting S3 keys present and absent from NZSL Signbank ...")
+    for video_key in s3_bucket_raw_keys_list:
+        if video_key in nzsl_raw_keys_dict:
+            nzsl_cooked_keys_dict[video_key] = nzsl_raw_keys_dict[video_key]
+        else:
+            s3_keys_not_in_nzsl_list.append(video_key)
+    print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
+    print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys")
+    # Write just the cooked keys back to a file
+    # This is mainly for Debug
+    with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj:
+        for video_key, is_public in nzsl_cooked_keys_dict.items():
+            f_obj.write(f"{video_key}, {str(is_public)}\n")
+
+if DEBUG:
+    # We used the ones we recorded on the last non-DEBUG run
+    with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
+        for line in f_obj.readlines():
+            video_key, is_public = line.strip().split(", ")
+            nzsl_cooked_keys_dict[video_key] = is_public
+
+# From the ones present, get all their ACL information
+print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
+for video_key, is_public in nzsl_cooked_keys_dict.items():
+    video_key = video_key.strip()
+    print(f"Key:    {video_key}")
+    print(f"Public: {is_public}")
+    result = subprocess.run(
+        [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
+                            env=new_env, shell=False, check=True,
+                            capture_output=True, text=True)
+    print(result.stdout)
+
 
-print(f"PRESENT: {len(nzsl_cooked_keys_list)} keys")
-print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys")

From c4ecdcb26dcf6cbbf803a7502e6f8220577c5542 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 26 Aug 2024 15:39:08 +1000
Subject: [PATCH 007/222] First pass at native boto s3 client use (messy)

---
 bin/python-get-heroku-video-s3-acls.py | 50 +++++++++++++++++---------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 5f75b833..28ddcf2f 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -5,22 +5,26 @@
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
 
-# FIXME
-# Currently pulling all data into text files the way the bash script
-# that this python script is based on did it.
-# We may be able to get away with losing some the files and doing most
-# if not all of it in memory.
-
 
 import os
 import subprocess
+import boto3
 from pprint import pprint
 
+# Never store these in code
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", None)
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", None)
+
+print(AWS_ACCESS_KEY_ID)
+print(AWS_SECRET_ACCESS_KEY)
 
+# if DEBUG, we use the results stored in files and only process the ACLS online
 DEBUG = True
 
 # Setup
 # TODO See how difficult using native API calls would be.
+# Answer: Heroku - no idea
+# Answer: AWS - fairly simple
 HEROKU = "/usr/bin/heroku"
 AWS = "/usr/local/bin/aws"
 
@@ -112,8 +116,6 @@
         video_key = columns[0].strip()
         is_public = columns[1].strip().lower() == 't'
         nzsl_raw_keys_dict[video_key] = is_public
-    # for item in nzsl_raw_keys_dict.items():
-    #    print(item)
 
     # Get the s3 keys present and absent from our NZSL keys
     print("Getting S3 keys present and absent from NZSL Signbank ...")
@@ -131,7 +133,7 @@
             f_obj.write(f"{video_key}, {str(is_public)}\n")
 
 if DEBUG:
-    # We used the ones we recorded on the last non-DEBUG run
+    # We use the ones we recorded on the last non-DEBUG run
     with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
         for line in f_obj.readlines():
             video_key, is_public = line.strip().split(", ")
@@ -139,14 +141,30 @@
 
 # From the ones present, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
+print("(Warning, this is a slow operation)")
 for video_key, is_public in nzsl_cooked_keys_dict.items():
     video_key = video_key.strip()
-    print(f"Key:    {video_key}")
-    print(f"Public: {is_public}")
-    result = subprocess.run(
-        [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
-                            env=new_env, shell=False, check=True,
-                            capture_output=True, text=True)
-    print(result.stdout)
+    header = f"Key:    {video_key}\nPublic: {is_public}"
+
+    USE_S3_NATIVE = True
+
+    if USE_S3_NATIVE:
+        # Be very careful, never write anything back
+        s3 = boto3.client(
+            "s3",
+            aws_access_key_id=AWS_ACCESS_KEY_ID,
+            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+        )
+        acl = s3.get_object_acl(Key=video_key, Bucket=AWS_S3_BUCKET)
+        print(header)
+        pprint(acl)
+    else:
+        result = subprocess.run(
+            [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
+            env=new_env, shell=False, check=True, capture_output=True, text=True)
+        print(f"Key:    {video_key}")
+        print(f"Public: {is_public}")
+        print(header)
+        print(result.stdout)
 
 

From 00e2d657e728cc5a84488e1dcc70dfa3fb32e41c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 26 Aug 2024 17:05:34 +1000
Subject: [PATCH 008/222] Revert "First pass at native boto s3 client use
 (messy)" Turns out setting up to use either path based or native is very
 un-simple.  Try if necessary another time.

This reverts commit c4ecdcb26dcf6cbbf803a7502e6f8220577c5542.
---
 bin/python-get-heroku-video-s3-acls.py | 50 +++++++++-----------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 28ddcf2f..5f75b833 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -5,26 +5,22 @@
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
 
+# FIXME
+# Currently pulling all data into text files the way the bash script
+# that this python script is based on did it.
+# We may be able to get away with losing some the files and doing most
+# if not all of it in memory.
+
 
 import os
 import subprocess
-import boto3
 from pprint import pprint
 
-# Never store these in code
-AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", None)
-AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", None)
-
-print(AWS_ACCESS_KEY_ID)
-print(AWS_SECRET_ACCESS_KEY)
 
-# if DEBUG, we use the results stored in files and only process the ACLS online
 DEBUG = True
 
 # Setup
 # TODO See how difficult using native API calls would be.
-# Answer: Heroku - no idea
-# Answer: AWS - fairly simple
 HEROKU = "/usr/bin/heroku"
 AWS = "/usr/local/bin/aws"
 
@@ -116,6 +112,8 @@
         video_key = columns[0].strip()
         is_public = columns[1].strip().lower() == 't'
         nzsl_raw_keys_dict[video_key] = is_public
+    # for item in nzsl_raw_keys_dict.items():
+    #    print(item)
 
     # Get the s3 keys present and absent from our NZSL keys
     print("Getting S3 keys present and absent from NZSL Signbank ...")
@@ -133,7 +131,7 @@
             f_obj.write(f"{video_key}, {str(is_public)}\n")
 
 if DEBUG:
-    # We use the ones we recorded on the last non-DEBUG run
+    # We used the ones we recorded on the last non-DEBUG run
     with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
         for line in f_obj.readlines():
             video_key, is_public = line.strip().split(", ")
@@ -141,30 +139,14 @@
 
 # From the ones present, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
-print("(Warning, this is a slow operation)")
 for video_key, is_public in nzsl_cooked_keys_dict.items():
     video_key = video_key.strip()
-    header = f"Key:    {video_key}\nPublic: {is_public}"
-
-    USE_S3_NATIVE = True
-
-    if USE_S3_NATIVE:
-        # Be very careful, never write anything back
-        s3 = boto3.client(
-            "s3",
-            aws_access_key_id=AWS_ACCESS_KEY_ID,
-            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
-        )
-        acl = s3.get_object_acl(Key=video_key, Bucket=AWS_S3_BUCKET)
-        print(header)
-        pprint(acl)
-    else:
-        result = subprocess.run(
-            [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
-            env=new_env, shell=False, check=True, capture_output=True, text=True)
-        print(f"Key:    {video_key}")
-        print(f"Public: {is_public}")
-        print(header)
-        print(result.stdout)
+    print(f"Key:    {video_key}")
+    print(f"Public: {is_public}")
+    result = subprocess.run(
+        [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
+                            env=new_env, shell=False, check=True,
+                            capture_output=True, text=True)
+    print(result.stdout)
 
 

From a487f596639a9fc40f272914714b8c2f6d50c799 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 14:25:02 +1000
Subject: [PATCH 009/222] Rudimentary command line parsing

---
 bin/python-get-heroku-video-s3-acls.py | 45 +++++++++++++-------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py
index 5f75b833..a2773423 100755
--- a/bin/python-get-heroku-video-s3-acls.py
+++ b/bin/python-get-heroku-video-s3-acls.py
@@ -5,22 +5,21 @@
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
 
-# FIXME
-# Currently pulling all data into text files the way the bash script
-# that this python script is based on did it.
-# We may be able to get away with losing some the files and doing most
-# if not all of it in memory.
-
 
 import os
 import subprocess
+import argparse
 from pprint import pprint
 
-
-DEBUG = True
+parser = argparse.ArgumentParser()
+parser.add_argument("--cached",
+                    default=False,
+                    required=False,
+                    action="store_true",
+                    help="Use keys generated on a previous non-cache run")
+args = parser.parse_args()
 
 # Setup
-# TODO See how difficult using native API calls would be.
 HEROKU = "/usr/bin/heroku"
 AWS = "/usr/local/bin/aws"
 
@@ -54,7 +53,18 @@
 nzsl_cooked_keys_dict = {}
 s3_keys_not_in_nzsl_list = []
 
-if not DEBUG:
+if args.cached:
+    print("Using the video keys we recorded on the last non-cached run")
+    try:
+        with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
+            for line in f_obj.readlines():
+                video_key, is_public = line.strip().split(", ")
+                nzsl_cooked_keys_dict[video_key] = is_public
+    except FileNotFoundError:
+        print(f"File not found: {NZSL_COOKED_KEYS_FILE}")
+        exit()
+    print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
+else:
     for p in (
             NZSL_RAW_KEYS_FILE,
             NZSL_COOKED_KEYS_FILE,
@@ -69,7 +79,6 @@
 
     # Get all keys from S3
     print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
-    # TODO Change this to a file-like object
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
                                 env=new_env, shell=False, check=True,
@@ -130,14 +139,8 @@
         for video_key, is_public in nzsl_cooked_keys_dict.items():
             f_obj.write(f"{video_key}, {str(is_public)}\n")
 
-if DEBUG:
-    # We used the ones we recorded on the last non-DEBUG run
-    with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
-        for line in f_obj.readlines():
-            video_key, is_public = line.strip().split(", ")
-            nzsl_cooked_keys_dict[video_key] = is_public
 
-# From the ones present, get all their ACL information
+# From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
 for video_key, is_public in nzsl_cooked_keys_dict.items():
     video_key = video_key.strip()
@@ -145,8 +148,6 @@
     print(f"Public: {is_public}")
     result = subprocess.run(
         [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
-                            env=new_env, shell=False, check=True,
-                            capture_output=True, text=True)
+        env=new_env, shell=False, check=True,
+        capture_output=True, text=True)
     print(result.stdout)
-
-

From b58f6bf37fc4b627a4058444c8f864b5cc5b1897 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 14:26:20 +1000
Subject: [PATCH 010/222] Rename

---
 bin/{python-get-heroku-video-s3-acls.py => get-video-s3-acls.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename bin/{python-get-heroku-video-s3-acls.py => get-video-s3-acls.py} (100%)

diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/get-video-s3-acls.py
similarity index 100%
rename from bin/python-get-heroku-video-s3-acls.py
rename to bin/get-video-s3-acls.py

From 58d2a9aaf4f0d50dfb5931a90d6c665274304672 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 14:58:31 +1000
Subject: [PATCH 011/222] Better command line arguments

---
 bin/get-video-s3-acls.py | 45 +++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a2773423..e947cc46 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,36 +11,58 @@
 import argparse
 from pprint import pprint
 
-parser = argparse.ArgumentParser()
+HEROKU = "/usr/bin/heroku"
+AWS = "/usr/local/bin/aws"
+
+parser = argparse.ArgumentParser(epilog="You must have a configured AWS profile to use this app. See the --awsprofile argument.")
 parser.add_argument("--cached",
                     default=False,
                     required=False,
                     action="store_true",
-                    help="Use keys generated on a previous non-cache run")
+                    help="Use keys generated on a previous non-cache run (default: False)")
+parser.add_argument("--production",
+                    default=False,
+                    required=False,
+                    action="store_true",
+                    help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)")
+parser.add_argument("--pgclient",
+                    default=HEROKU,
+                    required=False,
+                    help=f"Postgres client path (default: {HEROKU})")
+parser.add_argument("--awsprofile",
+                    default="nzsl",
+                    required=False,
+                    help=f"AWS configured profile to use (default: 'nzsl')")
+parser.add_argument("--s3client",
+                    default=AWS,
+                    required=False,
+                    help=f"AWS S3 client path (default: {AWS})")
 args = parser.parse_args()
 
-# Setup
-HEROKU = "/usr/bin/heroku"
-AWS = "/usr/local/bin/aws"
+HEROKU = args.pgclient
+AWS = args.s3client
 
-RUN_MODE = "production"
-if RUN_MODE == "production":
-    print("PRODUCTION")
+if args.production:
+    print("Mode:                 PRODUCTION")
     NZSL_APP = "nzsl-signbank-production"
     AWS_S3_BUCKET = "nzsl-signbank-media-production"
 else:
-    print("STAGING")
+    print("Mode:                 STAGING")
     NZSL_APP = "nzsl-signbank-uat"
     AWS_S3_BUCKET = "nzsl-signbank-media-uat"
 
 new_env = os.environ.copy()
-new_env["AWS_PROFILE"] = "nzsl"
+new_env["AWS_PROFILE"] = args.awsprofile
+
+print(f"Target NZSL app:      {NZSL_APP}")
+print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}")
+print(f"AWS profile using:    {new_env['AWS_PROFILE']}")
 
 TMPDIR = "/tmp/nzsl"
 try:
     os.makedirs(TMPDIR, exist_ok=True)
 except OSError as err:
-    print(f"Error creating directory: {err}")
+    print(f"Error creating temporary directory: {TMPDIR} {err}")
     exit()
 NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
 NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
@@ -139,7 +161,6 @@
         for video_key, is_public in nzsl_cooked_keys_dict.items():
             f_obj.write(f"{video_key}, {str(is_public)}\n")
 
-
 # From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
 for video_key, is_public in nzsl_cooked_keys_dict.items():

From da626ae1e198ecca8e22922c5be58865f5e9da5e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:31:16 +1000
Subject: [PATCH 012/222] Command line arguments and external apps codified

---
 bin/get-video-s3-acls.py | 45 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e947cc46..832a0b28 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,10 +11,13 @@
 import argparse
 from pprint import pprint
 
-HEROKU = "/usr/bin/heroku"
+PGCLIENT = "/usr/bin/psql"
 AWS = "/usr/local/bin/aws"
 
 parser = argparse.ArgumentParser(epilog="You must have a configured AWS profile to use this app. See the --awsprofile argument.")
+parser.add_argument("--dburl",
+                    required=True,
+                    help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)")
 parser.add_argument("--cached",
                     default=False,
                     required=False,
@@ -26,34 +29,36 @@
                     action="store_true",
                     help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)")
 parser.add_argument("--pgclient",
-                    default=HEROKU,
+                    default=PGCLIENT,
                     required=False,
-                    help=f"Postgres client path (default: {HEROKU})")
+                    help=f"Postgres client path (default: {PGCLIENT})")
 parser.add_argument("--awsprofile",
                     default="nzsl",
                     required=False,
                     help=f"AWS configured profile to use (default: 'nzsl')")
-parser.add_argument("--s3client",
+parser.add_argument("--awsclient",
                     default=AWS,
                     required=False,
-                    help=f"AWS S3 client path (default: {AWS})")
+                    help=f"AWS client path (default: {AWS})")
 args = parser.parse_args()
 
-HEROKU = args.pgclient
-AWS = args.s3client
+DATABASE_URL = args.dburl
+PGCLIENT = args.pgclient
+AWS = args.awsclient
 
 if args.production:
-    print("Mode:                 PRODUCTION")
+    MODE_STR = "PRODUCTION"
     NZSL_APP = "nzsl-signbank-production"
     AWS_S3_BUCKET = "nzsl-signbank-media-production"
 else:
-    print("Mode:                 STAGING")
+    MODE_STR = "STAGING"
     NZSL_APP = "nzsl-signbank-uat"
     AWS_S3_BUCKET = "nzsl-signbank-media-uat"
 
 new_env = os.environ.copy()
 new_env["AWS_PROFILE"] = args.awsprofile
 
+print(f"Mode:                 {MODE_STR}")
 print(f"Target NZSL app:      {NZSL_APP}")
 print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}")
 print(f"AWS profile using:    {new_env['AWS_PROFILE']}")
@@ -76,7 +81,7 @@
 s3_keys_not_in_nzsl_list = []
 
 if args.cached:
-    print("Using the video keys we recorded on the last non-cached run")
+    print("Using the video keys we recorded on the last non-cached run.")
     try:
         with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
@@ -87,6 +92,7 @@
         exit()
     print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
 else:
+    print("Generating keys from scratch.")
     for p in (
             NZSL_RAW_KEYS_FILE,
             NZSL_COOKED_KEYS_FILE,
@@ -120,31 +126,26 @@
     # Get the video file keys from NZSL Signbank
     print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
     with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
-        result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}",
-                                 "-c", "select videofile, is_public from video_glossvideo"],
+        result = subprocess.run([PGCLIENT,
+                                 "-t",
+                                 "-c", "select videofile, is_public from video_glossvideo",
+                                f"{DATABASE_URL}"],
                                 env=new_env, shell=False, check=True,
                                 text=True, stdout=f_obj)
-
-    # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting
     with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
-        nzsl_raw_keys_list = nzsl_raw_keys_list[2:]
-        nzsl_raw_keys_list = nzsl_raw_keys_list[:-2]
     print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
 
-    # Put the raw lines back into the text file
-    with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
-        f_obj.writelines(nzsl_raw_keys_list)
-
     # Separate out the NZSL key columns
     # Write them to a dictionary so we can do fast operations on them
     for rawl in nzsl_raw_keys_list:
+        rawl = rawl.strip()
+        if not rawl:
+            continue
         columns = rawl.split("|")
         video_key = columns[0].strip()
         is_public = columns[1].strip().lower() == 't'
         nzsl_raw_keys_dict[video_key] = is_public
-    # for item in nzsl_raw_keys_dict.items():
-    #    print(item)
 
     # Get the s3 keys present and absent from our NZSL keys
     print("Getting S3 keys present and absent from NZSL Signbank ...")

From 3fdb0b21483b04858598e471e6f9fc695a8778ed Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:32:24 +1000
Subject: [PATCH 013/222] Comments

---
 bin/get-video-s3-acls.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 832a0b28..3b6498d1 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,6 +11,9 @@
 import argparse
 from pprint import pprint
 
+# TODO
+# We are using external apps just for the moment.
+# These will be removed for native libraries.
 PGCLIENT = "/usr/bin/psql"
 AWS = "/usr/local/bin/aws"
 

From 026c2fa98f6e991095738b650a8b9809035eb891 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:41:20 +1000
Subject: [PATCH 014/222] black

---
 bin/get-video-s3-acls.py | 129 +++++++++++++++++++++++++--------------
 1 file changed, 84 insertions(+), 45 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 3b6498d1..e03327db 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -17,32 +17,44 @@
 PGCLIENT = "/usr/bin/psql"
 AWS = "/usr/local/bin/aws"
 
-parser = argparse.ArgumentParser(epilog="You must have a configured AWS profile to use this app. See the --awsprofile argument.")
-parser.add_argument("--dburl",
-                    required=True,
-                    help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)")
-parser.add_argument("--cached",
-                    default=False,
-                    required=False,
-                    action="store_true",
-                    help="Use keys generated on a previous non-cache run (default: False)")
-parser.add_argument("--production",
-                    default=False,
-                    required=False,
-                    action="store_true",
-                    help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)")
-parser.add_argument("--pgclient",
-                    default=PGCLIENT,
-                    required=False,
-                    help=f"Postgres client path (default: {PGCLIENT})")
-parser.add_argument("--awsprofile",
-                    default="nzsl",
-                    required=False,
-                    help=f"AWS configured profile to use (default: 'nzsl')")
-parser.add_argument("--awsclient",
-                    default=AWS,
-                    required=False,
-                    help=f"AWS client path (default: {AWS})")
+parser = argparse.ArgumentParser(
+    epilog="You must have a configured AWS profile to use this app. See the --awsprofile "
+    "argument."
+)
+parser.add_argument(
+    "--dburl",
+    required=True,
+    help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)",
+)
+parser.add_argument(
+    "--cached",
+    default=False,
+    required=False,
+    action="store_true",
+    help="Use keys generated on a previous non-cache run (default: False)",
+)
+parser.add_argument(
+    "--production",
+    default=False,
+    required=False,
+    action="store_true",
+    help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)",
+)
+parser.add_argument(
+    "--pgclient",
+    default=PGCLIENT,
+    required=False,
+    help=f"Postgres client path (default: {PGCLIENT})",
+)
+parser.add_argument(
+    "--awsprofile",
+    default="nzsl",
+    required=False,
+    help=f"AWS configured profile to use (default: 'nzsl')",
+)
+parser.add_argument(
+    "--awsclient", default=AWS, required=False, help=f"AWS client path (default: {AWS})"
+)
 args = parser.parse_args()
 
 DATABASE_URL = args.dburl
@@ -97,12 +109,12 @@
 else:
     print("Generating keys from scratch.")
     for p in (
-            NZSL_RAW_KEYS_FILE,
-            NZSL_COOKED_KEYS_FILE,
-            S3_BUCKET_RAW_KEYS_FILE,
-            S3_BUCKET_ERROR_KEYS_FILE,
-            S3_BUCKET_CONTENTS_FILE,
-            S3_KEYS_NOT_IN_NZSL
+        NZSL_RAW_KEYS_FILE,
+        NZSL_COOKED_KEYS_FILE,
+        S3_BUCKET_RAW_KEYS_FILE,
+        S3_BUCKET_ERROR_KEYS_FILE,
+        S3_BUCKET_CONTENTS_FILE,
+        S3_KEYS_NOT_IN_NZSL,
     ):
         f = open(p, "a")
         f.truncate()
@@ -111,9 +123,14 @@
     # Get all keys from S3
     print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
-        result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
-                                env=new_env, shell=False, check=True,
-                                text=True, stdout=f_obj)
+        result = subprocess.run(
+            [AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
+            env=new_env,
+            shell=False,
+            check=True,
+            text=True,
+            stdout=f_obj,
+        )
 
     # Separate out just the keys (also strips newlines)
     # Put them in an in-memory list
@@ -129,12 +146,20 @@
     # Get the video file keys from NZSL Signbank
     print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
     with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
-        result = subprocess.run([PGCLIENT,
-                                 "-t",
-                                 "-c", "select videofile, is_public from video_glossvideo",
-                                f"{DATABASE_URL}"],
-                                env=new_env, shell=False, check=True,
-                                text=True, stdout=f_obj)
+        result = subprocess.run(
+            [
+                PGCLIENT,
+                "-t",
+                "-c",
+                "select videofile, is_public from video_glossvideo",
+                f"{DATABASE_URL}",
+            ],
+            env=new_env,
+            shell=False,
+            check=True,
+            text=True,
+            stdout=f_obj,
+        )
     with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
     print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
@@ -147,7 +172,7 @@
             continue
         columns = rawl.split("|")
         video_key = columns[0].strip()
-        is_public = columns[1].strip().lower() == 't'
+        is_public = columns[1].strip().lower() == "t"
         nzsl_raw_keys_dict[video_key] = is_public
 
     # Get the s3 keys present and absent from our NZSL keys
@@ -172,7 +197,21 @@
     print(f"Key:    {video_key}")
     print(f"Public: {is_public}")
     result = subprocess.run(
-        [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key],
-        env=new_env, shell=False, check=True,
-        capture_output=True, text=True)
+        [
+            AWS,
+            "s3api",
+            "get-object-acl",
+            "--output",
+            "text",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ],
+        env=new_env,
+        shell=False,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
     print(result.stdout)

From cc6acb2e6d9f0fcd8536e6546cb2c8f3d5696cd9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:45:08 +1000
Subject: [PATCH 015/222] Better arguments

---
 bin/get-video-s3-acls.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e03327db..401981bd 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -21,11 +21,12 @@
     epilog="You must have a configured AWS profile to use this app. See the --awsprofile "
     "argument."
 )
+# Positional args
 parser.add_argument(
-    "--dburl",
-    required=True,
+    "dburl",
     help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)",
 )
+# Named args
 parser.add_argument(
     "--cached",
     default=False,

From f9019fd79401b51e353a592c2ed97655f026c6b4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:58:27 +1000
Subject: [PATCH 016/222] Better args

---
 bin/get-video-s3-acls.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 401981bd..b1687607 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -32,29 +32,33 @@
     default=False,
     required=False,
     action="store_true",
-    help="Use keys generated on a previous non-cache run (default: False)",
+    help="Use keys generated on a previous non-cache run (default: %(default)s) "
+    "(Don't mix PRODUCTION and STAGING!)",
 )
 parser.add_argument(
     "--production",
     default=False,
     required=False,
     action="store_true",
-    help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)",
+    help="Run in PRODUCTION mode, instead of STAGING (default: %(default)s)",
 )
 parser.add_argument(
     "--pgclient",
     default=PGCLIENT,
     required=False,
-    help=f"Postgres client path (default: {PGCLIENT})",
+    help=f"Postgres client path (default: %(default)s)",
 )
 parser.add_argument(
-    "--awsprofile",
-    default="nzsl",
+    "--awsclient",
+    default=AWS,
     required=False,
-    help=f"AWS configured profile to use (default: 'nzsl')",
+    help=f"AWS client path (default: %(default)s)",
 )
 parser.add_argument(
-    "--awsclient", default=AWS, required=False, help=f"AWS client path (default: {AWS})"
+    "--awsprofile",
+    default="nzsl",
+    required=False,
+    help=f"AWS configured profile to use (default: '%(default)s')",
 )
 args = parser.parse_args()
 
@@ -195,8 +199,6 @@
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
 for video_key, is_public in nzsl_cooked_keys_dict.items():
     video_key = video_key.strip()
-    print(f"Key:    {video_key}")
-    print(f"Public: {is_public}")
     result = subprocess.run(
         [
             AWS,
@@ -215,4 +217,6 @@
         capture_output=True,
         text=True,
     )
+    print(f"Key:    {video_key}")
+    print(f"Public: {is_public}")
     print(result.stdout)

From 32cf39d425174b142b872921e70c855fc337d8c8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 16:01:43 +1000
Subject: [PATCH 017/222] Better arg help and ordering

---
 bin/get-video-s3-acls.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b1687607..b42f15ba 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -18,22 +18,20 @@
 AWS = "/usr/local/bin/aws"
 
 parser = argparse.ArgumentParser(
-    epilog="You must have a configured AWS profile to use this app. See the --awsprofile "
+    description="You must have a configured AWS profile to use this app. See the --awsprofile "
     "argument."
 )
-# Positional args
+# Positional arguments
 parser.add_argument(
     "dburl",
     help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)",
 )
-# Named args
+# Optional arguments
 parser.add_argument(
-    "--cached",
-    default=False,
+    "--awsprofile",
+    default="nzsl",
     required=False,
-    action="store_true",
-    help="Use keys generated on a previous non-cache run (default: %(default)s) "
-    "(Don't mix PRODUCTION and STAGING!)",
+    help=f"AWS configured profile to use (default: '%(default)s')",
 )
 parser.add_argument(
     "--production",
@@ -42,6 +40,14 @@
     action="store_true",
     help="Run in PRODUCTION mode, instead of STAGING (default: %(default)s)",
 )
+parser.add_argument(
+    "--cached",
+    default=False,
+    required=False,
+    action="store_true",
+    help="Use keys generated on a previous non-cached run (default: %(default)s) "
+    "(Don't mix PRODUCTION and STAGING!)",
+)
 parser.add_argument(
     "--pgclient",
     default=PGCLIENT,
@@ -54,12 +60,6 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--awsprofile",
-    default="nzsl",
-    required=False,
-    help=f"AWS configured profile to use (default: '%(default)s')",
-)
 args = parser.parse_args()
 
 DATABASE_URL = args.dburl

From 49ea76266e33cbb6a7b14bc1ebd4bb7b0793e19c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 17:52:18 +1000
Subject: [PATCH 018/222] Better cached handling

---
 bin/get-video-s3-acls.py | 65 +++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b42f15ba..a6142e35 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -15,23 +15,29 @@
 # We are using external apps just for the moment.
 # These will be removed for native libraries.
 PGCLIENT = "/usr/bin/psql"
-AWS = "/usr/local/bin/aws"
+AWSCLIENT = "/usr/local/bin/aws"
+
+# NZSL: Is there a database url defined in the environment?
+DATABASE_URL = os.getenv("DATABASE_URL", None)
 
 parser = argparse.ArgumentParser(
-    description="You must have a configured AWS profile to use this app. See the --awsprofile "
+    description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile "
     "argument."
 )
 # Positional arguments
-parser.add_argument(
-    "dburl",
-    help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)",
-)
+if DATABASE_URL:
+    print("DATABASE_URL defined in environment")
+else:
+    parser.add_argument(
+        "dburl",
+        help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)",
+    )
 # Optional arguments
 parser.add_argument(
     "--awsprofile",
     default="nzsl",
     required=False,
-    help=f"AWS configured profile to use (default: '%(default)s')",
+    help=f"AWSCLIENT configured profile to use (default: '%(default)s')",
 )
 parser.add_argument(
     "--production",
@@ -56,16 +62,12 @@
 )
 parser.add_argument(
     "--awsclient",
-    default=AWS,
+    default=AWSCLIENT,
     required=False,
-    help=f"AWS client path (default: %(default)s)",
+    help=f"AWSCLIENT client path (default: %(default)s)",
 )
 args = parser.parse_args()
 
-DATABASE_URL = args.dburl
-PGCLIENT = args.pgclient
-AWS = args.awsclient
-
 if args.production:
     MODE_STR = "PRODUCTION"
     NZSL_APP = "nzsl-signbank-production"
@@ -78,10 +80,22 @@
 new_env = os.environ.copy()
 new_env["AWS_PROFILE"] = args.awsprofile
 
+PGCLIENT = args.pgclient
+AWSCLIENT = args.awsclient
+
+if not DATABASE_URL:
+    DATABASE_URL = args.dburl
+
+if args.cached:
+    print("Using the video keys we recorded on the last non-cached run.")
+
 print(f"Mode:                 {MODE_STR}")
 print(f"Target NZSL app:      {NZSL_APP}")
 print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}")
 print(f"AWS profile using:    {new_env['AWS_PROFILE']}")
+print(f"PGCLIENT:             {PGCLIENT}")
+print(f"AWSCLIENT:            {AWSCLIENT}")
+print(f"DATABASE_URL:\n{DATABASE_URL}")
 
 TMPDIR = "/tmp/nzsl"
 try:
@@ -94,14 +108,13 @@
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
 S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
-S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
+S3_KEYS_NOT_IN_NZSL_FILE = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
 
 nzsl_raw_keys_dict = {}
 nzsl_cooked_keys_dict = {}
 s3_keys_not_in_nzsl_list = []
 
 if args.cached:
-    print("Using the video keys we recorded on the last non-cached run.")
     try:
         with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
@@ -110,7 +123,14 @@
     except FileNotFoundError:
         print(f"File not found: {NZSL_COOKED_KEYS_FILE}")
         exit()
+    try:
+        with open(S3_KEYS_NOT_IN_NZSL_FILE, "r") as f_obj:
+            s3_keys_not_in_nzsl_list = [line.strip() for line in f_obj.readlines()]
+    except FileNotFoundError:
+        print(f"File not found: {S3_KEYS_NOT_IN_NZSL_FILE}")
+        exit()
     print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
+    print(f"ABSENT:  {len(s3_keys_not_in_nzsl_list)} keys")
 else:
     print("Generating keys from scratch.")
     for p in (
@@ -119,7 +139,7 @@
         S3_BUCKET_RAW_KEYS_FILE,
         S3_BUCKET_ERROR_KEYS_FILE,
         S3_BUCKET_CONTENTS_FILE,
-        S3_KEYS_NOT_IN_NZSL,
+        S3_KEYS_NOT_IN_NZSL_FILE,
     ):
         f = open(p, "a")
         f.truncate()
@@ -129,7 +149,7 @@
     print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
-            [AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
+            [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
             env=new_env,
             shell=False,
             check=True,
@@ -189,19 +209,24 @@
             s3_keys_not_in_nzsl_list.append(video_key)
     print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
     print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys")
-    # Write just the cooked keys back to a file
-    # This is mainly for Debug
+
+    # Write the "cooked" (i.e. present) keys back to a file
     with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj:
         for video_key, is_public in nzsl_cooked_keys_dict.items():
             f_obj.write(f"{video_key}, {str(is_public)}\n")
 
+    # Write the absent keys back to a file
+    with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj:
+        for video_key in s3_keys_not_in_nzsl_list:
+            f_obj.write(f"{video_key}\n")
+
 # From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
 for video_key, is_public in nzsl_cooked_keys_dict.items():
     video_key = video_key.strip()
     result = subprocess.run(
         [
-            AWS,
+            AWSCLIENT,
             "s3api",
             "get-object-acl",
             "--output",

From e692cb9d146707675013ca32eb38a61669282488 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 17:52:49 +1000
Subject: [PATCH 019/222] Debug removed

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a6142e35..bdab8285 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -9,7 +9,6 @@
 import os
 import subprocess
 import argparse
-from pprint import pprint
 
 # TODO
 # We are using external apps just for the moment.

From ed40d189af4ae7b7ebf4280069ad105bb7d7c108 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 17:55:32 +1000
Subject: [PATCH 020/222] Cut n pasted text fixed

---
 bin/get-video-s3-acls.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index bdab8285..1d8de80a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -36,7 +36,7 @@
     "--awsprofile",
     default="nzsl",
     required=False,
-    help=f"AWSCLIENT configured profile to use (default: '%(default)s')",
+    help=f"AWS configured profile to use (default: '%(default)s')",
 )
 parser.add_argument(
     "--production",
@@ -63,7 +63,7 @@
     "--awsclient",
     default=AWSCLIENT,
     required=False,
-    help=f"AWSCLIENT client path (default: %(default)s)",
+    help=f"AWS client path (default: %(default)s)",
 )
 args = parser.parse_args()
 

From 29e0fda91387d62db83d9db613331a44822e73f1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 28 Aug 2024 18:11:32 +1000
Subject: [PATCH 021/222] Incremental improvements

---
 bin/get-video-s3-acls.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 1d8de80a..2a1bde6f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -13,8 +13,8 @@
 # TODO
 # We are using external apps just for the moment.
 # These will be removed for native libraries.
-PGCLIENT = "/usr/bin/psql"
 AWSCLIENT = "/usr/local/bin/aws"
+PGCLIENT = "/usr/bin/psql"
 
 # NZSL: Is there a database url defined in the environment?
 DATABASE_URL = os.getenv("DATABASE_URL", None)
@@ -79,21 +79,23 @@
 new_env = os.environ.copy()
 new_env["AWS_PROFILE"] = args.awsprofile
 
-PGCLIENT = args.pgclient
 AWSCLIENT = args.awsclient
+PGCLIENT = args.pgclient
 
 if not DATABASE_URL:
     DATABASE_URL = args.dburl
 
 if args.cached:
     print("Using the video keys we recorded on the last non-cached run.")
+else:
+    print("Generating keys from scratch.")
 
-print(f"Mode:                 {MODE_STR}")
-print(f"Target NZSL app:      {NZSL_APP}")
-print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}")
-print(f"AWS profile using:    {new_env['AWS_PROFILE']}")
-print(f"PGCLIENT:             {PGCLIENT}")
-print(f"AWSCLIENT:            {AWSCLIENT}")
+print(f"Mode:          {MODE_STR}")
+print(f"NZSL app:      {NZSL_APP}")
+print(f"AWS S3 bucket: {AWS_S3_BUCKET}")
+print(f"AWS profile:   {new_env['AWS_PROFILE']}")
+print(f"AWSCLIENT:     {AWSCLIENT}")
+print(f"PGCLIENT:      {PGCLIENT}")
 print(f"DATABASE_URL:\n{DATABASE_URL}")
 
 TMPDIR = "/tmp/nzsl"
@@ -104,6 +106,7 @@
     exit()
 NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
 NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
+COOKED_DELIMITER = ", "
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
 S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
@@ -114,10 +117,11 @@
 s3_keys_not_in_nzsl_list = []
 
 if args.cached:
+    # Pull all info from existing files
     try:
         with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
-                video_key, is_public = line.strip().split(", ")
+                video_key, is_public = line.strip().split(COOKED_DELIMITER)
                 nzsl_cooked_keys_dict[video_key] = is_public
     except FileNotFoundError:
         print(f"File not found: {NZSL_COOKED_KEYS_FILE}")
@@ -131,7 +135,7 @@
     print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
     print(f"ABSENT:  {len(s3_keys_not_in_nzsl_list)} keys")
 else:
-    print("Generating keys from scratch.")
+    # Zero-out files
     for p in (
         NZSL_RAW_KEYS_FILE,
         NZSL_COOKED_KEYS_FILE,
@@ -144,8 +148,8 @@
         f.truncate()
         f.close()
 
-    # Get all keys from S3
-    print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...")
+    # Get all keys from AWS S3
+    print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...")
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
             [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
@@ -212,7 +216,7 @@
     # Write the "cooked" (i.e. present) keys back to a file
     with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj:
         for video_key, is_public in nzsl_cooked_keys_dict.items():
-            f_obj.write(f"{video_key}, {str(is_public)}\n")
+            f_obj.write(f"{video_key}{COOKED_DELIMITER}{str(is_public)}\n")
 
     # Write the absent keys back to a file
     with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj:

From 516f6a7a367a4dee3ff900faa27f98d9fab0b674 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 29 Aug 2024 13:06:24 +1000
Subject: [PATCH 022/222] Minor feedback text fix

---
 bin/get-video-s3-acls.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 2a1bde6f..707fb78d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -90,12 +90,12 @@
 else:
     print("Generating keys from scratch.")
 
-print(f"Mode:          {MODE_STR}")
-print(f"NZSL app:      {NZSL_APP}")
-print(f"AWS S3 bucket: {AWS_S3_BUCKET}")
-print(f"AWS profile:   {new_env['AWS_PROFILE']}")
-print(f"AWSCLIENT:     {AWSCLIENT}")
-print(f"PGCLIENT:      {PGCLIENT}")
+print(f"Mode:        {MODE_STR}")
+print(f"  NZSL app:  {NZSL_APP}")
+print(f"  S3 bucket: {AWS_S3_BUCKET}")
+print(f"AWS profile: {new_env['AWS_PROFILE']}")
+print(f"AWSCLIENT:   {AWSCLIENT}")
+print(f"PGCLIENT:    {PGCLIENT}")
 print(f"DATABASE_URL:\n{DATABASE_URL}")
 
 TMPDIR = "/tmp/nzsl"

From dbb1a43d40bc6650df5a40de51faa3bf6fe552f4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:36:01 +1000
Subject: [PATCH 023/222] set_public() functions renamed for clarity

---
 signbank/video/admin.py  | 10 +++++-----
 signbank/video/models.py |  6 ++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/signbank/video/admin.py b/signbank/video/admin.py
index f2d62811..5c1282b7 100644
--- a/signbank/video/admin.py
+++ b/signbank/video/admin.py
@@ -75,18 +75,18 @@ def queryset(self, request, queryset):
             return queryset
 
 
-def set_public(modeladmin, request, queryset):
+def admin_set_public(modeladmin, request, queryset):
     for glossvideo in queryset.all():
         glossvideo.set_public(True)
 
 
-def set_hidden(modeladmin, request, queryset):
+def admin_set_hidden(modeladmin, request, queryset):
     for glossvideo in queryset.all():
         glossvideo.set_public(False)
 
 
-set_public.short_description = _lazy("Set selected videos public")
-set_hidden.short_description = _lazy("Set selected videos hidden")
+admin_set_public.short_description = _lazy("Set selected videos public")
+admin_set_hidden.short_description = _lazy("Set selected videos hidden")
 
 
 class GlossVideoAdmin(admin.ModelAdmin):
@@ -98,7 +98,7 @@ class GlossVideoAdmin(admin.ModelAdmin):
                     'videofile', 'video_type', 'posterfile', 'id', 'version')
     list_filter = ('is_public', 'video_type', 'gloss__dataset',
                    HasGlossFilter, 'dataset', HasPosterFilter, GlossesVideoCountFilter)
-    actions = [set_public, set_hidden]
+    actions = [admin_set_public, admin_set_hidden]
 
     def get_queryset(self, request):
         qs = super(GlossVideoAdmin, self).get_queryset(request)
diff --git a/signbank/video/models.py b/signbank/video/models.py
index 8de46671..e1617765 100644
--- a/signbank/video/models.py
+++ b/signbank/video/models.py
@@ -48,7 +48,7 @@ def public_url(self, name):
 
             return f'{domain}{path}'
 
-    def set_public(self, name, is_public):
+    def set_public_acl(self, name, is_public):
         """ Set the object ACL on the object. This is only supported
         for S3 storage, and is a no-op for local file storage
         """
@@ -62,8 +62,6 @@ def set_public(self, name, is_public):
         )
 
 
-
-
 class GlossVideo(models.Model):
     """A video that represents a particular idgloss"""
     #: Descriptive title of the GlossVideo.
@@ -247,7 +245,7 @@ def is_video(self):
     def set_public(self, is_public):
         self.is_public = is_public
         self.save()
-        self.videofile.storage.set_public(self.videofile.name, is_public)
+        self.videofile.storage.set_public_acl(self.videofile.name, is_public)
 
         True
 

From 1c294a29fe3cd05058eea8708f2ded24ded82df5 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 15:52:54 +1000
Subject: [PATCH 024/222] Basics of final output collection working

---
 bin/get-video-s3-acls.py | 67 +++++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 707fb78d..5f587ec1 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Permissions required:
-#  heroku cli - access to app
+#  psql - access to heroku app's postgres
 #  aws s3 - NZSL IAM access
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
@@ -9,6 +9,7 @@
 import os
 import subprocess
 import argparse
+from pprint import pprint
 
 # TODO
 # We are using external apps just for the moment.
@@ -106,22 +107,28 @@
     exit()
 NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
 NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
-COOKED_DELIMITER = ", "
+CSV_DELIMITER = ", "
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
 S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
 S3_KEYS_NOT_IN_NZSL_FILE = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
+ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
 nzsl_raw_keys_dict = {}
 nzsl_cooked_keys_dict = {}
 s3_keys_not_in_nzsl_list = []
 
+# TODO This will replace everything
+all_keys_dict = {}
+
 if args.cached:
+    print("NOT READY!")
+    exit()
     # Pull all info from existing files
     try:
         with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
-                video_key, is_public = line.strip().split(COOKED_DELIMITER)
+                video_key, is_public = line.strip().split(CSV_DELIMITER)
                 nzsl_cooked_keys_dict[video_key] = is_public
     except FileNotFoundError:
         print(f"File not found: {NZSL_COOKED_KEYS_FILE}")
@@ -143,6 +150,7 @@
         S3_BUCKET_ERROR_KEYS_FILE,
         S3_BUCKET_CONTENTS_FILE,
         S3_KEYS_NOT_IN_NZSL_FILE,
+        ALL_KEYS_FILE
     ):
         f = open(p, "a")
         f.truncate()
@@ -160,26 +168,26 @@
             stdout=f_obj,
         )
 
-    # Separate out just the keys (also strips newlines)
-    # Put them in an in-memory list
+    # Separate out just the key (also strip newline) from date, time, size, key
+    # Put the keys in an in-memory list
     with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
         s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
     print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
 
-    # Write the keys back to the file
+    # Write the keys back to the file, for cleanliness
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
         for line in s3_bucket_raw_keys_list:
             f_obj.write(f"{line}\n")
 
-    # Get the video file keys from NZSL Signbank
-    print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...")
+    # Get the video files info from NZSL Signbank
+    print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...")
     with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
             [
                 PGCLIENT,
                 "-t",
                 "-c",
-                "select videofile, is_public from video_glossvideo",
+                "select id as db_id, gloss_id, is_public, videofile from video_glossvideo",
                 f"{DATABASE_URL}",
             ],
             env=new_env,
@@ -193,40 +201,63 @@
     print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
 
     # Separate out the NZSL key columns
-    # Write them to a dictionary so we can do fast operations on them
+    # Write them to a dictionary, so we can do fast operations on them
     for rawl in nzsl_raw_keys_list:
         rawl = rawl.strip()
         if not rawl:
             continue
         columns = rawl.split("|")
-        video_key = columns[0].strip()
-        is_public = columns[1].strip().lower() == "t"
-        nzsl_raw_keys_dict[video_key] = is_public
+        db_id = columns[0].strip()
+        gloss_id = columns[1].strip()
+        is_public = columns[2].strip().lower() == "t"
+        # 'videofile' data is also the key for S3
+        video_key = columns[3].strip()
+        # Each dictionary entry is all of these values
+        nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public]
 
     # Get the s3 keys present and absent from our NZSL keys
     print("Getting S3 keys present and absent from NZSL Signbank ...")
+    nkeys_present = 0
+    nkeys_absent = 0
     for video_key in s3_bucket_raw_keys_list:
         if video_key in nzsl_raw_keys_dict:
+            nkeys_present += 1
+            # Add 'Present' column to start
+            all_keys_dict[video_key] = [True] + nzsl_raw_keys_dict[video_key]
             nzsl_cooked_keys_dict[video_key] = nzsl_raw_keys_dict[video_key]
         else:
+            nkeys_absent += 1
             s3_keys_not_in_nzsl_list.append(video_key)
-    print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
-    print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys")
+            # Add 'Present' (absent) column to start
+            all_keys_dict[video_key] = [False, "", "", ""]
+    print(f"PRESENT: {nkeys_present} keys")
+    print(f"ABSENT: {nkeys_absent} keys")
 
     # Write the "cooked" (i.e. present) keys back to a file
     with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj:
         for video_key, is_public in nzsl_cooked_keys_dict.items():
-            f_obj.write(f"{video_key}{COOKED_DELIMITER}{str(is_public)}\n")
+            f_obj.write(f"{video_key}{CSV_DELIMITER}{str(is_public)}\n")
 
     # Write the absent keys back to a file
     with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj:
         for video_key in s3_keys_not_in_nzsl_list:
             f_obj.write(f"{video_key}\n")
 
+    # Write all keys back to a file
+    with open(ALL_KEYS_FILE, "w") as f_obj:
+        for video_key, item_list in all_keys_dict.items():
+            outstr = f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
+            f_obj.write(outstr)
+
 # From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
-for video_key, is_public in nzsl_cooked_keys_dict.items():
-    video_key = video_key.strip()
+for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
+    if not is_present:
+        continue
+
+    print("HUMPHREY")
+    print(video_key)
+
     result = subprocess.run(
         [
             AWSCLIENT,

From eab29ba176017e32a484eae7d073598db3f4ad94 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:08:47 +1000
Subject: [PATCH 025/222] Basics of final output collection working

---
 bin/get-video-s3-acls.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 5f587ec1..c72dd7ab 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -255,7 +255,6 @@
     if not is_present:
         continue
 
-    print("HUMPHREY")
     print(video_key)
 
     result = subprocess.run(
@@ -276,6 +275,8 @@
         capture_output=True,
         text=True,
     )
-    print(f"Key:    {video_key}")
-    print(f"Public: {is_public}")
+    print(f"Key:      {video_key}")
+    print(f"Public:   {is_public}")
+    print(f"db_id:    {db_id}")
+    print(f"gloss_id: {gloss_id}")
     print(result.stdout)

From 54595ffe63f930175dd4447ef66263dd231e2c1a Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:34:19 +1000
Subject: [PATCH 026/222] About to remove legacy files output

---
 bin/get-video-s3-acls.py | 43 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c72dd7ab..00a31dfb 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -105,9 +105,9 @@
 except OSError as err:
     print(f"Error creating temporary directory: {TMPDIR} {err}")
     exit()
+CSV_DELIMITER = ","
 NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
 NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
-CSV_DELIMITER = ", "
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
 S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
@@ -120,11 +120,45 @@
 
 # TODO This will replace everything
 all_keys_dict = {}
+nkeys_present = 0
+nkeys_absent = 0
 
 if args.cached:
-    print("NOT READY!")
-    exit()
     # Pull all info from existing files
+    try:
+        with open(ALL_KEYS_FILE, "r") as f_obj:
+            for line in f_obj.readlines():
+
+                print(line, end="")
+
+                video_key, is_present_str, db_id_str, gloss_id_str, is_public_str = line.strip().split(CSV_DELIMITER)
+
+                is_present = is_present_str.strip().lower() == "true"
+                if is_present:
+                    nkeys_present += 1
+                    db_id = int(db_id_str)
+                    try:
+                        gloss_id = int(gloss_id_str)
+                    except ValueError:
+                        gloss_id = None
+                    is_public = is_public_str.strip().lower() == "true"
+                else:
+                    nkeys_absent += 1
+                    db_id = None
+                    gloss_id = None
+                    is_public = None
+
+                all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
+
+                print(video_key, end=" ")
+                pprint(all_keys_dict[video_key])
+
+        print(f"PRESENT: {nkeys_present} keys")
+        print(f"ABSENT: {nkeys_absent} keys")
+    except FileNotFoundError:
+        print(f"File not found: {ALL_KEYS_FILE}")
+        exit()
+    """
     try:
         with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
@@ -141,6 +175,7 @@
         exit()
     print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
     print(f"ABSENT:  {len(s3_keys_not_in_nzsl_list)} keys")
+        """
 else:
     # Zero-out files
     for p in (
@@ -217,8 +252,6 @@
 
     # Get the s3 keys present and absent from our NZSL keys
     print("Getting S3 keys present and absent from NZSL Signbank ...")
-    nkeys_present = 0
-    nkeys_absent = 0
     for video_key in s3_bucket_raw_keys_list:
         if video_key in nzsl_raw_keys_dict:
             nkeys_present += 1

From c8c51620f7b8841e1bcddf66e8074cc3f56d8859 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:39:39 +1000
Subject: [PATCH 027/222] Legacy output files removed

---
 bin/get-video-s3-acls.py | 44 ++--------------------------------------
 1 file changed, 2 insertions(+), 42 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 00a31dfb..bd68d607 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -105,21 +105,15 @@
 except OSError as err:
     print(f"Error creating temporary directory: {TMPDIR} {err}")
     exit()
+
 CSV_DELIMITER = ","
 NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
-NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt"
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
-S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv"
-S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv"
-S3_KEYS_NOT_IN_NZSL_FILE = f"{TMPDIR}/s3_keys_not_in_nzsl.csv"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
 nzsl_raw_keys_dict = {}
-nzsl_cooked_keys_dict = {}
-s3_keys_not_in_nzsl_list = []
-
-# TODO This will replace everything
 all_keys_dict = {}
+
 nkeys_present = 0
 nkeys_absent = 0
 
@@ -158,33 +152,11 @@
     except FileNotFoundError:
         print(f"File not found: {ALL_KEYS_FILE}")
         exit()
-    """
-    try:
-        with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj:
-            for line in f_obj.readlines():
-                video_key, is_public = line.strip().split(CSV_DELIMITER)
-                nzsl_cooked_keys_dict[video_key] = is_public
-    except FileNotFoundError:
-        print(f"File not found: {NZSL_COOKED_KEYS_FILE}")
-        exit()
-    try:
-        with open(S3_KEYS_NOT_IN_NZSL_FILE, "r") as f_obj:
-            s3_keys_not_in_nzsl_list = [line.strip() for line in f_obj.readlines()]
-    except FileNotFoundError:
-        print(f"File not found: {S3_KEYS_NOT_IN_NZSL_FILE}")
-        exit()
-    print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys")
-    print(f"ABSENT:  {len(s3_keys_not_in_nzsl_list)} keys")
-        """
 else:
     # Zero-out files
     for p in (
         NZSL_RAW_KEYS_FILE,
-        NZSL_COOKED_KEYS_FILE,
         S3_BUCKET_RAW_KEYS_FILE,
-        S3_BUCKET_ERROR_KEYS_FILE,
-        S3_BUCKET_CONTENTS_FILE,
-        S3_KEYS_NOT_IN_NZSL_FILE,
         ALL_KEYS_FILE
     ):
         f = open(p, "a")
@@ -257,25 +229,13 @@
             nkeys_present += 1
             # Add 'Present' column to start
             all_keys_dict[video_key] = [True] + nzsl_raw_keys_dict[video_key]
-            nzsl_cooked_keys_dict[video_key] = nzsl_raw_keys_dict[video_key]
         else:
             nkeys_absent += 1
-            s3_keys_not_in_nzsl_list.append(video_key)
             # Add 'Present' (absent) column to start
             all_keys_dict[video_key] = [False, "", "", ""]
     print(f"PRESENT: {nkeys_present} keys")
     print(f"ABSENT: {nkeys_absent} keys")
 
-    # Write the "cooked" (i.e. present) keys back to a file
-    with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj:
-        for video_key, is_public in nzsl_cooked_keys_dict.items():
-            f_obj.write(f"{video_key}{CSV_DELIMITER}{str(is_public)}\n")
-
-    # Write the absent keys back to a file
-    with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj:
-        for video_key in s3_keys_not_in_nzsl_list:
-            f_obj.write(f"{video_key}\n")
-
     # Write all keys back to a file
     with open(ALL_KEYS_FILE, "w") as f_obj:
         for video_key, item_list in all_keys_dict.items():

From d2ebec3e7c32b20635c4ec31ece2f670af33b7dc Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:44:48 +1000
Subject: [PATCH 028/222] cleanups

---
 bin/get-video-s3-acls.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index bd68d607..3a30c637 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -107,7 +107,7 @@
     exit()
 
 CSV_DELIMITER = ","
-NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt"
+NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.txt"
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
@@ -118,19 +118,17 @@
 nkeys_absent = 0
 
 if args.cached:
-    # Pull all info from existing files
+    # Pull all info from existing file
     try:
         with open(ALL_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
-
-                print(line, end="")
-
                 video_key, is_present_str, db_id_str, gloss_id_str, is_public_str = line.strip().split(CSV_DELIMITER)
 
                 is_present = is_present_str.strip().lower() == "true"
                 if is_present:
                     nkeys_present += 1
                     db_id = int(db_id_str)
+                    # Some don't have gloss_id's
                     try:
                         gloss_id = int(gloss_id_str)
                     except ValueError:
@@ -144,18 +142,15 @@
 
                 all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
 
-                print(video_key, end=" ")
-                pprint(all_keys_dict[video_key])
-
         print(f"PRESENT: {nkeys_present} keys")
-        print(f"ABSENT: {nkeys_absent} keys")
+        print(f"ABSENT:  {nkeys_absent} keys")
     except FileNotFoundError:
         print(f"File not found: {ALL_KEYS_FILE}")
         exit()
 else:
     # Zero-out files
     for p in (
-        NZSL_RAW_KEYS_FILE,
+        NZSL_POSTGRES_RAW_KEYS_FILE,
         S3_BUCKET_RAW_KEYS_FILE,
         ALL_KEYS_FILE
     ):
@@ -188,7 +183,7 @@
 
     # Get the video files info from NZSL Signbank
     print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...")
-    with open(NZSL_RAW_KEYS_FILE, "w") as f_obj:
+    with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
             [
                 PGCLIENT,
@@ -203,9 +198,9 @@
             text=True,
             stdout=f_obj,
         )
-    with open(NZSL_RAW_KEYS_FILE, "r") as f_obj:
+    with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
-    print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}")
+    print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}")
 
     # Separate out the NZSL key columns
     # Write them to a dictionary, so we can do fast operations on them
@@ -234,7 +229,7 @@
             # Add 'Present' (absent) column to start
             all_keys_dict[video_key] = [False, "", "", ""]
     print(f"PRESENT: {nkeys_present} keys")
-    print(f"ABSENT: {nkeys_absent} keys")
+    print(f"ABSENT:  {nkeys_absent} keys")
 
     # Write all keys back to a file
     with open(ALL_KEYS_FILE, "w") as f_obj:

From ce51fa442a8097e86f3f089ff19c34239ee58c53 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:49:36 +1000
Subject: [PATCH 029/222] s3_bucket_raw_keys_list

---
 bin/get-video-s3-acls.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 3a30c637..3b233382 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -112,6 +112,7 @@
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
 nzsl_raw_keys_dict = {}
+s3_bucket_raw_keys_list = []
 all_keys_dict = {}
 
 nkeys_present = 0

From fedf7e9544ac20495aa144a6cdc4725a2c3878ec Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 30 Aug 2024 18:19:06 +1000
Subject: [PATCH 030/222] Output changed to JSON -> py dict for processing

---
 bin/get-video-s3-acls.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 3b233382..c4ac5919 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -9,6 +9,7 @@
 import os
 import subprocess
 import argparse
+import json
 from pprint import pprint
 
 # TODO
@@ -203,8 +204,8 @@
         nzsl_raw_keys_list = f_obj.readlines()
     print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}")
 
-    # Separate out the NZSL key columns
-    # Write them to a dictionary, so we can do fast operations on them
+    # Separate out the NZSL db columns
+    # Write them to a dictionary, so we can do fast operations
     for rawl in nzsl_raw_keys_list:
         rawl = rawl.strip()
         if not rawl:
@@ -215,7 +216,7 @@
         is_public = columns[2].strip().lower() == "t"
         # 'videofile' data is also the key for S3
         video_key = columns[3].strip()
-        # Each dictionary entry is all of these values
+        # Each dictionary slot contains these values
         nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public]
 
     # Get the s3 keys present and absent from our NZSL keys
@@ -244,15 +245,13 @@
     if not is_present:
         continue
 
-    print(video_key)
-
     result = subprocess.run(
         [
             AWSCLIENT,
             "s3api",
             "get-object-acl",
             "--output",
-            "text",
+            "json",
             "--bucket",
             AWS_S3_BUCKET,
             "--key",
@@ -268,4 +267,6 @@
     print(f"Public:   {is_public}")
     print(f"db_id:    {db_id}")
     print(f"gloss_id: {gloss_id}")
-    print(result.stdout)
+
+    # Still figuring out how to make this into canned ACLS, shouldn't be hard
+    pprint(json.loads(result.stdout))

From 274ff5a1fb77133d68980cd219fa873d260076f9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Sun, 1 Sep 2024 15:05:02 +1000
Subject: [PATCH 031/222] All fields represented and ACL logic working

---
 bin/get-video-s3-acls.py | 68 ++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c4ac5919..dad43695 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -242,31 +242,45 @@
 # From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
-    if not is_present:
-        continue
-
-    result = subprocess.run(
-        [
-            AWSCLIENT,
-            "s3api",
-            "get-object-acl",
-            "--output",
-            "json",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ],
-        env=new_env,
-        shell=False,
-        check=True,
-        capture_output=True,
-        text=True,
-    )
+    canned_acl = ""
+    canned_acl_expected = ""
+    if is_present:
+        canned_acl_expected = "public-read" if is_public else "private"
+        result = subprocess.run(
+            [
+                AWSCLIENT,
+                "s3api",
+                "get-object-acl",
+                "--output",
+                "json",
+                "--bucket",
+                AWS_S3_BUCKET,
+                "--key",
+                video_key,
+            ],
+            env=new_env,
+            shell=False,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        acls_grants_json = json.loads(result.stdout)["Grants"]
+        if len(acls_grants_json) > 1:
+            if acls_grants_json[0]["Permission"] == "FULL_CONTROL" and acls_grants_json[1]["Permission"] == "READ":
+                canned_acl = "public-read"
+            else:
+                canned_acl = "Unknown ACL"
+        else:
+            if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
+                canned_acl = "private"
+            else:
+                canned_acl = "Unknown ACL"
     print(f"Key:      {video_key}")
-    print(f"Public:   {is_public}")
-    print(f"db_id:    {db_id}")
-    print(f"gloss_id: {gloss_id}")
-
-    # Still figuring out how to make this into canned ACLS, shouldn't be hard
-    pprint(json.loads(result.stdout))
+    print(f"Present:  {is_present}")
+    print(f"db_id:    {db_id if is_present else ''}")
+    print(f"gloss_id: {gloss_id if is_present else ''}")
+    print(f"Public:   {is_public if is_present else ''}")
+    print(f"Expected: {canned_acl_expected}")
+    print(f"Got:      {canned_acl}")
+    print(f"Match:    {str(canned_acl_expected == canned_acl) if is_present else ''}")
+    print("--------------------------------------")

From 8fc2e414b7f6f20d2da1184dedf7242e7f598cd6 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Sun, 1 Sep 2024 15:09:36 +1000
Subject: [PATCH 032/222] black

---
 bin/get-video-s3-acls.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index dad43695..eb82fc46 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -124,7 +124,13 @@
     try:
         with open(ALL_KEYS_FILE, "r") as f_obj:
             for line in f_obj.readlines():
-                video_key, is_present_str, db_id_str, gloss_id_str, is_public_str = line.strip().split(CSV_DELIMITER)
+                (
+                    video_key,
+                    is_present_str,
+                    db_id_str,
+                    gloss_id_str,
+                    is_public_str,
+                ) = line.strip().split(CSV_DELIMITER)
 
                 is_present = is_present_str.strip().lower() == "true"
                 if is_present:
@@ -151,11 +157,7 @@
         exit()
 else:
     # Zero-out files
-    for p in (
-        NZSL_POSTGRES_RAW_KEYS_FILE,
-        S3_BUCKET_RAW_KEYS_FILE,
-        ALL_KEYS_FILE
-    ):
+    for p in (NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE):
         f = open(p, "a")
         f.truncate()
         f.close()
@@ -236,11 +238,17 @@
     # Write all keys back to a file
     with open(ALL_KEYS_FILE, "w") as f_obj:
         for video_key, item_list in all_keys_dict.items():
-            outstr = f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
+            outstr = (
+                f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
+            )
             f_obj.write(outstr)
 
 # From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
+# CSV header
+print(
+    f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match"
+)
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""
     canned_acl_expected = ""
@@ -266,7 +274,10 @@
         )
         acls_grants_json = json.loads(result.stdout)["Grants"]
         if len(acls_grants_json) > 1:
-            if acls_grants_json[0]["Permission"] == "FULL_CONTROL" and acls_grants_json[1]["Permission"] == "READ":
+            if (
+                acls_grants_json[0]["Permission"] == "FULL_CONTROL"
+                and acls_grants_json[1]["Permission"] == "READ"
+            ):
                 canned_acl = "public-read"
             else:
                 canned_acl = "Unknown ACL"
@@ -275,7 +286,8 @@
                 canned_acl = "private"
             else:
                 canned_acl = "Unknown ACL"
-    print(f"Key:      {video_key}")
+    # CSV columns
+    print(f"Key:      {video_key}", end=CSV_DELIMITER)
     print(f"Present:  {is_present}")
     print(f"db_id:    {db_id if is_present else ''}")
     print(f"gloss_id: {gloss_id if is_present else ''}")

From c143b2f92eb9fa5bb4198df82bfaf6feb05319ef Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Sun, 1 Sep 2024 15:13:37 +1000
Subject: [PATCH 033/222] black

---
 bin/get-video-s3-acls.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index eb82fc46..39b6a573 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -286,13 +286,13 @@
                 canned_acl = "private"
             else:
                 canned_acl = "Unknown ACL"
+
     # CSV columns
-    print(f"Key:      {video_key}", end=CSV_DELIMITER)
-    print(f"Present:  {is_present}")
-    print(f"db_id:    {db_id if is_present else ''}")
-    print(f"gloss_id: {gloss_id if is_present else ''}")
-    print(f"Public:   {is_public if is_present else ''}")
-    print(f"Expected: {canned_acl_expected}")
-    print(f"Got:      {canned_acl}")
-    print(f"Match:    {str(canned_acl_expected == canned_acl) if is_present else ''}")
-    print("--------------------------------------")
+    print(f"{video_key}", end=CSV_DELIMITER)
+    print(f"{is_present}", end=CSV_DELIMITER)
+    print(f"{db_id if is_present else ''}", end=CSV_DELIMITER)
+    print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER)
+    print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
+    print(f"{canned_acl_expected}", end=CSV_DELIMITER)
+    print(f"{canned_acl}", end=CSV_DELIMITER)
+    print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}")

From 1c267ff958f4c91337cde99ab0e774f1f4b1c819 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:12:21 +1000
Subject: [PATCH 034/222] remove pprint

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 39b6a573..969c665a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -10,7 +10,6 @@
 import subprocess
 import argparse
 import json
-from pprint import pprint
 
 # TODO
 # We are using external apps just for the moment.

From 1937bf3e65d22d4f0580c559eee2cfeacb350ce6 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:26:45 +1000
Subject: [PATCH 035/222] Header writes to stderr

---
 bin/get-video-s3-acls.py | 45 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 969c665a..298f5219 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -7,6 +7,7 @@
 
 
 import os
+import sys
 import subprocess
 import argparse
 import json
@@ -26,7 +27,7 @@
 )
 # Positional arguments
 if DATABASE_URL:
-    print("DATABASE_URL defined in environment")
+    print("DATABASE_URL defined in environment", file=sys.stderr)
 else:
     parser.add_argument(
         "dburl",
@@ -87,23 +88,23 @@
     DATABASE_URL = args.dburl
 
 if args.cached:
-    print("Using the video keys we recorded on the last non-cached run.")
+    print("Using the video keys we recorded on the last non-cached run.", file=sys.stderr)
 else:
-    print("Generating keys from scratch.")
+    print("Generating keys from scratch.", file=sys.stderr)
 
-print(f"Mode:        {MODE_STR}")
-print(f"  NZSL app:  {NZSL_APP}")
-print(f"  S3 bucket: {AWS_S3_BUCKET}")
-print(f"AWS profile: {new_env['AWS_PROFILE']}")
-print(f"AWSCLIENT:   {AWSCLIENT}")
-print(f"PGCLIENT:    {PGCLIENT}")
-print(f"DATABASE_URL:\n{DATABASE_URL}")
+print(f"Mode:        {MODE_STR}", file=sys.stderr)
+print(f"  NZSL app:  {NZSL_APP}", file=sys.stderr)
+print(f"  S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)
+print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
+print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
+print(f"DATABASE_URL:\n{DATABASE_URL}", file=sys.stderr)
 
 TMPDIR = "/tmp/nzsl"
 try:
     os.makedirs(TMPDIR, exist_ok=True)
 except OSError as err:
-    print(f"Error creating temporary directory: {TMPDIR} {err}")
+    print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
 
 CSV_DELIMITER = ","
@@ -149,10 +150,10 @@
 
                 all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
 
-        print(f"PRESENT: {nkeys_present} keys")
-        print(f"ABSENT:  {nkeys_absent} keys")
+        print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
+        print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
     except FileNotFoundError:
-        print(f"File not found: {ALL_KEYS_FILE}")
+        print(f"File not found: {ALL_KEYS_FILE}", file=sys.stderr)
         exit()
 else:
     # Zero-out files
@@ -162,7 +163,7 @@
         f.close()
 
     # Get all keys from AWS S3
-    print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...")
+    print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr)
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
             [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
@@ -177,7 +178,7 @@
     # Put the keys in an in-memory list
     with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
         s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
-    print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}")
+    print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}", file=sys.stderr)
 
     # Write the keys back to the file, for cleanliness
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
@@ -185,7 +186,7 @@
             f_obj.write(f"{line}\n")
 
     # Get the video files info from NZSL Signbank
-    print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...")
+    print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...", file=sys.stderr)
     with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
             [
@@ -203,7 +204,7 @@
         )
     with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
-    print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}")
+    print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}", file=sys.stderr)
 
     # Separate out the NZSL db columns
     # Write them to a dictionary, so we can do fast operations
@@ -221,7 +222,7 @@
         nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public]
 
     # Get the s3 keys present and absent from our NZSL keys
-    print("Getting S3 keys present and absent from NZSL Signbank ...")
+    print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
     for video_key in s3_bucket_raw_keys_list:
         if video_key in nzsl_raw_keys_dict:
             nkeys_present += 1
@@ -231,8 +232,8 @@
             nkeys_absent += 1
             # Add 'Present' (absent) column to start
             all_keys_dict[video_key] = [False, "", "", ""]
-    print(f"PRESENT: {nkeys_present} keys")
-    print(f"ABSENT:  {nkeys_absent} keys")
+    print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
+    print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
 
     # Write all keys back to a file
     with open(ALL_KEYS_FILE, "w") as f_obj:
@@ -243,7 +244,7 @@
             f_obj.write(outstr)
 
 # From the keys present in NZSL, get all their ACL information
-print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...")
+print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 # CSV header
 print(
     f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match"

From 79e1361705951cbcc6ab0346c7a6539e28933331 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:26:53 +1000
Subject: [PATCH 036/222] black

---
 bin/get-video-s3-acls.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 298f5219..128a3955 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -88,7 +88,9 @@
     DATABASE_URL = args.dburl
 
 if args.cached:
-    print("Using the video keys we recorded on the last non-cached run.", file=sys.stderr)
+    print(
+        "Using the video keys we recorded on the last non-cached run.", file=sys.stderr
+    )
 else:
     print("Generating keys from scratch.", file=sys.stderr)
 
@@ -178,7 +180,10 @@
     # Put the keys in an in-memory list
     with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
         s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
-    print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}", file=sys.stderr)
+    print(
+        f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}",
+        file=sys.stderr,
+    )
 
     # Write the keys back to the file, for cleanliness
     with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
@@ -186,7 +191,10 @@
             f_obj.write(f"{line}\n")
 
     # Get the video files info from NZSL Signbank
-    print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...", file=sys.stderr)
+    print(
+        f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...",
+        file=sys.stderr,
+    )
     with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj:
         result = subprocess.run(
             [
@@ -204,7 +212,10 @@
         )
     with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
-    print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}", file=sys.stderr)
+    print(
+        f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}",
+        file=sys.stderr,
+    )
 
     # Separate out the NZSL db columns
     # Write them to a dictionary, so we can do fast operations

From 08afe68f998c569ee36b48ac2ddef26869f8adfb Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:28:41 +1000
Subject: [PATCH 037/222] Long line that black missed

---
 bin/get-video-s3-acls.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 128a3955..249b2c8d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -258,7 +258,8 @@
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 # CSV header
 print(
-    f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match"
+    f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}"
+    "Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match"
 )
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""

From 4b64642600519841d9875a238c6e1c04ca718c01 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:20:44 +1000
Subject: [PATCH 038/222] AWS_PROFILE purely environment var

---
 bin/get-video-s3-acls.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 249b2c8d..b20011e6 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -21,10 +21,16 @@
 # NZSL: Is there a database url defined in the environment?
 DATABASE_URL = os.getenv("DATABASE_URL", None)
 
+# AWS: Is there an AWS_PROFILE defined in the environment?
+AWS_PROFILE = os.getenv("AWS_PROFILE", None)
+if not AWS_PROFILE:
+    print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'")
+    exit()
+
 parser = argparse.ArgumentParser(
-    description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile "
-    "argument."
+    description="You must define, in the environment: AWS_PROFILE"
 )
+
 # Positional arguments
 if DATABASE_URL:
     print("DATABASE_URL defined in environment", file=sys.stderr)
@@ -34,12 +40,6 @@
         help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)",
     )
 # Optional arguments
-parser.add_argument(
-    "--awsprofile",
-    default="nzsl",
-    required=False,
-    help=f"AWS configured profile to use (default: '%(default)s')",
-)
 parser.add_argument(
     "--production",
     default=False,
@@ -78,8 +78,8 @@
     NZSL_APP = "nzsl-signbank-uat"
     AWS_S3_BUCKET = "nzsl-signbank-media-uat"
 
+# Get the environment
 new_env = os.environ.copy()
-new_env["AWS_PROFILE"] = args.awsprofile
 
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient

From e907e2e02f70a9923e4a1047462d181bc5418be9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:21:26 +1000
Subject: [PATCH 039/222] Revert "AWS_PROFILE purely environment var" Jumped
 the gun, wrong variable.

This reverts commit 4b64642600519841d9875a238c6e1c04ca718c01.
---
 bin/get-video-s3-acls.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b20011e6..249b2c8d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -21,16 +21,10 @@
 # NZSL: Is there a database url defined in the environment?
 DATABASE_URL = os.getenv("DATABASE_URL", None)
 
-# AWS: Is there an AWS_PROFILE defined in the environment?
-AWS_PROFILE = os.getenv("AWS_PROFILE", None)
-if not AWS_PROFILE:
-    print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'")
-    exit()
-
 parser = argparse.ArgumentParser(
-    description="You must define, in the environment: AWS_PROFILE"
+    description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile "
+    "argument."
 )
-
 # Positional arguments
 if DATABASE_URL:
     print("DATABASE_URL defined in environment", file=sys.stderr)
@@ -40,6 +34,12 @@
         help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)",
     )
 # Optional arguments
+parser.add_argument(
+    "--awsprofile",
+    default="nzsl",
+    required=False,
+    help=f"AWS configured profile to use (default: '%(default)s')",
+)
 parser.add_argument(
     "--production",
     default=False,
@@ -78,8 +78,8 @@
     NZSL_APP = "nzsl-signbank-uat"
     AWS_S3_BUCKET = "nzsl-signbank-media-uat"
 
-# Get the environment
 new_env = os.environ.copy()
+new_env["AWS_PROFILE"] = args.awsprofile
 
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient

From 1f8f70bfe6d54d897a83de969157e076112d44ce Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:27:42 +1000
Subject: [PATCH 040/222] Revert "Revert "AWS_PROFILE purely environment var""
 No, I actually had it right the first time.

This reverts commit e907e2e02f70a9923e4a1047462d181bc5418be9.
---
 bin/get-video-s3-acls.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 249b2c8d..b20011e6 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -21,10 +21,16 @@
 # NZSL: Is there a database url defined in the environment?
 DATABASE_URL = os.getenv("DATABASE_URL", None)
 
+# AWS: Is there an AWS_PROFILE defined in the environment?
+AWS_PROFILE = os.getenv("AWS_PROFILE", None)
+if not AWS_PROFILE:
+    print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'")
+    exit()
+
 parser = argparse.ArgumentParser(
-    description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile "
-    "argument."
+    description="You must define, in the environment: AWS_PROFILE"
 )
+
 # Positional arguments
 if DATABASE_URL:
     print("DATABASE_URL defined in environment", file=sys.stderr)
@@ -34,12 +40,6 @@
         help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)",
     )
 # Optional arguments
-parser.add_argument(
-    "--awsprofile",
-    default="nzsl",
-    required=False,
-    help=f"AWS configured profile to use (default: '%(default)s')",
-)
 parser.add_argument(
     "--production",
     default=False,
@@ -78,8 +78,8 @@
     NZSL_APP = "nzsl-signbank-uat"
     AWS_S3_BUCKET = "nzsl-signbank-media-uat"
 
+# Get the environment
 new_env = os.environ.copy()
-new_env["AWS_PROFILE"] = args.awsprofile
 
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient

From 5100287ed042508fa355479578c696aee2800d82 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:32:47 +1000
Subject: [PATCH 041/222] DATABASE_URL purely environment var. Missing stderrs
 added.

---
 bin/get-video-s3-acls.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b20011e6..6cba5805 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -20,25 +20,20 @@
 
 # NZSL: Is there a database url defined in the environment?
 DATABASE_URL = os.getenv("DATABASE_URL", None)
+if not DATABASE_URL:
+    print("You must define DATABASE_URL in the environment.", file=sys.stderr)
+    exit()
 
 # AWS: Is there an AWS_PROFILE defined in the environment?
 AWS_PROFILE = os.getenv("AWS_PROFILE", None)
 if not AWS_PROFILE:
-    print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'")
+    print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'", file=sys.stderr)
     exit()
 
 parser = argparse.ArgumentParser(
-    description="You must define, in the environment: AWS_PROFILE"
+    description="You must define, in the environment: AWS_PROFILE, DATABASE_URL"
 )
 
-# Positional arguments
-if DATABASE_URL:
-    print("DATABASE_URL defined in environment", file=sys.stderr)
-else:
-    parser.add_argument(
-        "dburl",
-        help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)",
-    )
 # Optional arguments
 parser.add_argument(
     "--production",
@@ -84,9 +79,6 @@
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient
 
-if not DATABASE_URL:
-    DATABASE_URL = args.dburl
-
 if args.cached:
     print(
         "Using the video keys we recorded on the last non-cached run.", file=sys.stderr
@@ -100,7 +92,7 @@
 print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)
 print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
-print(f"DATABASE_URL:\n{DATABASE_URL}", file=sys.stderr)
+print(f"DATABASE_URL:\n{new_env['DATABASE_URL']}", file=sys.stderr)
 
 TMPDIR = "/tmp/nzsl"
 try:

From 114d5c49ccc97d0d9f61f62ace6dd4d59f897985 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:39:57 +1000
Subject: [PATCH 042/222] Production/UAT mode changed to string

---
 bin/get-video-s3-acls.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 6cba5805..ff91972f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -36,11 +36,10 @@
 
 # Optional arguments
 parser.add_argument(
-    "--production",
-    default=False,
+    "--mode",
+    default="uat",
     required=False,
-    action="store_true",
-    help="Run in PRODUCTION mode, instead of STAGING (default: %(default)s)",
+    help="Mode to run in, eg 'production, 'uat', etc (default: '%(default)s')",
 )
 parser.add_argument(
     "--cached",
@@ -64,14 +63,8 @@
 )
 args = parser.parse_args()
 
-if args.production:
-    MODE_STR = "PRODUCTION"
-    NZSL_APP = "nzsl-signbank-production"
-    AWS_S3_BUCKET = "nzsl-signbank-media-production"
-else:
-    MODE_STR = "STAGING"
-    NZSL_APP = "nzsl-signbank-uat"
-    AWS_S3_BUCKET = "nzsl-signbank-media-uat"
+NZSL_APP = f"nzsl-signbank-{args.mode}"
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 
 # Get the environment
 new_env = os.environ.copy()
@@ -86,7 +79,7 @@
 else:
     print("Generating keys from scratch.", file=sys.stderr)
 
-print(f"Mode:        {MODE_STR}", file=sys.stderr)
+print(f"Mode:        {args.mode}", file=sys.stderr)
 print(f"  NZSL app:  {NZSL_APP}", file=sys.stderr)
 print(f"  S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)

From dd94fb2e4fd9c92d427351c660893d731e06eda1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:44:58 +1000
Subject: [PATCH 043/222] Better column names

---
 bin/get-video-s3-acls.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index ff91972f..9435b471 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -243,8 +243,9 @@
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 # CSV header
 print(
-    f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}"
-    "Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match"
+    f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
+    f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
+    f"{CSV_DELIMITER}Correct Canned ACL?"
 )
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""

From c477058c619adc6216acdc0a1db252a89a3e6d05 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:46:07 +1000
Subject: [PATCH 044/222] black

---
 bin/get-video-s3-acls.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 9435b471..2d05b523 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -27,7 +27,10 @@
 # AWS: Is there an AWS_PROFILE defined in the environment?
 AWS_PROFILE = os.getenv("AWS_PROFILE", None)
 if not AWS_PROFILE:
-    print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'", file=sys.stderr)
+    print(
+        "You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'",
+        file=sys.stderr,
+    )
     exit()
 
 parser = argparse.ArgumentParser(

From e4ca16bb226ccfe7063e19759f19334173496631 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 16:01:14 +1000
Subject: [PATCH 045/222] Output raw ACL data as well

---
 bin/get-video-s3-acls.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 2d05b523..43cd9eca 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -248,11 +248,12 @@
 print(
     f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
     f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
-    f"{CSV_DELIMITER}Correct Canned ACL?"
+    f"{CSV_DELIMITER}Correct Canned ACL?{CSV_DELIMITER}Raw ACL data"
 )
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""
     canned_acl_expected = ""
+    raw_acl = ""
     if is_present:
         canned_acl_expected = "public-read" if is_public else "private"
         result = subprocess.run(
@@ -273,6 +274,7 @@
             capture_output=True,
             text=True,
         )
+        raw_acl = result.stdout.replace("\n", " ")
         acls_grants_json = json.loads(result.stdout)["Grants"]
         if len(acls_grants_json) > 1:
             if (
@@ -296,4 +298,5 @@
     print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
     print(f"{canned_acl_expected}", end=CSV_DELIMITER)
     print(f"{canned_acl}", end=CSV_DELIMITER)
-    print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}")
+    print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}", end=CSV_DELIMITER)
+    print(f"{raw_acl}")

From 9e24507d7488173a8fa94d98b0c29d3c3811a2a5 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 2 Sep 2024 17:06:49 +1000
Subject: [PATCH 046/222] Comment showing where canned ACLs are set in main app

---
 bin/get-video-s3-acls.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 43cd9eca..1f0a6e49 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -255,6 +255,7 @@
     canned_acl_expected = ""
     raw_acl = ""
     if is_present:
+        # See signbank/video/models.py, line 59, in function set_public_acl()
         canned_acl_expected = "public-read" if is_public else "private"
         result = subprocess.run(
             [

From 15af142829818966becc9c2893fb0eec95edb249 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 11:43:13 +1000
Subject: [PATCH 047/222] Raw ACL data and header removed again

---
 bin/get-video-s3-acls.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 1f0a6e49..d73c911a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -248,7 +248,7 @@
 print(
     f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
     f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
-    f"{CSV_DELIMITER}Correct Canned ACL?{CSV_DELIMITER}Raw ACL data"
+    f"{CSV_DELIMITER}Correct Canned ACL?"
 )
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""
@@ -275,7 +275,6 @@
             capture_output=True,
             text=True,
         )
-        raw_acl = result.stdout.replace("\n", " ")
         acls_grants_json = json.loads(result.stdout)["Grants"]
         if len(acls_grants_json) > 1:
             if (
@@ -299,5 +298,4 @@
     print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
     print(f"{canned_acl_expected}", end=CSV_DELIMITER)
     print(f"{canned_acl}", end=CSV_DELIMITER)
-    print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}", end=CSV_DELIMITER)
-    print(f"{raw_acl}")
+    print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}")

From e85309a66bbcc25ca5673585ad60e758f44c5cbd Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:00:50 +1000
Subject: [PATCH 048/222] Extraneous columns removed

---
 bin/get-video-s3-acls.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index d73c911a..b5d4e466 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -246,9 +246,8 @@
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 # CSV header
 print(
-    f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
+    f"Video S3 Key{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
     f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
-    f"{CSV_DELIMITER}Correct Canned ACL?"
 )
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""
@@ -292,10 +291,8 @@
 
     # CSV columns
     print(f"{video_key}", end=CSV_DELIMITER)
-    print(f"{is_present}", end=CSV_DELIMITER)
     print(f"{db_id if is_present else ''}", end=CSV_DELIMITER)
     print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER)
     print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
     print(f"{canned_acl_expected}", end=CSV_DELIMITER)
-    print(f"{canned_acl}", end=CSV_DELIMITER)
-    print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}")
+    print(f"{canned_acl}")

From 9eed309168bf7b92062f2079ea42d84a5eb4fa11 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:03:52 +1000
Subject: [PATCH 049/222] NSZL_APP removed, no longer needed

---
 bin/get-video-s3-acls.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b5d4e466..e7564912 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -5,7 +5,6 @@
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
 
-
 import os
 import sys
 import subprocess
@@ -66,7 +65,6 @@
 )
 args = parser.parse_args()
 
-NZSL_APP = f"nzsl-signbank-{args.mode}"
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 
 # Get the environment
@@ -83,8 +81,7 @@
     print("Generating keys from scratch.", file=sys.stderr)
 
 print(f"Mode:        {args.mode}", file=sys.stderr)
-print(f"  NZSL app:  {NZSL_APP}", file=sys.stderr)
-print(f"  S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)
 print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
@@ -180,7 +177,7 @@
 
     # Get the video files info from NZSL Signbank
     print(
-        f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...",
+        f"Getting raw list of video file info from NZSL Signbank ...",
         file=sys.stderr,
     )
     with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj:

From 0e48d3a472a0446277d69f09f23655136f6aa377 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:09:29 +1000
Subject: [PATCH 050/222] AWS_PROFILE requirement removed

---
 bin/get-video-s3-acls.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e7564912..9354c55d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -23,17 +23,8 @@
     print("You must define DATABASE_URL in the environment.", file=sys.stderr)
     exit()
 
-# AWS: Is there an AWS_PROFILE defined in the environment?
-AWS_PROFILE = os.getenv("AWS_PROFILE", None)
-if not AWS_PROFILE:
-    print(
-        "You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'",
-        file=sys.stderr,
-    )
-    exit()
-
 parser = argparse.ArgumentParser(
-    description="You must define, in the environment: AWS_PROFILE, DATABASE_URL"
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL"
 )
 
 # Optional arguments

From 4aa6add07d63826610cf8b297a5221cfa72e868c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:30:21 +1000
Subject: [PATCH 051/222] Removed question mark

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 9354c55d..28108ce0 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -235,7 +235,7 @@
 # CSV header
 print(
     f"Video S3 Key{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
-    f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
+    f"Signbank is_public{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
 )
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""

From d7af5055582fbb5de13820d3c0ead240723f4eb6 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:23:07 +1000
Subject: [PATCH 052/222] Header refactored

---
 bin/get-video-s3-acls.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 28108ce0..b448b090 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -232,11 +232,18 @@
 
 # From the keys present in NZSL, get all their ACL information
 print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
+
 # CSV header
-print(
-    f"Video S3 Key{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}"
-    f"Signbank is_public{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL"
-)
+csv_header_list = [
+    "Video S3 Key",
+    "Postgres ID",
+    "Gloss ID",
+    "Signbank Public",
+    "Expected S3 Canned ACL",
+    "Actual S3 Canned ACL",
+]
+print(CSV_DELIMITER.join(csv_header_list))
+
 for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
     canned_acl = ""
     canned_acl_expected = ""

From 32b20425bc198b83cc672fffb083c2c63d47dd5e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:26:41 +1000
Subject: [PATCH 053/222] AWS_PROFILE printing conditional

---
 bin/get-video-s3-acls.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b448b090..8b1799ff 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -73,7 +73,8 @@
 
 print(f"Mode:        {args.mode}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)
+if "AWS_PROFILE" in new_env:
+    print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)
 print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
 print(f"DATABASE_URL:\n{new_env['DATABASE_URL']}", file=sys.stderr)

From 4bc617c5d56807d564adfb8a7ebfdb67127567e7 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:30:22 +1000
Subject: [PATCH 054/222] Tidy up

---
 bin/get-video-s3-acls.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 8b1799ff..830e8f21 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -57,13 +57,13 @@
 args = parser.parse_args()
 
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
+AWSCLIENT = args.awsclient
+PGCLIENT = args.pgclient
 
 # Get the environment
 new_env = os.environ.copy()
 
-AWSCLIENT = args.awsclient
-PGCLIENT = args.pgclient
-
 if args.cached:
     print(
         "Using the video keys we recorded on the last non-cached run.", file=sys.stderr

From ea689f4daa090e63aaffa066d27a567be78257ea Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:03:55 +1000
Subject: [PATCH 055/222] duplicated line removed

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 830e8f21..53d2eead 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -56,7 +56,6 @@
 )
 args = parser.parse_args()
 
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient

From 53c4154b84d1995fb88fd09a2cc873f55a9bf182 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:58:53 +1000
Subject: [PATCH 056/222] Initial organisation into functions, and cleanup

---
 bin/get-video-s3-acls.py | 290 +++++++++++++++++++++------------------
 1 file changed, 160 insertions(+), 130 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 53d2eead..3f809514 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,23 +11,20 @@
 import argparse
 import json
 
-# TODO
-# We are using external apps just for the moment.
-# These will be removed for native libraries.
-AWSCLIENT = "/usr/local/bin/aws"
-PGCLIENT = "/usr/bin/psql"
-
-# NZSL: Is there a database url defined in the environment?
+# Globals
 DATABASE_URL = os.getenv("DATABASE_URL", None)
 if not DATABASE_URL:
     print("You must define DATABASE_URL in the environment.", file=sys.stderr)
     exit()
+NEW_ENV = os.environ.copy()
+CSV_DELIMITER = ","
+nzsl_raw_keys_dict = {}
+s3_bucket_raw_keys_list = []
+all_keys_dict = {}
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL"
 )
-
-# Optional arguments
 parser.add_argument(
     "--mode",
     default="uat",
@@ -44,63 +41,49 @@
 )
 parser.add_argument(
     "--pgclient",
-    default=PGCLIENT,
+    default="/usr/bin/psql",
     required=False,
     help=f"Postgres client path (default: %(default)s)",
 )
 parser.add_argument(
     "--awsclient",
-    default=AWSCLIENT,
+    default="/usr/local/bin/aws",
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
 args = parser.parse_args()
 
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 
-# Get the environment
-new_env = os.environ.copy()
-
-if args.cached:
-    print(
-        "Using the video keys we recorded on the last non-cached run.", file=sys.stderr
-    )
-else:
-    print("Generating keys from scratch.", file=sys.stderr)
-
-print(f"Mode:        {args.mode}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-if "AWS_PROFILE" in new_env:
-    print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr)
-print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
-print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
-print(f"DATABASE_URL:\n{new_env['DATABASE_URL']}", file=sys.stderr)
-
+# Files
 TMPDIR = "/tmp/nzsl"
 try:
     os.makedirs(TMPDIR, exist_ok=True)
 except OSError as err:
     print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
-
-CSV_DELIMITER = ","
 NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.txt"
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
-nzsl_raw_keys_dict = {}
-s3_bucket_raw_keys_list = []
-all_keys_dict = {}
+# Truncate files, creating them if necessary
+def init_files(files_list):
+    # Zero-out files
+    for p in files_list:
+        f = open(p, "a")
+        f.truncate()
+        f.close()
 
-nkeys_present = 0
-nkeys_absent = 0
 
-if args.cached:
-    # Pull all info from existing file
+# Pull all info from existing file
+def get_keys_from_cache_file(cache_file):
+    nkeys_present = 0
+    nkeys_absent = 0
+    this_all_keys_dict = {}
     try:
-        with open(ALL_KEYS_FILE, "r") as f_obj:
+        with open(cache_file, "r") as f_obj:
             for line in f_obj.readlines():
                 (
                     video_key,
@@ -126,26 +109,25 @@
                     gloss_id = None
                     is_public = None
 
-                all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
+                this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
 
         print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
         print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
+
+        return this_all_keys_dict
+
     except FileNotFoundError:
-        print(f"File not found: {ALL_KEYS_FILE}", file=sys.stderr)
+        print(f"File not found: {cache_file}", file=sys.stderr)
         exit()
-else:
-    # Zero-out files
-    for p in (NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE):
-        f = open(p, "a")
-        f.truncate()
-        f.close()
 
-    # Get all keys from AWS S3
-    print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr)
-    with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
+
+# Get all keys from AWS S3
+def get_keys_from_s3(s3_bucket, keys_file):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    with open(keys_file, "w") as f_obj:
         result = subprocess.run(
-            [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"],
-            env=new_env,
+            [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"],
+            env=NEW_ENV,
             shell=False,
             check=True,
             text=True,
@@ -154,24 +136,29 @@
 
     # Separate out just the key (also strip newline) from date, time, size, key
     # Put the keys in an in-memory list
-    with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj:
-        s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
+    with open(keys_file, "r") as f_obj:
+        this_s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
     print(
-        f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}",
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved: {keys_file}",
         file=sys.stderr,
     )
 
     # Write the keys back to the file, for cleanliness
-    with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj:
-        for line in s3_bucket_raw_keys_list:
+    with open(keys_file, "w") as f_obj:
+        for line in this_s3_bucket_raw_keys_list:
             f_obj.write(f"{line}\n")
 
-    # Get the video files info from NZSL Signbank
+    return this_s3_bucket_raw_keys_list
+
+
+# Get the video files info from NZSL Signbank
+def get_keys_from_nzsl(keys_file):
+    this_nzsl_raw_keys_dict = {}
     print(
         f"Getting raw list of video file info from NZSL Signbank ...",
         file=sys.stderr,
     )
-    with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj:
+    with open(keys_file, "w") as f_obj:
         result = subprocess.run(
             [
                 PGCLIENT,
@@ -180,16 +167,16 @@
                 "select id as db_id, gloss_id, is_public, videofile from video_glossvideo",
                 f"{DATABASE_URL}",
             ],
-            env=new_env,
+            env=NEW_ENV,
             shell=False,
             check=True,
             text=True,
             stdout=f_obj,
         )
-    with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj:
+    with open(keys_file, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
     print(
-        f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}",
+        f"{len(nzsl_raw_keys_list)} rows retrieved: {keys_file}",
         file=sys.stderr,
     )
 
@@ -206,88 +193,131 @@
         # 'videofile' data is also the key for S3
         video_key = columns[3].strip()
         # Each dictionary slot contains these values
-        nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public]
+        this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public]
+
+    return this_nzsl_raw_keys_dict
+
 
-    # Get the s3 keys present and absent from our NZSL keys
+# Get the s3 keys present and absent from our NZSL keys
+def create_all_keys_dict(
+    this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file
+):
     print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
-    for video_key in s3_bucket_raw_keys_list:
-        if video_key in nzsl_raw_keys_dict:
+    nkeys_present = 0
+    nkeys_absent = 0
+    this_all_keys_dict = {}
+    for video_key in this_s3_bucket_raw_keys_list:
+        if video_key in this_nzsl_raw_keys_dict:
             nkeys_present += 1
             # Add 'Present' column to start
-            all_keys_dict[video_key] = [True] + nzsl_raw_keys_dict[video_key]
+            this_all_keys_dict[video_key] = [True] + this_nzsl_raw_keys_dict[video_key]
         else:
             nkeys_absent += 1
             # Add 'Present' (absent) column to start
-            all_keys_dict[video_key] = [False, "", "", ""]
+            this_all_keys_dict[video_key] = [False, "", "", ""]
     print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
     print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
 
     # Write all keys back to a file
-    with open(ALL_KEYS_FILE, "w") as f_obj:
-        for video_key, item_list in all_keys_dict.items():
+    with open(all_keys_file, "w") as f_obj:
+        for video_key, item_list in this_all_keys_dict.items():
             outstr = (
                 f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
             )
             f_obj.write(outstr)
 
+    return this_all_keys_dict
+
+
 # From the keys present in NZSL, get all their ACL information
-print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
-
-# CSV header
-csv_header_list = [
-    "Video S3 Key",
-    "Postgres ID",
-    "Gloss ID",
-    "Signbank Public",
-    "Expected S3 Canned ACL",
-    "Actual S3 Canned ACL",
-]
-print(CSV_DELIMITER.join(csv_header_list))
-
-for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items():
-    canned_acl = ""
-    canned_acl_expected = ""
-    raw_acl = ""
-    if is_present:
-        # See signbank/video/models.py, line 59, in function set_public_acl()
-        canned_acl_expected = "public-read" if is_public else "private"
-        result = subprocess.run(
-            [
-                AWSCLIENT,
-                "s3api",
-                "get-object-acl",
-                "--output",
-                "json",
-                "--bucket",
-                AWS_S3_BUCKET,
-                "--key",
-                video_key,
-            ],
-            env=new_env,
-            shell=False,
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-        acls_grants_json = json.loads(result.stdout)["Grants"]
-        if len(acls_grants_json) > 1:
-            if (
-                acls_grants_json[0]["Permission"] == "FULL_CONTROL"
-                and acls_grants_json[1]["Permission"] == "READ"
-            ):
-                canned_acl = "public-read"
-            else:
-                canned_acl = "Unknown ACL"
-        else:
-            if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
-                canned_acl = "private"
+def output_csv(this_all_keys_dict):
+    print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
+
+    # CSV header
+    csv_header_list = [
+        "Video S3 Key",
+        "Postgres ID",
+        "Gloss ID",
+        "Signbank Public",
+        "Expected S3 Canned ACL",
+        "Actual S3 Canned ACL",
+    ]
+    print(CSV_DELIMITER.join(csv_header_list))
+
+    for video_key, [
+        is_present,
+        db_id,
+        gloss_id,
+        is_public,
+    ] in this_all_keys_dict.items():
+        canned_acl = ""
+        canned_acl_expected = ""
+        raw_acl = ""
+        if is_present:
+            # See signbank/video/models.py, line 59, in function set_public_acl()
+            canned_acl_expected = "public-read" if is_public else "private"
+            result = subprocess.run(
+                [
+                    AWSCLIENT,
+                    "s3api",
+                    "get-object-acl",
+                    "--output",
+                    "json",
+                    "--bucket",
+                    AWS_S3_BUCKET,
+                    "--key",
+                    video_key,
+                ],
+                env=NEW_ENV,
+                shell=False,
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            acls_grants_json = json.loads(result.stdout)["Grants"]
+            if len(acls_grants_json) > 1:
+                if (
+                    acls_grants_json[0]["Permission"] == "FULL_CONTROL"
+                    and acls_grants_json[1]["Permission"] == "READ"
+                ):
+                    canned_acl = "public-read"
+                else:
+                    canned_acl = "Unknown ACL"
             else:
-                canned_acl = "Unknown ACL"
-
-    # CSV columns
-    print(f"{video_key}", end=CSV_DELIMITER)
-    print(f"{db_id if is_present else ''}", end=CSV_DELIMITER)
-    print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER)
-    print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
-    print(f"{canned_acl_expected}", end=CSV_DELIMITER)
-    print(f"{canned_acl}")
+                if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
+                    canned_acl = "private"
+                else:
+                    canned_acl = "Unknown ACL"
+
+        # CSV columns
+        print(f"{video_key}", end=CSV_DELIMITER)
+        print(f"{db_id if is_present else ''}", end=CSV_DELIMITER)
+        print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER)
+        print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
+        print(f"{canned_acl_expected}", end=CSV_DELIMITER)
+        print(f"{canned_acl}")
+
+
+print(f"Mode:        {args.mode}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+if "AWS_PROFILE" in NEW_ENV:
+    print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr)
+print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
+print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
+print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr)
+
+if args.cached:
+    print(
+        "Using the video keys we recorded on the last non-cached run.", file=sys.stderr
+    )
+    all_keys_dict = get_keys_from_cache_file(ALL_KEYS_FILE)
+else:
+    print("Generating keys from scratch.", file=sys.stderr)
+    init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE])
+    s3_bucket_raw_keys_list = get_keys_from_s3(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE)
+    nzsl_raw_keys_dict = get_keys_from_nzsl(NZSL_POSTGRES_RAW_KEYS_FILE)
+    all_keys_dict = create_all_keys_dict(
+        s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE
+    )
+
+output_csv(all_keys_dict)

From bb6a53e1d2b83f54ad96643f206215aa11359fc1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 3 Sep 2024 18:06:46 +1000
Subject: [PATCH 057/222] Tidy ups and renaming

---
 bin/get-video-s3-acls.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 3f809514..7b65cd3a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,16 +11,6 @@
 import argparse
 import json
 
-# Globals
-DATABASE_URL = os.getenv("DATABASE_URL", None)
-if not DATABASE_URL:
-    print("You must define DATABASE_URL in the environment.", file=sys.stderr)
-    exit()
-NEW_ENV = os.environ.copy()
-CSV_DELIMITER = ","
-nzsl_raw_keys_dict = {}
-s3_bucket_raw_keys_list = []
-all_keys_dict = {}
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL"
@@ -53,9 +43,21 @@
 )
 args = parser.parse_args()
 
+# Globals
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient
+DATABASE_URL = os.getenv("DATABASE_URL", None)
+if not DATABASE_URL:
+    print("You must define DATABASE_URL in the environment.", file=sys.stderr)
+    exit()
+NEW_ENV = os.environ.copy()
+CSV_DELIMITER = ","
+
+# Vars
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
+nzsl_raw_keys_dict = {}
+s3_bucket_raw_keys_list = []
+all_keys_dict = {}
 
 # Files
 TMPDIR = "/tmp/nzsl"
@@ -68,9 +70,9 @@
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
+
 # Truncate files, creating them if necessary
 def init_files(files_list):
-    # Zero-out files
     for p in files_list:
         f = open(p, "a")
         f.truncate()
@@ -122,10 +124,10 @@ def get_keys_from_cache_file(cache_file):
 
 
 # Get all keys from AWS S3
-def get_keys_from_s3(s3_bucket, keys_file):
+def get_s3_bucket_raw_keys_list(s3_bucket, keys_file):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     with open(keys_file, "w") as f_obj:
-        result = subprocess.run(
+        subprocess.run(
             [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"],
             env=NEW_ENV,
             shell=False,
@@ -152,14 +154,14 @@ def get_keys_from_s3(s3_bucket, keys_file):
 
 
 # Get the video files info from NZSL Signbank
-def get_keys_from_nzsl(keys_file):
+def get_nzsl_raw_keys_dict(keys_file):
     this_nzsl_raw_keys_dict = {}
     print(
         f"Getting raw list of video file info from NZSL Signbank ...",
         file=sys.stderr,
     )
     with open(keys_file, "w") as f_obj:
-        result = subprocess.run(
+        subprocess.run(
             [
                 PGCLIENT,
                 "-t",
@@ -314,8 +316,8 @@ def output_csv(this_all_keys_dict):
 else:
     print("Generating keys from scratch.", file=sys.stderr)
     init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE])
-    s3_bucket_raw_keys_list = get_keys_from_s3(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE)
-    nzsl_raw_keys_dict = get_keys_from_nzsl(NZSL_POSTGRES_RAW_KEYS_FILE)
+    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE)
+    nzsl_raw_keys_dict = get_nzsl_raw_keys_dict(NZSL_POSTGRES_RAW_KEYS_FILE)
     all_keys_dict = create_all_keys_dict(
         s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE
     )

From 1b93c81bb0de2969f5ab405bea0c9179a5daa2b9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 08:50:50 +1000
Subject: [PATCH 058/222] DATABASE_URL warning message removed

---
 bin/get-video-s3-acls.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 7b65cd3a..9a02ebbd 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,9 +11,9 @@
 import argparse
 import json
 
-
 parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL"
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable."
+    "Postgres access details, eg. DATABASE_URL"
 )
 parser.add_argument(
     "--mode",
@@ -47,9 +47,6 @@
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient
 DATABASE_URL = os.getenv("DATABASE_URL", None)
-if not DATABASE_URL:
-    print("You must define DATABASE_URL in the environment.", file=sys.stderr)
-    exit()
 NEW_ENV = os.environ.copy()
 CSV_DELIMITER = ","
 
@@ -316,7 +313,9 @@ def output_csv(this_all_keys_dict):
 else:
     print("Generating keys from scratch.", file=sys.stderr)
     init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE])
-    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE)
+    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(
+        AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE
+    )
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict(NZSL_POSTGRES_RAW_KEYS_FILE)
     all_keys_dict = create_all_keys_dict(
         s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE

From ea751e5e8497d7869dc816af833046e3c751d6cc Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:21:39 +1000
Subject: [PATCH 059/222] whitespace

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 9a02ebbd..ba90eb59 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -12,7 +12,7 @@
 import json
 
 parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable."
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable. "
     "Postgres access details, eg. DATABASE_URL"
 )
 parser.add_argument(

From 0b3ce8db0b26317a1f63a308bf95862376f0cac0 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:23:31 +1000
Subject: [PATCH 060/222] Adding OSV ignores just to silence warnings. Remove
 later.

---
 .osv-detector.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.osv-detector.yml b/.osv-detector.yml
index 794fb52a..4b9be1f1 100644
--- a/.osv-detector.yml
+++ b/.osv-detector.yml
@@ -6,3 +6,5 @@ ignore:
   - GHSA-257q-pv89-v3xv # GHSA says affected versions are jQuery v.2.2.0 until v.3.5.0
   - GHSA-vm8q-m57g-pff3
   - GHSA-w3h3-4rj7-4ph4
+  - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc)
+  - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2)

From fbe33feeb00869c9b35793ad78d179d9154a4adb Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:25:06 +1000
Subject: [PATCH 061/222] More OSV ignores

---
 .osv-detector.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.osv-detector.yml b/.osv-detector.yml
index 4b9be1f1..2d4acb90 100644
--- a/.osv-detector.yml
+++ b/.osv-detector.yml
@@ -8,3 +8,4 @@ ignore:
   - GHSA-w3h3-4rj7-4ph4
   - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc)
   - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2)
+  - GHSA-9mvj-f7w8-pvh2 # Bootstrap Cross-Site Scripting (XSS) vulnerability (https://github.com/advisories/GHSA-9mvj-f7w8-pvh2)

From 765963287eaebd6d4a53ee19a25acd39f297549b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:27:58 +1000
Subject: [PATCH 062/222] tidy ups

---
 bin/get-video-s3-acls.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index ba90eb59..c8e78a07 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -12,8 +12,8 @@
 import json
 
 parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable. "
-    "Postgres access details, eg. DATABASE_URL"
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var."
 )
 parser.add_argument(
     "--mode",
@@ -46,7 +46,7 @@
 # Globals
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient
-DATABASE_URL = os.getenv("DATABASE_URL", None)
+DATABASE_URL = os.getenv("DATABASE_URL", "")
 NEW_ENV = os.environ.copy()
 CSV_DELIMITER = ","
 

From cb2b3a03bca237522ba0956000fce3f1e3d5e5b4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:36:37 +1000
Subject: [PATCH 063/222] File names hidden

---
 bin/get-video-s3-acls.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c8e78a07..9c459015 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -69,7 +69,7 @@
 
 
 # Truncate files, creating them if necessary
-def init_files(files_list):
+def init_files(files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)):
     for p in files_list:
         f = open(p, "a")
         f.truncate()
@@ -77,7 +77,7 @@ def init_files(files_list):
 
 
 # Pull all info from existing file
-def get_keys_from_cache_file(cache_file):
+def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
     nkeys_present = 0
     nkeys_absent = 0
     this_all_keys_dict = {}
@@ -121,7 +121,7 @@ def get_keys_from_cache_file(cache_file):
 
 
 # Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket, keys_file):
+def get_s3_bucket_raw_keys_list(s3_bucket, keys_file=S3_BUCKET_RAW_KEYS_FILE):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     with open(keys_file, "w") as f_obj:
         subprocess.run(
@@ -151,7 +151,7 @@ def get_s3_bucket_raw_keys_list(s3_bucket, keys_file):
 
 
 # Get the video files info from NZSL Signbank
-def get_nzsl_raw_keys_dict(keys_file):
+def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE):
     this_nzsl_raw_keys_dict = {}
     print(
         f"Getting raw list of video file info from NZSL Signbank ...",
@@ -199,7 +199,7 @@ def get_nzsl_raw_keys_dict(keys_file):
 
 # Get the s3 keys present and absent from our NZSL keys
 def create_all_keys_dict(
-    this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file
+    this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file=ALL_KEYS_FILE
 ):
     print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
     nkeys_present = 0
@@ -309,16 +309,14 @@ def output_csv(this_all_keys_dict):
     print(
         "Using the video keys we recorded on the last non-cached run.", file=sys.stderr
     )
-    all_keys_dict = get_keys_from_cache_file(ALL_KEYS_FILE)
+    all_keys_dict = get_keys_from_cache_file()
 else:
     print("Generating keys from scratch.", file=sys.stderr)
-    init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE])
-    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(
-        AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE
-    )
-    nzsl_raw_keys_dict = get_nzsl_raw_keys_dict(NZSL_POSTGRES_RAW_KEYS_FILE)
+    init_files()
+    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET)
+    nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
     all_keys_dict = create_all_keys_dict(
-        s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE
+        s3_bucket_raw_keys_list, nzsl_raw_keys_dict
     )
 
 output_csv(all_keys_dict)

From 9b502dde632da60b087c00b0a99ff0fe52f3a978 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:38:17 +1000
Subject: [PATCH 064/222] Bunch of things could be made global, starting here

---
 bin/get-video-s3-acls.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 9c459015..58857446 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -121,7 +121,7 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
 
 
 # Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket, keys_file=S3_BUCKET_RAW_KEYS_FILE):
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     with open(keys_file, "w") as f_obj:
         subprocess.run(
@@ -313,7 +313,7 @@ def output_csv(this_all_keys_dict):
 else:
     print("Generating keys from scratch.", file=sys.stderr)
     init_files()
-    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET)
+    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
     all_keys_dict = create_all_keys_dict(
         s3_bucket_raw_keys_list, nzsl_raw_keys_dict

From a4d978c2ca5fab82a94c2f9c6a8c38d5ddab8477 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:41:07 +1000
Subject: [PATCH 065/222] More tidying

---
 bin/get-video-s3-acls.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 58857446..a9a46a1d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -49,14 +49,7 @@
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 NEW_ENV = os.environ.copy()
 CSV_DELIMITER = ","
-
-# Vars
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
-nzsl_raw_keys_dict = {}
-s3_bucket_raw_keys_list = []
-all_keys_dict = {}
-
-# Files
 TMPDIR = "/tmp/nzsl"
 try:
     os.makedirs(TMPDIR, exist_ok=True)
@@ -67,6 +60,10 @@
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
+# Vars
+nzsl_raw_keys_dict = {}
+s3_bucket_raw_keys_list = []
+all_keys_dict = {}
 
 # Truncate files, creating them if necessary
 def init_files(files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)):

From 8364354f2e0d7b15869886b00e52e96fb102ce6e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:41:26 +1000
Subject: [PATCH 066/222] black

---
 bin/get-video-s3-acls.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a9a46a1d..0d81756f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -65,8 +65,11 @@
 s3_bucket_raw_keys_list = []
 all_keys_dict = {}
 
+
 # Truncate files, creating them if necessary
-def init_files(files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)):
+def init_files(
+    files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)
+):
     for p in files_list:
         f = open(p, "a")
         f.truncate()
@@ -118,7 +121,9 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
 
 
 # Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE):
+def get_s3_bucket_raw_keys_list(
+    s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE
+):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     with open(keys_file, "w") as f_obj:
         subprocess.run(
@@ -312,8 +317,6 @@ def output_csv(this_all_keys_dict):
     init_files()
     s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
-    all_keys_dict = create_all_keys_dict(
-        s3_bucket_raw_keys_list, nzsl_raw_keys_dict
-    )
+    all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
 output_csv(all_keys_dict)

From 15f5443b93939f31c685f6dda8874ce958b476c5 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:56:51 +1000
Subject: [PATCH 067/222] Simpler and cleaner

---
 bin/get-video-s3-acls.py | 92 ++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 0d81756f..a5efe3c4 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -251,52 +251,53 @@ def output_csv(this_all_keys_dict):
         gloss_id,
         is_public,
     ] in this_all_keys_dict.items():
-        canned_acl = ""
-        canned_acl_expected = ""
-        raw_acl = ""
-        if is_present:
-            # See signbank/video/models.py, line 59, in function set_public_acl()
-            canned_acl_expected = "public-read" if is_public else "private"
-            result = subprocess.run(
-                [
-                    AWSCLIENT,
-                    "s3api",
-                    "get-object-acl",
-                    "--output",
-                    "json",
-                    "--bucket",
-                    AWS_S3_BUCKET,
-                    "--key",
-                    video_key,
-                ],
-                env=NEW_ENV,
-                shell=False,
-                check=True,
-                capture_output=True,
-                text=True,
-            )
-            acls_grants_json = json.loads(result.stdout)["Grants"]
-            if len(acls_grants_json) > 1:
-                if (
-                    acls_grants_json[0]["Permission"] == "FULL_CONTROL"
-                    and acls_grants_json[1]["Permission"] == "READ"
-                ):
-                    canned_acl = "public-read"
-                else:
-                    canned_acl = "Unknown ACL"
-            else:
-                if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
-                    canned_acl = "private"
-                else:
-                    canned_acl = "Unknown ACL"
+
+        if not is_present:
+            print(f"{video_key},,,,,")
+            continue
+
+        # See signbank/video/models.py, line 59, in function set_public_acl()
+        canned_acl_expected = "public-read" if is_public else "private"
+        result = subprocess.run(
+            [
+                AWSCLIENT,
+                "s3api",
+                "get-object-acl",
+                "--output",
+                "json",
+                "--bucket",
+                AWS_S3_BUCKET,
+                "--key",
+                video_key,
+            ],
+            env=NEW_ENV,
+            shell=False,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        canned_acl = "unknown"
+        acls_grants_json = json.loads(result.stdout)["Grants"]
+        if len(acls_grants_json) > 1:
+            if (
+                acls_grants_json[0]["Permission"] == "FULL_CONTROL"
+                and acls_grants_json[1]["Permission"] == "READ"
+            ):
+                canned_acl = "public-read"
+        else:
+            if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
+                canned_acl = "private"
 
         # CSV columns
-        print(f"{video_key}", end=CSV_DELIMITER)
-        print(f"{db_id if is_present else ''}", end=CSV_DELIMITER)
-        print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER)
-        print(f"{is_public if is_present else ''}", end=CSV_DELIMITER)
-        print(f"{canned_acl_expected}", end=CSV_DELIMITER)
-        print(f"{canned_acl}")
+        csv_column_list = [
+            f"{video_key}",
+            f"{db_id}",
+            f"{gloss_id}",
+            f"{is_public}",
+            f"{canned_acl_expected}",
+            f"{canned_acl}",
+        ]
+        print(CSV_DELIMITER.join(csv_column_list))
 
 
 print(f"Mode:        {args.mode}", file=sys.stderr)
@@ -305,7 +306,8 @@ def output_csv(this_all_keys_dict):
     print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr)
 print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
-print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr)
+if "DATABASE_URL" in NEW_ENV:
+    print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr)
 
 if args.cached:
     print(

From e4189fe48f0a173e7f348d23508364d10ba34e14 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:59:02 +1000
Subject: [PATCH 068/222] Output text

---
 bin/get-video-s3-acls.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a5efe3c4..aa904fcd 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -302,10 +302,10 @@ def output_csv(this_all_keys_dict):
 
 print(f"Mode:        {args.mode}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-if "AWS_PROFILE" in NEW_ENV:
-    print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr)
 print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
+if "AWS_PROFILE" in NEW_ENV:
+    print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr)
 if "DATABASE_URL" in NEW_ENV:
     print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr)
 

From 0d13787b688f377d53c24bba2e303c2d683373e0 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 12:48:28 +1000
Subject: [PATCH 069/222] DATABASE_URL output removed as security issue

---
 bin/get-video-s3-acls.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index aa904fcd..adf3ae5b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -306,8 +306,6 @@ def output_csv(this_all_keys_dict):
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
 if "AWS_PROFILE" in NEW_ENV:
     print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr)
-if "DATABASE_URL" in NEW_ENV:
-    print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr)
 
 if args.cached:
     print(

From ec62dc64063380d53842d015c564b8e0d92701da Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 12:51:29 +1000
Subject: [PATCH 070/222] os.environ used everywhere

---
 bin/get-video-s3-acls.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index adf3ae5b..0e44ff21 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -47,7 +47,6 @@
 AWSCLIENT = args.awsclient
 PGCLIENT = args.pgclient
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-NEW_ENV = os.environ.copy()
 CSV_DELIMITER = ","
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
 TMPDIR = "/tmp/nzsl"
@@ -128,7 +127,7 @@ def get_s3_bucket_raw_keys_list(
     with open(keys_file, "w") as f_obj:
         subprocess.run(
             [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"],
-            env=NEW_ENV,
+            env=os.environ,
             shell=False,
             check=True,
             text=True,
@@ -168,7 +167,7 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE):
                 "select id as db_id, gloss_id, is_public, videofile from video_glossvideo",
                 f"{DATABASE_URL}",
             ],
-            env=NEW_ENV,
+            env=os.environ,
             shell=False,
             check=True,
             text=True,
@@ -270,7 +269,7 @@ def output_csv(this_all_keys_dict):
                 "--key",
                 video_key,
             ],
-            env=NEW_ENV,
+            env=os.environ,
             shell=False,
             check=True,
             capture_output=True,
@@ -304,8 +303,8 @@ def output_csv(this_all_keys_dict):
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
 print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
-if "AWS_PROFILE" in NEW_ENV:
-    print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr)
+if "AWS_PROFILE" in os.environ:
+    print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 
 if args.cached:
     print(

From 5c0fb65132cfdd95f91e86d6d6951bfc8391378f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 12:53:19 +1000
Subject: [PATCH 071/222] Exception test removed

---
 bin/get-video-s3-acls.py | 55 ++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 0e44ff21..2602a735 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -80,44 +80,39 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
     nkeys_present = 0
     nkeys_absent = 0
     this_all_keys_dict = {}
-    try:
-        with open(cache_file, "r") as f_obj:
-            for line in f_obj.readlines():
-                (
-                    video_key,
-                    is_present_str,
-                    db_id_str,
-                    gloss_id_str,
-                    is_public_str,
-                ) = line.strip().split(CSV_DELIMITER)
-
-                is_present = is_present_str.strip().lower() == "true"
-                if is_present:
-                    nkeys_present += 1
-                    db_id = int(db_id_str)
-                    # Some don't have gloss_id's
-                    try:
-                        gloss_id = int(gloss_id_str)
-                    except ValueError:
-                        gloss_id = None
-                    is_public = is_public_str.strip().lower() == "true"
-                else:
-                    nkeys_absent += 1
-                    db_id = None
+    with open(cache_file, "r") as f_obj:
+        for line in f_obj.readlines():
+            (
+                video_key,
+                is_present_str,
+                db_id_str,
+                gloss_id_str,
+                is_public_str,
+            ) = line.strip().split(CSV_DELIMITER)
+
+            is_present = is_present_str.strip().lower() == "true"
+            if is_present:
+                nkeys_present += 1
+                db_id = int(db_id_str)
+                # Some don't have gloss_id's
+                try:
+                    gloss_id = int(gloss_id_str)
+                except ValueError:
                     gloss_id = None
-                    is_public = None
+                is_public = is_public_str.strip().lower() == "true"
+            else:
+                nkeys_absent += 1
+                db_id = None
+                gloss_id = None
+                is_public = None
 
-                this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
+            this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
 
         print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
         print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
 
         return this_all_keys_dict
 
-    except FileNotFoundError:
-        print(f"File not found: {cache_file}", file=sys.stderr)
-        exit()
-
 
 # Get all keys from AWS S3
 def get_s3_bucket_raw_keys_list(

From 1a3a61254aa595557a5ad502677b450dcbe97654 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:26:33 +1000
Subject: [PATCH 072/222] PSQL client works smarter using COPY

---
 bin/get-video-s3-acls.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 2602a735..dd2370de 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -55,7 +55,7 @@
 except OSError as err:
     print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
-NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.txt"
+NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.csv"
 S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
@@ -154,12 +154,13 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE):
         file=sys.stderr,
     )
     with open(keys_file, "w") as f_obj:
+        # In theory postgres COPY could output directly to our file, but subprocess.run throws an error
         subprocess.run(
             [
                 PGCLIENT,
-                "-t",
                 "-c",
-                "select id as db_id, gloss_id, is_public, videofile from video_glossvideo",
+                "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) "
+                "TO STDOUT WITH (FORMAT CSV)",
                 f"{DATABASE_URL}",
             ],
             env=os.environ,
@@ -168,6 +169,7 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE):
             text=True,
             stdout=f_obj,
         )
+
     with open(keys_file, "r") as f_obj:
         nzsl_raw_keys_list = f_obj.readlines()
     print(
@@ -181,14 +183,8 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE):
         rawl = rawl.strip()
         if not rawl:
             continue
-        columns = rawl.split("|")
-        db_id = columns[0].strip()
-        gloss_id = columns[1].strip()
-        is_public = columns[2].strip().lower() == "t"
-        # 'videofile' data is also the key for S3
-        video_key = columns[3].strip()
-        # Each dictionary slot contains these values
-        this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public]
+        [db_id, gloss_id, is_public, video_key] = rawl.split(",")
+        this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public.lower() == "t"]
 
     return this_nzsl_raw_keys_dict
 

From 8ff16e3acc4b3b95d172acf1c76d124a43d7760f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 5 Sep 2024 18:09:04 +1000
Subject: [PATCH 073/222] Output canned ACL even if video_key absent from NZSL
 Signback postgres database. Intermediate step using re.split()

---
 bin/get-video-s3-acls.py | 47 +++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index dd2370de..50b4159d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -10,6 +10,7 @@
 import subprocess
 import argparse
 import json
+import re
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -121,7 +122,15 @@ def get_s3_bucket_raw_keys_list(
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     with open(keys_file, "w") as f_obj:
         subprocess.run(
-            [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"],
+            [
+                AWSCLIENT,
+                "s3",
+                "ls",
+                f"s3://{s3_bucket}",
+                "--recursive",
+                "--output",
+                "json",
+            ],
             env=os.environ,
             shell=False,
             check=True,
@@ -132,7 +141,9 @@ def get_s3_bucket_raw_keys_list(
     # Separate out just the key (also strip newline) from date, time, size, key
     # Put the keys in an in-memory list
     with open(keys_file, "r") as f_obj:
-        this_s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj]
+        this_s3_bucket_raw_keys_list = [
+            re.split(r"\s+", line, 3)[3].strip() for line in f_obj
+        ]
     print(
         f"{len(this_s3_bucket_raw_keys_list)} rows retrieved: {keys_file}",
         file=sys.stderr,
@@ -242,24 +253,22 @@ def output_csv(this_all_keys_dict):
         is_public,
     ] in this_all_keys_dict.items():
 
-        if not is_present:
-            print(f"{video_key},,,,,")
-            continue
-
         # See signbank/video/models.py, line 59, in function set_public_acl()
         canned_acl_expected = "public-read" if is_public else "private"
+        run_array = [
+            AWSCLIENT,
+            "s3api",
+            "get-object-acl",
+            "--output",
+            "json",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ]
+
         result = subprocess.run(
-            [
-                AWSCLIENT,
-                "s3api",
-                "get-object-acl",
-                "--output",
-                "json",
-                "--bucket",
-                AWS_S3_BUCKET,
-                "--key",
-                video_key,
-            ],
+            run_array,
             env=os.environ,
             shell=False,
             check=True,
@@ -278,6 +287,10 @@ def output_csv(this_all_keys_dict):
             if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
                 canned_acl = "private"
 
+        if not is_present:
+            print(f"{video_key},,,,,{canned_acl}")
+            continue
+
         # CSV columns
         csv_column_list = [
             f"{video_key}",

From 05618312822d3259c769d2328e15058b4d00f2b9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:13:21 +1000
Subject: [PATCH 074/222] S3 intermediate file removed

---
 bin/get-video-s3-acls.py | 54 ++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 50b4159d..b6379f3b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -57,7 +57,6 @@
     print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
 NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.csv"
-S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt"
 ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
 
 # Vars
@@ -68,7 +67,7 @@
 
 # Truncate files, creating them if necessary
 def init_files(
-    files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)
+    files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, ALL_KEYS_FILE)
 ):
     for p in files_list:
         f = open(p, "a")
@@ -117,43 +116,33 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
 
 # Get all keys from AWS S3
 def get_s3_bucket_raw_keys_list(
-    s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE
+    s3_bucket=AWS_S3_BUCKET
 ):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    with open(keys_file, "w") as f_obj:
-        subprocess.run(
-            [
-                AWSCLIENT,
-                "s3",
-                "ls",
-                f"s3://{s3_bucket}",
-                "--recursive",
-                "--output",
-                "json",
-            ],
-            env=os.environ,
-            shell=False,
-            check=True,
-            text=True,
-            stdout=f_obj,
-        )
+    result = subprocess.run(
+        [
+            AWSCLIENT,
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
 
-    # Separate out just the key (also strip newline) from date, time, size, key
-    # Put the keys in an in-memory list
-    with open(keys_file, "r") as f_obj:
-        this_s3_bucket_raw_keys_list = [
-            re.split(r"\s+", line, 3)[3].strip() for line in f_obj
-        ]
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split('\n'):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
     print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved: {keys_file}",
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
         file=sys.stderr,
     )
 
-    # Write the keys back to the file, for cleanliness
-    with open(keys_file, "w") as f_obj:
-        for line in this_s3_bucket_raw_keys_list:
-            f_obj.write(f"{line}\n")
-
     return this_s3_bucket_raw_keys_list
 
 
@@ -266,7 +255,6 @@ def output_csv(this_all_keys_dict):
             "--key",
             video_key,
         ]
-
         result = subprocess.run(
             run_array,
             env=os.environ,

From 47494caf733bd35f6478ce0be99c95cd9567910b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:31:04 +1000
Subject: [PATCH 075/222] Intermediate files gone. Cache file only. Tidy up.

---
 bin/get-video-s3-acls.py | 75 +++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 44 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b6379f3b..25974631 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -56,8 +56,7 @@
 except OSError as err:
     print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
-NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.csv"
-ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv"
+ALL_KEYS_CACHE_FILE = f"{TMPDIR}/all_keys_cache.csv"
 
 # Vars
 nzsl_raw_keys_dict = {}
@@ -66,17 +65,15 @@
 
 
 # Truncate files, creating them if necessary
-def init_files(
-    files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, ALL_KEYS_FILE)
-):
+def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
     for p in files_list:
         f = open(p, "a")
         f.truncate()
         f.close()
 
 
-# Pull all info from existing file
-def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
+# Pull all info from existing cache file
+def get_keys_from_cache_file(cache_file=ALL_KEYS_CACHE_FILE):
     nkeys_present = 0
     nkeys_absent = 0
     this_all_keys_dict = {}
@@ -115,9 +112,7 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE):
 
 
 # Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(
-    s3_bucket=AWS_S3_BUCKET
-):
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     result = subprocess.run(
         [
@@ -135,9 +130,10 @@ def get_s3_bucket_raw_keys_list(
 
     # Separate out just the key from date, time, size, key
     this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split('\n'):
+    for line in result.stdout.split("\n"):
         if line:
             this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
     print(
         f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
         file=sys.stderr,
@@ -147,52 +143,45 @@ def get_s3_bucket_raw_keys_list(
 
 
 # Get the video files info from NZSL Signbank
-def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE):
+def get_nzsl_raw_keys_dict():
     this_nzsl_raw_keys_dict = {}
     print(
         f"Getting raw list of video file info from NZSL Signbank ...",
         file=sys.stderr,
     )
-    with open(keys_file, "w") as f_obj:
-        # In theory postgres COPY could output directly to our file, but subprocess.run throws an error
-        subprocess.run(
-            [
-                PGCLIENT,
-                "-c",
-                "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) "
-                "TO STDOUT WITH (FORMAT CSV)",
-                f"{DATABASE_URL}",
-            ],
-            env=os.environ,
-            shell=False,
-            check=True,
-            text=True,
-            stdout=f_obj,
-        )
-
-    with open(keys_file, "r") as f_obj:
-        nzsl_raw_keys_list = f_obj.readlines()
-    print(
-        f"{len(nzsl_raw_keys_list)} rows retrieved: {keys_file}",
-        file=sys.stderr,
+    result = subprocess.run(
+        [
+            PGCLIENT,
+            "-c",
+            "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) "
+            "TO STDOUT WITH (FORMAT CSV)",
+            f"{DATABASE_URL}",
+        ],
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
     )
 
     # Separate out the NZSL db columns
     # Write them to a dictionary, so we can do fast operations
-    for rawl in nzsl_raw_keys_list:
+    for rawl in result.stdout.split("\n"):
         rawl = rawl.strip()
         if not rawl:
             continue
         [db_id, gloss_id, is_public, video_key] = rawl.split(",")
         this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public.lower() == "t"]
 
+    print(
+        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
+        file=sys.stderr,
+    )
+
     return this_nzsl_raw_keys_dict
 
 
 # Get the s3 keys present and absent from our NZSL keys
-def create_all_keys_dict(
-    this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file=ALL_KEYS_FILE
-):
+def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
     nkeys_present = 0
     nkeys_absent = 0
@@ -209,8 +198,8 @@ def create_all_keys_dict(
     print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
     print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
 
-    # Write all keys back to a file
-    with open(all_keys_file, "w") as f_obj:
+    # Write all keys back to a cache file
+    with open(ALL_KEYS_CACHE_FILE, "w") as f_obj:
         for video_key, item_list in this_all_keys_dict.items():
             outstr = (
                 f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
@@ -299,12 +288,10 @@ def output_csv(this_all_keys_dict):
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 
 if args.cached:
-    print(
-        "Using the video keys we recorded on the last non-cached run.", file=sys.stderr
-    )
+    print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr)
     all_keys_dict = get_keys_from_cache_file()
 else:
-    print("Generating keys from scratch.", file=sys.stderr)
+    print("Generating video keys from scratch.", file=sys.stderr)
     init_files()
     s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()

From 81d77897d44290b8224a96ac55c33106c5b26525 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:37:21 +1000
Subject: [PATCH 076/222] Text, whitespace

---
 bin/get-video-s3-acls.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 25974631..3c7010a4 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -27,17 +27,17 @@
     default=False,
     required=False,
     action="store_true",
-    help="Use keys generated on a previous non-cached run (default: %(default)s) "
-    "(Don't mix PRODUCTION and STAGING!)",
+    help="Use video keys generated on a previous non-cached run (default: %(default)s) "
+    "(Do not mix production and staging!)",
 )
 parser.add_argument(
-    "--pgclient",
+    "--pgcli",
     default="/usr/bin/psql",
     required=False,
     help=f"Postgres client path (default: %(default)s)",
 )
 parser.add_argument(
-    "--awsclient",
+    "--awscli",
     default="/usr/local/bin/aws",
     required=False,
     help=f"AWS client path (default: %(default)s)",
@@ -45,8 +45,8 @@
 args = parser.parse_args()
 
 # Globals
-AWSCLIENT = args.awsclient
-PGCLIENT = args.pgclient
+AWSCLI = args.awscli
+PGCLI = args.pgcli
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 CSV_DELIMITER = ","
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
@@ -116,7 +116,7 @@ def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
     result = subprocess.run(
         [
-            AWSCLIENT,
+            AWSCLI,
             "s3",
             "ls",
             f"s3://{s3_bucket}",
@@ -151,7 +151,7 @@ def get_nzsl_raw_keys_dict():
     )
     result = subprocess.run(
         [
-            PGCLIENT,
+            PGCLI,
             "-c",
             "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) "
             "TO STDOUT WITH (FORMAT CSV)",
@@ -195,6 +195,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
             nkeys_absent += 1
             # Add 'Present' (absent) column to start
             this_all_keys_dict[video_key] = [False, "", "", ""]
+
     print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
     print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
 
@@ -234,7 +235,7 @@ def output_csv(this_all_keys_dict):
         # See signbank/video/models.py, line 59, in function set_public_acl()
         canned_acl_expected = "public-read" if is_public else "private"
         run_array = [
-            AWSCLIENT,
+            AWSCLI,
             "s3api",
             "get-object-acl",
             "--output",
@@ -280,10 +281,10 @@ def output_csv(this_all_keys_dict):
         print(CSV_DELIMITER.join(csv_column_list))
 
 
-print(f"Mode:        {args.mode}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLIENT:   {AWSCLIENT}", file=sys.stderr)
-print(f"PGCLIENT:    {PGCLIENT}", file=sys.stderr)
+print(f"Mode:      {args.mode}", file=sys.stderr)
+print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWSCLI:    {AWSCLI}", file=sys.stderr)
+print(f"PGCLI:     {PGCLI}", file=sys.stderr)
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 

From 0fe589bfdbdc8b9583574ebc896577b692a45448 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:39:40 +1000
Subject: [PATCH 077/222] Cache file made global

---
 bin/get-video-s3-acls.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 3c7010a4..de3548fc 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -73,11 +73,11 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
 
 
 # Pull all info from existing cache file
-def get_keys_from_cache_file(cache_file=ALL_KEYS_CACHE_FILE):
+def get_keys_from_cache_file():
     nkeys_present = 0
     nkeys_absent = 0
     this_all_keys_dict = {}
-    with open(cache_file, "r") as f_obj:
+    with open(ALL_KEYS_CACHE_FILE, "r") as f_obj:
         for line in f_obj.readlines():
             (
                 video_key,

From 278cb49350e96cf453ad862380651a0adc53bc05 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:47:44 +1000
Subject: [PATCH 078/222] Unbuffered python output

---
 bin/get-video-s3-acls.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index de3548fc..913c041f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python3
+#!/usr/bin/env -S python3 -u
+# Bang line above passes '-u' to python, for unbuffered output
 # Permissions required:
 #  psql - access to heroku app's postgres
 #  aws s3 - NZSL IAM access

From 666c8de798a7474c02b8383d0fd78c86577098df Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 10 Sep 2024 14:34:34 +1000
Subject: [PATCH 079/222] Text change: mode -> env

---
 bin/get-video-s3-acls.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 913c041f..7eadeec3 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -18,10 +18,10 @@
     "Postgres access details, eg. DATABASE_URL env var."
 )
 parser.add_argument(
-    "--mode",
+    "--env",
     default="uat",
     required=False,
-    help="Mode to run in, eg 'production, 'uat', etc (default: '%(default)s')",
+    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
 )
 parser.add_argument(
     "--cached",
@@ -50,7 +50,7 @@
 PGCLI = args.pgcli
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 CSV_DELIMITER = ","
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}"
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 TMPDIR = "/tmp/nzsl"
 try:
     os.makedirs(TMPDIR, exist_ok=True)
@@ -282,7 +282,7 @@ def output_csv(this_all_keys_dict):
         print(CSV_DELIMITER.join(csv_column_list))
 
 
-print(f"Mode:      {args.mode}", file=sys.stderr)
+print(f"Mode:      {args.env}", file=sys.stderr)
 print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:    {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:     {PGCLI}", file=sys.stderr)

From a884e3babbab52700c9f2e8a581cf0227695dbee Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 10 Sep 2024 15:06:55 +1000
Subject: [PATCH 080/222] Cache file written in same loop as keys dictionary

---
 bin/get-video-s3-acls.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 7eadeec3..bf69d26b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -186,27 +186,27 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
     nkeys_present = 0
     nkeys_absent = 0
+    item_list = []
     this_all_keys_dict = {}
-    for video_key in this_s3_bucket_raw_keys_list:
-        if video_key in this_nzsl_raw_keys_dict:
-            nkeys_present += 1
-            # Add 'Present' column to start
-            this_all_keys_dict[video_key] = [True] + this_nzsl_raw_keys_dict[video_key]
-        else:
-            nkeys_absent += 1
-            # Add 'Present' (absent) column to start
-            this_all_keys_dict[video_key] = [False, "", "", ""]
-
-    print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
-    print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
+    with open(ALL_KEYS_CACHE_FILE, "w") as cache_file:
+        for video_key in this_s3_bucket_raw_keys_list:
+            if video_key in this_nzsl_raw_keys_dict:
+                nkeys_present += 1
+                # Add 'Present' column to start
+                item_list = [True] + this_nzsl_raw_keys_dict[video_key]
+            else:
+                nkeys_absent += 1
+                # Add 'Present' (absent) column to start
+                item_list = [False, "", "", ""]
+            this_all_keys_dict[video_key] = item_list
 
-    # Write all keys back to a cache file
-    with open(ALL_KEYS_CACHE_FILE, "w") as f_obj:
-        for video_key, item_list in this_all_keys_dict.items():
-            outstr = (
+            # Write all keys back to a cache file
+            cache_file.write(
                 f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
             )
-            f_obj.write(outstr)
+
+    print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
+    print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
 
     return this_all_keys_dict
 

From e0e7a8e45cd791ad806497d99c34fa6423f11c49 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 10 Sep 2024 15:08:40 +1000
Subject: [PATCH 081/222] Simplified conditional

---
 bin/get-video-s3-acls.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index bf69d26b..8a7ad557 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -262,9 +262,8 @@ def output_csv(this_all_keys_dict):
                 and acls_grants_json[1]["Permission"] == "READ"
             ):
                 canned_acl = "public-read"
-        else:
-            if acls_grants_json[0]["Permission"] == "FULL_CONTROL":
-                canned_acl = "private"
+        elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
+            canned_acl = "private"
 
         if not is_present:
             print(f"{video_key},,,,,{canned_acl}")

From 7249def8d08db329bb79e257c30e48e4d9347b1d Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 10 Sep 2024 15:47:30 +1000
Subject: [PATCH 082/222] CSV construction deconstructed

---
 bin/get-video-s3-acls.py | 129 ++++++++++++++++++++++-----------------
 1 file changed, 74 insertions(+), 55 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 8a7ad557..c0e68c89 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -211,20 +211,76 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     return this_all_keys_dict
 
 
+def build_csv_header():
+    return CSV_DELIMITER.join(
+        [
+            "Video S3 Key",
+            "Postgres ID",
+            "Gloss ID",
+            "Signbank Public",
+            "Expected S3 Canned ACL",
+            "Actual S3 Canned ACL",
+        ]
+    )
+
+
+def build_csv_row(
+    video_key, is_present=False, db_id=None, gloss_id=None, is_public=False
+):
+
+    run_array = [
+        AWSCLI,
+        "s3api",
+        "get-object-acl",
+        "--output",
+        "json",
+        "--bucket",
+        AWS_S3_BUCKET,
+        "--key",
+        video_key,
+    ]
+    result = subprocess.run(
+        run_array,
+        env=os.environ,
+        shell=False,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    canned_acl = "unknown"
+    acls_grants_json = json.loads(result.stdout)["Grants"]
+    if len(acls_grants_json) > 1:
+        if (
+            acls_grants_json[0]["Permission"] == "FULL_CONTROL"
+            and acls_grants_json[1]["Permission"] == "READ"
+        ):
+            canned_acl = "public-read"
+    elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
+        canned_acl = "private"
+
+    # See signbank/video/models.py, line 59, in function set_public_acl()
+    if is_present:
+        canned_acl_expected = "public-read" if is_public else "private"
+    else:
+        canned_acl_expected = ""
+
+    return CSV_DELIMITER.join(
+        [
+            f"{video_key}",
+            f"{db_id}",
+            f"{gloss_id}",
+            f"{is_public}",
+            f"{canned_acl_expected}",
+            f"{canned_acl}",
+        ]
+    )
+
+
 # From the keys present in NZSL, get all their ACL information
 def output_csv(this_all_keys_dict):
     print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
-    # CSV header
-    csv_header_list = [
-        "Video S3 Key",
-        "Postgres ID",
-        "Gloss ID",
-        "Signbank Public",
-        "Expected S3 Canned ACL",
-        "Actual S3 Canned ACL",
-    ]
-    print(CSV_DELIMITER.join(csv_header_list))
+    print(build_csv_header())
 
     for video_key, [
         is_present,
@@ -233,52 +289,15 @@ def output_csv(this_all_keys_dict):
         is_public,
     ] in this_all_keys_dict.items():
 
-        # See signbank/video/models.py, line 59, in function set_public_acl()
-        canned_acl_expected = "public-read" if is_public else "private"
-        run_array = [
-            AWSCLI,
-            "s3api",
-            "get-object-acl",
-            "--output",
-            "json",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-        result = subprocess.run(
-            run_array,
-            env=os.environ,
-            shell=False,
-            check=True,
-            capture_output=True,
-            text=True,
+        print(
+            build_csv_row(
+                video_key,
+                is_present,
+                db_id,
+                gloss_id,
+                is_public,
+            )
         )
-        canned_acl = "unknown"
-        acls_grants_json = json.loads(result.stdout)["Grants"]
-        if len(acls_grants_json) > 1:
-            if (
-                acls_grants_json[0]["Permission"] == "FULL_CONTROL"
-                and acls_grants_json[1]["Permission"] == "READ"
-            ):
-                canned_acl = "public-read"
-        elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
-            canned_acl = "private"
-
-        if not is_present:
-            print(f"{video_key},,,,,{canned_acl}")
-            continue
-
-        # CSV columns
-        csv_column_list = [
-            f"{video_key}",
-            f"{db_id}",
-            f"{gloss_id}",
-            f"{is_public}",
-            f"{canned_acl_expected}",
-            f"{canned_acl}",
-        ]
-        print(CSV_DELIMITER.join(csv_column_list))
 
 
 print(f"Mode:      {args.env}", file=sys.stderr)

From 70d65fa036dee6e5b5ea355f3b835789d35f40e8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 10 Sep 2024 16:07:36 +1000
Subject: [PATCH 083/222] Superfluous variable removed

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c0e68c89..cde4175b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -186,7 +186,6 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
     nkeys_present = 0
     nkeys_absent = 0
-    item_list = []
     this_all_keys_dict = {}
     with open(ALL_KEYS_CACHE_FILE, "w") as cache_file:
         for video_key in this_s3_bucket_raw_keys_list:

From 1c558b4fcd88fdf352e5478197bb6c1ad020d7cf Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Sep 2024 16:36:20 +1000
Subject: [PATCH 084/222] First approximation of bidirectional matching

---
 bin/get-video-s3-acls.py | 67 ++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index cde4175b..b269a3f0 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -17,6 +17,14 @@
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
+# This debug will be removed
+parser.add_argument(
+    "--debug",
+    default=False,
+    required=False,
+    action="store_true",
+    help="Turn on some debug actions (default: %(default)s) "
+)
 parser.add_argument(
     "--env",
     default="uat",
@@ -181,32 +189,52 @@ def get_nzsl_raw_keys_dict():
     return this_nzsl_raw_keys_dict
 
 
-# Get the s3 keys present and absent from our NZSL keys
+# Get the s3 keys present and absent from our NZSL keys, to dictionary:
+#   video_key(str) -> in_nzsl(bool), in_s3(bool), db_id(int), gloss_id(int), is_public(bool)
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
-    print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr)
-    nkeys_present = 0
-    nkeys_absent = 0
+    print(
+        "Getting keys present and absent across NZSL Signbank and S3 ...",
+        file=sys.stderr,
+    )
     this_all_keys_dict = {}
     with open(ALL_KEYS_CACHE_FILE, "w") as cache_file:
+
+        # Debug, we inject fake keys: grep for 'This_'
+        if args.debug:
+            this_nzsl_raw_keys_dict["This_key_is_in_both"] = [0, 1, True]
+            this_s3_bucket_raw_keys_list.append("This_key_is_in_both")
+            this_nzsl_raw_keys_dict["This_nzsl_key_is_not_in_s3"] = [0, 1, True]
+            this_s3_bucket_raw_keys_list.append("This_s3_key_is_not_in_nzsl")
+
+        # Find S3 keys that are present in NZSL, or absent
         for video_key in this_s3_bucket_raw_keys_list:
             if video_key in this_nzsl_raw_keys_dict:
-                nkeys_present += 1
-                # Add 'Present' column to start
-                item_list = [True] + this_nzsl_raw_keys_dict[video_key]
+                if args.debug:
+                    print(f"'{video_key}' in BOTH NZSL and S3")
+                # NZSL PRESENT, S3 PRESENT
+                this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[
+                    video_key
+                ]
             else:
-                nkeys_absent += 1
-                # Add 'Present' (absent) column to start
-                item_list = [False, "", "", ""]
-            this_all_keys_dict[video_key] = item_list
-
-            # Write all keys back to a cache file
+                if args.debug:
+                    print(f"'{video_key}' NOT in NZSL, but in S3")
+                # NZSL Absent, S3 PRESENT
+                this_all_keys_dict[video_key] = [False, True, "", "", ""]
+
+        # Find NZSL keys that are absent from S3 (present handled already above)
+        for video_key, item_list in this_nzsl_raw_keys_dict.items():
+            if video_key not in this_s3_bucket_raw_keys_list:
+                if args.debug:
+                    print(f"'{video_key}' in NZSL, but NOT in S3")
+                # NZSL PRESENT, S3 Absent
+                this_all_keys_dict[video_key] = [True, False] + item_list
+
+        # Write all keys back to a cache file
+        for video_key, item_list in this_all_keys_dict.items():
             cache_file.write(
                 f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
             )
 
-    print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
-    print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
-
     return this_all_keys_dict
 
 
@@ -308,7 +336,9 @@ def output_csv(this_all_keys_dict):
 
 if args.cached:
     print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr)
-    all_keys_dict = get_keys_from_cache_file()
+    print("We are not yet worthy.")
+    exit()
+    # all_keys_dict = get_keys_from_cache_file()
 else:
     print("Generating video keys from scratch.", file=sys.stderr)
     init_files()
@@ -316,4 +346,7 @@ def output_csv(this_all_keys_dict):
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
     all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
+print("DEBUG EXIT")
+exit()
+
 output_csv(all_keys_dict)

From 1ad888cd732ae3ff144201fc42dfd14a71c19413 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Sep 2024 16:52:13 +1000
Subject: [PATCH 085/222] Presence/Absence S3 vs NZSL now bi-directional

---
 bin/get-video-s3-acls.py | 64 +++++++++++++---------------------------
 1 file changed, 20 insertions(+), 44 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b269a3f0..780e5bc3 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -17,14 +17,6 @@
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
-# This debug will be removed
-parser.add_argument(
-    "--debug",
-    default=False,
-    required=False,
-    action="store_true",
-    help="Turn on some debug actions (default: %(default)s) "
-)
 parser.add_argument(
     "--env",
     default="uat",
@@ -83,22 +75,21 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
 
 # Pull all info from existing cache file
 def get_keys_from_cache_file():
-    nkeys_present = 0
-    nkeys_absent = 0
     this_all_keys_dict = {}
     with open(ALL_KEYS_CACHE_FILE, "r") as f_obj:
         for line in f_obj.readlines():
             (
                 video_key,
-                is_present_str,
+                key_in_nzsl_str,
+                key_in_s3_str,
                 db_id_str,
                 gloss_id_str,
                 is_public_str,
             ) = line.strip().split(CSV_DELIMITER)
 
-            is_present = is_present_str.strip().lower() == "true"
-            if is_present:
-                nkeys_present += 1
+            # If possible, get NZSL db info
+            key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
+            if key_in_nzsl:
                 db_id = int(db_id_str)
                 # Some don't have gloss_id's
                 try:
@@ -107,15 +98,13 @@ def get_keys_from_cache_file():
                     gloss_id = None
                 is_public = is_public_str.strip().lower() == "true"
             else:
-                nkeys_absent += 1
-                db_id = None
-                gloss_id = None
-                is_public = None
+                db_id = ""
+                gloss_id = ""
+                is_public = ""
 
-            this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public]
+            key_in_s3 = key_in_s3_str.strip().lower() == "true"
 
-        print(f"PRESENT: {nkeys_present} keys", file=sys.stderr)
-        print(f"ABSENT:  {nkeys_absent} keys", file=sys.stderr)
+            this_all_keys_dict[video_key] = [key_in_nzsl, key_in_s3, db_id, gloss_id, is_public]
 
         return this_all_keys_dict
 
@@ -199,33 +188,20 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     this_all_keys_dict = {}
     with open(ALL_KEYS_CACHE_FILE, "w") as cache_file:
 
-        # Debug, we inject fake keys: grep for 'This_'
-        if args.debug:
-            this_nzsl_raw_keys_dict["This_key_is_in_both"] = [0, 1, True]
-            this_s3_bucket_raw_keys_list.append("This_key_is_in_both")
-            this_nzsl_raw_keys_dict["This_nzsl_key_is_not_in_s3"] = [0, 1, True]
-            this_s3_bucket_raw_keys_list.append("This_s3_key_is_not_in_nzsl")
-
         # Find S3 keys that are present in NZSL, or absent
         for video_key in this_s3_bucket_raw_keys_list:
             if video_key in this_nzsl_raw_keys_dict:
-                if args.debug:
-                    print(f"'{video_key}' in BOTH NZSL and S3")
                 # NZSL PRESENT, S3 PRESENT
                 this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[
                     video_key
                 ]
             else:
-                if args.debug:
-                    print(f"'{video_key}' NOT in NZSL, but in S3")
                 # NZSL Absent, S3 PRESENT
                 this_all_keys_dict[video_key] = [False, True, "", "", ""]
 
         # Find NZSL keys that are absent from S3 (present handled already above)
         for video_key, item_list in this_nzsl_raw_keys_dict.items():
             if video_key not in this_s3_bucket_raw_keys_list:
-                if args.debug:
-                    print(f"'{video_key}' in NZSL, but NOT in S3")
                 # NZSL PRESENT, S3 Absent
                 this_all_keys_dict[video_key] = [True, False] + item_list
 
@@ -252,9 +228,12 @@ def build_csv_header():
 
 
 def build_csv_row(
-    video_key, is_present=False, db_id=None, gloss_id=None, is_public=False
+    video_key, key_in_nzsl=False, key_in_s3=False, db_id=None, gloss_id=None, is_public=False
 ):
 
+    if not key_in_s3:
+        return
+
     run_array = [
         AWSCLI,
         "s3api",
@@ -286,7 +265,7 @@ def build_csv_row(
         canned_acl = "private"
 
     # See signbank/video/models.py, line 59, in function set_public_acl()
-    if is_present:
+    if key_in_nzsl:
         canned_acl_expected = "public-read" if is_public else "private"
     else:
         canned_acl_expected = ""
@@ -310,7 +289,8 @@ def output_csv(this_all_keys_dict):
     print(build_csv_header())
 
     for video_key, [
-        is_present,
+        key_in_nzsl,
+        key_in_s3,
         db_id,
         gloss_id,
         is_public,
@@ -319,7 +299,8 @@ def output_csv(this_all_keys_dict):
         print(
             build_csv_row(
                 video_key,
-                is_present,
+                key_in_nzsl,
+                key_in_s3,
                 db_id,
                 gloss_id,
                 is_public,
@@ -336,9 +317,7 @@ def output_csv(this_all_keys_dict):
 
 if args.cached:
     print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr)
-    print("We are not yet worthy.")
-    exit()
-    # all_keys_dict = get_keys_from_cache_file()
+    all_keys_dict = get_keys_from_cache_file()
 else:
     print("Generating video keys from scratch.", file=sys.stderr)
     init_files()
@@ -346,7 +325,4 @@ def output_csv(this_all_keys_dict):
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
     all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
-print("DEBUG EXIT")
-exit()
-
 output_csv(all_keys_dict)

From 2c7d3752059e43150031955d756a6f233a27e44f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Sep 2024 17:03:34 +1000
Subject: [PATCH 086/222] NZSL Present S3 Absent case now outputs to CSV

---
 bin/get-video-s3-acls.py | 65 ++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 780e5bc3..fb17a0b6 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -104,7 +104,13 @@ def get_keys_from_cache_file():
 
             key_in_s3 = key_in_s3_str.strip().lower() == "true"
 
-            this_all_keys_dict[video_key] = [key_in_nzsl, key_in_s3, db_id, gloss_id, is_public]
+            this_all_keys_dict[video_key] = [
+                key_in_nzsl,
+                key_in_s3,
+                db_id,
+                gloss_id,
+                is_public,
+            ]
 
         return this_all_keys_dict
 
@@ -228,25 +234,46 @@ def build_csv_header():
 
 
 def build_csv_row(
-    video_key, key_in_nzsl=False, key_in_s3=False, db_id=None, gloss_id=None, is_public=False
+    video_key,
+    key_in_nzsl=False,
+    key_in_s3=False,
+    db_id=None,
+    gloss_id=None,
+    is_public=False,
 ):
 
+    # See signbank/video/models.py, line 59, in function set_public_acl()
+    if key_in_nzsl:
+        canned_acl_expected = "public-read" if is_public else "private"
+    else:
+        canned_acl_expected = ""
+
+    # If key not in S3, just return its NZSL info
     if not key_in_s3:
-        return
-
-    run_array = [
-        AWSCLI,
-        "s3api",
-        "get-object-acl",
-        "--output",
-        "json",
-        "--bucket",
-        AWS_S3_BUCKET,
-        "--key",
-        video_key,
-    ]
+        return CSV_DELIMITER.join(
+            [
+                f"{video_key}",
+                f"{db_id}",
+                f"{gloss_id}",
+                f"{is_public}",
+                f"{canned_acl_expected}",
+                "",
+            ]
+        )
+
+    # Get S3 object's ACL
     result = subprocess.run(
-        run_array,
+        [
+            AWSCLI,
+            "s3api",
+            "get-object-acl",
+            "--output",
+            "json",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ],
         env=os.environ,
         shell=False,
         check=True,
@@ -264,12 +291,6 @@ def build_csv_row(
     elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
         canned_acl = "private"
 
-    # See signbank/video/models.py, line 59, in function set_public_acl()
-    if key_in_nzsl:
-        canned_acl_expected = "public-read" if is_public else "private"
-    else:
-        canned_acl_expected = ""
-
     return CSV_DELIMITER.join(
         [
             f"{video_key}",

From 3438dd9c869d13da16be3eb6ae28323e4b444bca Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Sep 2024 17:08:05 +1000
Subject: [PATCH 087/222] Added --tmpdir argument

---
 bin/get-video-s3-acls.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index fb17a0b6..a5590027 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -43,6 +43,12 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
+parser.add_argument(
+    "--tmpdir",
+    default="/tmp/nzsl",
+    required=False,
+    help=f"Temp dir path (default: %(default)s)",
+)
 args = parser.parse_args()
 
 # Globals
@@ -51,7 +57,7 @@
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 CSV_DELIMITER = ","
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
-TMPDIR = "/tmp/nzsl"
+TMPDIR = args.tmpdir
 try:
     os.makedirs(TMPDIR, exist_ok=True)
 except OSError as err:

From f13b7037129f6d70b76f2f201829b105b89339c4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 12 Sep 2024 11:54:00 +1000
Subject: [PATCH 088/222] Minor tidy-ups

---
 bin/get-video-s3-acls.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a5590027..63bbff46 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -52,10 +52,10 @@
 args = parser.parse_args()
 
 # Globals
+CSV_DELIMITER = ","
+DATABASE_URL = os.getenv("DATABASE_URL", "")
 AWSCLI = args.awscli
 PGCLI = args.pgcli
-DATABASE_URL = os.getenv("DATABASE_URL", "")
-CSV_DELIMITER = ","
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 TMPDIR = args.tmpdir
 try:
@@ -82,8 +82,8 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
 # Pull all info from existing cache file
 def get_keys_from_cache_file():
     this_all_keys_dict = {}
-    with open(ALL_KEYS_CACHE_FILE, "r") as f_obj:
-        for line in f_obj.readlines():
+    with open(ALL_KEYS_CACHE_FILE, "r") as cache_file:
+        for line in cache_file.readlines():
             (
                 video_key,
                 key_in_nzsl_str,
@@ -97,12 +97,12 @@ def get_keys_from_cache_file():
             key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
             if key_in_nzsl:
                 db_id = int(db_id_str)
+                is_public = is_public_str.strip().lower() == "true"
                 # Some don't have gloss_id's
                 try:
                     gloss_id = int(gloss_id_str)
                 except ValueError:
                     gloss_id = None
-                is_public = is_public_str.strip().lower() == "true"
             else:
                 db_id = ""
                 gloss_id = ""
@@ -154,11 +154,11 @@ def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
 
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
-    this_nzsl_raw_keys_dict = {}
     print(
         f"Getting raw list of video file info from NZSL Signbank ...",
         file=sys.stderr,
     )
+    this_nzsl_raw_keys_dict = {}
     result = subprocess.run(
         [
             PGCLI,
@@ -190,7 +190,7 @@ def get_nzsl_raw_keys_dict():
     return this_nzsl_raw_keys_dict
 
 
-# Get the s3 keys present and absent from our NZSL keys, to dictionary:
+# Get the keys present and absent across NZSL Signbank and S3, to dictionary:
 #   video_key(str) -> in_nzsl(bool), in_s3(bool), db_id(int), gloss_id(int), is_public(bool)
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print(

From fb5c8b728beed5a8b3604f9d021b5e26dc287dca Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 17 Sep 2024 17:00:11 +1000
Subject: [PATCH 089/222] Initial code for new columns

---
 bin/get-video-s3-acls.py | 142 +++++++++++++++++++++++++++++----------
 1 file changed, 105 insertions(+), 37 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 63bbff46..bdff1f24 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -79,6 +79,21 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
         f.close()
 
 
+# DICTIONARY and CACHE FILE format
+# This is used at several points
+# Essentially video_key + in_nzsl + in_s3 + nzsl_raw_keys_dict
+#   video_key(str) ->
+#       in_nzsl(bool),
+#       in_s3(bool),
+#       gloss_id(int),
+#       gloss_idgloss(str),
+#       created_at(str),
+#       gloss_public(bool),
+#       video_public(bool)
+#       video_id(int)
+# TODO For cache file format maybe move the video key to the end of the row, for consistency
+
+
 # Pull all info from existing cache file
 def get_keys_from_cache_file():
     this_all_keys_dict = {}
@@ -88,34 +103,40 @@ def get_keys_from_cache_file():
                 video_key,
                 key_in_nzsl_str,
                 key_in_s3_str,
-                db_id_str,
                 gloss_id_str,
-                is_public_str,
+                gloss_idgloss,
+                created_at,
+                gloss_public_str,
+                video_public_str,
+                video_id_str,
             ) = line.strip().split(CSV_DELIMITER)
 
             # If possible, get NZSL db info
             key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
             if key_in_nzsl:
-                db_id = int(db_id_str)
-                is_public = is_public_str.strip().lower() == "true"
+                video_id = int(video_id_str)
+                gloss_public = gloss_public_str.strip().lower() == "true"
+                video_public = video_public_str.strip().lower() == "true"
                 # Some don't have gloss_id's
                 try:
                     gloss_id = int(gloss_id_str)
                 except ValueError:
                     gloss_id = None
             else:
-                db_id = ""
                 gloss_id = ""
-                is_public = ""
+                video_id = ""
+                gloss_public = ""
+                video_public = ""
 
             key_in_s3 = key_in_s3_str.strip().lower() == "true"
 
             this_all_keys_dict[video_key] = [
                 key_in_nzsl,
                 key_in_s3,
-                db_id,
+                video_id,
                 gloss_id,
-                is_public,
+                gloss_public,
+                video_public,
             ]
 
         return this_all_keys_dict
@@ -159,12 +180,23 @@ def get_nzsl_raw_keys_dict():
         file=sys.stderr,
     )
     this_nzsl_raw_keys_dict = {}
+    # Column renaming is purely for readability
+    # We use a specific delimiter because columns might contain commas
     result = subprocess.run(
         [
             PGCLI,
             "-c",
-            "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) "
-            "TO STDOUT WITH (FORMAT CSV)",
+            "COPY ("
+            "SELECT "
+            "dg.id AS gloss_id, "
+            "dg.idgloss AS gloss_idgloss, "
+            "dg.created_at, "
+            "dg.published AS gloss_public, "
+            "vg.is_public AS video_public, "
+            "vg.id AS video_id, "
+            "vg.videofile AS video_key "
+            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH DELIMITER AS '|'",
             f"{DATABASE_URL}",
         ],
         env=os.environ,
@@ -173,14 +205,33 @@ def get_nzsl_raw_keys_dict():
         text=True,
     )
 
-    # Separate out the NZSL db columns
+    from pprint import pprint
+
+    # Separate the NZSL db columns
     # Write them to a dictionary, so we can do fast operations
     for rawl in result.stdout.split("\n"):
         rawl = rawl.strip()
+        print(f">>>{rawl}<<<")
+        pprint(rawl.split(","))
         if not rawl:
             continue
-        [db_id, gloss_id, is_public, video_key] = rawl.split(",")
-        this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public.lower() == "t"]
+        [
+            gloss_id,
+            gloss_idgloss,
+            created_at,
+            gloss_public,
+            video_public,
+            video_id,
+            video_key,
+        ] = rawl.split("|")
+        this_nzsl_raw_keys_dict[video_key] = [
+            gloss_id,
+            gloss_idgloss,
+            created_at,
+            gloss_public.lower() == "t",
+            video_public.lower() == "t",
+            video_id,
+        ]
 
     print(
         f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
@@ -190,8 +241,8 @@ def get_nzsl_raw_keys_dict():
     return this_nzsl_raw_keys_dict
 
 
-# Get the keys present and absent across NZSL Signbank and S3, to dictionary:
-#   video_key(str) -> in_nzsl(bool), in_s3(bool), db_id(int), gloss_id(int), is_public(bool)
+# Get the keys present and absent across NZSL Signbank and S3, to dictionary
+# See DICTIONARY and CACHE FILE format
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print(
         "Getting keys present and absent across NZSL Signbank and S3 ...",
@@ -209,7 +260,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
                 ]
             else:
                 # NZSL Absent, S3 PRESENT
-                this_all_keys_dict[video_key] = [False, True, "", "", ""]
+                this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""]
 
         # Find NZSL keys that are absent from S3 (present handled already above)
         for video_key, item_list in this_nzsl_raw_keys_dict.items():
@@ -229,28 +280,34 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
 def build_csv_header():
     return CSV_DELIMITER.join(
         [
-            "Video S3 Key",
-            "Postgres ID",
             "Gloss ID",
-            "Signbank Public",
-            "Expected S3 Canned ACL",
-            "Actual S3 Canned ACL",
+            "Gloss",
+            "Created at",
+            "Gloss public",
+            "Vido public",
+            "Video ID",
+            "Video key",
+            "Expected Canned ACL",
+            "Actual Canned ACL",
         ]
     )
 
 
 def build_csv_row(
-    video_key,
     key_in_nzsl=False,
     key_in_s3=False,
-    db_id=None,
     gloss_id=None,
-    is_public=False,
+    gloss_idgloss=None,
+    created_at=None,
+    gloss_public=False,
+    video_public=False,
+    video_id=None,
+    video_key=None,
 ):
 
     # See signbank/video/models.py, line 59, in function set_public_acl()
     if key_in_nzsl:
-        canned_acl_expected = "public-read" if is_public else "private"
+        canned_acl_expected = "public-read" if video_public else "private"
     else:
         canned_acl_expected = ""
 
@@ -258,10 +315,13 @@ def build_csv_row(
     if not key_in_s3:
         return CSV_DELIMITER.join(
             [
-                f"{video_key}",
-                f"{db_id}",
                 f"{gloss_id}",
-                f"{is_public}",
+                f"{gloss_idgloss}",
+                f"{created_at}",
+                f"{gloss_public}",
+                f"{video_public}",
+                f"{video_id}",
+                f"{video_key}",
                 f"{canned_acl_expected}",
                 "",
             ]
@@ -299,10 +359,13 @@ def build_csv_row(
 
     return CSV_DELIMITER.join(
         [
-            f"{video_key}",
-            f"{db_id}",
             f"{gloss_id}",
-            f"{is_public}",
+            f"{gloss_idgloss}",
+            f"{created_at}",
+            f"{gloss_public}",
+            f"{video_public}",
+            f"{video_id}",
+            f"{video_key}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
         ]
@@ -314,23 +377,28 @@ def output_csv(this_all_keys_dict):
     print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
     print(build_csv_header())
-
     for video_key, [
         key_in_nzsl,
         key_in_s3,
-        db_id,
         gloss_id,
-        is_public,
+        gloss_idgloss,
+        created_at,
+        gloss_public,
+        video_public,
+        video_id,
     ] in this_all_keys_dict.items():
 
         print(
             build_csv_row(
-                video_key,
                 key_in_nzsl,
                 key_in_s3,
-                db_id,
                 gloss_id,
-                is_public,
+                gloss_idgloss,
+                created_at,
+                gloss_public,
+                video_public,
+                video_id,
+                video_key,
             )
         )
 

From abc430fd497632f1ef500036caa89de758a5ad28 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 17 Sep 2024 18:38:59 +1000
Subject: [PATCH 090/222] Debug removed, tmpdir announced

---
 bin/get-video-s3-acls.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index bdff1f24..f8b3cbe5 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -211,8 +211,6 @@ def get_nzsl_raw_keys_dict():
     # Write them to a dictionary, so we can do fast operations
     for rawl in result.stdout.split("\n"):
         rawl = rawl.strip()
-        print(f">>>{rawl}<<<")
-        pprint(rawl.split(","))
         if not rawl:
             continue
         [
@@ -407,6 +405,7 @@ def output_csv(this_all_keys_dict):
 print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:    {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:     {PGCLI}", file=sys.stderr)
+print(f"TMPDIR:    {TMPDIR}", file=sys.stderr)
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 

From 70027e38e72fa848953de92ea9f57baf8acac605 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 18 Sep 2024 15:20:16 +1000
Subject: [PATCH 091/222] typo

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index f8b3cbe5..ec9d6d4e 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -282,7 +282,7 @@ def build_csv_header():
             "Gloss",
             "Created at",
             "Gloss public",
-            "Vido public",
+            "Video public",
             "Video ID",
             "Video key",
             "Expected Canned ACL",

From f9727f26976327bc8022486479003095b0471f95 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 18 Sep 2024 15:23:07 +1000
Subject: [PATCH 092/222] Debug removed

---
 bin/get-video-s3-acls.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index ec9d6d4e..ab1f9e7f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -205,8 +205,6 @@ def get_nzsl_raw_keys_dict():
         text=True,
     )
 
-    from pprint import pprint
-
     # Separate the NZSL db columns
     # Write them to a dictionary, so we can do fast operations
     for rawl in result.stdout.split("\n"):

From 336241cfed18e3f1c8e5754e771155b559bd8936 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 18 Sep 2024 15:39:02 +1000
Subject: [PATCH 093/222] Tidy ups

---
 bin/get-video-s3-acls.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index ab1f9e7f..bd748e75 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -111,25 +111,23 @@ def get_keys_from_cache_file():
                 video_id_str,
             ) = line.strip().split(CSV_DELIMITER)
 
-            # If possible, get NZSL db info
             key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
+            key_in_s3 = key_in_s3_str.strip().lower() == "true"
             if key_in_nzsl:
                 video_id = int(video_id_str)
-                gloss_public = gloss_public_str.strip().lower() == "true"
-                video_public = video_public_str.strip().lower() == "true"
-                # Some don't have gloss_id's
+                # Some have no gloss_id
                 try:
                     gloss_id = int(gloss_id_str)
                 except ValueError:
                     gloss_id = None
+                gloss_public = gloss_public_str.strip().lower() == "true"
+                video_public = video_public_str.strip().lower() == "true"
             else:
-                gloss_id = ""
                 video_id = ""
+                gloss_id = ""
                 gloss_public = ""
                 video_public = ""
 
-            key_in_s3 = key_in_s3_str.strip().lower() == "true"
-
             this_all_keys_dict[video_key] = [
                 key_in_nzsl,
                 key_in_s3,
@@ -181,7 +179,7 @@ def get_nzsl_raw_keys_dict():
     )
     this_nzsl_raw_keys_dict = {}
     # Column renaming is purely for readability
-    # We use a specific delimiter because columns might contain commas
+    # Special delimiter because columns might contain commas
     result = subprocess.run(
         [
             PGCLI,

From f6ffc184c2f5be3dcc400b9c777ac69cd9d65de0 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:27:34 +1000
Subject: [PATCH 094/222] Video key moved, functions reordered, gloss quoting
 hardened

---
 bin/get-video-s3-acls.py | 76 +++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index bd748e75..cc9f793b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -91,7 +91,6 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
 #       gloss_public(bool),
 #       video_public(bool)
 #       video_id(int)
-# TODO For cache file format maybe move the video key to the end of the row, for consistency
 
 
 # Pull all info from existing cache file
@@ -100,7 +99,6 @@ def get_keys_from_cache_file():
     with open(ALL_KEYS_CACHE_FILE, "r") as cache_file:
         for line in cache_file.readlines():
             (
-                video_key,
                 key_in_nzsl_str,
                 key_in_s3_str,
                 gloss_id_str,
@@ -109,6 +107,7 @@ def get_keys_from_cache_file():
                 gloss_public_str,
                 video_public_str,
                 video_id_str,
+                video_key,
             ) = line.strip().split(CSV_DELIMITER)
 
             key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
@@ -140,37 +139,6 @@ def get_keys_from_cache_file():
         return this_all_keys_dict
 
 
-# Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = subprocess.run(
-        [
-            AWSCLI,
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
-    )
-
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
-
-    print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_s3_bucket_raw_keys_list
-
-
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
     print(
@@ -179,7 +147,6 @@ def get_nzsl_raw_keys_dict():
     )
     this_nzsl_raw_keys_dict = {}
     # Column renaming is purely for readability
-    # Special delimiter because columns might contain commas
     result = subprocess.run(
         [
             PGCLI,
@@ -193,8 +160,8 @@ def get_nzsl_raw_keys_dict():
             "vg.is_public AS video_public, "
             "vg.id AS video_id, "
             "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-            ") TO STDOUT WITH DELIMITER AS '|'",
+            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id "
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|') ",
             f"{DATABASE_URL}",
         ],
         env=os.environ,
@@ -220,7 +187,7 @@ def get_nzsl_raw_keys_dict():
         ] = rawl.split("|")
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_id,
-            gloss_idgloss,
+            gloss_idgloss.replace(CSV_DELIMITER, ""),
             created_at,
             gloss_public.lower() == "t",
             video_public.lower() == "t",
@@ -235,6 +202,37 @@ def get_nzsl_raw_keys_dict():
     return this_nzsl_raw_keys_dict
 
 
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = subprocess.run(
+        [
+            AWSCLI,
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
 # Get the keys present and absent across NZSL Signbank and S3, to dictionary
 # See DICTIONARY and CACHE FILE format
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
@@ -265,7 +263,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
         # Write all keys back to a cache file
         for video_key, item_list in this_all_keys_dict.items():
             cache_file.write(
-                f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
+                f"{CSV_DELIMITER.join(map(str, item_list))}{CSV_DELIMITER}{video_key}\n"
             )
 
     return this_all_keys_dict
@@ -411,8 +409,8 @@ def output_csv(this_all_keys_dict):
 else:
     print("Generating video keys from scratch.", file=sys.stderr)
     init_files()
-    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
     nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
+    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
     all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
 output_csv(all_keys_dict)

From 76966ab9770fad531760e114e3f58d5143594afe Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:49:46 +1000
Subject: [PATCH 095/222] Revert "Video key moved, functions reordered, gloss
 quoting hardened" Total mess, doing it another way.

This reverts commit f6ffc184c2f5be3dcc400b9c777ac69cd9d65de0.
---
 bin/get-video-s3-acls.py | 76 +++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index cc9f793b..bd748e75 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -91,6 +91,7 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
 #       gloss_public(bool),
 #       video_public(bool)
 #       video_id(int)
+# TODO For cache file format maybe move the video key to the end of the row, for consistency
 
 
 # Pull all info from existing cache file
@@ -99,6 +100,7 @@ def get_keys_from_cache_file():
     with open(ALL_KEYS_CACHE_FILE, "r") as cache_file:
         for line in cache_file.readlines():
             (
+                video_key,
                 key_in_nzsl_str,
                 key_in_s3_str,
                 gloss_id_str,
@@ -107,7 +109,6 @@ def get_keys_from_cache_file():
                 gloss_public_str,
                 video_public_str,
                 video_id_str,
-                video_key,
             ) = line.strip().split(CSV_DELIMITER)
 
             key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
@@ -139,6 +140,37 @@ def get_keys_from_cache_file():
         return this_all_keys_dict
 
 
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = subprocess.run(
+        [
+            AWSCLI,
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
     print(
@@ -147,6 +179,7 @@ def get_nzsl_raw_keys_dict():
     )
     this_nzsl_raw_keys_dict = {}
     # Column renaming is purely for readability
+    # Special delimiter because columns might contain commas
     result = subprocess.run(
         [
             PGCLI,
@@ -160,8 +193,8 @@ def get_nzsl_raw_keys_dict():
             "vg.is_public AS video_public, "
             "vg.id AS video_id, "
             "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id "
-            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|') ",
+            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH DELIMITER AS '|'",
             f"{DATABASE_URL}",
         ],
         env=os.environ,
@@ -187,7 +220,7 @@ def get_nzsl_raw_keys_dict():
         ] = rawl.split("|")
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_id,
-            gloss_idgloss.replace(CSV_DELIMITER, ""),
+            gloss_idgloss,
             created_at,
             gloss_public.lower() == "t",
             video_public.lower() == "t",
@@ -202,37 +235,6 @@ def get_nzsl_raw_keys_dict():
     return this_nzsl_raw_keys_dict
 
 
-# Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = subprocess.run(
-        [
-            AWSCLI,
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
-    )
-
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
-
-    print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_s3_bucket_raw_keys_list
-
-
 # Get the keys present and absent across NZSL Signbank and S3, to dictionary
 # See DICTIONARY and CACHE FILE format
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
@@ -263,7 +265,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
         # Write all keys back to a cache file
         for video_key, item_list in this_all_keys_dict.items():
             cache_file.write(
-                f"{CSV_DELIMITER.join(map(str, item_list))}{CSV_DELIMITER}{video_key}\n"
+                f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
             )
 
     return this_all_keys_dict
@@ -409,8 +411,8 @@ def output_csv(this_all_keys_dict):
 else:
     print("Generating video keys from scratch.", file=sys.stderr)
     init_files()
-    nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
     s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
+    nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
     all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
 output_csv(all_keys_dict)

From afcfa813072d136959e145a66fb661d596930ec1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:01:12 +1000
Subject: [PATCH 096/222] Cache file removed Too difficult to maintain

---
 bin/get-video-s3-acls.py | 126 ++++++++-------------------------------
 1 file changed, 25 insertions(+), 101 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index bd748e75..57f8605a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -12,6 +12,7 @@
 import argparse
 import json
 import re
+from pprint import pprint
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -23,14 +24,6 @@
     required=False,
     help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
 )
-parser.add_argument(
-    "--cached",
-    default=False,
-    required=False,
-    action="store_true",
-    help="Use video keys generated on a previous non-cached run (default: %(default)s) "
-    "(Do not mix production and staging!)",
-)
 parser.add_argument(
     "--pgcli",
     default="/usr/bin/psql",
@@ -63,23 +56,13 @@
 except OSError as err:
     print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
-ALL_KEYS_CACHE_FILE = f"{TMPDIR}/all_keys_cache.csv"
 
 # Vars
 nzsl_raw_keys_dict = {}
 s3_bucket_raw_keys_list = []
 all_keys_dict = {}
 
-
-# Truncate files, creating them if necessary
-def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
-    for p in files_list:
-        f = open(p, "a")
-        f.truncate()
-        f.close()
-
-
-# DICTIONARY and CACHE FILE format
+# DICTIONARY format
 # This is used at several points
 # Essentially video_key + in_nzsl + in_s3 + nzsl_raw_keys_dict
 #   video_key(str) ->
@@ -91,53 +74,6 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)):
 #       gloss_public(bool),
 #       video_public(bool)
 #       video_id(int)
-# TODO For cache file format maybe move the video key to the end of the row, for consistency
-
-
-# Pull all info from existing cache file
-def get_keys_from_cache_file():
-    this_all_keys_dict = {}
-    with open(ALL_KEYS_CACHE_FILE, "r") as cache_file:
-        for line in cache_file.readlines():
-            (
-                video_key,
-                key_in_nzsl_str,
-                key_in_s3_str,
-                gloss_id_str,
-                gloss_idgloss,
-                created_at,
-                gloss_public_str,
-                video_public_str,
-                video_id_str,
-            ) = line.strip().split(CSV_DELIMITER)
-
-            key_in_nzsl = key_in_nzsl_str.strip().lower() == "true"
-            key_in_s3 = key_in_s3_str.strip().lower() == "true"
-            if key_in_nzsl:
-                video_id = int(video_id_str)
-                # Some have no gloss_id
-                try:
-                    gloss_id = int(gloss_id_str)
-                except ValueError:
-                    gloss_id = None
-                gloss_public = gloss_public_str.strip().lower() == "true"
-                video_public = video_public_str.strip().lower() == "true"
-            else:
-                video_id = ""
-                gloss_id = ""
-                gloss_public = ""
-                video_public = ""
-
-            this_all_keys_dict[video_key] = [
-                key_in_nzsl,
-                key_in_s3,
-                video_id,
-                gloss_id,
-                gloss_public,
-                video_public,
-            ]
-
-        return this_all_keys_dict
 
 
 # Get all keys from AWS S3
@@ -194,7 +130,7 @@ def get_nzsl_raw_keys_dict():
             "vg.id AS video_id, "
             "vg.videofile AS video_key "
             "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-            ") TO STDOUT WITH DELIMITER AS '|'",
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
             f"{DATABASE_URL}",
         ],
         env=os.environ,
@@ -220,7 +156,7 @@ def get_nzsl_raw_keys_dict():
         ] = rawl.split("|")
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_id,
-            gloss_idgloss,
+            gloss_idgloss.replace(CSV_DELIMITER, ""),
             created_at,
             gloss_public.lower() == "t",
             video_public.lower() == "t",
@@ -243,30 +179,23 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
         file=sys.stderr,
     )
     this_all_keys_dict = {}
-    with open(ALL_KEYS_CACHE_FILE, "w") as cache_file:
-
-        # Find S3 keys that are present in NZSL, or absent
-        for video_key in this_s3_bucket_raw_keys_list:
-            if video_key in this_nzsl_raw_keys_dict:
-                # NZSL PRESENT, S3 PRESENT
-                this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[
-                    video_key
-                ]
-            else:
-                # NZSL Absent, S3 PRESENT
-                this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""]
-
-        # Find NZSL keys that are absent from S3 (present handled already above)
-        for video_key, item_list in this_nzsl_raw_keys_dict.items():
-            if video_key not in this_s3_bucket_raw_keys_list:
-                # NZSL PRESENT, S3 Absent
-                this_all_keys_dict[video_key] = [True, False] + item_list
-
-        # Write all keys back to a cache file
-        for video_key, item_list in this_all_keys_dict.items():
-            cache_file.write(
-                f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n"
-            )
+
+    # Find S3 keys that are present in NZSL, or absent
+    for video_key in this_s3_bucket_raw_keys_list:
+        if video_key in this_nzsl_raw_keys_dict:
+            # NZSL PRESENT, S3 PRESENT
+            this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[
+                video_key
+            ]
+        else:
+            # NZSL Absent, S3 PRESENT
+            this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""]
+
+    # Find NZSL keys that are absent from S3 (present handled already above)
+    for video_key, item_list in this_nzsl_raw_keys_dict.items():
+        if video_key not in this_s3_bucket_raw_keys_list:
+            # NZSL PRESENT, S3 Absent
+            this_all_keys_dict[video_key] = [True, False] + item_list
 
     return this_all_keys_dict
 
@@ -371,6 +300,7 @@ def output_csv(this_all_keys_dict):
     print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
     print(build_csv_header())
+
     for video_key, [
         key_in_nzsl,
         key_in_s3,
@@ -405,14 +335,8 @@ def output_csv(this_all_keys_dict):
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 
-if args.cached:
-    print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr)
-    all_keys_dict = get_keys_from_cache_file()
-else:
-    print("Generating video keys from scratch.", file=sys.stderr)
-    init_files()
-    s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
-    nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
-    all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
+s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
+nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
+all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
 output_csv(all_keys_dict)

From a5ed6328b1b6556c53bfc7f68c621394f6d33478 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:32:56 +1000
Subject: [PATCH 097/222] Fields broken out for clarity

---
 bin/get-video-s3-acls.py | 58 +++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 9 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 57f8605a..ff7bb19f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -172,7 +172,6 @@ def get_nzsl_raw_keys_dict():
 
 
 # Get the keys present and absent across NZSL Signbank and S3, to dictionary
-# See DICTIONARY and CACHE FILE format
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print(
         "Getting keys present and absent across NZSL Signbank and S3 ...",
@@ -183,19 +182,60 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     # Find S3 keys that are present in NZSL, or absent
     for video_key in this_s3_bucket_raw_keys_list:
         if video_key in this_nzsl_raw_keys_dict:
-            # NZSL PRESENT, S3 PRESENT
-            this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[
-                video_key
+
+            # This is split out purely for human readability
+            [
+                gloss_id,
+                gloss_idgloss,
+                created_at,
+                gloss_public,
+                video_public,
+                video_id,
+            ] = this_nzsl_raw_keys_dict[video_key]
+            this_all_keys_dict[video_key] = [
+                True,   # NZSL PRESENT
+                True,   # S3 PRESENT
+                gloss_id,
+                gloss_idgloss,
+                created_at,
+                gloss_public,
+                video_public,
+                video_id,
             ]
         else:
-            # NZSL Absent, S3 PRESENT
-            this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""]
+            this_all_keys_dict[video_key] = [
+                False,  # NZSL Absent
+                True,   # S3 PRESENT
+                "",     # gloss_id
+                "",     # gloss_idgloss,
+                "",     # created_at,
+                "",     # gloss_public,
+                "",     # video_public,
+                ""      # video_id,
+            ]
 
     # Find NZSL keys that are absent from S3 (present handled already above)
-    for video_key, item_list in this_nzsl_raw_keys_dict.items():
+    for (video_key,
+         [
+             gloss_id,
+             gloss_idgloss,
+             created_at,
+             gloss_public,
+             video_public,
+             video_id,
+         ]
+         ) in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
-            # NZSL PRESENT, S3 Absent
-            this_all_keys_dict[video_key] = [True, False] + item_list
+            this_all_keys_dict[video_key] = [
+                True,   # NZSL PRESENT
+                False,  # S3 Absent
+                gloss_id,
+                gloss_idgloss,
+                created_at,
+                gloss_public,
+                video_public,
+                video_id,
+            ]
 
     return this_all_keys_dict
 

From b5bec417ef1787e945caac7fc493cb63d0ffdd4f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:36:01 +1000
Subject: [PATCH 098/222] Reordered function declarations

---
 bin/get-video-s3-acls.py | 75 +++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 44 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index ff7bb19f..15942b31 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -62,50 +62,6 @@
 s3_bucket_raw_keys_list = []
 all_keys_dict = {}
 
-# DICTIONARY format
-# This is used at several points
-# Essentially video_key + in_nzsl + in_s3 + nzsl_raw_keys_dict
-#   video_key(str) ->
-#       in_nzsl(bool),
-#       in_s3(bool),
-#       gloss_id(int),
-#       gloss_idgloss(str),
-#       created_at(str),
-#       gloss_public(bool),
-#       video_public(bool)
-#       video_id(int)
-
-
-# Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = subprocess.run(
-        [
-            AWSCLI,
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
-    )
-
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
-
-    print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_s3_bucket_raw_keys_list
-
 
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
@@ -171,6 +127,37 @@ def get_nzsl_raw_keys_dict():
     return this_nzsl_raw_keys_dict
 
 
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = subprocess.run(
+        [
+            AWSCLI,
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
 # Get the keys present and absent across NZSL Signbank and S3, to dictionary
 def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     print(

From 7b7f111efa8cde94f0bf099a6ee803dee42361f4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:39:11 +1000
Subject: [PATCH 099/222] Renamed created_at prior to S3 replacement

---
 bin/get-video-s3-acls.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 15942b31..c6ef4967 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -80,7 +80,7 @@ def get_nzsl_raw_keys_dict():
             "SELECT "
             "dg.id AS gloss_id, "
             "dg.idgloss AS gloss_idgloss, "
-            "dg.created_at, "
+            "dg.created_at AS gloss_created_at, "
             "dg.published AS gloss_public, "
             "vg.is_public AS video_public, "
             "vg.id AS video_id, "
@@ -104,7 +104,7 @@ def get_nzsl_raw_keys_dict():
         [
             gloss_id,
             gloss_idgloss,
-            created_at,
+            gloss_created_at,
             gloss_public,
             video_public,
             video_id,
@@ -113,7 +113,7 @@ def get_nzsl_raw_keys_dict():
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_id,
             gloss_idgloss.replace(CSV_DELIMITER, ""),
-            created_at,
+            gloss_created_at,
             gloss_public.lower() == "t",
             video_public.lower() == "t",
             video_id,
@@ -174,7 +174,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
             [
                 gloss_id,
                 gloss_idgloss,
-                created_at,
+                gloss_created_at,
                 gloss_public,
                 video_public,
                 video_id,
@@ -184,7 +184,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
                 True,   # S3 PRESENT
                 gloss_id,
                 gloss_idgloss,
-                created_at,
+                gloss_created_at,
                 gloss_public,
                 video_public,
                 video_id,
@@ -195,7 +195,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
                 True,   # S3 PRESENT
                 "",     # gloss_id
                 "",     # gloss_idgloss,
-                "",     # created_at,
+                "",     # gloss_created_at,
                 "",     # gloss_public,
                 "",     # video_public,
                 ""      # video_id,
@@ -206,7 +206,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
          [
              gloss_id,
              gloss_idgloss,
-             created_at,
+             gloss_created_at,
              gloss_public,
              video_public,
              video_id,
@@ -218,7 +218,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
                 False,  # S3 Absent
                 gloss_id,
                 gloss_idgloss,
-                created_at,
+                gloss_created_at,
                 gloss_public,
                 video_public,
                 video_id,
@@ -232,7 +232,7 @@ def build_csv_header():
         [
             "Gloss ID",
             "Gloss",
-            "Created at",
+            "Gloss created at",
             "Gloss public",
             "Video public",
             "Video ID",
@@ -248,7 +248,7 @@ def build_csv_row(
     key_in_s3=False,
     gloss_id=None,
     gloss_idgloss=None,
-    created_at=None,
+    gloss_created_at=None,
     gloss_public=False,
     video_public=False,
     video_id=None,
@@ -267,7 +267,7 @@ def build_csv_row(
             [
                 f"{gloss_id}",
                 f"{gloss_idgloss}",
-                f"{created_at}",
+                f"{gloss_created_at}",
                 f"{gloss_public}",
                 f"{video_public}",
                 f"{video_id}",
@@ -311,7 +311,7 @@ def build_csv_row(
         [
             f"{gloss_id}",
             f"{gloss_idgloss}",
-            f"{created_at}",
+            f"{gloss_created_at}",
             f"{gloss_public}",
             f"{video_public}",
             f"{video_id}",
@@ -333,7 +333,7 @@ def output_csv(this_all_keys_dict):
         key_in_s3,
         gloss_id,
         gloss_idgloss,
-        created_at,
+        gloss_created_at,
         gloss_public,
         video_public,
         video_id,
@@ -345,7 +345,7 @@ def output_csv(this_all_keys_dict):
                 key_in_s3,
                 gloss_id,
                 gloss_idgloss,
-                created_at,
+                gloss_created_at,
                 gloss_public,
                 video_public,
                 video_id,

From 6360bf641bc0b4e7d02e1035aec4976f9b8b3205 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:02:58 +1000
Subject: [PATCH 100/222] Reformatted

---
 bin/get-video-s3-acls.py | 46 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c6ef4967..1672078c 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -70,7 +70,7 @@ def get_nzsl_raw_keys_dict():
         file=sys.stderr,
     )
     this_nzsl_raw_keys_dict = {}
-    # Column renaming is purely for readability
+    # Column renaming is for readability
     # Special delimiter because columns might contain commas
     result = subprocess.run(
         [
@@ -110,6 +110,7 @@ def get_nzsl_raw_keys_dict():
             video_id,
             video_key,
         ] = rawl.split("|")
+
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_id,
             gloss_idgloss.replace(CSV_DELIMITER, ""),
@@ -170,7 +171,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
     for video_key in this_s3_bucket_raw_keys_list:
         if video_key in this_nzsl_raw_keys_dict:
 
-            # This is split out purely for human readability
+            # Split out for readability
             [
                 gloss_id,
                 gloss_idgloss,
@@ -179,9 +180,10 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
                 video_public,
                 video_id,
             ] = this_nzsl_raw_keys_dict[video_key]
+
             this_all_keys_dict[video_key] = [
-                True,   # NZSL PRESENT
-                True,   # S3 PRESENT
+                True,  # NZSL PRESENT
+                True,  # S3 PRESENT
                 gloss_id,
                 gloss_idgloss,
                 gloss_created_at,
@@ -192,29 +194,27 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
         else:
             this_all_keys_dict[video_key] = [
                 False,  # NZSL Absent
-                True,   # S3 PRESENT
-                "",     # gloss_id
-                "",     # gloss_idgloss,
-                "",     # gloss_created_at,
-                "",     # gloss_public,
-                "",     # video_public,
-                ""      # video_id,
+                True,  # S3 PRESENT
+                "",  # gloss_id
+                "",  # gloss_idgloss,
+                "",  # gloss_created_at,
+                "",  # gloss_public,
+                "",  # video_public,
+                "",  # video_id,
             ]
 
-    # Find NZSL keys that are absent from S3 (present handled already above)
-    for (video_key,
-         [
-             gloss_id,
-             gloss_idgloss,
-             gloss_created_at,
-             gloss_public,
-             video_public,
-             video_id,
-         ]
-         ) in this_nzsl_raw_keys_dict.items():
+    # Find NZSL keys that are absent from S3 (present handled above)
+    for video_key, [
+        gloss_id,
+        gloss_idgloss,
+        gloss_created_at,
+        gloss_public,
+        video_public,
+        video_id,
+    ] in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
             this_all_keys_dict[video_key] = [
-                True,   # NZSL PRESENT
+                True,  # NZSL PRESENT
                 False,  # S3 Absent
                 gloss_id,
                 gloss_idgloss,

From f985cd8a8e6bfb9e08e331e8dde4403fba3b1889 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:21:47 +1000
Subject: [PATCH 101/222] Fields rearranged (still gloss created_at)

---
 bin/get-video-s3-acls.py | 70 ++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 1672078c..7316c74b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -112,12 +112,12 @@ def get_nzsl_raw_keys_dict():
         ] = rawl.split("|")
 
         this_nzsl_raw_keys_dict[video_key] = [
-            gloss_id,
             gloss_idgloss.replace(CSV_DELIMITER, ""),
             gloss_created_at,
+            gloss_id,
+            video_id,
             gloss_public.lower() == "t",
             video_public.lower() == "t",
-            video_id,
         ]
 
     print(
@@ -173,55 +173,55 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
 
             # Split out for readability
             [
-                gloss_id,
                 gloss_idgloss,
                 gloss_created_at,
+                gloss_id,
+                video_id,
                 gloss_public,
                 video_public,
-                video_id,
             ] = this_nzsl_raw_keys_dict[video_key]
 
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT
                 True,  # S3 PRESENT
-                gloss_id,
                 gloss_idgloss,
                 gloss_created_at,
+                gloss_id,
+                video_id,
                 gloss_public,
                 video_public,
-                video_id,
             ]
         else:
             this_all_keys_dict[video_key] = [
                 False,  # NZSL Absent
                 True,  # S3 PRESENT
-                "",  # gloss_id
                 "",  # gloss_idgloss,
                 "",  # gloss_created_at,
+                "",  # gloss_id
+                "",  # video_id,
                 "",  # gloss_public,
                 "",  # video_public,
-                "",  # video_id,
             ]
 
     # Find NZSL keys that are absent from S3 (present handled above)
     for video_key, [
-        gloss_id,
         gloss_idgloss,
         gloss_created_at,
+        gloss_id,
+        video_id,
         gloss_public,
         video_public,
-        video_id,
     ] in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT
                 False,  # S3 Absent
-                gloss_id,
                 gloss_idgloss,
                 gloss_created_at,
+                gloss_id,
+                video_id,
                 gloss_public,
                 video_public,
-                video_id,
             ]
 
     return this_all_keys_dict
@@ -230,15 +230,15 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
 def build_csv_header():
     return CSV_DELIMITER.join(
         [
-            "Gloss ID",
+            "Video key",
             "Gloss",
             "Gloss created at",
-            "Gloss public",
-            "Video public",
-            "Video ID",
-            "Video key",
             "Expected Canned ACL",
             "Actual Canned ACL",
+            "Gloss ID",
+            "Video ID",
+            "Gloss public",
+            "Video public",
         ]
     )
 
@@ -246,13 +246,13 @@ def build_csv_header():
 def build_csv_row(
     key_in_nzsl=False,
     key_in_s3=False,
-    gloss_id=None,
+    video_key=None,
     gloss_idgloss=None,
     gloss_created_at=None,
+    gloss_id=None,
+    video_id=None,
     gloss_public=False,
     video_public=False,
-    video_id=None,
-    video_key=None,
 ):
 
     # See signbank/video/models.py, line 59, in function set_public_acl()
@@ -265,15 +265,15 @@ def build_csv_row(
     if not key_in_s3:
         return CSV_DELIMITER.join(
             [
-                f"{gloss_id}",
+                f"{video_key}",
                 f"{gloss_idgloss}",
                 f"{gloss_created_at}",
+                f"{canned_acl_expected}",
+                "",  # Actual Canned ACL
+                f"{gloss_id}",
+                f"{video_id}",
                 f"{gloss_public}",
                 f"{video_public}",
-                f"{video_id}",
-                f"{video_key}",
-                f"{canned_acl_expected}",
-                "",
             ]
         )
 
@@ -309,15 +309,15 @@ def build_csv_row(
 
     return CSV_DELIMITER.join(
         [
-            f"{gloss_id}",
+            f"{video_key}",
             f"{gloss_idgloss}",
             f"{gloss_created_at}",
-            f"{gloss_public}",
-            f"{video_public}",
-            f"{video_id}",
-            f"{video_key}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
+            f"{gloss_id}",
+            f"{video_id}",
+            f"{gloss_public}",
+            f"{video_public}",
         ]
     )
 
@@ -331,25 +331,25 @@ def output_csv(this_all_keys_dict):
     for video_key, [
         key_in_nzsl,
         key_in_s3,
-        gloss_id,
         gloss_idgloss,
         gloss_created_at,
+        gloss_id,
+        video_id,
         gloss_public,
         video_public,
-        video_id,
     ] in this_all_keys_dict.items():
 
         print(
             build_csv_row(
                 key_in_nzsl,
                 key_in_s3,
-                gloss_id,
+                video_key,
                 gloss_idgloss,
                 gloss_created_at,
+                gloss_id,
+                video_id,
                 gloss_public,
                 video_public,
-                video_id,
-                video_key,
             )
         )
 

From 8387571dd727cdea3d478346caa62ce135743c7b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:24:54 +1000
Subject: [PATCH 102/222] Comments

---
 bin/get-video-s3-acls.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 7316c74b..887b68ea 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -255,7 +255,7 @@ def build_csv_row(
     video_public=False,
 ):
 
-    # See signbank/video/models.py, line 59, in function set_public_acl()
+    # See signbank/video/models.py, line 59, function set_public_acl()
     if key_in_nzsl:
         canned_acl_expected = "public-read" if video_public else "private"
     else:
@@ -307,6 +307,9 @@ def build_csv_row(
     elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
         canned_acl = "private"
 
+    # TODO Get S3 object's LastModified date/time
+    # ---
+
     return CSV_DELIMITER.join(
         [
             f"{video_key}",

From 51caea740f48cde34fc9aea779616109a7cb0912 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:50:16 +1000
Subject: [PATCH 103/222] S3 Lastmodified datetime. Reordering. Column names
 updated.

---
 bin/get-video-s3-acls.py | 48 ++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 887b68ea..4da8e49b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -230,15 +230,16 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
 def build_csv_header():
     return CSV_DELIMITER.join(
         [
-            "Video key",
-            "Gloss",
-            "Gloss created at",
-            "Expected Canned ACL",
-            "Actual Canned ACL",
-            "Gloss ID",
-            "Video ID",
-            "Gloss public",
-            "Video public",
+            "S3 Video key",
+            "Sbank Gloss",
+            "Sbank Gloss created at",
+            "S3 LastModified",
+            "S3 Expected Canned ACL",
+            "S3 Actual Canned ACL",
+            "Sbank Gloss ID",
+            "Sbank Video ID",
+            "Sbank Gloss public",
+            "Sbank Video public",
         ]
     )
 
@@ -307,14 +308,35 @@ def build_csv_row(
     elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
         canned_acl = "private"
 
-    # TODO Get S3 object's LastModified date/time
-    # ---
+    # Get S3 object's LastModified date/time
+    result = subprocess.run(
+        [
+            AWSCLI,
+            "s3api",
+            "get-object-attributes",
+            "--object-attributes",
+            "ObjectParts",
+            "--output",
+            "json",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ],
+        env=os.environ,
+        shell=False,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    s3_lastmodified = json.loads(result.stdout)["LastModified"]
 
     return CSV_DELIMITER.join(
         [
             f"{video_key}",
             f"{gloss_idgloss}",
             f"{gloss_created_at}",
+            f"{s3_lastmodified}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
             f"{gloss_id}",
@@ -365,8 +387,10 @@ def output_csv(this_all_keys_dict):
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 
-s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
 nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
+
+s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
+
 all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
 
 output_csv(all_keys_dict)

From 0e92d203d624fbfaf50451c8587efdb6d3607d8e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:52:37 +1000
Subject: [PATCH 104/222] Message updated

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 4da8e49b..cddde0c4 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -349,7 +349,7 @@ def build_csv_row(
 
 # From the keys present in NZSL, get all their ACL information
 def output_csv(this_all_keys_dict):
-    print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr)
+    print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
     print(build_csv_header())
 

From c26b9aac0f6c22e56721387e2493f56e73a6360d Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:54:56 +1000
Subject: [PATCH 105/222] Message

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index cddde0c4..83a2c8b8 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -379,7 +379,7 @@ def output_csv(this_all_keys_dict):
         )
 
 
-print(f"Mode:      {args.env}", file=sys.stderr)
+print(f"Env:       {args.env}", file=sys.stderr)
 print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:    {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:     {PGCLI}", file=sys.stderr)

From 86e4d34a21359d8708d9ad403d6f1abaa26ca4e9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 12:03:26 +1000
Subject: [PATCH 106/222] Rearranged. Intermediate variables removed. Black.

---
 bin/get-video-s3-acls.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 83a2c8b8..dabadd43 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -57,11 +57,6 @@
     print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
     exit()
 
-# Vars
-nzsl_raw_keys_dict = {}
-s3_bucket_raw_keys_list = []
-all_keys_dict = {}
-
 
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
@@ -160,7 +155,7 @@ def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
 
 
 # Get the keys present and absent across NZSL Signbank and S3, to dictionary
-def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict):
+def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     print(
         "Getting keys present and absent across NZSL Signbank and S3 ...",
         file=sys.stderr,
@@ -387,10 +382,6 @@ def output_csv(this_all_keys_dict):
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 
-nzsl_raw_keys_dict = get_nzsl_raw_keys_dict()
-
-s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list()
-
-all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict)
-
-output_csv(all_keys_dict)
+output_csv(
+    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+)

From 0d564a9f161f3f0982e03d6903dc4668e76a41ce Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 12:06:59 +1000
Subject: [PATCH 107/222] Comment

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index dabadd43..e270dc41 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -342,7 +342,7 @@ def build_csv_row(
     )
 
 
-# From the keys present in NZSL, get all their ACL information
+# From the keys present in NZSL, get all their S3 information
 def output_csv(this_all_keys_dict):
     print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 

From 45064bad067f25e8a5fe66cd65914e26634722c8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 12:08:14 +1000
Subject: [PATCH 108/222] Missing empty LastModified for case not in S3

---
 bin/get-video-s3-acls.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e270dc41..fe886862 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -264,6 +264,7 @@ def build_csv_row(
                 f"{video_key}",
                 f"{gloss_idgloss}",
                 f"{gloss_created_at}",
+                "",  # S3 LastModified
                 f"{canned_acl_expected}",
                 "",  # Actual Canned ACL
                 f"{gloss_id}",

From 4cff3da932afbcee24d9f569c9ac4ed01893f1b4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 20 Sep 2024 13:58:37 +1000
Subject: [PATCH 109/222] TMPDIR removed

---
 bin/get-video-s3-acls.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index fe886862..5fd14e0a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -50,12 +50,6 @@
 AWSCLI = args.awscli
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
-TMPDIR = args.tmpdir
-try:
-    os.makedirs(TMPDIR, exist_ok=True)
-except OSError as err:
-    print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr)
-    exit()
 
 
 # Get the video files info from NZSL Signbank
@@ -379,7 +373,6 @@ def output_csv(this_all_keys_dict):
 print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:    {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:     {PGCLI}", file=sys.stderr)
-print(f"TMPDIR:    {TMPDIR}", file=sys.stderr)
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 

From 7795405ecfcd1cf8760885901ff2fda16ea18a87 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 23 Sep 2024 10:00:36 +1000
Subject: [PATCH 110/222] More efficient dictionary lookup

---
 bin/get-video-s3-acls.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 5fd14e0a..817d7cb2 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -158,8 +158,8 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
 
     # Find S3 keys that are present in NZSL, or absent
     for video_key in this_s3_bucket_raw_keys_list:
-        if video_key in this_nzsl_raw_keys_dict:
-
+        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
+        if dict_row:
             # Split out for readability
             [
                 gloss_idgloss,
@@ -168,7 +168,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
                 video_id,
                 gloss_public,
                 video_public,
-            ] = this_nzsl_raw_keys_dict[video_key]
+            ] = dict_row
 
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT

From a68b89bb2d470cc9b9d095fee93c285bb74c969e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 26 Sep 2024 19:18:56 +1000
Subject: [PATCH 111/222] comments

---
 bin/get-video-s3-acls.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 817d7cb2..88589d0e 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -321,6 +321,32 @@ def build_csv_row(
     )
     s3_lastmodified = json.loads(result.stdout)["LastModified"]
 
+
+    #TODO Logic goes here
+    # We decide what to do
+    # We mark what to do, but we don't actually do it
+    # If we are in 'Go' mode, and there is a previous mark, we do what that mark says
+    # Then we change the mark to past tense
+
+    # Cases
+
+    # Only a Signbank entry, no S3
+    #   - nothing to do, no action, NOOP
+
+    # Only an S3 entry, no Signbank
+    #    - DELETE the S3 entry
+
+    # Both:
+
+    # Private gloss
+    #   - set S3 PRIVATE, that's it
+
+    # Public gloss, video private, S3 public
+    #   - set S3 PRIVATE
+
+    # Public gloss, video public, S3 private
+    #   - set S3 PUBLIC
+
     return CSV_DELIMITER.join(
         [
             f"{video_key}",

From 6a5514ab681634731e2be4237604161f06f812c8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 3 Oct 2024 12:25:12 +1000
Subject: [PATCH 112/222] Basic 'action' recommendation working

---
 bin/get-video-s3-acls.py | 64 +++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 88589d0e..c0a12c0b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -18,6 +18,14 @@
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
+# 'Go' mode, args.do_actions
+parser.add_argument(
+    "--do-actions",
+    action='store_true',
+    default=False,
+    required=False,
+    help="Actually perform Delete objects or change ACLs (DESTRUCTIVE operation)",
+)
 parser.add_argument(
     "--env",
     default="uat",
@@ -183,7 +191,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
         else:
             this_all_keys_dict[video_key] = [
                 False,  # NZSL Absent
-                True,  # S3 PRESENT
+                True,   # S3 PRESENT
                 "",  # gloss_idgloss,
                 "",  # gloss_created_at,
                 "",  # gloss_id
@@ -203,7 +211,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     ] in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
             this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
+                True,   # NZSL PRESENT
                 False,  # S3 Absent
                 gloss_idgloss,
                 gloss_created_at,
@@ -229,6 +237,7 @@ def build_csv_header():
             "Sbank Video ID",
             "Sbank Gloss public",
             "Sbank Video public",
+            "Action",
         ]
     )
 
@@ -245,13 +254,29 @@ def build_csv_row(
     video_public=False,
 ):
 
+    action = ""
+    # Cases
+    # In S3     In NZSL     Action
+    #   Is        No          Delete!
+    #   Is        Is          Check ACL
+    #   Not       Is          Review
+    #   (F F impossible)
+
+    if key_in_s3:
+        if key_in_nzsl:
+            action = "Check ACL"
+        else:
+            action = "Delete"
+    else:
+        action = "Review"
+
     # See signbank/video/models.py, line 59, function set_public_acl()
     if key_in_nzsl:
         canned_acl_expected = "public-read" if video_public else "private"
     else:
         canned_acl_expected = ""
 
-    # If key not in S3, just return its NZSL info
+    # If key not in S3, just return its NZSL info and action
     if not key_in_s3:
         return CSV_DELIMITER.join(
             [
@@ -265,6 +290,7 @@ def build_csv_row(
                 f"{video_id}",
                 f"{gloss_public}",
                 f"{video_public}",
+                action,
             ]
         )
 
@@ -321,32 +347,6 @@ def build_csv_row(
     )
     s3_lastmodified = json.loads(result.stdout)["LastModified"]
 
-
-    #TODO Logic goes here
-    # We decide what to do
-    # We mark what to do, but we don't actually do it
-    # If we are in 'Go' mode, and there is a previous mark, we do what that mark says
-    # Then we change the mark to past tense
-
-    # Cases
-
-    # Only a Signbank entry, no S3
-    #   - nothing to do, no action, NOOP
-
-    # Only an S3 entry, no Signbank
-    #    - DELETE the S3 entry
-
-    # Both:
-
-    # Private gloss
-    #   - set S3 PRIVATE, that's it
-
-    # Public gloss, video private, S3 public
-    #   - set S3 PRIVATE
-
-    # Public gloss, video public, S3 private
-    #   - set S3 PUBLIC
-
     return CSV_DELIMITER.join(
         [
             f"{video_key}",
@@ -359,12 +359,14 @@ def build_csv_row(
             f"{video_id}",
             f"{gloss_public}",
             f"{video_public}",
+            action,
         ]
     )
 
 
 # From the keys present in NZSL, get all their S3 information
-def output_csv(this_all_keys_dict):
+# If we are in 'Go' mode, perform actions
+def process_keys(this_all_keys_dict):
     print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
     print(build_csv_header())
@@ -402,6 +404,6 @@ def output_csv(this_all_keys_dict):
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 
-output_csv(
+process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )

From 27b389d9015ff54dcf0c10b9f22bede2373273c6 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 7 Oct 2024 14:18:42 +1100
Subject: [PATCH 113/222] Pass whole row to build_csv_header()

---
 bin/get-video-s3-acls.py | 36 +++++++-----------------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c0a12c0b..743dc960 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -243,9 +243,9 @@ def build_csv_header():
 
 
 def build_csv_row(
+    video_key,
     key_in_nzsl=False,
     key_in_s3=False,
-    video_key=None,
     gloss_idgloss=None,
     gloss_created_at=None,
     gloss_id=None,
@@ -371,36 +371,14 @@ def process_keys(this_all_keys_dict):
 
     print(build_csv_header())
 
-    for video_key, [
-        key_in_nzsl,
-        key_in_s3,
-        gloss_idgloss,
-        gloss_created_at,
-        gloss_id,
-        video_id,
-        gloss_public,
-        video_public,
-    ] in this_all_keys_dict.items():
-
-        print(
-            build_csv_row(
-                key_in_nzsl,
-                key_in_s3,
-                video_key,
-                gloss_idgloss,
-                gloss_created_at,
-                gloss_id,
-                video_id,
-                gloss_public,
-                video_public,
-            )
-        )
+    for video_key, values in this_all_keys_dict.items():
+        print(build_csv_row(video_key, *values))
 
 
-print(f"Env:       {args.env}", file=sys.stderr)
-print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLI:    {AWSCLI}", file=sys.stderr)
-print(f"PGCLI:     {PGCLI}", file=sys.stderr)
+print(f"Env:         {args.env}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
+print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 if "AWS_PROFILE" in os.environ:
     print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
 

From 777dac0000ec9417a718c38a556a6c726413b39f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 7 Oct 2024 15:14:31 +1100
Subject: [PATCH 114/222] tweaks

---
 bin/get-video-s3-acls.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 743dc960..818003ae 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -21,7 +21,7 @@
 # 'Go' mode, args.do_actions
 parser.add_argument(
     "--do-actions",
-    action='store_true',
+    action="store_true",
     default=False,
     required=False,
     help="Actually perform Delete objects or change ACLs (DESTRUCTIVE operation)",
@@ -191,7 +191,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
         else:
             this_all_keys_dict[video_key] = [
                 False,  # NZSL Absent
-                True,   # S3 PRESENT
+                True,  # S3 PRESENT
                 "",  # gloss_idgloss,
                 "",  # gloss_created_at,
                 "",  # gloss_id
@@ -211,7 +211,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     ] in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
             this_all_keys_dict[video_key] = [
-                True,   # NZSL PRESENT
+                True,  # NZSL PRESENT
                 False,  # S3 Absent
                 gloss_idgloss,
                 gloss_created_at,
@@ -379,8 +379,7 @@ def process_keys(this_all_keys_dict):
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
-if "AWS_PROFILE" in os.environ:
-    print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr)
+print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())

From 5b2715f2cfc205674c51bcea5da15fc86855dc4d Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 7 Oct 2024 15:40:57 +1100
Subject: [PATCH 115/222] Actions in a function, rearranged

---
 bin/get-video-s3-acls.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 818003ae..2d3c85ea 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -242,6 +242,20 @@ def build_csv_header():
     )
 
 
+# Cases
+# In S3     In NZSL     Action
+#   Is        Not         Delete S3 Object
+#   Is        Is          Update ACL
+#   Not       Is          Review
+def get_action(key_in_nzsl, key_in_s3):
+    if key_in_s3:
+        if key_in_nzsl:
+            return "Update ACL"
+        else:
+            return "Delete S3 Object"
+    return "Review"
+
+
 def build_csv_row(
     video_key,
     key_in_nzsl=False,
@@ -254,21 +268,7 @@ def build_csv_row(
     video_public=False,
 ):
 
-    action = ""
-    # Cases
-    # In S3     In NZSL     Action
-    #   Is        No          Delete!
-    #   Is        Is          Check ACL
-    #   Not       Is          Review
-    #   (F F impossible)
-
-    if key_in_s3:
-        if key_in_nzsl:
-            action = "Check ACL"
-        else:
-            action = "Delete"
-    else:
-        action = "Review"
+    action = get_action(key_in_nzsl, key_in_s3)
 
     # See signbank/video/models.py, line 59, function set_public_acl()
     if key_in_nzsl:

From 57ef78005ba076384accaedea9ebbb9c284c458b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 7 Oct 2024 16:01:05 +1100
Subject: [PATCH 116/222] LastModified retrieved via query path

---
 bin/get-video-s3-acls.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 2d3c85ea..1d193cf1 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -332,6 +332,8 @@ def build_csv_row(
             "get-object-attributes",
             "--object-attributes",
             "ObjectParts",
+            "--query",
+            "LastModified",
             "--output",
             "json",
             "--bucket",
@@ -345,7 +347,7 @@ def build_csv_row(
         capture_output=True,
         text=True,
     )
-    s3_lastmodified = json.loads(result.stdout)["LastModified"]
+    s3_lastmodified = result.stdout.strip("\n\"")
 
     return CSV_DELIMITER.join(
         [

From 35021d6e9d87b753bb406b3dfcbab2f882e39236 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 7 Oct 2024 17:19:40 +1100
Subject: [PATCH 117/222] Lots of refactoring

---
 bin/get-video-s3-acls.py | 97 +++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 50 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 1d193cf1..a22a9070 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -256,52 +256,17 @@ def get_action(key_in_nzsl, key_in_s3):
     return "Review"
 
 
-def build_csv_row(
-    video_key,
-    key_in_nzsl=False,
-    key_in_s3=False,
-    gloss_idgloss=None,
-    gloss_created_at=None,
-    gloss_id=None,
-    video_id=None,
-    gloss_public=False,
-    video_public=False,
-):
-
-    action = get_action(key_in_nzsl, key_in_s3)
-
-    # See signbank/video/models.py, line 59, function set_public_acl()
-    if key_in_nzsl:
-        canned_acl_expected = "public-read" if video_public else "private"
-    else:
-        canned_acl_expected = ""
-
-    # If key not in S3, just return its NZSL info and action
-    if not key_in_s3:
-        return CSV_DELIMITER.join(
-            [
-                f"{video_key}",
-                f"{gloss_idgloss}",
-                f"{gloss_created_at}",
-                "",  # S3 LastModified
-                f"{canned_acl_expected}",
-                "",  # Actual Canned ACL
-                f"{gloss_id}",
-                f"{video_id}",
-                f"{gloss_public}",
-                f"{video_public}",
-                action,
-            ]
-        )
-
-    # Get S3 object's ACL
+# Get S3 object's ACL
+def get_s3_canned_acl(video_key):
     result = subprocess.run(
         [
             AWSCLI,
             "s3api",
             "get-object-acl",
             "--output",
-            "json",
+            "text",
+            "--query",
+            "Grants[*].Permission",
             "--bucket",
             AWS_S3_BUCKET,
             "--key",
@@ -313,18 +278,23 @@ def build_csv_row(
         capture_output=True,
         text=True,
     )
+    acls_grants = result.stdout.strip().split("\t")
+
     canned_acl = "unknown"
-    acls_grants_json = json.loads(result.stdout)["Grants"]
-    if len(acls_grants_json) > 1:
+    if len(acls_grants) > 1:
         if (
-            acls_grants_json[0]["Permission"] == "FULL_CONTROL"
-            and acls_grants_json[1]["Permission"] == "READ"
+            acls_grants[0] == "FULL_CONTROL"
+            and acls_grants[1] == "READ"
         ):
             canned_acl = "public-read"
-    elif acls_grants_json[0]["Permission"] == "FULL_CONTROL":
+    elif acls_grants[0] == "FULL_CONTROL":
         canned_acl = "private"
 
-    # Get S3 object's LastModified date/time
+    return canned_acl
+
+
+# Get S3 object's LastModified date/time
+def get_s3_lastmodified(video_key):
     result = subprocess.run(
         [
             AWSCLI,
@@ -332,10 +302,10 @@ def build_csv_row(
             "get-object-attributes",
             "--object-attributes",
             "ObjectParts",
+            "--output",
+            "text",
             "--query",
             "LastModified",
-            "--output",
-            "json",
             "--bucket",
             AWS_S3_BUCKET,
             "--key",
@@ -347,14 +317,41 @@ def build_csv_row(
         capture_output=True,
         text=True,
     )
-    s3_lastmodified = result.stdout.strip("\n\"")
+    return result.stdout.strip()
+
+
+def build_csv_row(
+    video_key,
+    key_in_nzsl=False,
+    key_in_s3=False,
+    gloss_idgloss=None,
+    gloss_created_at=None,
+    gloss_id=None,
+    video_id=None,
+    gloss_public=False,
+    video_public=False,
+):
+    # See signbank/video/models.py, line 59, function set_public_acl()
+    if key_in_nzsl:
+        canned_acl_expected = "public-read" if video_public else "private"
+    else:
+        canned_acl_expected = ""
+
+    if key_in_s3:
+        lastmodified = get_s3_lastmodified(video_key)
+        canned_acl = get_s3_canned_acl(video_key)
+    else:
+        lastmodified = ""
+        canned_acl = ""
+
+    action = get_action(key_in_nzsl, key_in_s3)
 
     return CSV_DELIMITER.join(
         [
             f"{video_key}",
             f"{gloss_idgloss}",
             f"{gloss_created_at}",
-            f"{s3_lastmodified}",
+            f"{lastmodified}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
             f"{gloss_id}",

From 63979e1bbb8b5d1e4a934686a07ae0e5d8c3b60b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 7 Oct 2024 17:21:40 +1100
Subject: [PATCH 118/222] Default shell=false redundancy

---
 bin/get-video-s3-acls.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a22a9070..90cad367 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -273,7 +273,6 @@ def get_s3_canned_acl(video_key):
             video_key,
         ],
         env=os.environ,
-        shell=False,
         check=True,
         capture_output=True,
         text=True,
@@ -312,7 +311,6 @@ def get_s3_lastmodified(video_key):
             video_key,
         ],
         env=os.environ,
-        shell=False,
         check=True,
         capture_output=True,
         text=True,

From f4a858ac0d11f7294096a75ea27c049c57a71997 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:39:57 +1100
Subject: [PATCH 119/222] subprocess.run wrapped for PGCLI

---
 bin/get-video-s3-acls.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 90cad367..8efab63c 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -60,6 +60,25 @@
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 
 
+def pg_cli(cmd):
+    return subprocess.run(
+        [
+            PGCLI,
+            "-c",
+            cmd,
+            f"{DATABASE_URL}",
+        ],
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+
+def aws_cli():
+    pass
+
+
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
     print(
@@ -69,10 +88,7 @@ def get_nzsl_raw_keys_dict():
     this_nzsl_raw_keys_dict = {}
     # Column renaming is for readability
     # Special delimiter because columns might contain commas
-    result = subprocess.run(
-        [
-            PGCLI,
-            "-c",
+    result = pg_cli(
             "COPY ("
             "SELECT "
             "dg.id AS gloss_id, "
@@ -84,12 +100,6 @@ def get_nzsl_raw_keys_dict():
             "vg.videofile AS video_key "
             "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
             ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
-            f"{DATABASE_URL}",
-        ],
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
     )
 
     # Separate the NZSL db columns

From 31b41f21c90f47ffd035f2e8a9aaec7027d4aa80 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:51:38 +1100
Subject: [PATCH 120/222] subprocess.run wrapped for AWSCLI. Better argument
 handling.

---
 bin/get-video-s3-acls.py | 68 ++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 8efab63c..2e5b1308 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -60,14 +60,11 @@
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 
 
-def pg_cli(cmd):
+def pg_cli(args_list):
+    if not isinstance(args_list, list):
+        args_list = [args_list]
     return subprocess.run(
-        [
-            PGCLI,
-            "-c",
-            cmd,
-            f"{DATABASE_URL}",
-        ],
+        [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
         env=os.environ,
         capture_output=True,
         check=True,
@@ -75,8 +72,16 @@ def pg_cli(cmd):
     )
 
 
-def aws_cli():
-    pass
+def aws_cli(args_list):
+    if not isinstance(args_list, list):
+        args_list = [args_list]
+    return subprocess.run(
+        [AWSCLI] + args_list,
+        env=os.environ,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
 
 
 # Get the video files info from NZSL Signbank
@@ -89,17 +94,17 @@ def get_nzsl_raw_keys_dict():
     # Column renaming is for readability
     # Special delimiter because columns might contain commas
     result = pg_cli(
-            "COPY ("
-            "SELECT "
-            "dg.id AS gloss_id, "
-            "dg.idgloss AS gloss_idgloss, "
-            "dg.created_at AS gloss_created_at, "
-            "dg.published AS gloss_public, "
-            "vg.is_public AS video_public, "
-            "vg.id AS video_id, "
-            "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        "COPY ("
+        "SELECT "
+        "dg.id AS gloss_id, "
+        "dg.idgloss AS gloss_idgloss, "
+        "dg.created_at AS gloss_created_at, "
+        "dg.published AS gloss_public, "
+        "vg.is_public AS video_public, "
+        "vg.id AS video_id, "
+        "vg.videofile AS video_key "
+        "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+        ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
     )
 
     # Separate the NZSL db columns
@@ -138,18 +143,13 @@ def get_nzsl_raw_keys_dict():
 # Get all keys from AWS S3
 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = subprocess.run(
+    result = aws_cli(
         [
-            AWSCLI,
             "s3",
             "ls",
             f"s3://{s3_bucket}",
             "--recursive",
         ],
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
     )
 
     # Separate out just the key from date, time, size, key
@@ -268,9 +268,8 @@ def get_action(key_in_nzsl, key_in_s3):
 
 # Get S3 object's ACL
 def get_s3_canned_acl(video_key):
-    result = subprocess.run(
+    result = aws_cli(
         [
-            AWSCLI,
             "s3api",
             "get-object-acl",
             "--output",
@@ -281,20 +280,13 @@ def get_s3_canned_acl(video_key):
             AWS_S3_BUCKET,
             "--key",
             video_key,
-        ],
-        env=os.environ,
-        check=True,
-        capture_output=True,
-        text=True,
+        ]
     )
     acls_grants = result.stdout.strip().split("\t")
-
+    
     canned_acl = "unknown"
     if len(acls_grants) > 1:
-        if (
-            acls_grants[0] == "FULL_CONTROL"
-            and acls_grants[1] == "READ"
-        ):
+        if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
             canned_acl = "public-read"
     elif acls_grants[0] == "FULL_CONTROL":
         canned_acl = "private"

From 975540e7c07e437955653aa99b2476afb480a9e9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 12:52:53 +1100
Subject: [PATCH 121/222] Another AWSCLI wrap.

---
 bin/get-video-s3-acls.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 2e5b1308..c8b99b00 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -283,7 +283,7 @@ def get_s3_canned_acl(video_key):
         ]
     )
     acls_grants = result.stdout.strip().split("\t")
-    
+
     canned_acl = "unknown"
     if len(acls_grants) > 1:
         if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
@@ -296,9 +296,8 @@ def get_s3_canned_acl(video_key):
 
 # Get S3 object's LastModified date/time
 def get_s3_lastmodified(video_key):
-    result = subprocess.run(
+    result = aws_cli(
         [
-            AWSCLI,
             "s3api",
             "get-object-attributes",
             "--object-attributes",
@@ -311,11 +310,7 @@ def get_s3_lastmodified(video_key):
             AWS_S3_BUCKET,
             "--key",
             video_key,
-        ],
-        env=os.environ,
-        check=True,
-        capture_output=True,
-        text=True,
+        ]
     )
     return result.stdout.strip()
 

From 6370e2124e28a69e8a3629fc822136607d54b5e4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:01:22 +1100
Subject: [PATCH 122/222] get-object-attributes -> head-object for LastModified

---
 bin/get-video-s3-acls.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c8b99b00..f98f46c1 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -299,9 +299,7 @@ def get_s3_lastmodified(video_key):
     result = aws_cli(
         [
             "s3api",
-            "get-object-attributes",
-            "--object-attributes",
-            "ObjectParts",
+            "head-object",
             "--output",
             "text",
             "--query",

From a773a6381817484bf10560d34f2d2ef0dfa19b43 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:03:14 +1100
Subject: [PATCH 123/222] Removed json module

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index f98f46c1..9c04fe0a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -10,7 +10,6 @@
 import sys
 import subprocess
 import argparse
-import json
 import re
 from pprint import pprint
 

From 1adbe925ea67bf36d19ed4ff28b76ad3d937bbe3 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:30:50 +1100
Subject: [PATCH 124/222] Moved header fn closer to row fn

---
 bin/get-video-s3-acls.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 9c04fe0a..4dbccf7b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -233,24 +233,6 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     return this_all_keys_dict
 
 
-def build_csv_header():
-    return CSV_DELIMITER.join(
-        [
-            "S3 Video key",
-            "Sbank Gloss",
-            "Sbank Gloss created at",
-            "S3 LastModified",
-            "S3 Expected Canned ACL",
-            "S3 Actual Canned ACL",
-            "Sbank Gloss ID",
-            "Sbank Video ID",
-            "Sbank Gloss public",
-            "Sbank Video public",
-            "Action",
-        ]
-    )
-
-
 # Cases
 # In S3     In NZSL     Action
 #   Is        Not         Delete S3 Object
@@ -312,6 +294,24 @@ def get_s3_lastmodified(video_key):
     return result.stdout.strip()
 
 
+def build_csv_header():
+    return CSV_DELIMITER.join(
+        [
+            "S3 Video key",
+            "Sbank Gloss",
+            "Sbank Gloss created at",
+            "S3 LastModified",
+            "S3 Expected Canned ACL",
+            "S3 Actual Canned ACL",
+            "Sbank Gloss ID",
+            "Sbank Video ID",
+            "Sbank Gloss public",
+            "Sbank Video public",
+            "Action",
+        ]
+    )
+
+
 def build_csv_row(
     video_key,
     key_in_nzsl=False,

From 2dd442fd1a03137274d98589aecf538a62e5b1d9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:26:55 +1100
Subject: [PATCH 125/222] Reorder fields

---
 bin/get-video-s3-acls.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 4dbccf7b..e5748d93 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -297,17 +297,17 @@ def get_s3_lastmodified(video_key):
 def build_csv_header():
     return CSV_DELIMITER.join(
         [
+            "Action",
             "S3 Video key",
-            "Sbank Gloss",
             "Sbank Gloss created at",
-            "S3 LastModified",
             "S3 Expected Canned ACL",
             "S3 Actual Canned ACL",
             "Sbank Gloss ID",
             "Sbank Video ID",
             "Sbank Gloss public",
             "Sbank Video public",
-            "Action",
+            "Sbank Gloss",
+            "S3 LastModified",
         ]
     )
 
@@ -340,17 +340,17 @@ def build_csv_row(
 
     return CSV_DELIMITER.join(
         [
+            action,
             f"{video_key}",
-            f"{gloss_idgloss}",
             f"{gloss_created_at}",
-            f"{lastmodified}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
             f"{gloss_id}",
             f"{video_id}",
             f"{gloss_public}",
             f"{video_public}",
-            action,
+            f"{gloss_idgloss}",
+            f"{lastmodified}",
         ]
     )
 

From 6f38ac0b26e967f956d4c22731b85b79201541a8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:33:39 +1100
Subject: [PATCH 126/222] Canned ACL variable removed

---
 bin/get-video-s3-acls.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e5748d93..32b4a936 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -265,14 +265,13 @@ def get_s3_canned_acl(video_key):
     )
     acls_grants = result.stdout.strip().split("\t")
 
-    canned_acl = "unknown"
     if len(acls_grants) > 1:
         if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
-            canned_acl = "public-read"
+            return "public-read"
     elif acls_grants[0] == "FULL_CONTROL":
-        canned_acl = "private"
+        return "private"
 
-    return canned_acl
+    return "unknown"
 
 
 # Get S3 object's LastModified date/time

From 92988c4bb8cd8dac0b5ddffe7a1b8c447be2f7bc Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:35:47 +1100
Subject: [PATCH 127/222] Renamed action function

---
 bin/get-video-s3-acls.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 32b4a936..71e2bf66 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -238,7 +238,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
 #   Is        Not         Delete S3 Object
 #   Is        Is          Update ACL
 #   Not       Is          Review
-def get_action(key_in_nzsl, key_in_s3):
+def get_recommended_action(key_in_nzsl, key_in_s3):
     if key_in_s3:
         if key_in_nzsl:
             return "Update ACL"
@@ -335,7 +335,7 @@ def build_csv_row(
         lastmodified = ""
         canned_acl = ""
 
-    action = get_action(key_in_nzsl, key_in_s3)
+    action = get_recommended_action(key_in_nzsl, key_in_s3)
 
     return CSV_DELIMITER.join(
         [

From 438a83e31cf5d23415b866bc83082bbb3a0c96ca Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:40:16 +1100
Subject: [PATCH 128/222] Else's defaulted

---
 bin/get-video-s3-acls.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 71e2bf66..e36df4c1 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -323,17 +323,15 @@ def build_csv_row(
     video_public=False,
 ):
     # See signbank/video/models.py, line 59, function set_public_acl()
+    canned_acl_expected = ""
     if key_in_nzsl:
         canned_acl_expected = "public-read" if video_public else "private"
-    else:
-        canned_acl_expected = ""
 
+    lastmodified = ""
+    canned_acl = ""
     if key_in_s3:
         lastmodified = get_s3_lastmodified(video_key)
         canned_acl = get_s3_canned_acl(video_key)
-    else:
-        lastmodified = ""
-        canned_acl = ""
 
     action = get_recommended_action(key_in_nzsl, key_in_s3)
 

From ea5d6bb490a49a9196314a24584e15b5597618e8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:44:34 +1100
Subject: [PATCH 129/222] Guard code removed.

---
 bin/get-video-s3-acls.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e36df4c1..7d88ccb8 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -60,8 +60,6 @@
 
 
 def pg_cli(args_list):
-    if not isinstance(args_list, list):
-        args_list = [args_list]
     return subprocess.run(
         [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
         env=os.environ,
@@ -72,8 +70,6 @@ def pg_cli(args_list):
 
 
 def aws_cli(args_list):
-    if not isinstance(args_list, list):
-        args_list = [args_list]
     return subprocess.run(
         [AWSCLI] + args_list,
         env=os.environ,

From c214dde0220f7144c45addf1038083ab5cac324b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:45:47 +1100
Subject: [PATCH 130/222] Debug removed.

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 7d88ccb8..6b404ba2 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,7 +11,6 @@
 import subprocess
 import argparse
 import re
-from pprint import pprint
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "

From e342fac27d8232c5ab3a85098e6922960b7e57de Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:51:22 +1100
Subject: [PATCH 131/222] Array argument fix

---
 bin/get-video-s3-acls.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 6b404ba2..6974727d 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -88,17 +88,19 @@ def get_nzsl_raw_keys_dict():
     # Column renaming is for readability
     # Special delimiter because columns might contain commas
     result = pg_cli(
-        "COPY ("
-        "SELECT "
-        "dg.id AS gloss_id, "
-        "dg.idgloss AS gloss_idgloss, "
-        "dg.created_at AS gloss_created_at, "
-        "dg.published AS gloss_public, "
-        "vg.is_public AS video_public, "
-        "vg.id AS video_id, "
-        "vg.videofile AS video_key "
-        "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-        ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        [
+            "COPY ("
+            "SELECT "
+            "dg.id AS gloss_id, "
+            "dg.idgloss AS gloss_idgloss, "
+            "dg.created_at AS gloss_created_at, "
+            "dg.published AS gloss_public, "
+            "vg.is_public AS video_public, "
+            "vg.id AS video_id, "
+            "vg.videofile AS video_key "
+            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        ]
     )
 
     # Separate the NZSL db columns

From b64e1adb5413c2e05d25f35c8e4433f323ff7191 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 23 Oct 2024 10:33:53 +1100
Subject: [PATCH 132/222] Swapped created_at and lastmodified columns

---
 bin/get-video-s3-acls.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 6974727d..573de55a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -295,7 +295,7 @@ def build_csv_header():
         [
             "Action",
             "S3 Video key",
-            "Sbank Gloss created at",
+            "S3 LastModified",
             "S3 Expected Canned ACL",
             "S3 Actual Canned ACL",
             "Sbank Gloss ID",
@@ -303,7 +303,7 @@ def build_csv_header():
             "Sbank Gloss public",
             "Sbank Video public",
             "Sbank Gloss",
-            "S3 LastModified",
+            "Sbank Gloss created at",
         ]
     )
 
@@ -336,7 +336,7 @@ def build_csv_row(
         [
             action,
             f"{video_key}",
-            f"{gloss_created_at}",
+            f"{lastmodified}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
             f"{gloss_id}",
@@ -344,7 +344,7 @@ def build_csv_row(
             f"{gloss_public}",
             f"{video_public}",
             f"{gloss_idgloss}",
-            f"{lastmodified}",
+            f"{gloss_created_at}",
         ]
     )
 

From ba137dcd07d56497f718b54a91c224549b8868c2 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 24 Oct 2024 12:53:26 +1100
Subject: [PATCH 133/222] FULL JOIN from INNER JOIN

---
 bin/get-video-s3-acls.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 573de55a..4aa4b469 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -98,7 +98,8 @@ def get_nzsl_raw_keys_dict():
             "vg.is_public AS video_public, "
             "vg.id AS video_id, "
             "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            "FROM dictionary_gloss AS dg "
+            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
             ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
         ]
     )

From a50455418fb42b46d4ffa5a4e1a60137249d32f8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:16:52 +1100
Subject: [PATCH 134/222] Internal changes

---
 bin/get-video-s3-acls.py | 47 +++++-----------------------------------
 1 file changed, 6 insertions(+), 41 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 4aa4b469..c27adee7 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -175,58 +175,23 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     for video_key in this_s3_bucket_raw_keys_list:
         dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
         if dict_row:
-            # Split out for readability
-            [
-                gloss_idgloss,
-                gloss_created_at,
-                gloss_id,
-                video_id,
-                gloss_public,
-                video_public,
-            ] = dict_row
-
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT
                 True,  # S3 PRESENT
-                gloss_idgloss,
-                gloss_created_at,
-                gloss_id,
-                video_id,
-                gloss_public,
-                video_public,
-            ]
+            ] + dict_row
         else:
             this_all_keys_dict[video_key] = [
                 False,  # NZSL Absent
                 True,  # S3 PRESENT
-                "",  # gloss_idgloss,
-                "",  # gloss_created_at,
-                "",  # gloss_id
-                "",  # video_id,
-                "",  # gloss_public,
-                "",  # video_public,
-            ]
+            ] + [""] * 6
 
     # Find NZSL keys that are absent from S3 (present handled above)
-    for video_key, [
-        gloss_idgloss,
-        gloss_created_at,
-        gloss_id,
-        video_id,
-        gloss_public,
-        video_public,
-    ] in this_nzsl_raw_keys_dict.items():
+    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT
                 False,  # S3 Absent
-                gloss_idgloss,
-                gloss_created_at,
-                gloss_id,
-                video_id,
-                gloss_public,
-                video_public,
-            ]
+            ] + dict_row
 
     return this_all_keys_dict
 
@@ -357,8 +322,8 @@ def process_keys(this_all_keys_dict):
 
     print(build_csv_header())
 
-    for video_key, values in this_all_keys_dict.items():
-        print(build_csv_row(video_key, *values))
+    for video_key, dict_row in this_all_keys_dict.items():
+        print(build_csv_row(video_key, *dict_row))
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From 75107c98e91dffc4434f5568f6ceb32e0279f06d Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:20:22 +1100
Subject: [PATCH 135/222] Comments

---
 bin/get-video-s3-acls.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c27adee7..f8a888ca 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -120,6 +120,7 @@ def get_nzsl_raw_keys_dict():
             video_key,
         ] = rawl.split("|")
 
+        # This sets the initial field ordering in the dictionary
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_idgloss.replace(CSV_DELIMITER, ""),
             gloss_created_at,

From 66b5db1e15283ae79ace7c1613e7e45772b62ed6 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:21:35 +1100
Subject: [PATCH 136/222] Comments

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index f8a888ca..a329ff07 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -120,7 +120,7 @@ def get_nzsl_raw_keys_dict():
             video_key,
         ] = rawl.split("|")
 
-        # This sets the initial field ordering in the dictionary
+        # This sets the initial field ordering in the all_keys dictionary row
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_idgloss.replace(CSV_DELIMITER, ""),
             gloss_created_at,

From e166694114f35c6e2d01d91cce4569c0983d4a76 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:27:05 +1100
Subject: [PATCH 137/222] Comments

---
 bin/get-video-s3-acls.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index a329ff07..89d952f1 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -317,7 +317,6 @@ def build_csv_row(
 
 
 # From the keys present in NZSL, get all their S3 information
-# If we are in 'Go' mode, perform actions
 def process_keys(this_all_keys_dict):
     print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 

From 0642a259b58d86058b3b94e1826e35e73ebcd309 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:29:50 +1100
Subject: [PATCH 138/222] Added forced retries to AWS command

---
 bin/get-video-s3-acls.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 89d952f1..b28598bf 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -11,6 +11,7 @@
 import subprocess
 import argparse
 import re
+from time import sleep
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -69,13 +70,26 @@ def pg_cli(args_list):
 
 
 def aws_cli(args_list):
-    return subprocess.run(
-        [AWSCLI] + args_list,
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
-    )
+    # Try indefinitely
+    output = None
+    while not output:
+        try:
+            output = subprocess.run(
+                [AWSCLI] + args_list,
+                env=os.environ,
+                capture_output=True,
+                check=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
+            )
+            print(e.cmd, file=sys.stderr)
+            print(e.stdout, file=sys.stderr)
+            print(e.stderr, file=sys.stderr)
+            sleep(1)
+    return output
 
 
 # Get the video files info from NZSL Signbank

From 97b559a43040486f5c29e6ca9da8f3f03daae52c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 14:04:56 +1100
Subject: [PATCH 139/222] Superfluous arguments removed

---
 bin/get-video-s3-acls.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index b28598bf..4328cd08 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -17,14 +17,6 @@
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
-# 'Go' mode, args.do_actions
-parser.add_argument(
-    "--do-actions",
-    action="store_true",
-    default=False,
-    required=False,
-    help="Actually perform Delete objects or change ACLs (DESTRUCTIVE operation)",
-)
 parser.add_argument(
     "--env",
     default="uat",
@@ -43,12 +35,6 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--tmpdir",
-    default="/tmp/nzsl",
-    required=False,
-    help=f"Temp dir path (default: %(default)s)",
-)
 args = parser.parse_args()
 
 # Globals

From a588af28e660e29c16478ee1c0a22c335c3c314f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 15:32:11 +1100
Subject: [PATCH 140/222] Experimental Django code

---
 bin/get-video-s3-acls.py | 53 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 4328cd08..be20313c 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -12,6 +12,22 @@
 import argparse
 import re
 from time import sleep
+from pprint import pprint
+import boto3
+import django
+
+# Magic required to allow this script to use Signbank Django classes
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+pprint(sys.path)
+os.environ.setdefault(
+    "DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+get_wsgi_application()
+from django.contrib.auth import get_user_model
+User = get_user_model()
+
+# Test
+from signbank.dictionary.models import FieldChoice, Gloss
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -35,11 +51,23 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
+parser.add_argument(
+    "--tests",
+    action="store_true",
+    default=False,
+    required=False,
+    help="Run remote tests instead of generating CSV output",
+)
+
 args = parser.parse_args()
 
 # Globals
 CSV_DELIMITER = ","
-DATABASE_URL = os.getenv("DATABASE_URL", "")
+DATABASE_URL = (
+    "postgres://postgres:postgres@localhost:5432/postgres"
+    if args.tests
+    else os.getenv("DATABASE_URL", "")
+)
 AWSCLI = args.awscli
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
@@ -316,6 +344,20 @@ def build_csv_row(
     )
 
 
+# Run some tests against the remote endpoints
+# This is a test-harness for now
+# Takes advantage of the fact we have a lot of setup infrastructure in this script already
+def do_tests():
+    # Debugging safety
+    if args.env != "dev":
+        print("Error: tests must be in 'dev' environment")
+        exit()
+    print(f"DATABASE_URL:{DATABASE_URL}")
+    print("Running tests")
+    s3 = boto3.client("s3")
+    #pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
+    #get_nzsl_raw_keys_dict()
+
 # From the keys present in NZSL, get all their S3 information
 def process_keys(this_all_keys_dict):
     print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
@@ -332,6 +374,9 @@ def process_keys(this_all_keys_dict):
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-process_keys(
-    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
-)
+if args.tests:
+    do_tests()
+else:
+    process_keys(
+        create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+    )

From 5d11500ca02f68fb5aa902e76d1ce46041444509 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 16:29:50 +1100
Subject: [PATCH 141/222] Experimental Django code

---
 bin/get-video-s3-acls.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index be20313c..82e82638 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -26,9 +26,6 @@
 from django.contrib.auth import get_user_model
 User = get_user_model()
 
-# Test
-from signbank.dictionary.models import FieldChoice, Gloss
-
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
@@ -347,6 +344,7 @@ def build_csv_row(
 # Run some tests against the remote endpoints
 # This is a test-harness for now
 # Takes advantage of the fact we have a lot of setup infrastructure in this script already
+from signbank.dictionary.models import FieldChoice, Gloss
 def do_tests():
     # Debugging safety
     if args.env != "dev":

From d430c59e2a13ebc7c81be947b5aeefd069b34847 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 16:32:51 +1100
Subject: [PATCH 142/222] black

---
 bin/get-video-s3-acls.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 82e82638..ec4b9f4c 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -19,11 +19,12 @@
 # Magic required to allow this script to use Signbank Django classes
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 pprint(sys.path)
-os.environ.setdefault(
-    "DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
 from django.core.wsgi import get_wsgi_application
+
 get_wsgi_application()
 from django.contrib.auth import get_user_model
+
 User = get_user_model()
 
 parser = argparse.ArgumentParser(
@@ -345,6 +346,8 @@ def build_csv_row(
 # This is a test-harness for now
 # Takes advantage of the fact we have a lot of setup infrastructure in this script already
 from signbank.dictionary.models import FieldChoice, Gloss
+
+
 def do_tests():
     # Debugging safety
     if args.env != "dev":
@@ -353,8 +356,9 @@ def do_tests():
     print(f"DATABASE_URL:{DATABASE_URL}")
     print("Running tests")
     s3 = boto3.client("s3")
-    #pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
-    #get_nzsl_raw_keys_dict()
+    # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
+    # get_nzsl_raw_keys_dict()
+
 
 # From the keys present in NZSL, get all their S3 information
 def process_keys(this_all_keys_dict):

From 566ad2a75b7a6d9fe268ad2b2670f3f7f596ba82 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 16:33:55 +1100
Subject: [PATCH 143/222] black

---
 bin/get-video-s3-acls.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index ec4b9f4c..37f5b73f 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -14,11 +14,9 @@
 from time import sleep
 from pprint import pprint
 import boto3
-import django
 
 # Magic required to allow this script to use Signbank Django classes
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-pprint(sys.path)
 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
 from django.core.wsgi import get_wsgi_application
 
@@ -26,6 +24,7 @@
 from django.contrib.auth import get_user_model
 
 User = get_user_model()
+from signbank.dictionary.models import FieldChoice, Gloss
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -345,9 +344,6 @@ def build_csv_row(
 # Run some tests against the remote endpoints
 # This is a test-harness for now
 # Takes advantage of the fact we have a lot of setup infrastructure in this script already
-from signbank.dictionary.models import FieldChoice, Gloss
-
-
 def do_tests():
     # Debugging safety
     if args.env != "dev":

From 4e62dbd6ac6459377cd043e99a02827c95985899 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:09:31 +1100
Subject: [PATCH 144/222] Django imports only occur under tests (requires a
 virtualenv). Postgres call handles and reports exceptions informatively.

---
 bin/get-video-s3-acls.py | 46 +++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 37f5b73f..c537abfe 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -15,16 +15,6 @@
 from pprint import pprint
 import boto3
 
-# Magic required to allow this script to use Signbank Django classes
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-from signbank.dictionary.models import FieldChoice, Gloss
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -58,6 +48,22 @@
 
 args = parser.parse_args()
 
+
+if args.tests:
+    # Magic required to allow this script to use Signbank Django classes
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+    from django.core.wsgi import get_wsgi_application
+
+    get_wsgi_application()
+
+    from django.contrib.auth import get_user_model
+
+    User = get_user_model()
+
+    from signbank.dictionary.models import FieldChoice, Gloss
+
+
 # Globals
 CSV_DELIMITER = ","
 DATABASE_URL = (
@@ -71,13 +77,19 @@
 
 
 def pg_cli(args_list):
-    return subprocess.run(
-        [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
-        env=os.environ,
-        capture_output=True,
-        check=True,
-        text=True,
-    )
+    try:
+        return subprocess.run(
+            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
+            env=os.environ,
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
+        print(e.cmd, file=sys.stderr)
+        print(e.stdout, file=sys.stderr)
+        print(e.stderr, file=sys.stderr)
 
 
 def aws_cli(args_list):

From 7c997a662573790caa02875fc1d0353c16358f74 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:13:17 +1100
Subject: [PATCH 145/222] exit on postgres exception

---
 bin/get-video-s3-acls.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index c537abfe..5203b34a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -90,6 +90,7 @@ def pg_cli(args_list):
         print(e.cmd, file=sys.stderr)
         print(e.stdout, file=sys.stderr)
         print(e.stderr, file=sys.stderr)
+        exit()
 
 
 def aws_cli(args_list):

From a1e10a71a2122699903da236be2cde1064a7233f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:39:25 +1100
Subject: [PATCH 146/222] Postgres tests safety guard

---
 bin/get-video-s3-acls.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 5203b34a..7625239a 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -61,16 +61,20 @@
 
     User = get_user_model()
 
-    from signbank.dictionary.models import FieldChoice, Gloss
-
+    from signbank.dictionary.models import (
+        Dataset,
+        FieldChoice,
+        Gloss,
+        GlossTranslations,
+        Language,
+        ManualValidationAggregation,
+        ShareValidationAggregation,
+        ValidationRecord,
+    )
 
 # Globals
 CSV_DELIMITER = ","
-DATABASE_URL = (
-    "postgres://postgres:postgres@localhost:5432/postgres"
-    if args.tests
-    else os.getenv("DATABASE_URL", "")
-)
+DATABASE_URL = os.getenv("DATABASE_URL", "")
 AWSCLI = args.awscli
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
@@ -363,10 +367,15 @@ def do_tests():
         print("Error: tests must be in 'dev' environment")
         exit()
     print(f"DATABASE_URL:{DATABASE_URL}")
+    if DATABASE_URL.find("@localhost") < 0:
+        print("Error: database url must contain '@localhost'")
+        exit()
+
     print("Running tests")
     s3 = boto3.client("s3")
     # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
     # get_nzsl_raw_keys_dict()
+    pprint(Gloss.objects.all())
 
 
 # From the keys present in NZSL, get all their S3 information
@@ -387,7 +396,8 @@ def process_keys(this_all_keys_dict):
 
 if args.tests:
     do_tests()
-else:
-    process_keys(
-        create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
-    )
+    exit()
+
+process_keys(
+    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+)

From a05eb003fca162354b76ea95c05c492b4f8ad606 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:39:31 +1100
Subject: [PATCH 147/222] Experimental refactor csv_import

---
 signbank/dictionary/csv_import.py | 884 ++++++++++++++++++------------
 1 file changed, 538 insertions(+), 346 deletions(-)

diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py
index 0bfdff69..0685d701 100644
--- a/signbank/dictionary/csv_import.py
+++ b/signbank/dictionary/csv_import.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
+from pprint import pprint
+
 import codecs
 import csv
 import datetime
@@ -26,8 +28,16 @@
 from tagging.models import Tag, TaggedItem
 
 from .forms import CSVFileOnlyUpload, CSVUploadForm
-from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language,
-                     ManualValidationAggregation, ShareValidationAggregation, ValidationRecord)
+from .models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
 from .tasks import retrieve_videos_for_glosses
 from ..video.models import GlossVideo
 
@@ -35,7 +45,7 @@
 
 
 @login_required
-@permission_required('dictionary.import_csv')
+@permission_required("dictionary.import_csv")
 def import_gloss_csv(request):
     """
     Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms.
@@ -44,31 +54,53 @@ def import_gloss_csv(request):
     glosses_new = []
     glosses_exists = []
     # Make sure that the session variables are flushed before using this view.
-    if 'dataset_id' in request.session: del request.session['dataset_id']
-    if 'glosses_new' in request.session: del request.session['glosses_new']
+    if "dataset_id" in request.session:
+        del request.session["dataset_id"]
+    if "glosses_new" in request.session:
+        del request.session["glosses_new"]
 
-    if request.method == 'POST':
+    if request.method == "POST":
         form = CSVUploadForm(request.POST, request.FILES)
         if form.is_valid():
-            dataset = form.cleaned_data['dataset']
-            if 'view_dataset' not in get_perms(request.user, dataset):
+            dataset = form.cleaned_data["dataset"]
+            if "view_dataset" not in get_perms(request.user, dataset):
                 # If user has no permissions to dataset, raise PermissionDenied to show 403 template.
-                msg = _("You do not have permissions to import glosses to this lexicon.")
+                msg = _(
+                    "You do not have permissions to import glosses to this lexicon."
+                )
                 messages.error(request, msg)
                 raise PermissionDenied(msg)
             try:
-                glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"')
+                glossreader = csv.reader(
+                    codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
+                    delimiter=",",
+                    quotechar='"',
+                )
             except csv.Error as e:
                 # Can't open file, remove session variables
-                if 'dataset_id' in request.session: del request.session['dataset_id']
-                if 'glosses_new' in request.session: del request.session['glosses_new']
+                if "dataset_id" in request.session:
+                    del request.session["dataset_id"]
+                if "glosses_new" in request.session:
+                    del request.session["glosses_new"]
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e)))
-                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
+                messages.add_message(
+                    request, messages.ERROR, _("Cannot open the file:" + str(e))
+                )
+                return render(
+                    request,
+                    "dictionary/import_gloss_csv.html",
+                    {"import_csv_form": CSVUploadForm()},
+                )
             except UnicodeDecodeError as e:
                 # File is not UTF-8 encoded.
-                messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!'))
-                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
+                messages.add_message(
+                    request, messages.ERROR, _("File must be UTF-8 encoded!")
+                )
+                return render(
+                    request,
+                    "dictionary/import_gloss_csv.html",
+                    {"import_csv_form": CSVUploadForm()},
+                )
 
             for row in glossreader:
                 if glossreader.line_num == 1:
@@ -87,74 +119,113 @@ def import_gloss_csv(request):
                     continue
 
             # Store dataset's id and the list of glosses to be added in session.
-            request.session['dataset_id'] = dataset.id
-            request.session['glosses_new'] = glosses_new
-
-            return render(request, 'dictionary/import_gloss_csv_confirmation.html',
-                          {'glosses_new': glosses_new,
-                           'glosses_exists': glosses_exists,
-                           'dataset': dataset, })
+            request.session["dataset_id"] = dataset.id
+            request.session["glosses_new"] = glosses_new
+
+            return render(
+                request,
+                "dictionary/import_gloss_csv_confirmation.html",
+                {
+                    "glosses_new": glosses_new,
+                    "glosses_exists": glosses_exists,
+                    "dataset": dataset,
+                },
+            )
         else:
             # If form is not valid, set a error message and return to the original form.
-            messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements '
-                                                            'or there is some other problem.'))
-            return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, )
+            messages.add_message(
+                request,
+                messages.ERROR,
+                _(
+                    "The provided CSV-file does not meet the requirements "
+                    "or there is some other problem."
+                ),
+            )
+            return render(
+                request,
+                "dictionary/import_gloss_csv.html",
+                {"import_csv_form": form},
+            )
     else:
         # If request type is not POST, return to the original form.
         csv_form = CSVUploadForm()
-        allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset')
+        allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
-            id__in=[x.id for x in allowed_datasets])
-        return render(request, "dictionary/import_gloss_csv.html",
-                      {'import_csv_form': csv_form}, )
+        csv_form.fields["dataset"].queryset = csv_form.fields[
+            "dataset"
+        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
+        return render(
+            request,
+            "dictionary/import_gloss_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
 
 @login_required
-@permission_required('dictionary.import_csv')
+@permission_required("dictionary.import_csv")
 def confirm_import_gloss_csv(request):
     """This view adds the data to database if the user confirms the action"""
-    if request.method == 'POST':
-        if 'cancel' in request.POST:
+    if request.method == "POST":
+        if "cancel" in request.POST:
             # If user cancels adding data, flush session variables
-            if 'dataset_id' in request.session: del request.session['dataset_id']
-            if 'glosses_new' in request.session: del request.session['glosses_new']
+            if "dataset_id" in request.session:
+                del request.session["dataset_id"]
+            if "glosses_new" in request.session:
+                del request.session["glosses_new"]
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.'))
-            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
+            messages.add_message(
+                request, messages.WARNING, _("Cancelled adding CSV data.")
+            )
+            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
 
-        elif 'confirm' in request.POST:
+        elif "confirm" in request.POST:
             glosses_added = []
             dataset = None
-            if 'glosses_new' and 'dataset_id' in request.session:
-                dataset = Dataset.objects.get(id=request.session['dataset_id'])
-                for gloss in request.session['glosses_new']:
+            if "glosses_new" and "dataset_id" in request.session:
+                dataset = Dataset.objects.get(id=request.session["dataset_id"])
+                for gloss in request.session["glosses_new"]:
 
                     # If the Gloss does not already exist, continue adding.
-                    if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists():
+                    if not Gloss.objects.filter(
+                        dataset=dataset, idgloss=gloss[0]
+                    ).exists():
                         try:
-                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1],
-                                          created_by=request.user, updated_by=request.user)
+                            new_gloss = Gloss(
+                                dataset=dataset,
+                                idgloss=gloss[0],
+                                idgloss_mi=gloss[1],
+                                created_by=request.user,
+                                updated_by=request.user,
+                            )
                         except IndexError:
                             # If we get IndexError, idgloss_mi was probably not provided
-                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0],
-                                              created_by=request.user, updated_by=request.user)
+                            new_gloss = Gloss(
+                                dataset=dataset,
+                                idgloss=gloss[0],
+                                created_by=request.user,
+                                updated_by=request.user,
+                            )
 
                         new_gloss.save()
                         glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi))
 
                 # Flush request.session['glosses_new'] and request.session['dataset']
-                del request.session['glosses_new']
-                del request.session['dataset_id']
+                del request.session["glosses_new"]
+                del request.session["dataset_id"]
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.'))
-            return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added,
-                                                                                     'dataset': dataset.name})
+                messages.add_message(
+                    request, messages.SUCCESS, _("Glosses were added successfully.")
+                )
+            return render(
+                request,
+                "dictionary/import_gloss_csv_confirmation.html",
+                {"glosses_added": glosses_added, "dataset": dataset.name},
+            )
         else:
-            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
+            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
     else:
         # If request method is not POST, redirect to the import form
-        return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
+        return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
 
 
 share_csv_header_list = [
@@ -191,20 +262,32 @@ def import_nzsl_share_gloss_csv(request):
         csv_form = CSVUploadForm()
         allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
-            id__in=[x.id for x in allowed_datasets])
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": csv_form}, )
+        csv_form.fields["dataset"].queryset = csv_form.fields[
+            "dataset"
+        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
     form = CSVUploadForm(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(request, messages.ERROR,
-                             _("The provided CSV-file does not meet the requirements "
-                               "or there is some other problem."))
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": form}, )
+        messages.add_message(
+            request,
+            messages.ERROR,
+            _(
+                "The provided CSV-file does not meet the requirements "
+                "or there is some other problem."
+            ),
+        )
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": form},
+        )
 
     new_glosses = []
     dataset = form.cleaned_data["dataset"]
@@ -218,7 +301,7 @@ def import_nzsl_share_gloss_csv(request):
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             fieldnames=share_csv_header_list,
             delimiter=",",
-            quotechar='"'
+            quotechar='"',
         )
 
         skipped_existing_glosses = []
@@ -254,29 +337,40 @@ def import_nzsl_share_gloss_csv(request):
         request.session.pop("dataset_id", None)
         request.session.pop("glosses_new", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": CSVUploadForm()}, )
+        messages.add_message(
+            request, messages.ERROR, _("Cannot open the file:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": CSVUploadForm()},
+        )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": CSVUploadForm()}, )
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": CSVUploadForm()},
+        )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["dataset_id"] = dataset.id
     request.session["glosses_new"] = new_glosses
 
-    return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-                  {
-                      "glosses_new": new_glosses,
-                      "dataset": dataset,
-                      "skipped_existing_glosses": skipped_existing_glosses
-                  })
+    return render(
+        request,
+        "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+        {
+            "glosses_new": new_glosses,
+            "dataset": dataset,
+            "skipped_existing_glosses": skipped_existing_glosses,
+        },
+    )
 
 
 def update_retrieval_videos(videos, gloss_data):
-    """ prep videos, illustrations and usage example for video retrieval """
+    """prep videos, illustrations and usage example for video retrieval"""
 
     gloss_pk = gloss_data["gloss"].pk
     gloss_word = gloss_data["word"]
@@ -284,16 +378,14 @@ def update_retrieval_videos(videos, gloss_data):
     if gloss_data.get("videos", None):
         video_url = gloss_data["videos"]
         extension = video_url[-3:]
-        file_name = (
-            f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
-        )
+        file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
 
         glossvideo = {
             "url": video_url,
             "file_name": file_name,
             "gloss_pk": gloss_pk,
             "video_type": "main",
-            "version": 0
+            "version": 0,
         }
         videos.append(glossvideo)
 
@@ -309,7 +401,7 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": "main",
-                "version": i
+                "version": i,
             }
             videos.append(glossvideo)
 
@@ -325,14 +417,18 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": f"finalexample{i + 1}",
-                "version": i
+                "version": i,
             }
             videos.append(glossvideo)
 
+
 @login_required
 @permission_required("dictionary.import_csv")
 @transaction.atomic()
 def confirm_import_nzsl_share_gloss_csv(request):
+
+    pprint(request.session.__dict__)
+
     """This view adds the data to database if the user confirms the action"""
     if not request.method == "POST":
         # If request method is not POST, redirect to the import form
@@ -348,6 +444,31 @@ def confirm_import_nzsl_share_gloss_csv(request):
     elif not "confirm" in request.POST:
         return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv"))
 
+    if "glosses_new" and "dataset_id" in request.session:
+        [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner(
+            request.session["glosses_new"], request.session["dataset_id"]
+        )
+
+        del request.session["glosses_new"]
+        del request.session["dataset_id"]
+
+        # Set a message to be shown so that the user knows what is going on.
+        messages.add_message(
+            request, messages.SUCCESS, _("Glosses were added successfully.")
+        )
+
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+            {"glosses_added": glosses_added, "dataset": dataset_name},
+        )
+
+
+def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id):
+    """Does the thing"""
+
+    print("IN CONFIRM INNER")
+
     glosses_added = []
     dataset = None
     translations = []
@@ -362,49 +483,49 @@ def confirm_import_nzsl_share_gloss_csv(request):
     bulk_share_validation_aggregations = []
     video_import_only_glosses_data = []
 
-    if "glosses_new" and "dataset_id" in request.session:
-        dataset = Dataset.objects.get(id=request.session["dataset_id"])
-        language_en = Language.objects.get(name="English")
-        language_mi = Language.objects.get(name="Māori")
-        gloss_content_type = ContentType.objects.get_for_model(Gloss)
-        site = Site.objects.get_current()
-        comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
-        semantic_fields = FieldChoice.objects.filter(
-            field="semantic_field"
-        ).values_list("english_name", "pk")
-        semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
-        signers = FieldChoice.objects.filter(field="signer")
-        signer_dict = {signer.english_name: signer for signer in signers}
-        existing_machine_values = [
-            mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
-        ]
-        not_public_tag = Tag.objects.get(name="not public")
-        nzsl_share_tag = Tag.objects.get(name="nzsl-share")
-        import_user = User.objects.get(
-            username="nzsl_share_importer",
-            first_name="Importer",
-            last_name="NZSL Share",
-        )
-
-        for row_num, gloss_data in enumerate(request.session["glosses_new"]):
-            # will iterate over these glosses again after bulk creating
-            # and to ensure we get the correct gloss_data for words that appear multiple
-            # times we'll use the row_num as the identifier for the gloss data
-
-            # if the gloss already exists at this point, it can only mean that
-            # it has no videos and we want to import videos for it
-            # try-except saves us a db call
-            try:
-                gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
-                gloss_data_copy = gloss_data.copy()
-                gloss_data_copy["gloss"] = gloss
-                video_import_only_glosses_data.append(gloss_data_copy)
-                continue
-            except Gloss.DoesNotExist:
-                pass
+    dataset = Dataset.objects.get(id=session_dataset_id)
+    language_en = Language.objects.get(name="English")
+    language_mi = Language.objects.get(name="Māori")
+    gloss_content_type = ContentType.objects.get_for_model(Gloss)
+    site = Site.objects.get_current()
+    comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
+    semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list(
+        "english_name", "pk"
+    )
+    semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
+    signers = FieldChoice.objects.filter(field="signer")
+    signer_dict = {signer.english_name: signer for signer in signers}
+    existing_machine_values = [
+        mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
+    ]
+    not_public_tag = Tag.objects.get(name="not public")
+    nzsl_share_tag = Tag.objects.get(name="nzsl-share")
+    import_user = User.objects.get(
+        username="nzsl_share_importer",
+        first_name="Importer",
+        last_name="NZSL Share",
+    )
 
-            new_glosses[str(row_num)] = gloss_data
-            bulk_create_gloss.append(Gloss(
+    for row_num, gloss_data in enumerate(session_glosses_new):
+        # will iterate over these glosses again after bulk creating
+        # and to ensure we get the correct gloss_data for words that appear multiple
+        # times we'll use the row_num as the identifier for the gloss data
+
+        # if the gloss already exists at this point, it can only mean that
+        # it has no videos and we want to import videos for it
+        # try-except saves us a db call
+        try:
+            gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
+            gloss_data_copy = gloss_data.copy()
+            gloss_data_copy["gloss"] = gloss
+            video_import_only_glosses_data.append(gloss_data_copy)
+            continue
+        except Gloss.DoesNotExist:
+            pass
+
+        new_glosses[str(row_num)] = gloss_data
+        bulk_create_gloss.append(
+            Gloss(
                 dataset=dataset,
                 nzsl_share_id=gloss_data["id"],
                 # need to make idgloss unique in dataset,
@@ -415,183 +536,174 @@ def confirm_import_nzsl_share_gloss_csv(request):
                 created_by=import_user,
                 updated_by=import_user,
                 exclude_from_ecv=True,
-            ))
-            contributors.append(gloss_data["contributor_username"])
-
-        bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
-
-        # Create new signers for contributors that do not exist as signers yet
-        contributors = set(contributors)
-        create_signers = []
-        signers = signer_dict.keys()
-        for contributor in contributors:
-            if contributor not in signers:
+            )
+        )
+        contributors.append(gloss_data["contributor_username"])
+
+    bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
+
+    # Create new signers for contributors that do not exist as signers yet
+    contributors = set(contributors)
+    create_signers = []
+    signers = signer_dict.keys()
+    for contributor in contributors:
+        if contributor not in signers:
+            new_machine_value = random.randint(0, 99999999)
+            while new_machine_value in existing_machine_values:
                 new_machine_value = random.randint(0, 99999999)
-                while new_machine_value in existing_machine_values:
-                    new_machine_value = random.randint(0, 99999999)
-                existing_machine_values.append(new_machine_value)
-                create_signers.append(FieldChoice(
+            existing_machine_values.append(new_machine_value)
+            create_signers.append(
+                FieldChoice(
                     field="signer",
                     english_name=contributor,
-                    machine_value=new_machine_value
-                ))
-        new_signers = FieldChoice.objects.bulk_create(create_signers)
-        for signer in new_signers:
-            signer_dict[signer.english_name] = signer
-
-        for gloss in bulk_created:
-            word_en, row_num = gloss.idgloss.split("_row")
-            gloss_data = new_glosses[row_num]
-            gloss_data["gloss"] = gloss
-
-            # get semantic fields for gloss_data topics
-            if gloss_data.get("topic_names", None):
-                gloss_topics = gloss_data["topic_names"].split("|")
-                # ignore all signs and All signs
-                cleaned_gloss_topics = [
-                    x for x in gloss_topics if x not in ["all signs", "All signs"]
-                ]
-                add_miscellaneous = False
-
-                for topic in cleaned_gloss_topics:
-                    if topic in semantic_fields_dict.keys():
-                        bulk_semantic_fields.append(
-                            Gloss.semantic_field.through(
-                                gloss_id=gloss.id,
-                                fieldchoice_id=semantic_fields_dict[topic]
-                            )
-                        )
-                    else:
-                        # add the miscellaneous semantic field if a topic does not exist
-                        add_miscellaneous = True
-
-                if add_miscellaneous:
+                    machine_value=new_machine_value,
+                )
+            )
+    new_signers = FieldChoice.objects.bulk_create(create_signers)
+    for signer in new_signers:
+        signer_dict[signer.english_name] = signer
+
+    for gloss in bulk_created:
+        word_en, row_num = gloss.idgloss.split("_row")
+        gloss_data = new_glosses[row_num]
+        gloss_data["gloss"] = gloss
+
+        # get semantic fields for gloss_data topics
+        if gloss_data.get("topic_names", None):
+            gloss_topics = gloss_data["topic_names"].split("|")
+            # ignore all signs and All signs
+            cleaned_gloss_topics = [
+                x for x in gloss_topics if x not in ["all signs", "All signs"]
+            ]
+            add_miscellaneous = False
+
+            for topic in cleaned_gloss_topics:
+                if topic in semantic_fields_dict.keys():
                     bulk_semantic_fields.append(
                         Gloss.semantic_field.through(
                             gloss_id=gloss.id,
-                            fieldchoice_id=semantic_fields_dict["Miscellaneous"]
+                            fieldchoice_id=semantic_fields_dict[topic],
                         )
                     )
+                else:
+                    # add the miscellaneous semantic field if a topic does not exist
+                    add_miscellaneous = True
+
+            if add_miscellaneous:
+                bulk_semantic_fields.append(
+                    Gloss.semantic_field.through(
+                        gloss_id=gloss.id,
+                        fieldchoice_id=semantic_fields_dict["Miscellaneous"],
+                    )
+                )
 
-            # create GlossTranslations for english and maori words
-            translations.append(GlossTranslations(
+        # create GlossTranslations for english and maori words
+        translations.append(
+            GlossTranslations(
                 gloss=gloss,
                 language=language_en,
                 translations=gloss_data["word"],
-                translations_secondary=gloss_data.get("secondary", None)
-            ))
-            if gloss_data.get("maori", None):
-                # There is potentially several comma separated maori words
-                maori_words = gloss_data["maori"].split(", ")
-
-                # Update idgloss_mi using first maori word, then create translation
-                gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
-
-                translation = GlossTranslations(
-                    gloss=gloss,
-                    language=language_mi,
-                    translations=maori_words[0]
-                )
-                if len(maori_words) > 1:
-                    translation.translations_secondary = ", ".join(maori_words[1:])
+                translations_secondary=gloss_data.get("secondary", None),
+            )
+        )
+        if gloss_data.get("maori", None):
+            # There is potentially several comma separated maori words
+            maori_words = gloss_data["maori"].split(", ")
+
+            # Update idgloss_mi using first maori word, then create translation
+            gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
 
-                translations.append(translation)
+            translation = GlossTranslations(
+                gloss=gloss, language=language_mi, translations=maori_words[0]
+            )
+            if len(maori_words) > 1:
+                translation.translations_secondary = ", ".join(maori_words[1:])
 
-            # Prepare new idgloss and signer fields for bulk update
-            gloss.idgloss = f"{word_en}:{gloss.pk}"
-            gloss.signer = signer_dict[gloss_data["contributor_username"]]
-            bulk_update_glosses.append(gloss)
+            translations.append(translation)
 
-            # Create comment for gloss_data notes
-            comments.append(Comment(
+        # Prepare new idgloss and signer fields for bulk update
+        gloss.idgloss = f"{word_en}:{gloss.pk}"
+        gloss.signer = signer_dict[gloss_data["contributor_username"]]
+        bulk_update_glosses.append(gloss)
+
+        # Create comment for gloss_data notes
+        comments.append(
+            Comment(
                 content_type=gloss_content_type,
                 object_pk=gloss.pk,
                 user_name=gloss_data.get("contributor_username", ""),
                 comment=gloss_data.get("notes", ""),
                 site=site,
                 is_public=False,
-                submit_date=comment_submit_date
-            ))
-            if gloss_data.get("sign_comments", None):
-                # create Comments for all gloss_data sign_comments
-                for comment in gloss_data["sign_comments"].split("|"):
-                    try:
-                        comment_content = comment.split(":")
-                        user_name = comment_content[0]
-                        comment_content = comment_content[1]
-                    except IndexError:
-                        comment_content = comment
-                        user_name = "Unknown"
-                    comments.append(Comment(
+                submit_date=comment_submit_date,
+            )
+        )
+        if gloss_data.get("sign_comments", None):
+            # create Comments for all gloss_data sign_comments
+            for comment in gloss_data["sign_comments"].split("|"):
+                try:
+                    comment_content = comment.split(":")
+                    user_name = comment_content[0]
+                    comment_content = comment_content[1]
+                except IndexError:
+                    comment_content = comment
+                    user_name = "Unknown"
+                comments.append(
+                    Comment(
                         content_type=gloss_content_type,
                         object_pk=gloss.pk,
                         user_name=user_name,
                         comment=comment_content,
                         site=site,
                         is_public=False,
-                        submit_date=comment_submit_date
-                    ))
+                        submit_date=comment_submit_date,
+                    )
+                )
 
-            # Add ShareValidationAggregation
-            bulk_share_validation_aggregations.append(ShareValidationAggregation(
+        # Add ShareValidationAggregation
+        bulk_share_validation_aggregations.append(
+            ShareValidationAggregation(
                 gloss=gloss,
                 agrees=int(gloss_data["agrees"]),
-                disagrees=int(gloss_data["disagrees"])
-            ))
-
-            # prep videos, illustrations and usage example for video retrieval
-            update_retrieval_videos(videos, gloss_data)
-
-            glosses_added.append(gloss)
-
-            bulk_tagged_items.append(TaggedItem(
-                content_type=gloss_content_type,
-                object_id=gloss.pk,
-                tag=nzsl_share_tag
-
-            ))
-            bulk_tagged_items.append(TaggedItem(
-                content_type=gloss_content_type,
-                object_id=gloss.pk,
-                tag=not_public_tag
-
-            ))
+                disagrees=int(gloss_data["disagrees"]),
+            )
+        )
 
-        # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
-        Comment.objects.bulk_create(comments)
-        GlossTranslations.objects.bulk_create(translations)
-        Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
-        Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
-        TaggedItem.objects.bulk_create(bulk_tagged_items)
-        ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
+        # prep videos, illustrations and usage example for video retrieval
+        update_retrieval_videos(videos, gloss_data)
 
-        # Add the video-update only glosses
-        for video_import_gloss_data in video_import_only_glosses_data:
-            # prep videos, illustrations and usage example for video retrieval
-            update_retrieval_videos(videos, video_import_gloss_data)
-            glosses_added.append(video_import_gloss_data["gloss"])
+        glosses_added.append(gloss)
 
-        # start Thread to process gloss video retrieval in the background
-        t = threading.Thread(
-            target=retrieve_videos_for_glosses,
-            args=[videos],
-            daemon=True
+        bulk_tagged_items.append(
+            TaggedItem(
+                content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag
+            )
+        )
+        bulk_tagged_items.append(
+            TaggedItem(
+                content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag
+            )
         )
-        t.start()
 
-        del request.session["glosses_new"]
-        del request.session["dataset_id"]
+    # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
+    Comment.objects.bulk_create(comments)
+    GlossTranslations.objects.bulk_create(translations)
+    Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
+    Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
+    TaggedItem.objects.bulk_create(bulk_tagged_items)
+    ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
 
-        # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully."))
-    return render(
-        request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-        {
-            "glosses_added": glosses_added,
-            "dataset": dataset.name
-        }
-    )
+    # Add the video-update only glosses
+    for video_import_gloss_data in video_import_only_glosses_data:
+        # prep videos, illustrations and usage example for video retrieval
+        update_retrieval_videos(videos, video_import_gloss_data)
+        glosses_added.append(video_import_gloss_data["gloss"])
 
+    # start Thread to process gloss video retrieval in the background
+    t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True)
+    t.start()
+
+    return [glosses_added, dataset.name]
 
 
 @login_required
@@ -608,18 +720,29 @@ def import_qualtrics_csv(request):
     if not request.method == "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": csv_form}, )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(request, messages.ERROR,
-                             _("The provided CSV-file does not meet the requirements "
-                               "or there is some other problem."))
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": form}, )
+        messages.add_message(
+            request,
+            messages.ERROR,
+            _(
+                "The provided CSV-file does not meet the requirements "
+                "or there is some other problem."
+            ),
+        )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": form},
+        )
 
     validation_records = []
     skipped_rows = []
@@ -627,7 +750,7 @@ def import_qualtrics_csv(request):
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             delimiter=",",
-            quotechar='"'
+            quotechar='"',
         )
 
         question_numbers = []
@@ -669,22 +792,33 @@ def import_qualtrics_csv(request):
         request.session.pop("question_numbers", None)
         request.session.pop("question_gloss_map", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        messages.add_message(
+            request, messages.ERROR, _("Cannot open the file:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["validation_records"] = validation_records
     request.session["question_numbers"] = question_numbers
     request.session["question_glossvideo_map"] = question_to_glossvideo_map
 
-    return render(request, "dictionary/import_qualtrics_csv_confirmation.html",
-                  {"validation_records": validation_records, "skipped_rows": skipped_rows})
+    return render(
+        request,
+        "dictionary/import_qualtrics_csv_confirmation.html",
+        {"validation_records": validation_records, "skipped_rows": skipped_rows},
+    )
 
 
 @login_required
@@ -714,13 +848,21 @@ def confirm_import_qualtrics_csv(request):
     bulk_tagged_items = []
     gloss_pks = set()
 
-    if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session:
+    if (
+        "validation_records"
+        and "question_numbers"
+        and "question_glossvideo_map" in request.session
+    ):
         # Retrieve glosses
         glossvideo_pk_list = request.session["question_glossvideo_map"].values()
-        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list)
+        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(
+            glossvideo_pk_list
+        )
         gloss_content_type = ContentType.objects.get_for_model(Gloss)
         check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS)
-        ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION)
+        ready_for_validation_tag = Tag.objects.get(
+            name=settings.TAG_READY_FOR_VALIDATION
+        )
 
         questions_numbers = request.session["question_numbers"]
         question_glossvideo_map = request.session["question_glossvideo_map"]
@@ -740,35 +882,43 @@ def confirm_import_qualtrics_csv(request):
                     sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value
 
                 try:
-                    gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss
-                    validation_records_added.append(ValidationRecord(
-                        gloss=gloss,
-                        sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
-                        response_id=response_id,
-                        respondent_first_name=respondent_first_name,
-                        respondent_last_name=respondent_last_name,
-                        comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
-                    ))
+                    gloss = glossvideo_dict[
+                        question_glossvideo_map[question_number]
+                    ].gloss
+                    validation_records_added.append(
+                        ValidationRecord(
+                            gloss=gloss,
+                            sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
+                            response_id=response_id,
+                            respondent_first_name=respondent_first_name,
+                            respondent_last_name=respondent_last_name,
+                            comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
+                        )
+                    )
                     gloss_pks.add(gloss.pk)
                 except KeyError:
-                    missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[
-                        question_number]
+                    missing_gloss_pk_question_pairs[question_number] = (
+                        question_glossvideo_map[question_number]
+                    )
 
         for gloss_pk in gloss_pks:
-            bulk_tagged_items.append(TaggedItem(
-                content_type=gloss_content_type,
-                object_id=gloss_pk,
-                tag=check_result_tag
-
-            ))
+            bulk_tagged_items.append(
+                TaggedItem(
+                    content_type=gloss_content_type,
+                    object_id=gloss_pk,
+                    tag=check_result_tag,
+                )
+            )
 
         # ignoring conflicts so the unique together on the model filters out potential duplicates
-        ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True)
+        ValidationRecord.objects.bulk_create(
+            validation_records_added, ignore_conflicts=True
+        )
         TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True)
         TaggedItem.objects.filter(
             content_type=gloss_content_type,
             object_id__in=gloss_pks,
-            tag=ready_for_validation_tag
+            tag=ready_for_validation_tag,
         ).delete()
 
         del request.session["validation_records"]
@@ -776,17 +926,19 @@ def confirm_import_qualtrics_csv(request):
         del request.session["question_glossvideo_map"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.SUCCESS,
-                             _("ValidationRecords were added successfully."))
+        messages.add_message(
+            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
+        )
     return render(
-        request, "dictionary/import_qualtrics_csv_confirmation.html",
+        request,
+        "dictionary/import_qualtrics_csv_confirmation.html",
         {
             "validation_records_added": validation_records_added,
             "validation_record_count": len(validation_records_added),
             "responses_count": len(validation_records),
             "gloss_count": len(gloss_pks),
-            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs
-        }
+            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs,
+        },
     )
 
 
@@ -815,18 +967,29 @@ def import_manual_validation(request):
     if request.method != "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": csv_form}, )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(request, messages.ERROR,
-                             _("The provided CSV-file does not meet the requirements "
-                               "or there is some other problem."))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": form}, )
+        messages.add_message(
+            request,
+            messages.ERROR,
+            _(
+                "The provided CSV-file does not meet the requirements "
+                "or there is some other problem."
+            ),
+        )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": form},
+        )
 
     group_row_map = defaultdict(list)
     group_gloss_count = defaultdict(int)
@@ -837,29 +1000,38 @@ def import_manual_validation(request):
         "yes",
         "no",
         "abstain or not sure",
-        "comments"
+        "comments",
     ]
     try:
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"),
             delimiter=",",
-            quotechar='"'
+            quotechar='"',
+        )
+        missing_headers = set(required_headers) - set(
+            validation_record_reader.fieldnames
         )
-        missing_headers = set(required_headers) - set(validation_record_reader.fieldnames)
         if missing_headers != set():
             request.session.pop("group_row_map", None)
             request.session.pop("glosses", None)
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(request, messages.ERROR,
-                                 _(f"CSV is missing required columns: {missing_headers}"))
-            return render(request,
-                              "dictionary/import_manual_validation_csv.html",
-                              {"import_csv_form": CSVFileOnlyUpload()}, )
+            messages.add_message(
+                request,
+                messages.ERROR,
+                _(f"CSV is missing required columns: {missing_headers}"),
+            )
+            return render(
+                request,
+                "dictionary/import_manual_validation_csv.html",
+                {"import_csv_form": CSVFileOnlyUpload()},
+            )
 
         for row in validation_record_reader:
             if validation_record_reader.line_num == 1:
                 continue
-            _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"])
+            _check_row_can_be_converted_to_integer(
+                row, ["yes", "no", "abstain or not sure"]
+            )
             group_row_map[row["group"]].append(row)
             group_gloss_count[row["group"]] += 1
             glosses.append(row["idgloss"].split(":")[1])
@@ -868,35 +1040,49 @@ def import_manual_validation(request):
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e)))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        messages.add_message(
+            request, messages.ERROR, _("File contains non-compliant data:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
 
     except csv.Error as e:
         # Can't open file, remove session variables
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        messages.add_message(
+            request, messages.ERROR, _("Cannot open the file:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["group_row_map"] = group_row_map
     request.session["glosses"] = list(set(glosses))
 
     return render(
-        request, "dictionary/import_manual_validation_csv_confirmation.html",
+        request,
+        "dictionary/import_manual_validation_csv_confirmation.html",
         {
             # iterating over defaultdicts causes issues in template rendering
             "group_row_map": dict(group_row_map),
-            "group_gloss_count": dict(group_gloss_count)
-        }
+            "group_gloss_count": dict(group_gloss_count),
+        },
     )
 
 
@@ -940,14 +1126,18 @@ def confirm_import_manual_validation(request):
                 sign_seen_no = row["no"]
                 sign_seen_not_sure = row["abstain or not sure"]
                 comments = row["comments"]
-                manual_validation_aggregations.append(ManualValidationAggregation(
-                    gloss=gloss,
-                    group=group,
-                    sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
-                    sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
-                    sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0,
-                    comments=comments
-                ))
+                manual_validation_aggregations.append(
+                    ManualValidationAggregation(
+                        gloss=gloss,
+                        group=group,
+                        sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
+                        sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
+                        sign_seen_not_sure=(
+                            int(sign_seen_not_sure) if sign_seen_not_sure else 0
+                        ),
+                        comments=comments,
+                    )
+                )
 
         ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations)
 
@@ -955,13 +1145,15 @@ def confirm_import_manual_validation(request):
         del request.session["glosses"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.SUCCESS,
-                             _("ValidationRecords were added successfully."))
+        messages.add_message(
+            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
+        )
     return render(
-        request, "dictionary/import_manual_validation_csv_confirmation.html",
+        request,
+        "dictionary/import_manual_validation_csv_confirmation.html",
         {
             "manual_validation_aggregations": manual_validation_aggregations,
             "manual_validation_aggregations_count": len(manual_validation_aggregations),
-            "missing_glosses": missing_glosses
-        }
+            "missing_glosses": missing_glosses,
+        },
     )

From 00a58b6e9ae1b43465cf013c3c16e04b2c5a336b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:40:27 +1100
Subject: [PATCH 148/222] Revert "Experimental refactor csv_import"
 Inadvertently left debug behind.

This reverts commit a05eb003fca162354b76ea95c05c492b4f8ad606.
---
 signbank/dictionary/csv_import.py | 884 ++++++++++++------------------
 1 file changed, 346 insertions(+), 538 deletions(-)

diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py
index 0685d701..0bfdff69 100644
--- a/signbank/dictionary/csv_import.py
+++ b/signbank/dictionary/csv_import.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-from pprint import pprint
-
 import codecs
 import csv
 import datetime
@@ -28,16 +26,8 @@
 from tagging.models import Tag, TaggedItem
 
 from .forms import CSVFileOnlyUpload, CSVUploadForm
-from .models import (
-    Dataset,
-    FieldChoice,
-    Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
-)
+from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language,
+                     ManualValidationAggregation, ShareValidationAggregation, ValidationRecord)
 from .tasks import retrieve_videos_for_glosses
 from ..video.models import GlossVideo
 
@@ -45,7 +35,7 @@
 
 
 @login_required
-@permission_required("dictionary.import_csv")
+@permission_required('dictionary.import_csv')
 def import_gloss_csv(request):
     """
     Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms.
@@ -54,53 +44,31 @@ def import_gloss_csv(request):
     glosses_new = []
     glosses_exists = []
     # Make sure that the session variables are flushed before using this view.
-    if "dataset_id" in request.session:
-        del request.session["dataset_id"]
-    if "glosses_new" in request.session:
-        del request.session["glosses_new"]
+    if 'dataset_id' in request.session: del request.session['dataset_id']
+    if 'glosses_new' in request.session: del request.session['glosses_new']
 
-    if request.method == "POST":
+    if request.method == 'POST':
         form = CSVUploadForm(request.POST, request.FILES)
         if form.is_valid():
-            dataset = form.cleaned_data["dataset"]
-            if "view_dataset" not in get_perms(request.user, dataset):
+            dataset = form.cleaned_data['dataset']
+            if 'view_dataset' not in get_perms(request.user, dataset):
                 # If user has no permissions to dataset, raise PermissionDenied to show 403 template.
-                msg = _(
-                    "You do not have permissions to import glosses to this lexicon."
-                )
+                msg = _("You do not have permissions to import glosses to this lexicon.")
                 messages.error(request, msg)
                 raise PermissionDenied(msg)
             try:
-                glossreader = csv.reader(
-                    codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
-                    delimiter=",",
-                    quotechar='"',
-                )
+                glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"')
             except csv.Error as e:
                 # Can't open file, remove session variables
-                if "dataset_id" in request.session:
-                    del request.session["dataset_id"]
-                if "glosses_new" in request.session:
-                    del request.session["glosses_new"]
+                if 'dataset_id' in request.session: del request.session['dataset_id']
+                if 'glosses_new' in request.session: del request.session['glosses_new']
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(
-                    request, messages.ERROR, _("Cannot open the file:" + str(e))
-                )
-                return render(
-                    request,
-                    "dictionary/import_gloss_csv.html",
-                    {"import_csv_form": CSVUploadForm()},
-                )
+                messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e)))
+                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
             except UnicodeDecodeError as e:
                 # File is not UTF-8 encoded.
-                messages.add_message(
-                    request, messages.ERROR, _("File must be UTF-8 encoded!")
-                )
-                return render(
-                    request,
-                    "dictionary/import_gloss_csv.html",
-                    {"import_csv_form": CSVUploadForm()},
-                )
+                messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!'))
+                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
 
             for row in glossreader:
                 if glossreader.line_num == 1:
@@ -119,113 +87,74 @@ def import_gloss_csv(request):
                     continue
 
             # Store dataset's id and the list of glosses to be added in session.
-            request.session["dataset_id"] = dataset.id
-            request.session["glosses_new"] = glosses_new
-
-            return render(
-                request,
-                "dictionary/import_gloss_csv_confirmation.html",
-                {
-                    "glosses_new": glosses_new,
-                    "glosses_exists": glosses_exists,
-                    "dataset": dataset,
-                },
-            )
+            request.session['dataset_id'] = dataset.id
+            request.session['glosses_new'] = glosses_new
+
+            return render(request, 'dictionary/import_gloss_csv_confirmation.html',
+                          {'glosses_new': glosses_new,
+                           'glosses_exists': glosses_exists,
+                           'dataset': dataset, })
         else:
             # If form is not valid, set a error message and return to the original form.
-            messages.add_message(
-                request,
-                messages.ERROR,
-                _(
-                    "The provided CSV-file does not meet the requirements "
-                    "or there is some other problem."
-                ),
-            )
-            return render(
-                request,
-                "dictionary/import_gloss_csv.html",
-                {"import_csv_form": form},
-            )
+            messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements '
+                                                            'or there is some other problem.'))
+            return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, )
     else:
         # If request type is not POST, return to the original form.
         csv_form = CSVUploadForm()
-        allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
+        allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset')
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields[
-            "dataset"
-        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
-        return render(
-            request,
-            "dictionary/import_gloss_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
+            id__in=[x.id for x in allowed_datasets])
+        return render(request, "dictionary/import_gloss_csv.html",
+                      {'import_csv_form': csv_form}, )
 
 
 @login_required
-@permission_required("dictionary.import_csv")
+@permission_required('dictionary.import_csv')
 def confirm_import_gloss_csv(request):
     """This view adds the data to database if the user confirms the action"""
-    if request.method == "POST":
-        if "cancel" in request.POST:
+    if request.method == 'POST':
+        if 'cancel' in request.POST:
             # If user cancels adding data, flush session variables
-            if "dataset_id" in request.session:
-                del request.session["dataset_id"]
-            if "glosses_new" in request.session:
-                del request.session["glosses_new"]
+            if 'dataset_id' in request.session: del request.session['dataset_id']
+            if 'glosses_new' in request.session: del request.session['glosses_new']
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(
-                request, messages.WARNING, _("Cancelled adding CSV data.")
-            )
-            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
+            messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.'))
+            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
 
-        elif "confirm" in request.POST:
+        elif 'confirm' in request.POST:
             glosses_added = []
             dataset = None
-            if "glosses_new" and "dataset_id" in request.session:
-                dataset = Dataset.objects.get(id=request.session["dataset_id"])
-                for gloss in request.session["glosses_new"]:
+            if 'glosses_new' and 'dataset_id' in request.session:
+                dataset = Dataset.objects.get(id=request.session['dataset_id'])
+                for gloss in request.session['glosses_new']:
 
                     # If the Gloss does not already exist, continue adding.
-                    if not Gloss.objects.filter(
-                        dataset=dataset, idgloss=gloss[0]
-                    ).exists():
+                    if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists():
                         try:
-                            new_gloss = Gloss(
-                                dataset=dataset,
-                                idgloss=gloss[0],
-                                idgloss_mi=gloss[1],
-                                created_by=request.user,
-                                updated_by=request.user,
-                            )
+                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1],
+                                          created_by=request.user, updated_by=request.user)
                         except IndexError:
                             # If we get IndexError, idgloss_mi was probably not provided
-                            new_gloss = Gloss(
-                                dataset=dataset,
-                                idgloss=gloss[0],
-                                created_by=request.user,
-                                updated_by=request.user,
-                            )
+                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0],
+                                              created_by=request.user, updated_by=request.user)
 
                         new_gloss.save()
                         glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi))
 
                 # Flush request.session['glosses_new'] and request.session['dataset']
-                del request.session["glosses_new"]
-                del request.session["dataset_id"]
+                del request.session['glosses_new']
+                del request.session['dataset_id']
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(
-                    request, messages.SUCCESS, _("Glosses were added successfully.")
-                )
-            return render(
-                request,
-                "dictionary/import_gloss_csv_confirmation.html",
-                {"glosses_added": glosses_added, "dataset": dataset.name},
-            )
+                messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.'))
+            return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added,
+                                                                                     'dataset': dataset.name})
         else:
-            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
+            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
     else:
         # If request method is not POST, redirect to the import form
-        return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
+        return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
 
 
 share_csv_header_list = [
@@ -262,32 +191,20 @@ def import_nzsl_share_gloss_csv(request):
         csv_form = CSVUploadForm()
         allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields[
-            "dataset"
-        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
+            id__in=[x.id for x in allowed_datasets])
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": csv_form}, )
 
     form = CSVUploadForm(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(
-            request,
-            messages.ERROR,
-            _(
-                "The provided CSV-file does not meet the requirements "
-                "or there is some other problem."
-            ),
-        )
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": form},
-        )
+        messages.add_message(request, messages.ERROR,
+                             _("The provided CSV-file does not meet the requirements "
+                               "or there is some other problem."))
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": form}, )
 
     new_glosses = []
     dataset = form.cleaned_data["dataset"]
@@ -301,7 +218,7 @@ def import_nzsl_share_gloss_csv(request):
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             fieldnames=share_csv_header_list,
             delimiter=",",
-            quotechar='"',
+            quotechar='"'
         )
 
         skipped_existing_glosses = []
@@ -337,40 +254,29 @@ def import_nzsl_share_gloss_csv(request):
         request.session.pop("dataset_id", None)
         request.session.pop("glosses_new", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("Cannot open the file:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": CSVUploadForm()},
-        )
+        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": CSVUploadForm()}, )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": CSVUploadForm()},
-        )
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": CSVUploadForm()}, )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["dataset_id"] = dataset.id
     request.session["glosses_new"] = new_glosses
 
-    return render(
-        request,
-        "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-        {
-            "glosses_new": new_glosses,
-            "dataset": dataset,
-            "skipped_existing_glosses": skipped_existing_glosses,
-        },
-    )
+    return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+                  {
+                      "glosses_new": new_glosses,
+                      "dataset": dataset,
+                      "skipped_existing_glosses": skipped_existing_glosses
+                  })
 
 
 def update_retrieval_videos(videos, gloss_data):
-    """prep videos, illustrations and usage example for video retrieval"""
+    """ prep videos, illustrations and usage example for video retrieval """
 
     gloss_pk = gloss_data["gloss"].pk
     gloss_word = gloss_data["word"]
@@ -378,14 +284,16 @@ def update_retrieval_videos(videos, gloss_data):
     if gloss_data.get("videos", None):
         video_url = gloss_data["videos"]
         extension = video_url[-3:]
-        file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
+        file_name = (
+            f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
+        )
 
         glossvideo = {
             "url": video_url,
             "file_name": file_name,
             "gloss_pk": gloss_pk,
             "video_type": "main",
-            "version": 0,
+            "version": 0
         }
         videos.append(glossvideo)
 
@@ -401,7 +309,7 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": "main",
-                "version": i,
+                "version": i
             }
             videos.append(glossvideo)
 
@@ -417,18 +325,14 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": f"finalexample{i + 1}",
-                "version": i,
+                "version": i
             }
             videos.append(glossvideo)
 
-
 @login_required
 @permission_required("dictionary.import_csv")
 @transaction.atomic()
 def confirm_import_nzsl_share_gloss_csv(request):
-
-    pprint(request.session.__dict__)
-
     """This view adds the data to database if the user confirms the action"""
     if not request.method == "POST":
         # If request method is not POST, redirect to the import form
@@ -444,31 +348,6 @@ def confirm_import_nzsl_share_gloss_csv(request):
     elif not "confirm" in request.POST:
         return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv"))
 
-    if "glosses_new" and "dataset_id" in request.session:
-        [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner(
-            request.session["glosses_new"], request.session["dataset_id"]
-        )
-
-        del request.session["glosses_new"]
-        del request.session["dataset_id"]
-
-        # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.SUCCESS, _("Glosses were added successfully.")
-        )
-
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-            {"glosses_added": glosses_added, "dataset": dataset_name},
-        )
-
-
-def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id):
-    """Does the thing"""
-
-    print("IN CONFIRM INNER")
-
     glosses_added = []
     dataset = None
     translations = []
@@ -483,49 +362,49 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas
     bulk_share_validation_aggregations = []
     video_import_only_glosses_data = []
 
-    dataset = Dataset.objects.get(id=session_dataset_id)
-    language_en = Language.objects.get(name="English")
-    language_mi = Language.objects.get(name="Māori")
-    gloss_content_type = ContentType.objects.get_for_model(Gloss)
-    site = Site.objects.get_current()
-    comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
-    semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list(
-        "english_name", "pk"
-    )
-    semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
-    signers = FieldChoice.objects.filter(field="signer")
-    signer_dict = {signer.english_name: signer for signer in signers}
-    existing_machine_values = [
-        mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
-    ]
-    not_public_tag = Tag.objects.get(name="not public")
-    nzsl_share_tag = Tag.objects.get(name="nzsl-share")
-    import_user = User.objects.get(
-        username="nzsl_share_importer",
-        first_name="Importer",
-        last_name="NZSL Share",
-    )
+    if "glosses_new" and "dataset_id" in request.session:
+        dataset = Dataset.objects.get(id=request.session["dataset_id"])
+        language_en = Language.objects.get(name="English")
+        language_mi = Language.objects.get(name="Māori")
+        gloss_content_type = ContentType.objects.get_for_model(Gloss)
+        site = Site.objects.get_current()
+        comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
+        semantic_fields = FieldChoice.objects.filter(
+            field="semantic_field"
+        ).values_list("english_name", "pk")
+        semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
+        signers = FieldChoice.objects.filter(field="signer")
+        signer_dict = {signer.english_name: signer for signer in signers}
+        existing_machine_values = [
+            mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
+        ]
+        not_public_tag = Tag.objects.get(name="not public")
+        nzsl_share_tag = Tag.objects.get(name="nzsl-share")
+        import_user = User.objects.get(
+            username="nzsl_share_importer",
+            first_name="Importer",
+            last_name="NZSL Share",
+        )
+
+        for row_num, gloss_data in enumerate(request.session["glosses_new"]):
+            # will iterate over these glosses again after bulk creating
+            # and to ensure we get the correct gloss_data for words that appear multiple
+            # times we'll use the row_num as the identifier for the gloss data
+
+            # if the gloss already exists at this point, it can only mean that
+            # it has no videos and we want to import videos for it
+            # try-except saves us a db call
+            try:
+                gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
+                gloss_data_copy = gloss_data.copy()
+                gloss_data_copy["gloss"] = gloss
+                video_import_only_glosses_data.append(gloss_data_copy)
+                continue
+            except Gloss.DoesNotExist:
+                pass
 
-    for row_num, gloss_data in enumerate(session_glosses_new):
-        # will iterate over these glosses again after bulk creating
-        # and to ensure we get the correct gloss_data for words that appear multiple
-        # times we'll use the row_num as the identifier for the gloss data
-
-        # if the gloss already exists at this point, it can only mean that
-        # it has no videos and we want to import videos for it
-        # try-except saves us a db call
-        try:
-            gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
-            gloss_data_copy = gloss_data.copy()
-            gloss_data_copy["gloss"] = gloss
-            video_import_only_glosses_data.append(gloss_data_copy)
-            continue
-        except Gloss.DoesNotExist:
-            pass
-
-        new_glosses[str(row_num)] = gloss_data
-        bulk_create_gloss.append(
-            Gloss(
+            new_glosses[str(row_num)] = gloss_data
+            bulk_create_gloss.append(Gloss(
                 dataset=dataset,
                 nzsl_share_id=gloss_data["id"],
                 # need to make idgloss unique in dataset,
@@ -536,174 +415,183 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas
                 created_by=import_user,
                 updated_by=import_user,
                 exclude_from_ecv=True,
-            )
-        )
-        contributors.append(gloss_data["contributor_username"])
-
-    bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
-
-    # Create new signers for contributors that do not exist as signers yet
-    contributors = set(contributors)
-    create_signers = []
-    signers = signer_dict.keys()
-    for contributor in contributors:
-        if contributor not in signers:
-            new_machine_value = random.randint(0, 99999999)
-            while new_machine_value in existing_machine_values:
+            ))
+            contributors.append(gloss_data["contributor_username"])
+
+        bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
+
+        # Create new signers for contributors that do not exist as signers yet
+        contributors = set(contributors)
+        create_signers = []
+        signers = signer_dict.keys()
+        for contributor in contributors:
+            if contributor not in signers:
                 new_machine_value = random.randint(0, 99999999)
-            existing_machine_values.append(new_machine_value)
-            create_signers.append(
-                FieldChoice(
+                while new_machine_value in existing_machine_values:
+                    new_machine_value = random.randint(0, 99999999)
+                existing_machine_values.append(new_machine_value)
+                create_signers.append(FieldChoice(
                     field="signer",
                     english_name=contributor,
-                    machine_value=new_machine_value,
-                )
-            )
-    new_signers = FieldChoice.objects.bulk_create(create_signers)
-    for signer in new_signers:
-        signer_dict[signer.english_name] = signer
-
-    for gloss in bulk_created:
-        word_en, row_num = gloss.idgloss.split("_row")
-        gloss_data = new_glosses[row_num]
-        gloss_data["gloss"] = gloss
-
-        # get semantic fields for gloss_data topics
-        if gloss_data.get("topic_names", None):
-            gloss_topics = gloss_data["topic_names"].split("|")
-            # ignore all signs and All signs
-            cleaned_gloss_topics = [
-                x for x in gloss_topics if x not in ["all signs", "All signs"]
-            ]
-            add_miscellaneous = False
-
-            for topic in cleaned_gloss_topics:
-                if topic in semantic_fields_dict.keys():
+                    machine_value=new_machine_value
+                ))
+        new_signers = FieldChoice.objects.bulk_create(create_signers)
+        for signer in new_signers:
+            signer_dict[signer.english_name] = signer
+
+        for gloss in bulk_created:
+            word_en, row_num = gloss.idgloss.split("_row")
+            gloss_data = new_glosses[row_num]
+            gloss_data["gloss"] = gloss
+
+            # get semantic fields for gloss_data topics
+            if gloss_data.get("topic_names", None):
+                gloss_topics = gloss_data["topic_names"].split("|")
+                # ignore all signs and All signs
+                cleaned_gloss_topics = [
+                    x for x in gloss_topics if x not in ["all signs", "All signs"]
+                ]
+                add_miscellaneous = False
+
+                for topic in cleaned_gloss_topics:
+                    if topic in semantic_fields_dict.keys():
+                        bulk_semantic_fields.append(
+                            Gloss.semantic_field.through(
+                                gloss_id=gloss.id,
+                                fieldchoice_id=semantic_fields_dict[topic]
+                            )
+                        )
+                    else:
+                        # add the miscellaneous semantic field if a topic does not exist
+                        add_miscellaneous = True
+
+                if add_miscellaneous:
                     bulk_semantic_fields.append(
                         Gloss.semantic_field.through(
                             gloss_id=gloss.id,
-                            fieldchoice_id=semantic_fields_dict[topic],
+                            fieldchoice_id=semantic_fields_dict["Miscellaneous"]
                         )
                     )
-                else:
-                    # add the miscellaneous semantic field if a topic does not exist
-                    add_miscellaneous = True
-
-            if add_miscellaneous:
-                bulk_semantic_fields.append(
-                    Gloss.semantic_field.through(
-                        gloss_id=gloss.id,
-                        fieldchoice_id=semantic_fields_dict["Miscellaneous"],
-                    )
-                )
 
-        # create GlossTranslations for english and maori words
-        translations.append(
-            GlossTranslations(
+            # create GlossTranslations for english and maori words
+            translations.append(GlossTranslations(
                 gloss=gloss,
                 language=language_en,
                 translations=gloss_data["word"],
-                translations_secondary=gloss_data.get("secondary", None),
-            )
-        )
-        if gloss_data.get("maori", None):
-            # There is potentially several comma separated maori words
-            maori_words = gloss_data["maori"].split(", ")
-
-            # Update idgloss_mi using first maori word, then create translation
-            gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
-
-            translation = GlossTranslations(
-                gloss=gloss, language=language_mi, translations=maori_words[0]
-            )
-            if len(maori_words) > 1:
-                translation.translations_secondary = ", ".join(maori_words[1:])
+                translations_secondary=gloss_data.get("secondary", None)
+            ))
+            if gloss_data.get("maori", None):
+                # There is potentially several comma separated maori words
+                maori_words = gloss_data["maori"].split(", ")
+
+                # Update idgloss_mi using first maori word, then create translation
+                gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
+
+                translation = GlossTranslations(
+                    gloss=gloss,
+                    language=language_mi,
+                    translations=maori_words[0]
+                )
+                if len(maori_words) > 1:
+                    translation.translations_secondary = ", ".join(maori_words[1:])
 
-            translations.append(translation)
+                translations.append(translation)
 
-        # Prepare new idgloss and signer fields for bulk update
-        gloss.idgloss = f"{word_en}:{gloss.pk}"
-        gloss.signer = signer_dict[gloss_data["contributor_username"]]
-        bulk_update_glosses.append(gloss)
+            # Prepare new idgloss and signer fields for bulk update
+            gloss.idgloss = f"{word_en}:{gloss.pk}"
+            gloss.signer = signer_dict[gloss_data["contributor_username"]]
+            bulk_update_glosses.append(gloss)
 
-        # Create comment for gloss_data notes
-        comments.append(
-            Comment(
+            # Create comment for gloss_data notes
+            comments.append(Comment(
                 content_type=gloss_content_type,
                 object_pk=gloss.pk,
                 user_name=gloss_data.get("contributor_username", ""),
                 comment=gloss_data.get("notes", ""),
                 site=site,
                 is_public=False,
-                submit_date=comment_submit_date,
-            )
-        )
-        if gloss_data.get("sign_comments", None):
-            # create Comments for all gloss_data sign_comments
-            for comment in gloss_data["sign_comments"].split("|"):
-                try:
-                    comment_content = comment.split(":")
-                    user_name = comment_content[0]
-                    comment_content = comment_content[1]
-                except IndexError:
-                    comment_content = comment
-                    user_name = "Unknown"
-                comments.append(
-                    Comment(
+                submit_date=comment_submit_date
+            ))
+            if gloss_data.get("sign_comments", None):
+                # create Comments for all gloss_data sign_comments
+                for comment in gloss_data["sign_comments"].split("|"):
+                    try:
+                        comment_content = comment.split(":")
+                        user_name = comment_content[0]
+                        comment_content = comment_content[1]
+                    except IndexError:
+                        comment_content = comment
+                        user_name = "Unknown"
+                    comments.append(Comment(
                         content_type=gloss_content_type,
                         object_pk=gloss.pk,
                         user_name=user_name,
                         comment=comment_content,
                         site=site,
                         is_public=False,
-                        submit_date=comment_submit_date,
-                    )
-                )
+                        submit_date=comment_submit_date
+                    ))
 
-        # Add ShareValidationAggregation
-        bulk_share_validation_aggregations.append(
-            ShareValidationAggregation(
+            # Add ShareValidationAggregation
+            bulk_share_validation_aggregations.append(ShareValidationAggregation(
                 gloss=gloss,
                 agrees=int(gloss_data["agrees"]),
-                disagrees=int(gloss_data["disagrees"]),
-            )
-        )
+                disagrees=int(gloss_data["disagrees"])
+            ))
 
-        # prep videos, illustrations and usage example for video retrieval
-        update_retrieval_videos(videos, gloss_data)
+            # prep videos, illustrations and usage example for video retrieval
+            update_retrieval_videos(videos, gloss_data)
 
-        glosses_added.append(gloss)
+            glosses_added.append(gloss)
 
-        bulk_tagged_items.append(
-            TaggedItem(
-                content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag
-            )
-        )
-        bulk_tagged_items.append(
-            TaggedItem(
-                content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag
-            )
-        )
+            bulk_tagged_items.append(TaggedItem(
+                content_type=gloss_content_type,
+                object_id=gloss.pk,
+                tag=nzsl_share_tag
 
-    # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
-    Comment.objects.bulk_create(comments)
-    GlossTranslations.objects.bulk_create(translations)
-    Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
-    Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
-    TaggedItem.objects.bulk_create(bulk_tagged_items)
-    ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
+            ))
+            bulk_tagged_items.append(TaggedItem(
+                content_type=gloss_content_type,
+                object_id=gloss.pk,
+                tag=not_public_tag
 
-    # Add the video-update only glosses
-    for video_import_gloss_data in video_import_only_glosses_data:
-        # prep videos, illustrations and usage example for video retrieval
-        update_retrieval_videos(videos, video_import_gloss_data)
-        glosses_added.append(video_import_gloss_data["gloss"])
+            ))
 
-    # start Thread to process gloss video retrieval in the background
-    t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True)
-    t.start()
+        # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
+        Comment.objects.bulk_create(comments)
+        GlossTranslations.objects.bulk_create(translations)
+        Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
+        Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
+        TaggedItem.objects.bulk_create(bulk_tagged_items)
+        ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
+
+        # Add the video-update only glosses
+        for video_import_gloss_data in video_import_only_glosses_data:
+            # prep videos, illustrations and usage example for video retrieval
+            update_retrieval_videos(videos, video_import_gloss_data)
+            glosses_added.append(video_import_gloss_data["gloss"])
+
+        # start Thread to process gloss video retrieval in the background
+        t = threading.Thread(
+            target=retrieve_videos_for_glosses,
+            args=[videos],
+            daemon=True
+        )
+        t.start()
+
+        del request.session["glosses_new"]
+        del request.session["dataset_id"]
+
+        # Set a message to be shown so that the user knows what is going on.
+        messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully."))
+    return render(
+        request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+        {
+            "glosses_added": glosses_added,
+            "dataset": dataset.name
+        }
+    )
 
-    return [glosses_added, dataset.name]
 
 
 @login_required
@@ -720,29 +608,18 @@ def import_qualtrics_csv(request):
     if not request.method == "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": csv_form}, )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(
-            request,
-            messages.ERROR,
-            _(
-                "The provided CSV-file does not meet the requirements "
-                "or there is some other problem."
-            ),
-        )
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": form},
-        )
+        messages.add_message(request, messages.ERROR,
+                             _("The provided CSV-file does not meet the requirements "
+                               "or there is some other problem."))
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": form}, )
 
     validation_records = []
     skipped_rows = []
@@ -750,7 +627,7 @@ def import_qualtrics_csv(request):
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             delimiter=",",
-            quotechar='"',
+            quotechar='"'
         )
 
         question_numbers = []
@@ -792,33 +669,22 @@ def import_qualtrics_csv(request):
         request.session.pop("question_numbers", None)
         request.session.pop("question_gloss_map", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("Cannot open the file:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["validation_records"] = validation_records
     request.session["question_numbers"] = question_numbers
     request.session["question_glossvideo_map"] = question_to_glossvideo_map
 
-    return render(
-        request,
-        "dictionary/import_qualtrics_csv_confirmation.html",
-        {"validation_records": validation_records, "skipped_rows": skipped_rows},
-    )
+    return render(request, "dictionary/import_qualtrics_csv_confirmation.html",
+                  {"validation_records": validation_records, "skipped_rows": skipped_rows})
 
 
 @login_required
@@ -848,21 +714,13 @@ def confirm_import_qualtrics_csv(request):
     bulk_tagged_items = []
     gloss_pks = set()
 
-    if (
-        "validation_records"
-        and "question_numbers"
-        and "question_glossvideo_map" in request.session
-    ):
+    if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session:
         # Retrieve glosses
         glossvideo_pk_list = request.session["question_glossvideo_map"].values()
-        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(
-            glossvideo_pk_list
-        )
+        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list)
         gloss_content_type = ContentType.objects.get_for_model(Gloss)
         check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS)
-        ready_for_validation_tag = Tag.objects.get(
-            name=settings.TAG_READY_FOR_VALIDATION
-        )
+        ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION)
 
         questions_numbers = request.session["question_numbers"]
         question_glossvideo_map = request.session["question_glossvideo_map"]
@@ -882,43 +740,35 @@ def confirm_import_qualtrics_csv(request):
                     sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value
 
                 try:
-                    gloss = glossvideo_dict[
-                        question_glossvideo_map[question_number]
-                    ].gloss
-                    validation_records_added.append(
-                        ValidationRecord(
-                            gloss=gloss,
-                            sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
-                            response_id=response_id,
-                            respondent_first_name=respondent_first_name,
-                            respondent_last_name=respondent_last_name,
-                            comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
-                        )
-                    )
+                    gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss
+                    validation_records_added.append(ValidationRecord(
+                        gloss=gloss,
+                        sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
+                        response_id=response_id,
+                        respondent_first_name=respondent_first_name,
+                        respondent_last_name=respondent_last_name,
+                        comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
+                    ))
                     gloss_pks.add(gloss.pk)
                 except KeyError:
-                    missing_gloss_pk_question_pairs[question_number] = (
-                        question_glossvideo_map[question_number]
-                    )
+                    missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[
+                        question_number]
 
         for gloss_pk in gloss_pks:
-            bulk_tagged_items.append(
-                TaggedItem(
-                    content_type=gloss_content_type,
-                    object_id=gloss_pk,
-                    tag=check_result_tag,
-                )
-            )
+            bulk_tagged_items.append(TaggedItem(
+                content_type=gloss_content_type,
+                object_id=gloss_pk,
+                tag=check_result_tag
+
+            ))
 
         # ignoring conflicts so the unique together on the model filters out potential duplicates
-        ValidationRecord.objects.bulk_create(
-            validation_records_added, ignore_conflicts=True
-        )
+        ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True)
         TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True)
         TaggedItem.objects.filter(
             content_type=gloss_content_type,
             object_id__in=gloss_pks,
-            tag=ready_for_validation_tag,
+            tag=ready_for_validation_tag
         ).delete()
 
         del request.session["validation_records"]
@@ -926,19 +776,17 @@ def confirm_import_qualtrics_csv(request):
         del request.session["question_glossvideo_map"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
-        )
+        messages.add_message(request, messages.SUCCESS,
+                             _("ValidationRecords were added successfully."))
     return render(
-        request,
-        "dictionary/import_qualtrics_csv_confirmation.html",
+        request, "dictionary/import_qualtrics_csv_confirmation.html",
         {
             "validation_records_added": validation_records_added,
             "validation_record_count": len(validation_records_added),
             "responses_count": len(validation_records),
             "gloss_count": len(gloss_pks),
-            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs,
-        },
+            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs
+        }
     )
 
 
@@ -967,29 +815,18 @@ def import_manual_validation(request):
     if request.method != "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": csv_form}, )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(
-            request,
-            messages.ERROR,
-            _(
-                "The provided CSV-file does not meet the requirements "
-                "or there is some other problem."
-            ),
-        )
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": form},
-        )
+        messages.add_message(request, messages.ERROR,
+                             _("The provided CSV-file does not meet the requirements "
+                               "or there is some other problem."))
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": form}, )
 
     group_row_map = defaultdict(list)
     group_gloss_count = defaultdict(int)
@@ -1000,38 +837,29 @@ def import_manual_validation(request):
         "yes",
         "no",
         "abstain or not sure",
-        "comments",
+        "comments"
     ]
     try:
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"),
             delimiter=",",
-            quotechar='"',
-        )
-        missing_headers = set(required_headers) - set(
-            validation_record_reader.fieldnames
+            quotechar='"'
         )
+        missing_headers = set(required_headers) - set(validation_record_reader.fieldnames)
         if missing_headers != set():
             request.session.pop("group_row_map", None)
             request.session.pop("glosses", None)
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(
-                request,
-                messages.ERROR,
-                _(f"CSV is missing required columns: {missing_headers}"),
-            )
-            return render(
-                request,
-                "dictionary/import_manual_validation_csv.html",
-                {"import_csv_form": CSVFileOnlyUpload()},
-            )
+            messages.add_message(request, messages.ERROR,
+                                 _(f"CSV is missing required columns: {missing_headers}"))
+            return render(request,
+                              "dictionary/import_manual_validation_csv.html",
+                              {"import_csv_form": CSVFileOnlyUpload()}, )
 
         for row in validation_record_reader:
             if validation_record_reader.line_num == 1:
                 continue
-            _check_row_can_be_converted_to_integer(
-                row, ["yes", "no", "abstain or not sure"]
-            )
+            _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"])
             group_row_map[row["group"]].append(row)
             group_gloss_count[row["group"]] += 1
             glosses.append(row["idgloss"].split(":")[1])
@@ -1040,49 +868,35 @@ def import_manual_validation(request):
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("File contains non-compliant data:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e)))
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
 
     except csv.Error as e:
         # Can't open file, remove session variables
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("Cannot open the file:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["group_row_map"] = group_row_map
     request.session["glosses"] = list(set(glosses))
 
     return render(
-        request,
-        "dictionary/import_manual_validation_csv_confirmation.html",
+        request, "dictionary/import_manual_validation_csv_confirmation.html",
         {
             # iterating over defaultdicts causes issues in template rendering
             "group_row_map": dict(group_row_map),
-            "group_gloss_count": dict(group_gloss_count),
-        },
+            "group_gloss_count": dict(group_gloss_count)
+        }
     )
 
 
@@ -1126,18 +940,14 @@ def confirm_import_manual_validation(request):
                 sign_seen_no = row["no"]
                 sign_seen_not_sure = row["abstain or not sure"]
                 comments = row["comments"]
-                manual_validation_aggregations.append(
-                    ManualValidationAggregation(
-                        gloss=gloss,
-                        group=group,
-                        sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
-                        sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
-                        sign_seen_not_sure=(
-                            int(sign_seen_not_sure) if sign_seen_not_sure else 0
-                        ),
-                        comments=comments,
-                    )
-                )
+                manual_validation_aggregations.append(ManualValidationAggregation(
+                    gloss=gloss,
+                    group=group,
+                    sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
+                    sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
+                    sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0,
+                    comments=comments
+                ))
 
         ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations)
 
@@ -1145,15 +955,13 @@ def confirm_import_manual_validation(request):
         del request.session["glosses"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
-        )
+        messages.add_message(request, messages.SUCCESS,
+                             _("ValidationRecords were added successfully."))
     return render(
-        request,
-        "dictionary/import_manual_validation_csv_confirmation.html",
+        request, "dictionary/import_manual_validation_csv_confirmation.html",
         {
             "manual_validation_aggregations": manual_validation_aggregations,
             "manual_validation_aggregations_count": len(manual_validation_aggregations),
-            "missing_glosses": missing_glosses,
-        },
+            "missing_glosses": missing_glosses
+        }
     )

From 5c5fcc60ecef0ea2d62e00987ea5fa62a92d0266 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:41:36 +1100
Subject: [PATCH 149/222] Revert "Revert "Experimental refactor csv_import""

This reverts commit 00a58b6e9ae1b43465cf013c3c16e04b2c5a336b.
---
 signbank/dictionary/csv_import.py | 884 ++++++++++++++++++------------
 1 file changed, 538 insertions(+), 346 deletions(-)

diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py
index 0bfdff69..0685d701 100644
--- a/signbank/dictionary/csv_import.py
+++ b/signbank/dictionary/csv_import.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
+from pprint import pprint
+
 import codecs
 import csv
 import datetime
@@ -26,8 +28,16 @@
 from tagging.models import Tag, TaggedItem
 
 from .forms import CSVFileOnlyUpload, CSVUploadForm
-from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language,
-                     ManualValidationAggregation, ShareValidationAggregation, ValidationRecord)
+from .models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
 from .tasks import retrieve_videos_for_glosses
 from ..video.models import GlossVideo
 
@@ -35,7 +45,7 @@
 
 
 @login_required
-@permission_required('dictionary.import_csv')
+@permission_required("dictionary.import_csv")
 def import_gloss_csv(request):
     """
     Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms.
@@ -44,31 +54,53 @@ def import_gloss_csv(request):
     glosses_new = []
     glosses_exists = []
     # Make sure that the session variables are flushed before using this view.
-    if 'dataset_id' in request.session: del request.session['dataset_id']
-    if 'glosses_new' in request.session: del request.session['glosses_new']
+    if "dataset_id" in request.session:
+        del request.session["dataset_id"]
+    if "glosses_new" in request.session:
+        del request.session["glosses_new"]
 
-    if request.method == 'POST':
+    if request.method == "POST":
         form = CSVUploadForm(request.POST, request.FILES)
         if form.is_valid():
-            dataset = form.cleaned_data['dataset']
-            if 'view_dataset' not in get_perms(request.user, dataset):
+            dataset = form.cleaned_data["dataset"]
+            if "view_dataset" not in get_perms(request.user, dataset):
                 # If user has no permissions to dataset, raise PermissionDenied to show 403 template.
-                msg = _("You do not have permissions to import glosses to this lexicon.")
+                msg = _(
+                    "You do not have permissions to import glosses to this lexicon."
+                )
                 messages.error(request, msg)
                 raise PermissionDenied(msg)
             try:
-                glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"')
+                glossreader = csv.reader(
+                    codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
+                    delimiter=",",
+                    quotechar='"',
+                )
             except csv.Error as e:
                 # Can't open file, remove session variables
-                if 'dataset_id' in request.session: del request.session['dataset_id']
-                if 'glosses_new' in request.session: del request.session['glosses_new']
+                if "dataset_id" in request.session:
+                    del request.session["dataset_id"]
+                if "glosses_new" in request.session:
+                    del request.session["glosses_new"]
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e)))
-                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
+                messages.add_message(
+                    request, messages.ERROR, _("Cannot open the file:" + str(e))
+                )
+                return render(
+                    request,
+                    "dictionary/import_gloss_csv.html",
+                    {"import_csv_form": CSVUploadForm()},
+                )
             except UnicodeDecodeError as e:
                 # File is not UTF-8 encoded.
-                messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!'))
-                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
+                messages.add_message(
+                    request, messages.ERROR, _("File must be UTF-8 encoded!")
+                )
+                return render(
+                    request,
+                    "dictionary/import_gloss_csv.html",
+                    {"import_csv_form": CSVUploadForm()},
+                )
 
             for row in glossreader:
                 if glossreader.line_num == 1:
@@ -87,74 +119,113 @@ def import_gloss_csv(request):
                     continue
 
             # Store dataset's id and the list of glosses to be added in session.
-            request.session['dataset_id'] = dataset.id
-            request.session['glosses_new'] = glosses_new
-
-            return render(request, 'dictionary/import_gloss_csv_confirmation.html',
-                          {'glosses_new': glosses_new,
-                           'glosses_exists': glosses_exists,
-                           'dataset': dataset, })
+            request.session["dataset_id"] = dataset.id
+            request.session["glosses_new"] = glosses_new
+
+            return render(
+                request,
+                "dictionary/import_gloss_csv_confirmation.html",
+                {
+                    "glosses_new": glosses_new,
+                    "glosses_exists": glosses_exists,
+                    "dataset": dataset,
+                },
+            )
         else:
             # If form is not valid, set a error message and return to the original form.
-            messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements '
-                                                            'or there is some other problem.'))
-            return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, )
+            messages.add_message(
+                request,
+                messages.ERROR,
+                _(
+                    "The provided CSV-file does not meet the requirements "
+                    "or there is some other problem."
+                ),
+            )
+            return render(
+                request,
+                "dictionary/import_gloss_csv.html",
+                {"import_csv_form": form},
+            )
     else:
         # If request type is not POST, return to the original form.
         csv_form = CSVUploadForm()
-        allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset')
+        allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
-            id__in=[x.id for x in allowed_datasets])
-        return render(request, "dictionary/import_gloss_csv.html",
-                      {'import_csv_form': csv_form}, )
+        csv_form.fields["dataset"].queryset = csv_form.fields[
+            "dataset"
+        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
+        return render(
+            request,
+            "dictionary/import_gloss_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
 
 @login_required
-@permission_required('dictionary.import_csv')
+@permission_required("dictionary.import_csv")
 def confirm_import_gloss_csv(request):
     """This view adds the data to database if the user confirms the action"""
-    if request.method == 'POST':
-        if 'cancel' in request.POST:
+    if request.method == "POST":
+        if "cancel" in request.POST:
             # If user cancels adding data, flush session variables
-            if 'dataset_id' in request.session: del request.session['dataset_id']
-            if 'glosses_new' in request.session: del request.session['glosses_new']
+            if "dataset_id" in request.session:
+                del request.session["dataset_id"]
+            if "glosses_new" in request.session:
+                del request.session["glosses_new"]
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.'))
-            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
+            messages.add_message(
+                request, messages.WARNING, _("Cancelled adding CSV data.")
+            )
+            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
 
-        elif 'confirm' in request.POST:
+        elif "confirm" in request.POST:
             glosses_added = []
             dataset = None
-            if 'glosses_new' and 'dataset_id' in request.session:
-                dataset = Dataset.objects.get(id=request.session['dataset_id'])
-                for gloss in request.session['glosses_new']:
+            if "glosses_new" and "dataset_id" in request.session:
+                dataset = Dataset.objects.get(id=request.session["dataset_id"])
+                for gloss in request.session["glosses_new"]:
 
                     # If the Gloss does not already exist, continue adding.
-                    if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists():
+                    if not Gloss.objects.filter(
+                        dataset=dataset, idgloss=gloss[0]
+                    ).exists():
                         try:
-                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1],
-                                          created_by=request.user, updated_by=request.user)
+                            new_gloss = Gloss(
+                                dataset=dataset,
+                                idgloss=gloss[0],
+                                idgloss_mi=gloss[1],
+                                created_by=request.user,
+                                updated_by=request.user,
+                            )
                         except IndexError:
                             # If we get IndexError, idgloss_mi was probably not provided
-                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0],
-                                              created_by=request.user, updated_by=request.user)
+                            new_gloss = Gloss(
+                                dataset=dataset,
+                                idgloss=gloss[0],
+                                created_by=request.user,
+                                updated_by=request.user,
+                            )
 
                         new_gloss.save()
                         glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi))
 
                 # Flush request.session['glosses_new'] and request.session['dataset']
-                del request.session['glosses_new']
-                del request.session['dataset_id']
+                del request.session["glosses_new"]
+                del request.session["dataset_id"]
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.'))
-            return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added,
-                                                                                     'dataset': dataset.name})
+                messages.add_message(
+                    request, messages.SUCCESS, _("Glosses were added successfully.")
+                )
+            return render(
+                request,
+                "dictionary/import_gloss_csv_confirmation.html",
+                {"glosses_added": glosses_added, "dataset": dataset.name},
+            )
         else:
-            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
+            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
     else:
         # If request method is not POST, redirect to the import form
-        return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
+        return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
 
 
 share_csv_header_list = [
@@ -191,20 +262,32 @@ def import_nzsl_share_gloss_csv(request):
         csv_form = CSVUploadForm()
         allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
-            id__in=[x.id for x in allowed_datasets])
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": csv_form}, )
+        csv_form.fields["dataset"].queryset = csv_form.fields[
+            "dataset"
+        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
     form = CSVUploadForm(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(request, messages.ERROR,
-                             _("The provided CSV-file does not meet the requirements "
-                               "or there is some other problem."))
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": form}, )
+        messages.add_message(
+            request,
+            messages.ERROR,
+            _(
+                "The provided CSV-file does not meet the requirements "
+                "or there is some other problem."
+            ),
+        )
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": form},
+        )
 
     new_glosses = []
     dataset = form.cleaned_data["dataset"]
@@ -218,7 +301,7 @@ def import_nzsl_share_gloss_csv(request):
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             fieldnames=share_csv_header_list,
             delimiter=",",
-            quotechar='"'
+            quotechar='"',
         )
 
         skipped_existing_glosses = []
@@ -254,29 +337,40 @@ def import_nzsl_share_gloss_csv(request):
         request.session.pop("dataset_id", None)
         request.session.pop("glosses_new", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": CSVUploadForm()}, )
+        messages.add_message(
+            request, messages.ERROR, _("Cannot open the file:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": CSVUploadForm()},
+        )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
-                      {"import_csv_form": CSVUploadForm()}, )
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv.html",
+            {"import_csv_form": CSVUploadForm()},
+        )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["dataset_id"] = dataset.id
     request.session["glosses_new"] = new_glosses
 
-    return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-                  {
-                      "glosses_new": new_glosses,
-                      "dataset": dataset,
-                      "skipped_existing_glosses": skipped_existing_glosses
-                  })
+    return render(
+        request,
+        "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+        {
+            "glosses_new": new_glosses,
+            "dataset": dataset,
+            "skipped_existing_glosses": skipped_existing_glosses,
+        },
+    )
 
 
 def update_retrieval_videos(videos, gloss_data):
-    """ prep videos, illustrations and usage example for video retrieval """
+    """prep videos, illustrations and usage example for video retrieval"""
 
     gloss_pk = gloss_data["gloss"].pk
     gloss_word = gloss_data["word"]
@@ -284,16 +378,14 @@ def update_retrieval_videos(videos, gloss_data):
     if gloss_data.get("videos", None):
         video_url = gloss_data["videos"]
         extension = video_url[-3:]
-        file_name = (
-            f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
-        )
+        file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
 
         glossvideo = {
             "url": video_url,
             "file_name": file_name,
             "gloss_pk": gloss_pk,
             "video_type": "main",
-            "version": 0
+            "version": 0,
         }
         videos.append(glossvideo)
 
@@ -309,7 +401,7 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": "main",
-                "version": i
+                "version": i,
             }
             videos.append(glossvideo)
 
@@ -325,14 +417,18 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": f"finalexample{i + 1}",
-                "version": i
+                "version": i,
             }
             videos.append(glossvideo)
 
+
 @login_required
 @permission_required("dictionary.import_csv")
 @transaction.atomic()
 def confirm_import_nzsl_share_gloss_csv(request):
+
+    pprint(request.session.__dict__)
+
     """This view adds the data to database if the user confirms the action"""
     if not request.method == "POST":
         # If request method is not POST, redirect to the import form
@@ -348,6 +444,31 @@ def confirm_import_nzsl_share_gloss_csv(request):
     elif not "confirm" in request.POST:
         return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv"))
 
+    if "glosses_new" and "dataset_id" in request.session:
+        [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner(
+            request.session["glosses_new"], request.session["dataset_id"]
+        )
+
+        del request.session["glosses_new"]
+        del request.session["dataset_id"]
+
+        # Set a message to be shown so that the user knows what is going on.
+        messages.add_message(
+            request, messages.SUCCESS, _("Glosses were added successfully.")
+        )
+
+        return render(
+            request,
+            "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+            {"glosses_added": glosses_added, "dataset": dataset_name},
+        )
+
+
+def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id):
+    """Does the thing"""
+
+    print("IN CONFIRM INNER")
+
     glosses_added = []
     dataset = None
     translations = []
@@ -362,49 +483,49 @@ def confirm_import_nzsl_share_gloss_csv(request):
     bulk_share_validation_aggregations = []
     video_import_only_glosses_data = []
 
-    if "glosses_new" and "dataset_id" in request.session:
-        dataset = Dataset.objects.get(id=request.session["dataset_id"])
-        language_en = Language.objects.get(name="English")
-        language_mi = Language.objects.get(name="Māori")
-        gloss_content_type = ContentType.objects.get_for_model(Gloss)
-        site = Site.objects.get_current()
-        comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
-        semantic_fields = FieldChoice.objects.filter(
-            field="semantic_field"
-        ).values_list("english_name", "pk")
-        semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
-        signers = FieldChoice.objects.filter(field="signer")
-        signer_dict = {signer.english_name: signer for signer in signers}
-        existing_machine_values = [
-            mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
-        ]
-        not_public_tag = Tag.objects.get(name="not public")
-        nzsl_share_tag = Tag.objects.get(name="nzsl-share")
-        import_user = User.objects.get(
-            username="nzsl_share_importer",
-            first_name="Importer",
-            last_name="NZSL Share",
-        )
-
-        for row_num, gloss_data in enumerate(request.session["glosses_new"]):
-            # will iterate over these glosses again after bulk creating
-            # and to ensure we get the correct gloss_data for words that appear multiple
-            # times we'll use the row_num as the identifier for the gloss data
-
-            # if the gloss already exists at this point, it can only mean that
-            # it has no videos and we want to import videos for it
-            # try-except saves us a db call
-            try:
-                gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
-                gloss_data_copy = gloss_data.copy()
-                gloss_data_copy["gloss"] = gloss
-                video_import_only_glosses_data.append(gloss_data_copy)
-                continue
-            except Gloss.DoesNotExist:
-                pass
+    dataset = Dataset.objects.get(id=session_dataset_id)
+    language_en = Language.objects.get(name="English")
+    language_mi = Language.objects.get(name="Māori")
+    gloss_content_type = ContentType.objects.get_for_model(Gloss)
+    site = Site.objects.get_current()
+    comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
+    semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list(
+        "english_name", "pk"
+    )
+    semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
+    signers = FieldChoice.objects.filter(field="signer")
+    signer_dict = {signer.english_name: signer for signer in signers}
+    existing_machine_values = [
+        mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
+    ]
+    not_public_tag = Tag.objects.get(name="not public")
+    nzsl_share_tag = Tag.objects.get(name="nzsl-share")
+    import_user = User.objects.get(
+        username="nzsl_share_importer",
+        first_name="Importer",
+        last_name="NZSL Share",
+    )
 
-            new_glosses[str(row_num)] = gloss_data
-            bulk_create_gloss.append(Gloss(
+    for row_num, gloss_data in enumerate(session_glosses_new):
+        # will iterate over these glosses again after bulk creating
+        # and to ensure we get the correct gloss_data for words that appear multiple
+        # times we'll use the row_num as the identifier for the gloss data
+
+        # if the gloss already exists at this point, it can only mean that
+        # it has no videos and we want to import videos for it
+        # try-except saves us a db call
+        try:
+            gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
+            gloss_data_copy = gloss_data.copy()
+            gloss_data_copy["gloss"] = gloss
+            video_import_only_glosses_data.append(gloss_data_copy)
+            continue
+        except Gloss.DoesNotExist:
+            pass
+
+        new_glosses[str(row_num)] = gloss_data
+        bulk_create_gloss.append(
+            Gloss(
                 dataset=dataset,
                 nzsl_share_id=gloss_data["id"],
                 # need to make idgloss unique in dataset,
@@ -415,183 +536,174 @@ def confirm_import_nzsl_share_gloss_csv(request):
                 created_by=import_user,
                 updated_by=import_user,
                 exclude_from_ecv=True,
-            ))
-            contributors.append(gloss_data["contributor_username"])
-
-        bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
-
-        # Create new signers for contributors that do not exist as signers yet
-        contributors = set(contributors)
-        create_signers = []
-        signers = signer_dict.keys()
-        for contributor in contributors:
-            if contributor not in signers:
+            )
+        )
+        contributors.append(gloss_data["contributor_username"])
+
+    bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
+
+    # Create new signers for contributors that do not exist as signers yet
+    contributors = set(contributors)
+    create_signers = []
+    signers = signer_dict.keys()
+    for contributor in contributors:
+        if contributor not in signers:
+            new_machine_value = random.randint(0, 99999999)
+            while new_machine_value in existing_machine_values:
                 new_machine_value = random.randint(0, 99999999)
-                while new_machine_value in existing_machine_values:
-                    new_machine_value = random.randint(0, 99999999)
-                existing_machine_values.append(new_machine_value)
-                create_signers.append(FieldChoice(
+            existing_machine_values.append(new_machine_value)
+            create_signers.append(
+                FieldChoice(
                     field="signer",
                     english_name=contributor,
-                    machine_value=new_machine_value
-                ))
-        new_signers = FieldChoice.objects.bulk_create(create_signers)
-        for signer in new_signers:
-            signer_dict[signer.english_name] = signer
-
-        for gloss in bulk_created:
-            word_en, row_num = gloss.idgloss.split("_row")
-            gloss_data = new_glosses[row_num]
-            gloss_data["gloss"] = gloss
-
-            # get semantic fields for gloss_data topics
-            if gloss_data.get("topic_names", None):
-                gloss_topics = gloss_data["topic_names"].split("|")
-                # ignore all signs and All signs
-                cleaned_gloss_topics = [
-                    x for x in gloss_topics if x not in ["all signs", "All signs"]
-                ]
-                add_miscellaneous = False
-
-                for topic in cleaned_gloss_topics:
-                    if topic in semantic_fields_dict.keys():
-                        bulk_semantic_fields.append(
-                            Gloss.semantic_field.through(
-                                gloss_id=gloss.id,
-                                fieldchoice_id=semantic_fields_dict[topic]
-                            )
-                        )
-                    else:
-                        # add the miscellaneous semantic field if a topic does not exist
-                        add_miscellaneous = True
-
-                if add_miscellaneous:
+                    machine_value=new_machine_value,
+                )
+            )
+    new_signers = FieldChoice.objects.bulk_create(create_signers)
+    for signer in new_signers:
+        signer_dict[signer.english_name] = signer
+
+    for gloss in bulk_created:
+        word_en, row_num = gloss.idgloss.split("_row")
+        gloss_data = new_glosses[row_num]
+        gloss_data["gloss"] = gloss
+
+        # get semantic fields for gloss_data topics
+        if gloss_data.get("topic_names", None):
+            gloss_topics = gloss_data["topic_names"].split("|")
+            # ignore all signs and All signs
+            cleaned_gloss_topics = [
+                x for x in gloss_topics if x not in ["all signs", "All signs"]
+            ]
+            add_miscellaneous = False
+
+            for topic in cleaned_gloss_topics:
+                if topic in semantic_fields_dict.keys():
                     bulk_semantic_fields.append(
                         Gloss.semantic_field.through(
                             gloss_id=gloss.id,
-                            fieldchoice_id=semantic_fields_dict["Miscellaneous"]
+                            fieldchoice_id=semantic_fields_dict[topic],
                         )
                     )
+                else:
+                    # add the miscellaneous semantic field if a topic does not exist
+                    add_miscellaneous = True
+
+            if add_miscellaneous:
+                bulk_semantic_fields.append(
+                    Gloss.semantic_field.through(
+                        gloss_id=gloss.id,
+                        fieldchoice_id=semantic_fields_dict["Miscellaneous"],
+                    )
+                )
 
-            # create GlossTranslations for english and maori words
-            translations.append(GlossTranslations(
+        # create GlossTranslations for english and maori words
+        translations.append(
+            GlossTranslations(
                 gloss=gloss,
                 language=language_en,
                 translations=gloss_data["word"],
-                translations_secondary=gloss_data.get("secondary", None)
-            ))
-            if gloss_data.get("maori", None):
-                # There is potentially several comma separated maori words
-                maori_words = gloss_data["maori"].split(", ")
-
-                # Update idgloss_mi using first maori word, then create translation
-                gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
-
-                translation = GlossTranslations(
-                    gloss=gloss,
-                    language=language_mi,
-                    translations=maori_words[0]
-                )
-                if len(maori_words) > 1:
-                    translation.translations_secondary = ", ".join(maori_words[1:])
+                translations_secondary=gloss_data.get("secondary", None),
+            )
+        )
+        if gloss_data.get("maori", None):
+            # There is potentially several comma separated maori words
+            maori_words = gloss_data["maori"].split(", ")
+
+            # Update idgloss_mi using first maori word, then create translation
+            gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
 
-                translations.append(translation)
+            translation = GlossTranslations(
+                gloss=gloss, language=language_mi, translations=maori_words[0]
+            )
+            if len(maori_words) > 1:
+                translation.translations_secondary = ", ".join(maori_words[1:])
 
-            # Prepare new idgloss and signer fields for bulk update
-            gloss.idgloss = f"{word_en}:{gloss.pk}"
-            gloss.signer = signer_dict[gloss_data["contributor_username"]]
-            bulk_update_glosses.append(gloss)
+            translations.append(translation)
 
-            # Create comment for gloss_data notes
-            comments.append(Comment(
+        # Prepare new idgloss and signer fields for bulk update
+        gloss.idgloss = f"{word_en}:{gloss.pk}"
+        gloss.signer = signer_dict[gloss_data["contributor_username"]]
+        bulk_update_glosses.append(gloss)
+
+        # Create comment for gloss_data notes
+        comments.append(
+            Comment(
                 content_type=gloss_content_type,
                 object_pk=gloss.pk,
                 user_name=gloss_data.get("contributor_username", ""),
                 comment=gloss_data.get("notes", ""),
                 site=site,
                 is_public=False,
-                submit_date=comment_submit_date
-            ))
-            if gloss_data.get("sign_comments", None):
-                # create Comments for all gloss_data sign_comments
-                for comment in gloss_data["sign_comments"].split("|"):
-                    try:
-                        comment_content = comment.split(":")
-                        user_name = comment_content[0]
-                        comment_content = comment_content[1]
-                    except IndexError:
-                        comment_content = comment
-                        user_name = "Unknown"
-                    comments.append(Comment(
+                submit_date=comment_submit_date,
+            )
+        )
+        if gloss_data.get("sign_comments", None):
+            # create Comments for all gloss_data sign_comments
+            for comment in gloss_data["sign_comments"].split("|"):
+                try:
+                    comment_content = comment.split(":")
+                    user_name = comment_content[0]
+                    comment_content = comment_content[1]
+                except IndexError:
+                    comment_content = comment
+                    user_name = "Unknown"
+                comments.append(
+                    Comment(
                         content_type=gloss_content_type,
                         object_pk=gloss.pk,
                         user_name=user_name,
                         comment=comment_content,
                         site=site,
                         is_public=False,
-                        submit_date=comment_submit_date
-                    ))
+                        submit_date=comment_submit_date,
+                    )
+                )
 
-            # Add ShareValidationAggregation
-            bulk_share_validation_aggregations.append(ShareValidationAggregation(
+        # Add ShareValidationAggregation
+        bulk_share_validation_aggregations.append(
+            ShareValidationAggregation(
                 gloss=gloss,
                 agrees=int(gloss_data["agrees"]),
-                disagrees=int(gloss_data["disagrees"])
-            ))
-
-            # prep videos, illustrations and usage example for video retrieval
-            update_retrieval_videos(videos, gloss_data)
-
-            glosses_added.append(gloss)
-
-            bulk_tagged_items.append(TaggedItem(
-                content_type=gloss_content_type,
-                object_id=gloss.pk,
-                tag=nzsl_share_tag
-
-            ))
-            bulk_tagged_items.append(TaggedItem(
-                content_type=gloss_content_type,
-                object_id=gloss.pk,
-                tag=not_public_tag
-
-            ))
+                disagrees=int(gloss_data["disagrees"]),
+            )
+        )
 
-        # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
-        Comment.objects.bulk_create(comments)
-        GlossTranslations.objects.bulk_create(translations)
-        Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
-        Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
-        TaggedItem.objects.bulk_create(bulk_tagged_items)
-        ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
+        # prep videos, illustrations and usage example for video retrieval
+        update_retrieval_videos(videos, gloss_data)
 
-        # Add the video-update only glosses
-        for video_import_gloss_data in video_import_only_glosses_data:
-            # prep videos, illustrations and usage example for video retrieval
-            update_retrieval_videos(videos, video_import_gloss_data)
-            glosses_added.append(video_import_gloss_data["gloss"])
+        glosses_added.append(gloss)
 
-        # start Thread to process gloss video retrieval in the background
-        t = threading.Thread(
-            target=retrieve_videos_for_glosses,
-            args=[videos],
-            daemon=True
+        bulk_tagged_items.append(
+            TaggedItem(
+                content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag
+            )
+        )
+        bulk_tagged_items.append(
+            TaggedItem(
+                content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag
+            )
         )
-        t.start()
 
-        del request.session["glosses_new"]
-        del request.session["dataset_id"]
+    # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
+    Comment.objects.bulk_create(comments)
+    GlossTranslations.objects.bulk_create(translations)
+    Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
+    Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
+    TaggedItem.objects.bulk_create(bulk_tagged_items)
+    ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
 
-        # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully."))
-    return render(
-        request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-        {
-            "glosses_added": glosses_added,
-            "dataset": dataset.name
-        }
-    )
+    # Add the video-update only glosses
+    for video_import_gloss_data in video_import_only_glosses_data:
+        # prep videos, illustrations and usage example for video retrieval
+        update_retrieval_videos(videos, video_import_gloss_data)
+        glosses_added.append(video_import_gloss_data["gloss"])
 
+    # start Thread to process gloss video retrieval in the background
+    t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True)
+    t.start()
+
+    return [glosses_added, dataset.name]
 
 
 @login_required
@@ -608,18 +720,29 @@ def import_qualtrics_csv(request):
     if not request.method == "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": csv_form}, )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(request, messages.ERROR,
-                             _("The provided CSV-file does not meet the requirements "
-                               "or there is some other problem."))
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": form}, )
+        messages.add_message(
+            request,
+            messages.ERROR,
+            _(
+                "The provided CSV-file does not meet the requirements "
+                "or there is some other problem."
+            ),
+        )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": form},
+        )
 
     validation_records = []
     skipped_rows = []
@@ -627,7 +750,7 @@ def import_qualtrics_csv(request):
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             delimiter=",",
-            quotechar='"'
+            quotechar='"',
         )
 
         question_numbers = []
@@ -669,22 +792,33 @@ def import_qualtrics_csv(request):
         request.session.pop("question_numbers", None)
         request.session.pop("question_gloss_map", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        messages.add_message(
+            request, messages.ERROR, _("Cannot open the file:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(request, "dictionary/import_qualtrics_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        return render(
+            request,
+            "dictionary/import_qualtrics_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["validation_records"] = validation_records
     request.session["question_numbers"] = question_numbers
     request.session["question_glossvideo_map"] = question_to_glossvideo_map
 
-    return render(request, "dictionary/import_qualtrics_csv_confirmation.html",
-                  {"validation_records": validation_records, "skipped_rows": skipped_rows})
+    return render(
+        request,
+        "dictionary/import_qualtrics_csv_confirmation.html",
+        {"validation_records": validation_records, "skipped_rows": skipped_rows},
+    )
 
 
 @login_required
@@ -714,13 +848,21 @@ def confirm_import_qualtrics_csv(request):
     bulk_tagged_items = []
     gloss_pks = set()
 
-    if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session:
+    if (
+        "validation_records"
+        and "question_numbers"
+        and "question_glossvideo_map" in request.session
+    ):
         # Retrieve glosses
         glossvideo_pk_list = request.session["question_glossvideo_map"].values()
-        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list)
+        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(
+            glossvideo_pk_list
+        )
         gloss_content_type = ContentType.objects.get_for_model(Gloss)
         check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS)
-        ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION)
+        ready_for_validation_tag = Tag.objects.get(
+            name=settings.TAG_READY_FOR_VALIDATION
+        )
 
         questions_numbers = request.session["question_numbers"]
         question_glossvideo_map = request.session["question_glossvideo_map"]
@@ -740,35 +882,43 @@ def confirm_import_qualtrics_csv(request):
                     sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value
 
                 try:
-                    gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss
-                    validation_records_added.append(ValidationRecord(
-                        gloss=gloss,
-                        sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
-                        response_id=response_id,
-                        respondent_first_name=respondent_first_name,
-                        respondent_last_name=respondent_last_name,
-                        comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
-                    ))
+                    gloss = glossvideo_dict[
+                        question_glossvideo_map[question_number]
+                    ].gloss
+                    validation_records_added.append(
+                        ValidationRecord(
+                            gloss=gloss,
+                            sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
+                            response_id=response_id,
+                            respondent_first_name=respondent_first_name,
+                            respondent_last_name=respondent_last_name,
+                            comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
+                        )
+                    )
                     gloss_pks.add(gloss.pk)
                 except KeyError:
-                    missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[
-                        question_number]
+                    missing_gloss_pk_question_pairs[question_number] = (
+                        question_glossvideo_map[question_number]
+                    )
 
         for gloss_pk in gloss_pks:
-            bulk_tagged_items.append(TaggedItem(
-                content_type=gloss_content_type,
-                object_id=gloss_pk,
-                tag=check_result_tag
-
-            ))
+            bulk_tagged_items.append(
+                TaggedItem(
+                    content_type=gloss_content_type,
+                    object_id=gloss_pk,
+                    tag=check_result_tag,
+                )
+            )
 
         # ignoring conflicts so the unique together on the model filters out potential duplicates
-        ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True)
+        ValidationRecord.objects.bulk_create(
+            validation_records_added, ignore_conflicts=True
+        )
         TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True)
         TaggedItem.objects.filter(
             content_type=gloss_content_type,
             object_id__in=gloss_pks,
-            tag=ready_for_validation_tag
+            tag=ready_for_validation_tag,
         ).delete()
 
         del request.session["validation_records"]
@@ -776,17 +926,19 @@ def confirm_import_qualtrics_csv(request):
         del request.session["question_glossvideo_map"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.SUCCESS,
-                             _("ValidationRecords were added successfully."))
+        messages.add_message(
+            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
+        )
     return render(
-        request, "dictionary/import_qualtrics_csv_confirmation.html",
+        request,
+        "dictionary/import_qualtrics_csv_confirmation.html",
         {
             "validation_records_added": validation_records_added,
             "validation_record_count": len(validation_records_added),
             "responses_count": len(validation_records),
             "gloss_count": len(gloss_pks),
-            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs
-        }
+            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs,
+        },
     )
 
 
@@ -815,18 +967,29 @@ def import_manual_validation(request):
     if request.method != "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": csv_form}, )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": csv_form},
+        )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(request, messages.ERROR,
-                             _("The provided CSV-file does not meet the requirements "
-                               "or there is some other problem."))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": form}, )
+        messages.add_message(
+            request,
+            messages.ERROR,
+            _(
+                "The provided CSV-file does not meet the requirements "
+                "or there is some other problem."
+            ),
+        )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": form},
+        )
 
     group_row_map = defaultdict(list)
     group_gloss_count = defaultdict(int)
@@ -837,29 +1000,38 @@ def import_manual_validation(request):
         "yes",
         "no",
         "abstain or not sure",
-        "comments"
+        "comments",
     ]
     try:
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"),
             delimiter=",",
-            quotechar='"'
+            quotechar='"',
+        )
+        missing_headers = set(required_headers) - set(
+            validation_record_reader.fieldnames
         )
-        missing_headers = set(required_headers) - set(validation_record_reader.fieldnames)
         if missing_headers != set():
             request.session.pop("group_row_map", None)
             request.session.pop("glosses", None)
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(request, messages.ERROR,
-                                 _(f"CSV is missing required columns: {missing_headers}"))
-            return render(request,
-                              "dictionary/import_manual_validation_csv.html",
-                              {"import_csv_form": CSVFileOnlyUpload()}, )
+            messages.add_message(
+                request,
+                messages.ERROR,
+                _(f"CSV is missing required columns: {missing_headers}"),
+            )
+            return render(
+                request,
+                "dictionary/import_manual_validation_csv.html",
+                {"import_csv_form": CSVFileOnlyUpload()},
+            )
 
         for row in validation_record_reader:
             if validation_record_reader.line_num == 1:
                 continue
-            _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"])
+            _check_row_can_be_converted_to_integer(
+                row, ["yes", "no", "abstain or not sure"]
+            )
             group_row_map[row["group"]].append(row)
             group_gloss_count[row["group"]] += 1
             glosses.append(row["idgloss"].split(":")[1])
@@ -868,35 +1040,49 @@ def import_manual_validation(request):
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e)))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        messages.add_message(
+            request, messages.ERROR, _("File contains non-compliant data:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
 
     except csv.Error as e:
         # Can't open file, remove session variables
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        messages.add_message(
+            request, messages.ERROR, _("Cannot open the file:" + str(e))
+        )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(request, "dictionary/import_manual_validation_csv.html",
-                      {"import_csv_form": CSVFileOnlyUpload()}, )
+        return render(
+            request,
+            "dictionary/import_manual_validation_csv.html",
+            {"import_csv_form": CSVFileOnlyUpload()},
+        )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["group_row_map"] = group_row_map
     request.session["glosses"] = list(set(glosses))
 
     return render(
-        request, "dictionary/import_manual_validation_csv_confirmation.html",
+        request,
+        "dictionary/import_manual_validation_csv_confirmation.html",
         {
             # iterating over defaultdicts causes issues in template rendering
             "group_row_map": dict(group_row_map),
-            "group_gloss_count": dict(group_gloss_count)
-        }
+            "group_gloss_count": dict(group_gloss_count),
+        },
     )
 
 
@@ -940,14 +1126,18 @@ def confirm_import_manual_validation(request):
                 sign_seen_no = row["no"]
                 sign_seen_not_sure = row["abstain or not sure"]
                 comments = row["comments"]
-                manual_validation_aggregations.append(ManualValidationAggregation(
-                    gloss=gloss,
-                    group=group,
-                    sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
-                    sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
-                    sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0,
-                    comments=comments
-                ))
+                manual_validation_aggregations.append(
+                    ManualValidationAggregation(
+                        gloss=gloss,
+                        group=group,
+                        sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
+                        sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
+                        sign_seen_not_sure=(
+                            int(sign_seen_not_sure) if sign_seen_not_sure else 0
+                        ),
+                        comments=comments,
+                    )
+                )
 
         ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations)
 
@@ -955,13 +1145,15 @@ def confirm_import_manual_validation(request):
         del request.session["glosses"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(request, messages.SUCCESS,
-                             _("ValidationRecords were added successfully."))
+        messages.add_message(
+            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
+        )
     return render(
-        request, "dictionary/import_manual_validation_csv_confirmation.html",
+        request,
+        "dictionary/import_manual_validation_csv_confirmation.html",
         {
             "manual_validation_aggregations": manual_validation_aggregations,
             "manual_validation_aggregations_count": len(manual_validation_aggregations),
-            "missing_glosses": missing_glosses
-        }
+            "missing_glosses": missing_glosses,
+        },
     )

From 3de214c844c138d568d9c5e36c234a1b1e6d692e Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:43:23 +1100
Subject: [PATCH 150/222] Debug removed

---
 signbank/dictionary/csv_import.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py
index 0685d701..ec6fed32 100644
--- a/signbank/dictionary/csv_import.py
+++ b/signbank/dictionary/csv_import.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-from pprint import pprint
-
 import codecs
 import csv
 import datetime
@@ -426,9 +424,6 @@ def update_retrieval_videos(videos, gloss_data):
 @permission_required("dictionary.import_csv")
 @transaction.atomic()
 def confirm_import_nzsl_share_gloss_csv(request):
-
-    pprint(request.session.__dict__)
-
     """This view adds the data to database if the user confirms the action"""
     if not request.method == "POST":
         # If request method is not POST, redirect to the import form
@@ -465,10 +460,7 @@ def confirm_import_nzsl_share_gloss_csv(request):
 
 
 def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id):
-    """Does the thing"""
-
-    print("IN CONFIRM INNER")
-
+    """Performs CSV import actions"""
     glosses_added = []
     dataset = None
     translations = []

From 335f07dbbf4c129620eb9d60a2b2e3afbcdbd0d3 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:45:56 +1100
Subject: [PATCH 151/222] Revert "Debug removed"

This reverts commit 3de214c844c138d568d9c5e36c234a1b1e6d692e.
---
 signbank/dictionary/csv_import.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py
index ec6fed32..0685d701 100644
--- a/signbank/dictionary/csv_import.py
+++ b/signbank/dictionary/csv_import.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
+from pprint import pprint
+
 import codecs
 import csv
 import datetime
@@ -424,6 +426,9 @@ def update_retrieval_videos(videos, gloss_data):
 @permission_required("dictionary.import_csv")
 @transaction.atomic()
 def confirm_import_nzsl_share_gloss_csv(request):
+
+    pprint(request.session.__dict__)
+
     """This view adds the data to database if the user confirms the action"""
     if not request.method == "POST":
         # If request method is not POST, redirect to the import form
@@ -460,7 +465,10 @@ def confirm_import_nzsl_share_gloss_csv(request):
 
 
 def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id):
-    """Performs CSV import actions"""
+    """Does the thing"""
+
+    print("IN CONFIRM INNER")
+
     glosses_added = []
     dataset = None
     translations = []

From 65de3ca1d902598de75c42d87ccf3a411df9895f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:46:00 +1100
Subject: [PATCH 152/222] Revert "Revert "Revert "Experimental refactor
 csv_import"""

This reverts commit 5c5fcc60ecef0ea2d62e00987ea5fa62a92d0266.
---
 signbank/dictionary/csv_import.py | 884 ++++++++++++------------------
 1 file changed, 346 insertions(+), 538 deletions(-)

diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py
index 0685d701..0bfdff69 100644
--- a/signbank/dictionary/csv_import.py
+++ b/signbank/dictionary/csv_import.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-from pprint import pprint
-
 import codecs
 import csv
 import datetime
@@ -28,16 +26,8 @@
 from tagging.models import Tag, TaggedItem
 
 from .forms import CSVFileOnlyUpload, CSVUploadForm
-from .models import (
-    Dataset,
-    FieldChoice,
-    Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
-)
+from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language,
+                     ManualValidationAggregation, ShareValidationAggregation, ValidationRecord)
 from .tasks import retrieve_videos_for_glosses
 from ..video.models import GlossVideo
 
@@ -45,7 +35,7 @@
 
 
 @login_required
-@permission_required("dictionary.import_csv")
+@permission_required('dictionary.import_csv')
 def import_gloss_csv(request):
     """
     Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms.
@@ -54,53 +44,31 @@ def import_gloss_csv(request):
     glosses_new = []
     glosses_exists = []
     # Make sure that the session variables are flushed before using this view.
-    if "dataset_id" in request.session:
-        del request.session["dataset_id"]
-    if "glosses_new" in request.session:
-        del request.session["glosses_new"]
+    if 'dataset_id' in request.session: del request.session['dataset_id']
+    if 'glosses_new' in request.session: del request.session['glosses_new']
 
-    if request.method == "POST":
+    if request.method == 'POST':
         form = CSVUploadForm(request.POST, request.FILES)
         if form.is_valid():
-            dataset = form.cleaned_data["dataset"]
-            if "view_dataset" not in get_perms(request.user, dataset):
+            dataset = form.cleaned_data['dataset']
+            if 'view_dataset' not in get_perms(request.user, dataset):
                 # If user has no permissions to dataset, raise PermissionDenied to show 403 template.
-                msg = _(
-                    "You do not have permissions to import glosses to this lexicon."
-                )
+                msg = _("You do not have permissions to import glosses to this lexicon.")
                 messages.error(request, msg)
                 raise PermissionDenied(msg)
             try:
-                glossreader = csv.reader(
-                    codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
-                    delimiter=",",
-                    quotechar='"',
-                )
+                glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"')
             except csv.Error as e:
                 # Can't open file, remove session variables
-                if "dataset_id" in request.session:
-                    del request.session["dataset_id"]
-                if "glosses_new" in request.session:
-                    del request.session["glosses_new"]
+                if 'dataset_id' in request.session: del request.session['dataset_id']
+                if 'glosses_new' in request.session: del request.session['glosses_new']
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(
-                    request, messages.ERROR, _("Cannot open the file:" + str(e))
-                )
-                return render(
-                    request,
-                    "dictionary/import_gloss_csv.html",
-                    {"import_csv_form": CSVUploadForm()},
-                )
+                messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e)))
+                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
             except UnicodeDecodeError as e:
                 # File is not UTF-8 encoded.
-                messages.add_message(
-                    request, messages.ERROR, _("File must be UTF-8 encoded!")
-                )
-                return render(
-                    request,
-                    "dictionary/import_gloss_csv.html",
-                    {"import_csv_form": CSVUploadForm()},
-                )
+                messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!'))
+                return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, )
 
             for row in glossreader:
                 if glossreader.line_num == 1:
@@ -119,113 +87,74 @@ def import_gloss_csv(request):
                     continue
 
             # Store dataset's id and the list of glosses to be added in session.
-            request.session["dataset_id"] = dataset.id
-            request.session["glosses_new"] = glosses_new
-
-            return render(
-                request,
-                "dictionary/import_gloss_csv_confirmation.html",
-                {
-                    "glosses_new": glosses_new,
-                    "glosses_exists": glosses_exists,
-                    "dataset": dataset,
-                },
-            )
+            request.session['dataset_id'] = dataset.id
+            request.session['glosses_new'] = glosses_new
+
+            return render(request, 'dictionary/import_gloss_csv_confirmation.html',
+                          {'glosses_new': glosses_new,
+                           'glosses_exists': glosses_exists,
+                           'dataset': dataset, })
         else:
             # If form is not valid, set a error message and return to the original form.
-            messages.add_message(
-                request,
-                messages.ERROR,
-                _(
-                    "The provided CSV-file does not meet the requirements "
-                    "or there is some other problem."
-                ),
-            )
-            return render(
-                request,
-                "dictionary/import_gloss_csv.html",
-                {"import_csv_form": form},
-            )
+            messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements '
+                                                            'or there is some other problem.'))
+            return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, )
     else:
         # If request type is not POST, return to the original form.
         csv_form = CSVUploadForm()
-        allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
+        allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset')
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields[
-            "dataset"
-        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
-        return render(
-            request,
-            "dictionary/import_gloss_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
+            id__in=[x.id for x in allowed_datasets])
+        return render(request, "dictionary/import_gloss_csv.html",
+                      {'import_csv_form': csv_form}, )
 
 
 @login_required
-@permission_required("dictionary.import_csv")
+@permission_required('dictionary.import_csv')
 def confirm_import_gloss_csv(request):
     """This view adds the data to database if the user confirms the action"""
-    if request.method == "POST":
-        if "cancel" in request.POST:
+    if request.method == 'POST':
+        if 'cancel' in request.POST:
             # If user cancels adding data, flush session variables
-            if "dataset_id" in request.session:
-                del request.session["dataset_id"]
-            if "glosses_new" in request.session:
-                del request.session["glosses_new"]
+            if 'dataset_id' in request.session: del request.session['dataset_id']
+            if 'glosses_new' in request.session: del request.session['glosses_new']
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(
-                request, messages.WARNING, _("Cancelled adding CSV data.")
-            )
-            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
+            messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.'))
+            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
 
-        elif "confirm" in request.POST:
+        elif 'confirm' in request.POST:
             glosses_added = []
             dataset = None
-            if "glosses_new" and "dataset_id" in request.session:
-                dataset = Dataset.objects.get(id=request.session["dataset_id"])
-                for gloss in request.session["glosses_new"]:
+            if 'glosses_new' and 'dataset_id' in request.session:
+                dataset = Dataset.objects.get(id=request.session['dataset_id'])
+                for gloss in request.session['glosses_new']:
 
                     # If the Gloss does not already exist, continue adding.
-                    if not Gloss.objects.filter(
-                        dataset=dataset, idgloss=gloss[0]
-                    ).exists():
+                    if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists():
                         try:
-                            new_gloss = Gloss(
-                                dataset=dataset,
-                                idgloss=gloss[0],
-                                idgloss_mi=gloss[1],
-                                created_by=request.user,
-                                updated_by=request.user,
-                            )
+                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1],
+                                          created_by=request.user, updated_by=request.user)
                         except IndexError:
                             # If we get IndexError, idgloss_mi was probably not provided
-                            new_gloss = Gloss(
-                                dataset=dataset,
-                                idgloss=gloss[0],
-                                created_by=request.user,
-                                updated_by=request.user,
-                            )
+                            new_gloss = Gloss(dataset=dataset, idgloss=gloss[0],
+                                              created_by=request.user, updated_by=request.user)
 
                         new_gloss.save()
                         glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi))
 
                 # Flush request.session['glosses_new'] and request.session['dataset']
-                del request.session["glosses_new"]
-                del request.session["dataset_id"]
+                del request.session['glosses_new']
+                del request.session['dataset_id']
                 # Set a message to be shown so that the user knows what is going on.
-                messages.add_message(
-                    request, messages.SUCCESS, _("Glosses were added successfully.")
-                )
-            return render(
-                request,
-                "dictionary/import_gloss_csv_confirmation.html",
-                {"glosses_added": glosses_added, "dataset": dataset.name},
-            )
+                messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.'))
+            return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added,
+                                                                                     'dataset': dataset.name})
         else:
-            return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
+            return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
     else:
         # If request method is not POST, redirect to the import form
-        return HttpResponseRedirect(reverse("dictionary:import_gloss_csv"))
+        return HttpResponseRedirect(reverse('dictionary:import_gloss_csv'))
 
 
 share_csv_header_list = [
@@ -262,32 +191,20 @@ def import_nzsl_share_gloss_csv(request):
         csv_form = CSVUploadForm()
         allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset")
         # Make sure we only list datasets the user has permissions to.
-        csv_form.fields["dataset"].queryset = csv_form.fields[
-            "dataset"
-        ].queryset.filter(id__in=[x.id for x in allowed_datasets])
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter(
+            id__in=[x.id for x in allowed_datasets])
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": csv_form}, )
 
     form = CSVUploadForm(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(
-            request,
-            messages.ERROR,
-            _(
-                "The provided CSV-file does not meet the requirements "
-                "or there is some other problem."
-            ),
-        )
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": form},
-        )
+        messages.add_message(request, messages.ERROR,
+                             _("The provided CSV-file does not meet the requirements "
+                               "or there is some other problem."))
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": form}, )
 
     new_glosses = []
     dataset = form.cleaned_data["dataset"]
@@ -301,7 +218,7 @@ def import_nzsl_share_gloss_csv(request):
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             fieldnames=share_csv_header_list,
             delimiter=",",
-            quotechar='"',
+            quotechar='"'
         )
 
         skipped_existing_glosses = []
@@ -337,40 +254,29 @@ def import_nzsl_share_gloss_csv(request):
         request.session.pop("dataset_id", None)
         request.session.pop("glosses_new", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("Cannot open the file:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": CSVUploadForm()},
-        )
+        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": CSVUploadForm()}, )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv.html",
-            {"import_csv_form": CSVUploadForm()},
-        )
+        return render(request, "dictionary/import_nzsl_share_gloss_csv.html",
+                      {"import_csv_form": CSVUploadForm()}, )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["dataset_id"] = dataset.id
     request.session["glosses_new"] = new_glosses
 
-    return render(
-        request,
-        "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-        {
-            "glosses_new": new_glosses,
-            "dataset": dataset,
-            "skipped_existing_glosses": skipped_existing_glosses,
-        },
-    )
+    return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+                  {
+                      "glosses_new": new_glosses,
+                      "dataset": dataset,
+                      "skipped_existing_glosses": skipped_existing_glosses
+                  })
 
 
 def update_retrieval_videos(videos, gloss_data):
-    """prep videos, illustrations and usage example for video retrieval"""
+    """ prep videos, illustrations and usage example for video retrieval """
 
     gloss_pk = gloss_data["gloss"].pk
     gloss_word = gloss_data["word"]
@@ -378,14 +284,16 @@ def update_retrieval_videos(videos, gloss_data):
     if gloss_data.get("videos", None):
         video_url = gloss_data["videos"]
         extension = video_url[-3:]
-        file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
+        file_name = (
+            f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}"
+        )
 
         glossvideo = {
             "url": video_url,
             "file_name": file_name,
             "gloss_pk": gloss_pk,
             "video_type": "main",
-            "version": 0,
+            "version": 0
         }
         videos.append(glossvideo)
 
@@ -401,7 +309,7 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": "main",
-                "version": i,
+                "version": i
             }
             videos.append(glossvideo)
 
@@ -417,18 +325,14 @@ def update_retrieval_videos(videos, gloss_data):
                 "file_name": file_name,
                 "gloss_pk": gloss_pk,
                 "video_type": f"finalexample{i + 1}",
-                "version": i,
+                "version": i
             }
             videos.append(glossvideo)
 
-
 @login_required
 @permission_required("dictionary.import_csv")
 @transaction.atomic()
 def confirm_import_nzsl_share_gloss_csv(request):
-
-    pprint(request.session.__dict__)
-
     """This view adds the data to database if the user confirms the action"""
     if not request.method == "POST":
         # If request method is not POST, redirect to the import form
@@ -444,31 +348,6 @@ def confirm_import_nzsl_share_gloss_csv(request):
     elif not "confirm" in request.POST:
         return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv"))
 
-    if "glosses_new" and "dataset_id" in request.session:
-        [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner(
-            request.session["glosses_new"], request.session["dataset_id"]
-        )
-
-        del request.session["glosses_new"]
-        del request.session["dataset_id"]
-
-        # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.SUCCESS, _("Glosses were added successfully.")
-        )
-
-        return render(
-            request,
-            "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
-            {"glosses_added": glosses_added, "dataset": dataset_name},
-        )
-
-
-def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id):
-    """Does the thing"""
-
-    print("IN CONFIRM INNER")
-
     glosses_added = []
     dataset = None
     translations = []
@@ -483,49 +362,49 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas
     bulk_share_validation_aggregations = []
     video_import_only_glosses_data = []
 
-    dataset = Dataset.objects.get(id=session_dataset_id)
-    language_en = Language.objects.get(name="English")
-    language_mi = Language.objects.get(name="Māori")
-    gloss_content_type = ContentType.objects.get_for_model(Gloss)
-    site = Site.objects.get_current()
-    comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
-    semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list(
-        "english_name", "pk"
-    )
-    semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
-    signers = FieldChoice.objects.filter(field="signer")
-    signer_dict = {signer.english_name: signer for signer in signers}
-    existing_machine_values = [
-        mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
-    ]
-    not_public_tag = Tag.objects.get(name="not public")
-    nzsl_share_tag = Tag.objects.get(name="nzsl-share")
-    import_user = User.objects.get(
-        username="nzsl_share_importer",
-        first_name="Importer",
-        last_name="NZSL Share",
-    )
+    if "glosses_new" and "dataset_id" in request.session:
+        dataset = Dataset.objects.get(id=request.session["dataset_id"])
+        language_en = Language.objects.get(name="English")
+        language_mi = Language.objects.get(name="Māori")
+        gloss_content_type = ContentType.objects.get_for_model(Gloss)
+        site = Site.objects.get_current()
+        comment_submit_date = datetime.datetime.now(tz=get_current_timezone())
+        semantic_fields = FieldChoice.objects.filter(
+            field="semantic_field"
+        ).values_list("english_name", "pk")
+        semantic_fields_dict = {field[0]: field[1] for field in semantic_fields}
+        signers = FieldChoice.objects.filter(field="signer")
+        signer_dict = {signer.english_name: signer for signer in signers}
+        existing_machine_values = [
+            mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True)
+        ]
+        not_public_tag = Tag.objects.get(name="not public")
+        nzsl_share_tag = Tag.objects.get(name="nzsl-share")
+        import_user = User.objects.get(
+            username="nzsl_share_importer",
+            first_name="Importer",
+            last_name="NZSL Share",
+        )
+
+        for row_num, gloss_data in enumerate(request.session["glosses_new"]):
+            # will iterate over these glosses again after bulk creating
+            # and to ensure we get the correct gloss_data for words that appear multiple
+            # times we'll use the row_num as the identifier for the gloss data
+
+            # if the gloss already exists at this point, it can only mean that
+            # it has no videos and we want to import videos for it
+            # try-except saves us a db call
+            try:
+                gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
+                gloss_data_copy = gloss_data.copy()
+                gloss_data_copy["gloss"] = gloss
+                video_import_only_glosses_data.append(gloss_data_copy)
+                continue
+            except Gloss.DoesNotExist:
+                pass
 
-    for row_num, gloss_data in enumerate(session_glosses_new):
-        # will iterate over these glosses again after bulk creating
-        # and to ensure we get the correct gloss_data for words that appear multiple
-        # times we'll use the row_num as the identifier for the gloss data
-
-        # if the gloss already exists at this point, it can only mean that
-        # it has no videos and we want to import videos for it
-        # try-except saves us a db call
-        try:
-            gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get()
-            gloss_data_copy = gloss_data.copy()
-            gloss_data_copy["gloss"] = gloss
-            video_import_only_glosses_data.append(gloss_data_copy)
-            continue
-        except Gloss.DoesNotExist:
-            pass
-
-        new_glosses[str(row_num)] = gloss_data
-        bulk_create_gloss.append(
-            Gloss(
+            new_glosses[str(row_num)] = gloss_data
+            bulk_create_gloss.append(Gloss(
                 dataset=dataset,
                 nzsl_share_id=gloss_data["id"],
                 # need to make idgloss unique in dataset,
@@ -536,174 +415,183 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas
                 created_by=import_user,
                 updated_by=import_user,
                 exclude_from_ecv=True,
-            )
-        )
-        contributors.append(gloss_data["contributor_username"])
-
-    bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
-
-    # Create new signers for contributors that do not exist as signers yet
-    contributors = set(contributors)
-    create_signers = []
-    signers = signer_dict.keys()
-    for contributor in contributors:
-        if contributor not in signers:
-            new_machine_value = random.randint(0, 99999999)
-            while new_machine_value in existing_machine_values:
+            ))
+            contributors.append(gloss_data["contributor_username"])
+
+        bulk_created = Gloss.objects.bulk_create(bulk_create_gloss)
+
+        # Create new signers for contributors that do not exist as signers yet
+        contributors = set(contributors)
+        create_signers = []
+        signers = signer_dict.keys()
+        for contributor in contributors:
+            if contributor not in signers:
                 new_machine_value = random.randint(0, 99999999)
-            existing_machine_values.append(new_machine_value)
-            create_signers.append(
-                FieldChoice(
+                while new_machine_value in existing_machine_values:
+                    new_machine_value = random.randint(0, 99999999)
+                existing_machine_values.append(new_machine_value)
+                create_signers.append(FieldChoice(
                     field="signer",
                     english_name=contributor,
-                    machine_value=new_machine_value,
-                )
-            )
-    new_signers = FieldChoice.objects.bulk_create(create_signers)
-    for signer in new_signers:
-        signer_dict[signer.english_name] = signer
-
-    for gloss in bulk_created:
-        word_en, row_num = gloss.idgloss.split("_row")
-        gloss_data = new_glosses[row_num]
-        gloss_data["gloss"] = gloss
-
-        # get semantic fields for gloss_data topics
-        if gloss_data.get("topic_names", None):
-            gloss_topics = gloss_data["topic_names"].split("|")
-            # ignore all signs and All signs
-            cleaned_gloss_topics = [
-                x for x in gloss_topics if x not in ["all signs", "All signs"]
-            ]
-            add_miscellaneous = False
-
-            for topic in cleaned_gloss_topics:
-                if topic in semantic_fields_dict.keys():
+                    machine_value=new_machine_value
+                ))
+        new_signers = FieldChoice.objects.bulk_create(create_signers)
+        for signer in new_signers:
+            signer_dict[signer.english_name] = signer
+
+        for gloss in bulk_created:
+            word_en, row_num = gloss.idgloss.split("_row")
+            gloss_data = new_glosses[row_num]
+            gloss_data["gloss"] = gloss
+
+            # get semantic fields for gloss_data topics
+            if gloss_data.get("topic_names", None):
+                gloss_topics = gloss_data["topic_names"].split("|")
+                # ignore all signs and All signs
+                cleaned_gloss_topics = [
+                    x for x in gloss_topics if x not in ["all signs", "All signs"]
+                ]
+                add_miscellaneous = False
+
+                for topic in cleaned_gloss_topics:
+                    if topic in semantic_fields_dict.keys():
+                        bulk_semantic_fields.append(
+                            Gloss.semantic_field.through(
+                                gloss_id=gloss.id,
+                                fieldchoice_id=semantic_fields_dict[topic]
+                            )
+                        )
+                    else:
+                        # add the miscellaneous semantic field if a topic does not exist
+                        add_miscellaneous = True
+
+                if add_miscellaneous:
                     bulk_semantic_fields.append(
                         Gloss.semantic_field.through(
                             gloss_id=gloss.id,
-                            fieldchoice_id=semantic_fields_dict[topic],
+                            fieldchoice_id=semantic_fields_dict["Miscellaneous"]
                         )
                     )
-                else:
-                    # add the miscellaneous semantic field if a topic does not exist
-                    add_miscellaneous = True
-
-            if add_miscellaneous:
-                bulk_semantic_fields.append(
-                    Gloss.semantic_field.through(
-                        gloss_id=gloss.id,
-                        fieldchoice_id=semantic_fields_dict["Miscellaneous"],
-                    )
-                )
 
-        # create GlossTranslations for english and maori words
-        translations.append(
-            GlossTranslations(
+            # create GlossTranslations for english and maori words
+            translations.append(GlossTranslations(
                 gloss=gloss,
                 language=language_en,
                 translations=gloss_data["word"],
-                translations_secondary=gloss_data.get("secondary", None),
-            )
-        )
-        if gloss_data.get("maori", None):
-            # There is potentially several comma separated maori words
-            maori_words = gloss_data["maori"].split(", ")
-
-            # Update idgloss_mi using first maori word, then create translation
-            gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
-
-            translation = GlossTranslations(
-                gloss=gloss, language=language_mi, translations=maori_words[0]
-            )
-            if len(maori_words) > 1:
-                translation.translations_secondary = ", ".join(maori_words[1:])
+                translations_secondary=gloss_data.get("secondary", None)
+            ))
+            if gloss_data.get("maori", None):
+                # There is potentially several comma separated maori words
+                maori_words = gloss_data["maori"].split(", ")
+
+                # Update idgloss_mi using first maori word, then create translation
+                gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}"
+
+                translation = GlossTranslations(
+                    gloss=gloss,
+                    language=language_mi,
+                    translations=maori_words[0]
+                )
+                if len(maori_words) > 1:
+                    translation.translations_secondary = ", ".join(maori_words[1:])
 
-            translations.append(translation)
+                translations.append(translation)
 
-        # Prepare new idgloss and signer fields for bulk update
-        gloss.idgloss = f"{word_en}:{gloss.pk}"
-        gloss.signer = signer_dict[gloss_data["contributor_username"]]
-        bulk_update_glosses.append(gloss)
+            # Prepare new idgloss and signer fields for bulk update
+            gloss.idgloss = f"{word_en}:{gloss.pk}"
+            gloss.signer = signer_dict[gloss_data["contributor_username"]]
+            bulk_update_glosses.append(gloss)
 
-        # Create comment for gloss_data notes
-        comments.append(
-            Comment(
+            # Create comment for gloss_data notes
+            comments.append(Comment(
                 content_type=gloss_content_type,
                 object_pk=gloss.pk,
                 user_name=gloss_data.get("contributor_username", ""),
                 comment=gloss_data.get("notes", ""),
                 site=site,
                 is_public=False,
-                submit_date=comment_submit_date,
-            )
-        )
-        if gloss_data.get("sign_comments", None):
-            # create Comments for all gloss_data sign_comments
-            for comment in gloss_data["sign_comments"].split("|"):
-                try:
-                    comment_content = comment.split(":")
-                    user_name = comment_content[0]
-                    comment_content = comment_content[1]
-                except IndexError:
-                    comment_content = comment
-                    user_name = "Unknown"
-                comments.append(
-                    Comment(
+                submit_date=comment_submit_date
+            ))
+            if gloss_data.get("sign_comments", None):
+                # create Comments for all gloss_data sign_comments
+                for comment in gloss_data["sign_comments"].split("|"):
+                    try:
+                        comment_content = comment.split(":")
+                        user_name = comment_content[0]
+                        comment_content = comment_content[1]
+                    except IndexError:
+                        comment_content = comment
+                        user_name = "Unknown"
+                    comments.append(Comment(
                         content_type=gloss_content_type,
                         object_pk=gloss.pk,
                         user_name=user_name,
                         comment=comment_content,
                         site=site,
                         is_public=False,
-                        submit_date=comment_submit_date,
-                    )
-                )
+                        submit_date=comment_submit_date
+                    ))
 
-        # Add ShareValidationAggregation
-        bulk_share_validation_aggregations.append(
-            ShareValidationAggregation(
+            # Add ShareValidationAggregation
+            bulk_share_validation_aggregations.append(ShareValidationAggregation(
                 gloss=gloss,
                 agrees=int(gloss_data["agrees"]),
-                disagrees=int(gloss_data["disagrees"]),
-            )
-        )
+                disagrees=int(gloss_data["disagrees"])
+            ))
 
-        # prep videos, illustrations and usage example for video retrieval
-        update_retrieval_videos(videos, gloss_data)
+            # prep videos, illustrations and usage example for video retrieval
+            update_retrieval_videos(videos, gloss_data)
 
-        glosses_added.append(gloss)
+            glosses_added.append(gloss)
 
-        bulk_tagged_items.append(
-            TaggedItem(
-                content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag
-            )
-        )
-        bulk_tagged_items.append(
-            TaggedItem(
-                content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag
-            )
-        )
+            bulk_tagged_items.append(TaggedItem(
+                content_type=gloss_content_type,
+                object_id=gloss.pk,
+                tag=nzsl_share_tag
 
-    # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
-    Comment.objects.bulk_create(comments)
-    GlossTranslations.objects.bulk_create(translations)
-    Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
-    Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
-    TaggedItem.objects.bulk_create(bulk_tagged_items)
-    ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
+            ))
+            bulk_tagged_items.append(TaggedItem(
+                content_type=gloss_content_type,
+                object_id=gloss.pk,
+                tag=not_public_tag
 
-    # Add the video-update only glosses
-    for video_import_gloss_data in video_import_only_glosses_data:
-        # prep videos, illustrations and usage example for video retrieval
-        update_retrieval_videos(videos, video_import_gloss_data)
-        glosses_added.append(video_import_gloss_data["gloss"])
+            ))
 
-    # start Thread to process gloss video retrieval in the background
-    t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True)
-    t.start()
+        # Bulk create entities related to the gloss, and bulk update the glosses' idgloss
+        Comment.objects.bulk_create(comments)
+        GlossTranslations.objects.bulk_create(translations)
+        Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"])
+        Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields)
+        TaggedItem.objects.bulk_create(bulk_tagged_items)
+        ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations)
+
+        # Add the video-update only glosses
+        for video_import_gloss_data in video_import_only_glosses_data:
+            # prep videos, illustrations and usage example for video retrieval
+            update_retrieval_videos(videos, video_import_gloss_data)
+            glosses_added.append(video_import_gloss_data["gloss"])
+
+        # start Thread to process gloss video retrieval in the background
+        t = threading.Thread(
+            target=retrieve_videos_for_glosses,
+            args=[videos],
+            daemon=True
+        )
+        t.start()
+
+        del request.session["glosses_new"]
+        del request.session["dataset_id"]
+
+        # Set a message to be shown so that the user knows what is going on.
+        messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully."))
+    return render(
+        request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html",
+        {
+            "glosses_added": glosses_added,
+            "dataset": dataset.name
+        }
+    )
 
-    return [glosses_added, dataset.name]
 
 
 @login_required
@@ -720,29 +608,18 @@ def import_qualtrics_csv(request):
     if not request.method == "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": csv_form}, )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(
-            request,
-            messages.ERROR,
-            _(
-                "The provided CSV-file does not meet the requirements "
-                "or there is some other problem."
-            ),
-        )
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": form},
-        )
+        messages.add_message(request, messages.ERROR,
+                             _("The provided CSV-file does not meet the requirements "
+                               "or there is some other problem."))
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": form}, )
 
     validation_records = []
     skipped_rows = []
@@ -750,7 +627,7 @@ def import_qualtrics_csv(request):
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8"),
             delimiter=",",
-            quotechar='"',
+            quotechar='"'
         )
 
         question_numbers = []
@@ -792,33 +669,22 @@ def import_qualtrics_csv(request):
         request.session.pop("question_numbers", None)
         request.session.pop("question_gloss_map", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("Cannot open the file:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(
-            request,
-            "dictionary/import_qualtrics_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        return render(request, "dictionary/import_qualtrics_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["validation_records"] = validation_records
     request.session["question_numbers"] = question_numbers
     request.session["question_glossvideo_map"] = question_to_glossvideo_map
 
-    return render(
-        request,
-        "dictionary/import_qualtrics_csv_confirmation.html",
-        {"validation_records": validation_records, "skipped_rows": skipped_rows},
-    )
+    return render(request, "dictionary/import_qualtrics_csv_confirmation.html",
+                  {"validation_records": validation_records, "skipped_rows": skipped_rows})
 
 
 @login_required
@@ -848,21 +714,13 @@ def confirm_import_qualtrics_csv(request):
     bulk_tagged_items = []
     gloss_pks = set()
 
-    if (
-        "validation_records"
-        and "question_numbers"
-        and "question_glossvideo_map" in request.session
-    ):
+    if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session:
         # Retrieve glosses
         glossvideo_pk_list = request.session["question_glossvideo_map"].values()
-        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(
-            glossvideo_pk_list
-        )
+        glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list)
         gloss_content_type = ContentType.objects.get_for_model(Gloss)
         check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS)
-        ready_for_validation_tag = Tag.objects.get(
-            name=settings.TAG_READY_FOR_VALIDATION
-        )
+        ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION)
 
         questions_numbers = request.session["question_numbers"]
         question_glossvideo_map = request.session["question_glossvideo_map"]
@@ -882,43 +740,35 @@ def confirm_import_qualtrics_csv(request):
                     sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value
 
                 try:
-                    gloss = glossvideo_dict[
-                        question_glossvideo_map[question_number]
-                    ].gloss
-                    validation_records_added.append(
-                        ValidationRecord(
-                            gloss=gloss,
-                            sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
-                            response_id=response_id,
-                            respondent_first_name=respondent_first_name,
-                            respondent_last_name=respondent_last_name,
-                            comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
-                        )
-                    )
+                    gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss
+                    validation_records_added.append(ValidationRecord(
+                        gloss=gloss,
+                        sign_seen=ValidationRecord.SignSeenChoices(sign_seen),
+                        response_id=response_id,
+                        respondent_first_name=respondent_first_name,
+                        respondent_last_name=respondent_last_name,
+                        comment=record.get(f"{question_number}_Q2_5_TEXT", ""),
+                    ))
                     gloss_pks.add(gloss.pk)
                 except KeyError:
-                    missing_gloss_pk_question_pairs[question_number] = (
-                        question_glossvideo_map[question_number]
-                    )
+                    missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[
+                        question_number]
 
         for gloss_pk in gloss_pks:
-            bulk_tagged_items.append(
-                TaggedItem(
-                    content_type=gloss_content_type,
-                    object_id=gloss_pk,
-                    tag=check_result_tag,
-                )
-            )
+            bulk_tagged_items.append(TaggedItem(
+                content_type=gloss_content_type,
+                object_id=gloss_pk,
+                tag=check_result_tag
+
+            ))
 
         # ignoring conflicts so the unique together on the model filters out potential duplicates
-        ValidationRecord.objects.bulk_create(
-            validation_records_added, ignore_conflicts=True
-        )
+        ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True)
         TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True)
         TaggedItem.objects.filter(
             content_type=gloss_content_type,
             object_id__in=gloss_pks,
-            tag=ready_for_validation_tag,
+            tag=ready_for_validation_tag
         ).delete()
 
         del request.session["validation_records"]
@@ -926,19 +776,17 @@ def confirm_import_qualtrics_csv(request):
         del request.session["question_glossvideo_map"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
-        )
+        messages.add_message(request, messages.SUCCESS,
+                             _("ValidationRecords were added successfully."))
     return render(
-        request,
-        "dictionary/import_qualtrics_csv_confirmation.html",
+        request, "dictionary/import_qualtrics_csv_confirmation.html",
         {
             "validation_records_added": validation_records_added,
             "validation_record_count": len(validation_records_added),
             "responses_count": len(validation_records),
             "gloss_count": len(gloss_pks),
-            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs,
-        },
+            "missing_gloss_question_pairs": missing_gloss_pk_question_pairs
+        }
     )
 
 
@@ -967,29 +815,18 @@ def import_manual_validation(request):
     if request.method != "POST":
         # If request type is not POST, return to the original form.
         csv_form = CSVFileOnlyUpload()
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": csv_form},
-        )
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": csv_form}, )
 
     form = CSVFileOnlyUpload(request.POST, request.FILES)
 
     if not form.is_valid():
         # If form is not valid, set a error message and return to the original form.
-        messages.add_message(
-            request,
-            messages.ERROR,
-            _(
-                "The provided CSV-file does not meet the requirements "
-                "or there is some other problem."
-            ),
-        )
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": form},
-        )
+        messages.add_message(request, messages.ERROR,
+                             _("The provided CSV-file does not meet the requirements "
+                               "or there is some other problem."))
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": form}, )
 
     group_row_map = defaultdict(list)
     group_gloss_count = defaultdict(int)
@@ -1000,38 +837,29 @@ def import_manual_validation(request):
         "yes",
         "no",
         "abstain or not sure",
-        "comments",
+        "comments"
     ]
     try:
         validation_record_reader = csv.DictReader(
             codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"),
             delimiter=",",
-            quotechar='"',
-        )
-        missing_headers = set(required_headers) - set(
-            validation_record_reader.fieldnames
+            quotechar='"'
         )
+        missing_headers = set(required_headers) - set(validation_record_reader.fieldnames)
         if missing_headers != set():
             request.session.pop("group_row_map", None)
             request.session.pop("glosses", None)
             # Set a message to be shown so that the user knows what is going on.
-            messages.add_message(
-                request,
-                messages.ERROR,
-                _(f"CSV is missing required columns: {missing_headers}"),
-            )
-            return render(
-                request,
-                "dictionary/import_manual_validation_csv.html",
-                {"import_csv_form": CSVFileOnlyUpload()},
-            )
+            messages.add_message(request, messages.ERROR,
+                                 _(f"CSV is missing required columns: {missing_headers}"))
+            return render(request,
+                              "dictionary/import_manual_validation_csv.html",
+                              {"import_csv_form": CSVFileOnlyUpload()}, )
 
         for row in validation_record_reader:
             if validation_record_reader.line_num == 1:
                 continue
-            _check_row_can_be_converted_to_integer(
-                row, ["yes", "no", "abstain or not sure"]
-            )
+            _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"])
             group_row_map[row["group"]].append(row)
             group_gloss_count[row["group"]] += 1
             glosses.append(row["idgloss"].split(":")[1])
@@ -1040,49 +868,35 @@ def import_manual_validation(request):
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("File contains non-compliant data:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e)))
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
 
     except csv.Error as e:
         # Can't open file, remove session variables
         request.session.pop("group_row_map", None)
         request.session.pop("glosses", None)
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.ERROR, _("Cannot open the file:" + str(e))
-        )
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e)))
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
     except UnicodeDecodeError as e:
         # File is not UTF-8 encoded.
         messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!"))
-        return render(
-            request,
-            "dictionary/import_manual_validation_csv.html",
-            {"import_csv_form": CSVFileOnlyUpload()},
-        )
+        return render(request, "dictionary/import_manual_validation_csv.html",
+                      {"import_csv_form": CSVFileOnlyUpload()}, )
 
     # Store dataset's id and the list of glosses to be added in session.
     request.session["group_row_map"] = group_row_map
     request.session["glosses"] = list(set(glosses))
 
     return render(
-        request,
-        "dictionary/import_manual_validation_csv_confirmation.html",
+        request, "dictionary/import_manual_validation_csv_confirmation.html",
         {
             # iterating over defaultdicts causes issues in template rendering
             "group_row_map": dict(group_row_map),
-            "group_gloss_count": dict(group_gloss_count),
-        },
+            "group_gloss_count": dict(group_gloss_count)
+        }
     )
 
 
@@ -1126,18 +940,14 @@ def confirm_import_manual_validation(request):
                 sign_seen_no = row["no"]
                 sign_seen_not_sure = row["abstain or not sure"]
                 comments = row["comments"]
-                manual_validation_aggregations.append(
-                    ManualValidationAggregation(
-                        gloss=gloss,
-                        group=group,
-                        sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
-                        sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
-                        sign_seen_not_sure=(
-                            int(sign_seen_not_sure) if sign_seen_not_sure else 0
-                        ),
-                        comments=comments,
-                    )
-                )
+                manual_validation_aggregations.append(ManualValidationAggregation(
+                    gloss=gloss,
+                    group=group,
+                    sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0,
+                    sign_seen_no=int(sign_seen_no) if sign_seen_no else 0,
+                    sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0,
+                    comments=comments
+                ))
 
         ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations)
 
@@ -1145,15 +955,13 @@ def confirm_import_manual_validation(request):
         del request.session["glosses"]
 
         # Set a message to be shown so that the user knows what is going on.
-        messages.add_message(
-            request, messages.SUCCESS, _("ValidationRecords were added successfully.")
-        )
+        messages.add_message(request, messages.SUCCESS,
+                             _("ValidationRecords were added successfully."))
     return render(
-        request,
-        "dictionary/import_manual_validation_csv_confirmation.html",
+        request, "dictionary/import_manual_validation_csv_confirmation.html",
         {
             "manual_validation_aggregations": manual_validation_aggregations,
             "manual_validation_aggregations_count": len(manual_validation_aggregations),
-            "missing_glosses": missing_glosses,
-        },
+            "missing_glosses": missing_glosses
+        }
     )

From 95c31790487ce6e7083bffbb0c363cb936c1c1d1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 28 Oct 2024 16:45:57 +1100
Subject: [PATCH 153/222] More experimental client code

---
 bin/get-video-s3-acls.py | 81 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 7625239a..60199b1c 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -14,6 +14,8 @@
 from time import sleep
 from pprint import pprint
 import boto3
+import copy
+import csv
 
 
 parser = argparse.ArgumentParser(
@@ -57,6 +59,7 @@
 
     get_wsgi_application()
 
+    from django.contrib.auth.models import Permission
     from django.contrib.auth import get_user_model
 
     User = get_user_model()
@@ -71,6 +74,11 @@
         ShareValidationAggregation,
         ValidationRecord,
     )
+    from signbank.video.models import GlossVideo
+    from django.test import Client
+    from django.core.files.uploadedfile import SimpleUploadedFile
+    from django.urls import reverse
+    from django.db.utils import IntegrityError
 
 # Globals
 CSV_DELIMITER = ","
@@ -366,16 +374,83 @@ def do_tests():
     if args.env != "dev":
         print("Error: tests must be in 'dev' environment")
         exit()
-    print(f"DATABASE_URL:{DATABASE_URL}")
     if DATABASE_URL.find("@localhost") < 0:
         print("Error: database url must contain '@localhost'")
         exit()
+    print(f"DATABASE_URL:{DATABASE_URL}")
 
     print("Running tests")
-    s3 = boto3.client("s3")
+    #s3 = boto3.client("s3")
     # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
     # get_nzsl_raw_keys_dict()
-    pprint(Gloss.objects.all())
+    # pprint(Gloss.objects.all())
+
+    # This is a cut and paste of the mock tests, but we're doing it "live" on dev
+    _csv_content = {
+        "id": "111",
+        "word": "Test",
+        "maori": "maori, maori 2",
+        "secondary": "test",
+        "notes": "a note",
+        "created_at": "2023-09-12 22:37:59 UTC",
+        "contributor_email": "ops@ackama.com",
+        "contributor_username": "Ackama Ops",
+        "agrees": "0",
+        "disagrees": "1",
+        "topic_names": "Test Topic|Test",
+        "videos": "/VID_20170815_153446275.mp4",
+        "illustrations": "/kiwifruit-2-6422.png",
+        "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4",
+        "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"),
+    }
+    file_name = "test.csv"
+    csv_content = [copy.deepcopy(_csv_content)]
+    csv_content[0]["id"] = "12345"
+    with open(file_name, "w") as file:
+        writer = csv.writer(file)
+        writer.writerow(csv_content[0].keys())
+        for row in csv_content:
+            writer.writerow(row.values())
+    data = open(file_name, "rb")
+    file = SimpleUploadedFile(
+        content=data.read(), name=data.name, content_type="content/multipart"
+    )
+    dataset = Dataset.objects.get(name="NZSL")
+
+    try:
+        Gloss.objects.get(idgloss="Share:11").delete()
+    except ValueError:
+        pass
+    Gloss.objects.create(
+        dataset=dataset,
+        idgloss="Share:11",
+        nzsl_share_id="12345",
+    )
+
+    # Create user and add permissions
+    try:
+        user = User.objects.create_user(username="test", email=None, password="test")
+        csv_permission = Permission.objects.get(codename='import_csv')
+        user.user_permissions.add(csv_permission)
+    except IntegrityError:
+        user = User.objects.get(username="test")
+
+    # Create client with change_gloss permission.
+    client = Client()
+    client.force_login(user)
+    s = client.session
+    s.update({
+        "dataset_id": dataset.pk,
+        "glosses_new": csv_content
+    })
+    s.save()
+    pprint("CLIENT SESSION")
+    pprint(client.session.items())
+    response = client.post(
+        reverse("dictionary:confirm_import_nzsl_share_gloss_csv"),
+        {"confirm": True}
+    )
+    pprint(response.__dict__)
 
 
 # From the keys present in NZSL, get all their S3 information

From 11dffb62cab3f3aaa2805083e6db307d225e58f9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 30 Oct 2024 11:06:47 +1100
Subject: [PATCH 154/222] Forking video tests away from ACL script

---
 bin/test-videos-s3.py | 478 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 478 insertions(+)
 create mode 100755 bin/test-videos-s3.py

diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py
new file mode 100755
index 00000000..9dbc6359
--- /dev/null
+++ b/bin/test-videos-s3.py
@@ -0,0 +1,478 @@
+#!/usr/bin/env -S python3 -u
+# Bang line above passes '-u' to python, for unbuffered output
+# Permissions required:
+#  psql - access to heroku app's postgres
+#  aws s3 - NZSL IAM access
+#  s3:GetObjectAcl permissions or READ_ACP access to the object
+#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
+
+import os
+import sys
+import subprocess
+import argparse
+import re
+from time import sleep
+from pprint import pprint
+import boto3
+import copy
+import csv
+
+
+parser = argparse.ArgumentParser(
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var."
+)
+parser.add_argument(
+    "--env",
+    default="uat",
+    required=False,
+    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+)
+parser.add_argument(
+    "--pgcli",
+    default="/usr/bin/psql",
+    required=False,
+    help=f"Postgres client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--awscli",
+    default="/usr/local/bin/aws",
+    required=False,
+    help=f"AWS client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--tests",
+    action="store_true",
+    default=False,
+    required=False,
+    help="Run remote tests instead of generating CSV output",
+)
+
+args = parser.parse_args()
+
+
+if args.tests:
+    # Magic required to allow this script to use Signbank Django classes
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+    from django.core.wsgi import get_wsgi_application
+
+    get_wsgi_application()
+
+    from django.contrib.auth.models import Permission
+    from django.contrib.auth import get_user_model
+
+    User = get_user_model()
+
+    from signbank.dictionary.models import (
+        Dataset,
+        FieldChoice,
+        Gloss,
+        GlossTranslations,
+        Language,
+        ManualValidationAggregation,
+        ShareValidationAggregation,
+        ValidationRecord,
+    )
+    from signbank.video.models import GlossVideo
+    from django.test import Client
+    from django.core.files.uploadedfile import SimpleUploadedFile
+    from django.urls import reverse
+    from django.db.utils import IntegrityError
+
+# Globals
+CSV_DELIMITER = ","
+DATABASE_URL = os.getenv("DATABASE_URL", "")
+AWSCLI = args.awscli
+PGCLI = args.pgcli
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+
+
+def pg_cli(args_list):
+    try:
+        return subprocess.run(
+            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
+            env=os.environ,
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
+        print(e.cmd, file=sys.stderr)
+        print(e.stdout, file=sys.stderr)
+        print(e.stderr, file=sys.stderr)
+        exit()
+
+
+def aws_cli(args_list):
+    # Try indefinitely
+    output = None
+    while not output:
+        try:
+            output = subprocess.run(
+                [AWSCLI] + args_list,
+                env=os.environ,
+                capture_output=True,
+                check=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
+            )
+            print(e.cmd, file=sys.stderr)
+            print(e.stdout, file=sys.stderr)
+            print(e.stderr, file=sys.stderr)
+            sleep(1)
+    return output
+
+
+# Get the video files info from NZSL Signbank
+def get_nzsl_raw_keys_dict():
+    print(
+        f"Getting raw list of video file info from NZSL Signbank ...",
+        file=sys.stderr,
+    )
+    this_nzsl_raw_keys_dict = {}
+    # Column renaming is for readability
+    # Special delimiter because columns might contain commas
+    result = pg_cli(
+        [
+            "COPY ("
+            "SELECT "
+            "dg.id AS gloss_id, "
+            "dg.idgloss AS gloss_idgloss, "
+            "dg.created_at AS gloss_created_at, "
+            "dg.published AS gloss_public, "
+            "vg.is_public AS video_public, "
+            "vg.id AS video_id, "
+            "vg.videofile AS video_key "
+            "FROM dictionary_gloss AS dg "
+            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        ]
+    )
+
+    # Separate the NZSL db columns
+    # Write them to a dictionary, so we can do fast operations
+    for rawl in result.stdout.split("\n"):
+        rawl = rawl.strip()
+        if not rawl:
+            continue
+        [
+            gloss_id,
+            gloss_idgloss,
+            gloss_created_at,
+            gloss_public,
+            video_public,
+            video_id,
+            video_key,
+        ] = rawl.split("|")
+
+        # This sets the initial field ordering in the all_keys dictionary row
+        this_nzsl_raw_keys_dict[video_key] = [
+            gloss_idgloss.replace(CSV_DELIMITER, ""),
+            gloss_created_at,
+            gloss_id,
+            video_id,
+            gloss_public.lower() == "t",
+            video_public.lower() == "t",
+        ]
+
+    print(
+        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_nzsl_raw_keys_dict
+
+
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = aws_cli(
+        [
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
+# Get the keys present and absent across NZSL Signbank and S3, to dictionary
+def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
+    print(
+        "Getting keys present and absent across NZSL Signbank and S3 ...",
+        file=sys.stderr,
+    )
+    this_all_keys_dict = {}
+
+    # Find S3 keys that are present in NZSL, or absent
+    for video_key in this_s3_bucket_raw_keys_list:
+        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
+        if dict_row:
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                True,  # S3 PRESENT
+            ] + dict_row
+        else:
+            this_all_keys_dict[video_key] = [
+                False,  # NZSL Absent
+                True,  # S3 PRESENT
+            ] + [""] * 6
+
+    # Find NZSL keys that are absent from S3 (present handled above)
+    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
+        if video_key not in this_s3_bucket_raw_keys_list:
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                False,  # S3 Absent
+            ] + dict_row
+
+    return this_all_keys_dict
+
+
+# Cases
+# In S3     In NZSL     Action
+#   Is        Not         Delete S3 Object
+#   Is        Is          Update ACL
+#   Not       Is          Review
+def get_recommended_action(key_in_nzsl, key_in_s3):
+    if key_in_s3:
+        if key_in_nzsl:
+            return "Update ACL"
+        else:
+            return "Delete S3 Object"
+    return "Review"
+
+
+# Get S3 object's ACL
+def get_s3_canned_acl(video_key):
+    result = aws_cli(
+        [
+            "s3api",
+            "get-object-acl",
+            "--output",
+            "text",
+            "--query",
+            "Grants[*].Permission",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ]
+    )
+    acls_grants = result.stdout.strip().split("\t")
+
+    if len(acls_grants) > 1:
+        if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
+            return "public-read"
+    elif acls_grants[0] == "FULL_CONTROL":
+        return "private"
+
+    return "unknown"
+
+
+# Get S3 object's LastModified date/time
+def get_s3_lastmodified(video_key):
+    result = aws_cli(
+        [
+            "s3api",
+            "head-object",
+            "--output",
+            "text",
+            "--query",
+            "LastModified",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ]
+    )
+    return result.stdout.strip()
+
+
+def build_csv_header():
+    return CSV_DELIMITER.join(
+        [
+            "Action",
+            "S3 Video key",
+            "S3 LastModified",
+            "S3 Expected Canned ACL",
+            "S3 Actual Canned ACL",
+            "Sbank Gloss ID",
+            "Sbank Video ID",
+            "Sbank Gloss public",
+            "Sbank Video public",
+            "Sbank Gloss",
+            "Sbank Gloss created at",
+        ]
+    )
+
+
+def build_csv_row(
+    video_key,
+    key_in_nzsl=False,
+    key_in_s3=False,
+    gloss_idgloss=None,
+    gloss_created_at=None,
+    gloss_id=None,
+    video_id=None,
+    gloss_public=False,
+    video_public=False,
+):
+    # See signbank/video/models.py, line 59, function set_public_acl()
+    canned_acl_expected = ""
+    if key_in_nzsl:
+        canned_acl_expected = "public-read" if video_public else "private"
+
+    lastmodified = ""
+    canned_acl = ""
+    if key_in_s3:
+        lastmodified = get_s3_lastmodified(video_key)
+        canned_acl = get_s3_canned_acl(video_key)
+
+    action = get_recommended_action(key_in_nzsl, key_in_s3)
+
+    return CSV_DELIMITER.join(
+        [
+            action,
+            f"{video_key}",
+            f"{lastmodified}",
+            f"{canned_acl_expected}",
+            f"{canned_acl}",
+            f"{gloss_id}",
+            f"{video_id}",
+            f"{gloss_public}",
+            f"{video_public}",
+            f"{gloss_idgloss}",
+            f"{gloss_created_at}",
+        ]
+    )
+
+
+# Run some tests against the remote endpoints
+# This is a test-harness for now
+# Takes advantage of the fact we have a lot of setup infrastructure in this script already
+def do_tests():
+    # Debugging safety
+    if args.env != "dev":
+        print("Error: tests must be in 'dev' environment")
+        exit()
+    if DATABASE_URL.find("@localhost") < 0:
+        print("Error: database url must contain '@localhost'")
+        exit()
+    print(f"DATABASE_URL:{DATABASE_URL}")
+
+    print("Running tests")
+    #s3 = boto3.client("s3")
+    # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
+    # get_nzsl_raw_keys_dict()
+    # pprint(Gloss.objects.all())
+
+    # This is a cut and paste of the mock tests, but we're doing it "live" on dev
+    _csv_content = {
+        "id": "111",
+        "word": "Test",
+        "maori": "maori, maori 2",
+        "secondary": "test",
+        "notes": "a note",
+        "created_at": "2023-09-12 22:37:59 UTC",
+        "contributor_email": "ops@ackama.com",
+        "contributor_username": "Ackama Ops",
+        "agrees": "0",
+        "disagrees": "1",
+        "topic_names": "Test Topic|Test",
+        "videos": "/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBc2pFIiwiZXhwIjoiMjAyNC0xMS0wM1QyMzoyNzo1Ni4yNDNaIiwicHVyIjoiYmxvYl9pZCJ9fQ==--53448dc4efcf056e7ba7fe6b711d6b1ae551d171/Zimbabwe.mp4",
+        "illustrations": "/kiwifruit-2-6422.png",
+        "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4",
+        "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"),
+    }
+    file_name = "test.csv"
+    csv_content = [copy.deepcopy(_csv_content)]
+    csv_content[0]["id"] = "12345"
+    with open(file_name, "w") as file:
+        writer = csv.writer(file)
+        writer.writerow(csv_content[0].keys())
+        for row in csv_content:
+            writer.writerow(row.values())
+    data = open(file_name, "rb")
+    file = SimpleUploadedFile(
+        content=data.read(), name=data.name, content_type="content/multipart"
+    )
+    dataset = Dataset.objects.get(name="NZSL")
+
+    try:
+        Gloss.objects.get(idgloss="Share:11").delete()
+    except ValueError:
+        pass
+    Gloss.objects.create(
+        dataset=dataset,
+        idgloss="Share:11",
+        nzsl_share_id="12345",
+    )
+
+    # Create user and add permissions
+    try:
+        user = User.objects.create_user(username="test", email=None, password="test")
+        csv_permission = Permission.objects.get(codename='import_csv')
+        user.user_permissions.add(csv_permission)
+    except IntegrityError:
+        user = User.objects.get(username="test")
+
+    # Create client with change_gloss permission.
+    client = Client()
+    client.force_login(user)
+    s = client.session
+    s.update({
+        "dataset_id": dataset.pk,
+        "glosses_new": csv_content
+    })
+    s.save()
+    response = client.post(
+        reverse("dictionary:confirm_import_nzsl_share_gloss_csv"),
+        {"confirm": True}
+    )
+
+    # test to see if we have to wait for thread
+    sleep(20)
+
+
+# From the keys present in NZSL, get all their S3 information
+def process_keys(this_all_keys_dict):
+    print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
+
+    print(build_csv_header())
+
+    for video_key, dict_row in this_all_keys_dict.items():
+        print(build_csv_row(video_key, *dict_row))
+
+
+print(f"Env:         {args.env}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
+print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+
+if args.tests:
+    do_tests()
+    exit()
+
+process_keys(
+    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+)

From 3f63e4206483066ec453f07a96919add098bb1dd Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 30 Oct 2024 11:17:50 +1100
Subject: [PATCH 155/222] Moved video tests out of this script

---
 bin/get-video-s3-acls.py | 131 ---------------------------------------
 1 file changed, 131 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 60199b1c..30e2dca6 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -13,11 +13,9 @@
 import re
 from time import sleep
 from pprint import pprint
-import boto3
 import copy
 import csv
 
-
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
@@ -40,46 +38,8 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--tests",
-    action="store_true",
-    default=False,
-    required=False,
-    help="Run remote tests instead of generating CSV output",
-)
-
 args = parser.parse_args()
 
-
-if args.tests:
-    # Magic required to allow this script to use Signbank Django classes
-    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-    from django.core.wsgi import get_wsgi_application
-
-    get_wsgi_application()
-
-    from django.contrib.auth.models import Permission
-    from django.contrib.auth import get_user_model
-
-    User = get_user_model()
-
-    from signbank.dictionary.models import (
-        Dataset,
-        FieldChoice,
-        Gloss,
-        GlossTranslations,
-        Language,
-        ManualValidationAggregation,
-        ShareValidationAggregation,
-        ValidationRecord,
-    )
-    from signbank.video.models import GlossVideo
-    from django.test import Client
-    from django.core.files.uploadedfile import SimpleUploadedFile
-    from django.urls import reverse
-    from django.db.utils import IntegrityError
-
 # Globals
 CSV_DELIMITER = ","
 DATABASE_URL = os.getenv("DATABASE_URL", "")
@@ -366,93 +326,6 @@ def build_csv_row(
     )
 
 
-# Run some tests against the remote endpoints
-# This is a test-harness for now
-# Takes advantage of the fact we have a lot of setup infrastructure in this script already
-def do_tests():
-    # Debugging safety
-    if args.env != "dev":
-        print("Error: tests must be in 'dev' environment")
-        exit()
-    if DATABASE_URL.find("@localhost") < 0:
-        print("Error: database url must contain '@localhost'")
-        exit()
-    print(f"DATABASE_URL:{DATABASE_URL}")
-
-    print("Running tests")
-    #s3 = boto3.client("s3")
-    # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
-    # get_nzsl_raw_keys_dict()
-    # pprint(Gloss.objects.all())
-
-    # This is a cut and paste of the mock tests, but we're doing it "live" on dev
-    _csv_content = {
-        "id": "111",
-        "word": "Test",
-        "maori": "maori, maori 2",
-        "secondary": "test",
-        "notes": "a note",
-        "created_at": "2023-09-12 22:37:59 UTC",
-        "contributor_email": "ops@ackama.com",
-        "contributor_username": "Ackama Ops",
-        "agrees": "0",
-        "disagrees": "1",
-        "topic_names": "Test Topic|Test",
-        "videos": "/VID_20170815_153446275.mp4",
-        "illustrations": "/kiwifruit-2-6422.png",
-        "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4",
-        "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"),
-    }
-    file_name = "test.csv"
-    csv_content = [copy.deepcopy(_csv_content)]
-    csv_content[0]["id"] = "12345"
-    with open(file_name, "w") as file:
-        writer = csv.writer(file)
-        writer.writerow(csv_content[0].keys())
-        for row in csv_content:
-            writer.writerow(row.values())
-    data = open(file_name, "rb")
-    file = SimpleUploadedFile(
-        content=data.read(), name=data.name, content_type="content/multipart"
-    )
-    dataset = Dataset.objects.get(name="NZSL")
-
-    try:
-        Gloss.objects.get(idgloss="Share:11").delete()
-    except ValueError:
-        pass
-    Gloss.objects.create(
-        dataset=dataset,
-        idgloss="Share:11",
-        nzsl_share_id="12345",
-    )
-
-    # Create user and add permissions
-    try:
-        user = User.objects.create_user(username="test", email=None, password="test")
-        csv_permission = Permission.objects.get(codename='import_csv')
-        user.user_permissions.add(csv_permission)
-    except IntegrityError:
-        user = User.objects.get(username="test")
-
-    # Create client with change_gloss permission.
-    client = Client()
-    client.force_login(user)
-    s = client.session
-    s.update({
-        "dataset_id": dataset.pk,
-        "glosses_new": csv_content
-    })
-    s.save()
-    pprint("CLIENT SESSION")
-    pprint(client.session.items())
-    response = client.post(
-        reverse("dictionary:confirm_import_nzsl_share_gloss_csv"),
-        {"confirm": True}
-    )
-    pprint(response.__dict__)
-
-
 # From the keys present in NZSL, get all their S3 information
 def process_keys(this_all_keys_dict):
     print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
@@ -469,10 +342,6 @@ def process_keys(this_all_keys_dict):
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-if args.tests:
-    do_tests()
-    exit()
-
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )

From 68310617f5a88ed89da474ac7e6053eab92fca11 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 30 Oct 2024 11:30:49 +1100
Subject: [PATCH 156/222] Minimum functionality

---
 bin/test-videos-s3.py | 331 +++++-------------------------------------
 1 file changed, 34 insertions(+), 297 deletions(-)

diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py
index 9dbc6359..b145227a 100755
--- a/bin/test-videos-s3.py
+++ b/bin/test-videos-s3.py
@@ -17,14 +17,42 @@
 import copy
 import csv
 
+# Magic required to allow this script to use Signbank Django classes
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth.models import Permission
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from signbank.dictionary.models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
+from signbank.video.models import GlossVideo
+from django.test import Client
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.urls import reverse
+from django.db.utils import IntegrityError
 
 parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    description="You need to run this in a venv that has all the right Python site-packages. You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
 parser.add_argument(
     "--env",
-    default="uat",
+    default="dev",
     required=False,
     help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
 )
@@ -40,46 +68,9 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--tests",
-    action="store_true",
-    default=False,
-    required=False,
-    help="Run remote tests instead of generating CSV output",
-)
-
 args = parser.parse_args()
 
 
-if args.tests:
-    # Magic required to allow this script to use Signbank Django classes
-    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-    from django.core.wsgi import get_wsgi_application
-
-    get_wsgi_application()
-
-    from django.contrib.auth.models import Permission
-    from django.contrib.auth import get_user_model
-
-    User = get_user_model()
-
-    from signbank.dictionary.models import (
-        Dataset,
-        FieldChoice,
-        Gloss,
-        GlossTranslations,
-        Language,
-        ManualValidationAggregation,
-        ShareValidationAggregation,
-        ValidationRecord,
-    )
-    from signbank.video.models import GlossVideo
-    from django.test import Client
-    from django.core.files.uploadedfile import SimpleUploadedFile
-    from django.urls import reverse
-    from django.db.utils import IntegrityError
-
 # Globals
 CSV_DELIMITER = ","
 DATABASE_URL = os.getenv("DATABASE_URL", "")
@@ -128,247 +119,7 @@ def aws_cli(args_list):
     return output
 
 
-# Get the video files info from NZSL Signbank
-def get_nzsl_raw_keys_dict():
-    print(
-        f"Getting raw list of video file info from NZSL Signbank ...",
-        file=sys.stderr,
-    )
-    this_nzsl_raw_keys_dict = {}
-    # Column renaming is for readability
-    # Special delimiter because columns might contain commas
-    result = pg_cli(
-        [
-            "COPY ("
-            "SELECT "
-            "dg.id AS gloss_id, "
-            "dg.idgloss AS gloss_idgloss, "
-            "dg.created_at AS gloss_created_at, "
-            "dg.published AS gloss_public, "
-            "vg.is_public AS video_public, "
-            "vg.id AS video_id, "
-            "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg "
-            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
-        ]
-    )
-
-    # Separate the NZSL db columns
-    # Write them to a dictionary, so we can do fast operations
-    for rawl in result.stdout.split("\n"):
-        rawl = rawl.strip()
-        if not rawl:
-            continue
-        [
-            gloss_id,
-            gloss_idgloss,
-            gloss_created_at,
-            gloss_public,
-            video_public,
-            video_id,
-            video_key,
-        ] = rawl.split("|")
-
-        # This sets the initial field ordering in the all_keys dictionary row
-        this_nzsl_raw_keys_dict[video_key] = [
-            gloss_idgloss.replace(CSV_DELIMITER, ""),
-            gloss_created_at,
-            gloss_id,
-            video_id,
-            gloss_public.lower() == "t",
-            video_public.lower() == "t",
-        ]
-
-    print(
-        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_nzsl_raw_keys_dict
-
-
-# Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = aws_cli(
-        [
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-    )
-
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
-
-    print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_s3_bucket_raw_keys_list
-
-
-# Get the keys present and absent across NZSL Signbank and S3, to dictionary
-def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
-    print(
-        "Getting keys present and absent across NZSL Signbank and S3 ...",
-        file=sys.stderr,
-    )
-    this_all_keys_dict = {}
-
-    # Find S3 keys that are present in NZSL, or absent
-    for video_key in this_s3_bucket_raw_keys_list:
-        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
-        if dict_row:
-            this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
-                True,  # S3 PRESENT
-            ] + dict_row
-        else:
-            this_all_keys_dict[video_key] = [
-                False,  # NZSL Absent
-                True,  # S3 PRESENT
-            ] + [""] * 6
-
-    # Find NZSL keys that are absent from S3 (present handled above)
-    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
-        if video_key not in this_s3_bucket_raw_keys_list:
-            this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
-                False,  # S3 Absent
-            ] + dict_row
-
-    return this_all_keys_dict
-
-
-# Cases
-# In S3     In NZSL     Action
-#   Is        Not         Delete S3 Object
-#   Is        Is          Update ACL
-#   Not       Is          Review
-def get_recommended_action(key_in_nzsl, key_in_s3):
-    if key_in_s3:
-        if key_in_nzsl:
-            return "Update ACL"
-        else:
-            return "Delete S3 Object"
-    return "Review"
-
-
-# Get S3 object's ACL
-def get_s3_canned_acl(video_key):
-    result = aws_cli(
-        [
-            "s3api",
-            "get-object-acl",
-            "--output",
-            "text",
-            "--query",
-            "Grants[*].Permission",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-    )
-    acls_grants = result.stdout.strip().split("\t")
-
-    if len(acls_grants) > 1:
-        if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
-            return "public-read"
-    elif acls_grants[0] == "FULL_CONTROL":
-        return "private"
-
-    return "unknown"
-
-
-# Get S3 object's LastModified date/time
-def get_s3_lastmodified(video_key):
-    result = aws_cli(
-        [
-            "s3api",
-            "head-object",
-            "--output",
-            "text",
-            "--query",
-            "LastModified",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-    )
-    return result.stdout.strip()
-
-
-def build_csv_header():
-    return CSV_DELIMITER.join(
-        [
-            "Action",
-            "S3 Video key",
-            "S3 LastModified",
-            "S3 Expected Canned ACL",
-            "S3 Actual Canned ACL",
-            "Sbank Gloss ID",
-            "Sbank Video ID",
-            "Sbank Gloss public",
-            "Sbank Video public",
-            "Sbank Gloss",
-            "Sbank Gloss created at",
-        ]
-    )
-
-
-def build_csv_row(
-    video_key,
-    key_in_nzsl=False,
-    key_in_s3=False,
-    gloss_idgloss=None,
-    gloss_created_at=None,
-    gloss_id=None,
-    video_id=None,
-    gloss_public=False,
-    video_public=False,
-):
-    # See signbank/video/models.py, line 59, function set_public_acl()
-    canned_acl_expected = ""
-    if key_in_nzsl:
-        canned_acl_expected = "public-read" if video_public else "private"
-
-    lastmodified = ""
-    canned_acl = ""
-    if key_in_s3:
-        lastmodified = get_s3_lastmodified(video_key)
-        canned_acl = get_s3_canned_acl(video_key)
-
-    action = get_recommended_action(key_in_nzsl, key_in_s3)
-
-    return CSV_DELIMITER.join(
-        [
-            action,
-            f"{video_key}",
-            f"{lastmodified}",
-            f"{canned_acl_expected}",
-            f"{canned_acl}",
-            f"{gloss_id}",
-            f"{video_id}",
-            f"{gloss_public}",
-            f"{video_public}",
-            f"{gloss_idgloss}",
-            f"{gloss_created_at}",
-        ]
-    )
-
-
 # Run some tests against the remote endpoints
-# This is a test-harness for now
-# Takes advantage of the fact we have a lot of setup infrastructure in this script already
 def do_tests():
     # Debugging safety
     if args.env != "dev":
@@ -450,17 +201,9 @@ def do_tests():
     )
 
     # test to see if we have to wait for thread
-    sleep(20)
-
-
-# From the keys present in NZSL, get all their S3 information
-def process_keys(this_all_keys_dict):
-    print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
-
-    print(build_csv_header())
-
-    for video_key, dict_row in this_all_keys_dict.items():
-        print(build_csv_row(video_key, *dict_row))
+    X_SECONDS=20
+    print(f"Sleeping {X_SECONDS} seconds to allow threads to complete ...")
+    sleep(X_SECONDS)
 
 
 print(f"Env:         {args.env}", file=sys.stderr)
@@ -469,10 +212,4 @@ def process_keys(this_all_keys_dict):
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-if args.tests:
-    do_tests()
-    exit()
-
-process_keys(
-    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
-)
+do_tests()

From 19a08811e52d0fa299459ea3be4697b665ff45d7 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:13:51 +1100
Subject: [PATCH 157/222] Fake key to handle FULL JOIN absent video keys

---
 bin/get-video-s3-acls.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 30e2dca6..453a3229 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -12,9 +12,8 @@
 import argparse
 import re
 from time import sleep
+from uuid import uuid4
 from pprint import pprint
-import copy
-import csv
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -38,10 +37,18 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
+parser.add_argument(
+    "--dumpnzsl",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Dump raw NZSL database output",
+)
 args = parser.parse_args()
 
 # Globals
 CSV_DELIMITER = ","
+FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 AWSCLI = args.awscli
 PGCLI = args.pgcli
@@ -88,6 +95,15 @@ def aws_cli(args_list):
     return output
 
 
+# Fake key is a hack to handle FULL JOIN
+def maybe_fakekey(instring):
+    return instring if instring else FAKEKEY_PREFIX + str(uuid4())
+
+
+def filter_fakekey(instring):
+    return "" if instring.startswith(FAKEKEY_PREFIX) else instring
+
+
 # Get the video files info from NZSL Signbank
 def get_nzsl_raw_keys_dict():
     print(
@@ -130,6 +146,9 @@ def get_nzsl_raw_keys_dict():
             video_key,
         ] = rawl.split("|")
 
+        # Hack to handle FULL JOIN
+        video_key = maybe_fakekey(video_key.strip())
+
         # This sets the initial field ordering in the all_keys dictionary row
         this_nzsl_raw_keys_dict[video_key] = [
             gloss_idgloss.replace(CSV_DELIMITER, ""),
@@ -212,6 +231,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
 #   Is        Not         Delete S3 Object
 #   Is        Is          Update ACL
 #   Not       Is          Review
+#      Other              Review
 def get_recommended_action(key_in_nzsl, key_in_s3):
     if key_in_s3:
         if key_in_nzsl:
@@ -312,7 +332,7 @@ def build_csv_row(
     return CSV_DELIMITER.join(
         [
             action,
-            f"{video_key}",
+            f"{filter_fakekey(video_key)}",
             f"{lastmodified}",
             f"{canned_acl_expected}",
             f"{canned_acl}",
@@ -342,6 +362,10 @@ def process_keys(this_all_keys_dict):
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
+if args.dumpnzsl:
+    pprint(get_nzsl_raw_keys_dict())
+    exit()
+
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )

From 078b479cbe5389a5596f5cbbef322f8de8c3f337 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 30 Oct 2024 16:55:05 +1100
Subject: [PATCH 158/222] black

---
 bin/test-videos-s3.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py
index b145227a..c3fa9f89 100755
--- a/bin/test-videos-s3.py
+++ b/bin/test-videos-s3.py
@@ -131,7 +131,7 @@ def do_tests():
     print(f"DATABASE_URL:{DATABASE_URL}")
 
     print("Running tests")
-    #s3 = boto3.client("s3")
+    # s3 = boto3.client("s3")
     # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
     # get_nzsl_raw_keys_dict()
     # pprint(Gloss.objects.all())
@@ -181,7 +181,7 @@ def do_tests():
     # Create user and add permissions
     try:
         user = User.objects.create_user(username="test", email=None, password="test")
-        csv_permission = Permission.objects.get(codename='import_csv')
+        csv_permission = Permission.objects.get(codename="import_csv")
         user.user_permissions.add(csv_permission)
     except IntegrityError:
         user = User.objects.get(username="test")
@@ -190,18 +190,14 @@ def do_tests():
     client = Client()
     client.force_login(user)
     s = client.session
-    s.update({
-        "dataset_id": dataset.pk,
-        "glosses_new": csv_content
-    })
+    s.update({"dataset_id": dataset.pk, "glosses_new": csv_content})
     s.save()
     response = client.post(
-        reverse("dictionary:confirm_import_nzsl_share_gloss_csv"),
-        {"confirm": True}
+        reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), {"confirm": True}
     )
 
     # test to see if we have to wait for thread
-    X_SECONDS=20
+    X_SECONDS = 20
     print(f"Sleeping {X_SECONDS} seconds to allow threads to complete ...")
     sleep(X_SECONDS)
 

From fa4689d6c04ba3970b9c0d4f2a859d06250676b5 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 30 Oct 2024 16:57:07 +1100
Subject: [PATCH 159/222] Rearranging

---
 bin/test-videos-s3.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py
index c3fa9f89..918cd980 100755
--- a/bin/test-videos-s3.py
+++ b/bin/test-videos-s3.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env -S python3 -u
+# You need to run this in a venv that has all the right Python site-packages.
 # Bang line above passes '-u' to python, for unbuffered output
 # Permissions required:
 #  psql - access to heroku app's postgres
@@ -30,6 +31,12 @@
 
 User = get_user_model()
 
+from django.test import Client
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.urls import reverse
+from django.db.utils import IntegrityError
+
+
 from signbank.dictionary.models import (
     Dataset,
     FieldChoice,
@@ -41,10 +48,6 @@
     ValidationRecord,
 )
 from signbank.video.models import GlossVideo
-from django.test import Client
-from django.core.files.uploadedfile import SimpleUploadedFile
-from django.urls import reverse
-from django.db.utils import IntegrityError
 
 parser = argparse.ArgumentParser(
     description="You need to run this in a venv that has all the right Python site-packages. You must setup: An AWS auth means, eg. AWS_PROFILE env var. "

From 0aa68e01b1f8c8b94e87e3d2e4fd2dd751af1f43 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 31 Oct 2024 15:56:49 +1100
Subject: [PATCH 160/222] S3 orphan resolution next pass

---
 bin/get-video-s3-acls.py | 118 +++++++++++++++++++++++++++++++++++++++
 bin/test-videos-s3.py    |   5 +-
 2 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 453a3229..dc291178 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -5,6 +5,8 @@
 #  aws s3 - NZSL IAM access
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
+# For some commands you need to run this in a venv that has all the right Python site-packages.
+# TODO Convert this script to a Django Management Command
 
 import os
 import sys
@@ -15,6 +17,7 @@
 from uuid import uuid4
 from pprint import pprint
 
+
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
@@ -44,8 +47,54 @@
     action="store_true",
     help=f"Dump raw NZSL database output",
 )
+parser.add_argument(
+    "--pyenv",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
+)
+parser.add_argument(
+    "--orphans",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Try to identify and match-up S3 orphans (requires --pyenv)",
+)
 args = parser.parse_args()
 
+if args.pyenv:
+    # Magic required to allow this script to use Signbank Django classes
+    # This goes away if this script becomes a Django Management Command
+    print ("Importing site-packages environment")
+    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+    from django.core.wsgi import get_wsgi_application
+
+    get_wsgi_application()
+
+    from django.contrib.auth.models import Permission
+    from django.contrib.auth import get_user_model
+
+    User = get_user_model()
+
+    from django.test import Client
+    from django.core.files.uploadedfile import SimpleUploadedFile
+    from django.urls import reverse
+    from django.db.utils import IntegrityError
+    from signbank.dictionary.models import (
+        Dataset,
+        FieldChoice,
+        Gloss,
+        GlossTranslations,
+        Language,
+        ManualValidationAggregation,
+        ShareValidationAggregation,
+        ValidationRecord,
+    )
+    from signbank.video.models import GlossVideo
+
 # Globals
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
@@ -355,6 +404,68 @@ def process_keys(this_all_keys_dict):
     for video_key, dict_row in this_all_keys_dict.items():
         print(build_csv_row(video_key, *dict_row))
 
+def process_orphans():
+    all_keys_dict = create_all_keys_dict(
+        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
+    )
+
+    # Traverse all the NZSL Signbank glosses that are missing S3 objects
+    # NOTE This may actually be the wrong way around, we may want to go from
+    # orphaned S3 objects _back_ to glosses, but it depends on what Micky says
+    for video_key, [
+        key_in_nzsl,
+        key_in_s3,
+        gloss_idgloss,
+        gloss_created_at,
+        gloss_id,
+        video_id,
+        gloss_public,
+        video_public,
+    ] in all_keys_dict.items():
+
+        if not key_in_nzsl:
+            # This is an S3 object, not a Signbank record
+            continue
+
+        if key_in_s3:
+            # This Signbank record already has an S3 object, all is well
+            continue
+
+        # Business rule
+        if int(gloss_id) < 8000:
+            continue
+
+        # The gloss_id is the only reliable retrieval key at the Signbank end
+        gloss = Gloss.objects.get(id=gloss_id)
+        video_path = gloss.get_video_path()
+
+        # Skip any that already have a video path
+        # If these had S3 video candidates they should not have made it this far
+        # These will have to have their videos reinstated (separate operation)
+        if len(video_path) > 0:
+            continue
+
+        gloss_name = gloss.idgloss.split(":")[0].strip()
+
+        # We try to find the orphaned S3 object, if it exists
+        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
+        for test_key, [ key_nzsl_yes, key_s3_yes, *_ ] in all_keys_dict.items():
+            if gloss_name in test_key:
+                if str(gloss_id) in test_key:
+                    if key_nzsl_yes:
+                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    if not key_s3_yes:
+                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    print(f"{gloss_id} {gloss.idgloss}")
+                    print(test_key)
+
+
+
+
+
+
 
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
@@ -366,6 +477,13 @@ def process_keys(this_all_keys_dict):
     pprint(get_nzsl_raw_keys_dict())
     exit()
 
+if args.orphans:
+    if args.pyenv:
+        process_orphans()
+    else:
+        print("Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv")
+    exit()
+
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )
diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py
index 918cd980..2eb27e24 100755
--- a/bin/test-videos-s3.py
+++ b/bin/test-videos-s3.py
@@ -35,8 +35,6 @@
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.urls import reverse
 from django.db.utils import IntegrityError
-
-
 from signbank.dictionary.models import (
     Dataset,
     FieldChoice,
@@ -50,7 +48,8 @@
 from signbank.video.models import GlossVideo
 
 parser = argparse.ArgumentParser(
-    description="You need to run this in a venv that has all the right Python site-packages. You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    description="You need to run this in a venv that has all the right Python site-packages. "
+    "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
 parser.add_argument(

From 09e124b494d0633c64ae2b0894d64fbbd35397a0 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 31 Oct 2024 15:57:09 +1100
Subject: [PATCH 161/222] black

---
 bin/get-video-s3-acls.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index dc291178..4ecd327e 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -66,7 +66,7 @@
 if args.pyenv:
     # Magic required to allow this script to use Signbank Django classes
     # This goes away if this script becomes a Django Management Command
-    print ("Importing site-packages environment")
+    print("Importing site-packages environment")
     print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
     sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
@@ -404,6 +404,7 @@ def process_keys(this_all_keys_dict):
     for video_key, dict_row in this_all_keys_dict.items():
         print(build_csv_row(video_key, *dict_row))
 
+
 def process_orphans():
     all_keys_dict = create_all_keys_dict(
         get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
@@ -449,7 +450,7 @@ def process_orphans():
 
         # We try to find the orphaned S3 object, if it exists
         # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
-        for test_key, [ key_nzsl_yes, key_s3_yes, *_ ] in all_keys_dict.items():
+        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
             if gloss_name in test_key:
                 if str(gloss_id) in test_key:
                     if key_nzsl_yes:
@@ -462,11 +463,6 @@ def process_orphans():
                     print(test_key)
 
 
-
-
-
-
-
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
@@ -481,7 +477,9 @@ def process_orphans():
     if args.pyenv:
         process_orphans()
     else:
-        print("Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv")
+        print(
+            "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
+        )
     exit()
 
 process_keys(

From 12a309bb64227d835aa020a31be158bf56fa73f3 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 31 Oct 2024 16:01:11 +1100
Subject: [PATCH 162/222] Comment

---
 bin/get-video-s3-acls.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 4ecd327e..3e9a9ce8 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -411,8 +411,6 @@ def process_orphans():
     )
 
     # Traverse all the NZSL Signbank glosses that are missing S3 objects
-    # NOTE This may actually be the wrong way around, we may want to go from
-    # orphaned S3 objects _back_ to glosses, but it depends on what Micky says
     for video_key, [
         key_in_nzsl,
         key_in_s3,

From fd62f81a52d0942358a3c48c2b7cee88f31b6ea9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 31 Oct 2024 16:13:50 +1100
Subject: [PATCH 163/222] CSV orphans

---
 bin/get-video-s3-acls.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 3e9a9ce8..7599182b 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -66,8 +66,8 @@
 if args.pyenv:
     # Magic required to allow this script to use Signbank Django classes
     # This goes away if this script becomes a Django Management Command
-    print("Importing site-packages environment")
-    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    print("Importing site-packages environment", file=sys.stderr)
+    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
     sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
     from django.core.wsgi import get_wsgi_application
@@ -410,6 +410,8 @@ def process_orphans():
         get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
     )
 
+    print("Gloss ID,Gloss,Suggested Video key")
+
     # Traverse all the NZSL Signbank glosses that are missing S3 objects
     for video_key, [
         key_in_nzsl,
@@ -446,6 +448,8 @@ def process_orphans():
 
         gloss_name = gloss.idgloss.split(":")[0].strip()
 
+        csv_rows = []
+
         # We try to find the orphaned S3 object, if it exists
         # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
         for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
@@ -457,8 +461,10 @@ def process_orphans():
                     if not key_s3_yes:
                         print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
                         continue
-                    print(f"{gloss_id} {gloss.idgloss}")
-                    print(test_key)
+                    csv_rows.append([gloss_id,gloss.idgloss,test_key])
+        if csv_rows:
+            for c_row in csv_rows:
+                print(CSV_DELIMITER.join(c_row))
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From 83d1c82da553789b73723bd7409c678958190402 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Thu, 31 Oct 2024 16:14:02 +1100
Subject: [PATCH 164/222] black

---
 bin/get-video-s3-acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 7599182b..efe041cd 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -461,7 +461,7 @@ def process_orphans():
                     if not key_s3_yes:
                         print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
                         continue
-                    csv_rows.append([gloss_id,gloss.idgloss,test_key])
+                    csv_rows.append([gloss_id, gloss.idgloss, test_key])
         if csv_rows:
             for c_row in csv_rows:
                 print(CSV_DELIMITER.join(c_row))

From 2418d96a5c51494e735bc8d0f2e1cc2c4bae3955 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 12:14:35 +1100
Subject: [PATCH 165/222] Script splitting

---
 bin/get-orphaned-videos.py | 491 +++++++++++++++++++++++++++++++++++++
 bin/test-videos-s3.py      | 213 ----------------
 2 files changed, 491 insertions(+), 213 deletions(-)
 create mode 100755 bin/get-orphaned-videos.py
 delete mode 100755 bin/test-videos-s3.py

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
new file mode 100755
index 00000000..efe041cd
--- /dev/null
+++ b/bin/get-orphaned-videos.py
@@ -0,0 +1,491 @@
+#!/usr/bin/env -S python3 -u
+# Bang line above passes '-u' to python, for unbuffered output
+# Permissions required:
+#  psql - access to heroku app's postgres
+#  aws s3 - NZSL IAM access
+#  s3:GetObjectAcl permissions or READ_ACP access to the object
+#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
+# For some commands you need to run this in a venv that has all the right Python site-packages.
+# TODO Convert this script to a Django Management Command
+
+import os
+import sys
+import subprocess
+import argparse
+import re
+from time import sleep
+from uuid import uuid4
+from pprint import pprint
+
+
+parser = argparse.ArgumentParser(
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var."
+)
+parser.add_argument(
+    "--env",
+    default="uat",
+    required=False,
+    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+)
+parser.add_argument(
+    "--pgcli",
+    default="/usr/bin/psql",
+    required=False,
+    help=f"Postgres client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--awscli",
+    default="/usr/local/bin/aws",
+    required=False,
+    help=f"AWS client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--dumpnzsl",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Dump raw NZSL database output",
+)
+parser.add_argument(
+    "--pyenv",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
+)
+parser.add_argument(
+    "--orphans",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Try to identify and match-up S3 orphans (requires --pyenv)",
+)
+args = parser.parse_args()
+
+if args.pyenv:
+    # Magic required to allow this script to use Signbank Django classes
+    # This goes away if this script becomes a Django Management Command
+    print("Importing site-packages environment", file=sys.stderr)
+    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+    from django.core.wsgi import get_wsgi_application
+
+    get_wsgi_application()
+
+    from django.contrib.auth.models import Permission
+    from django.contrib.auth import get_user_model
+
+    User = get_user_model()
+
+    from django.test import Client
+    from django.core.files.uploadedfile import SimpleUploadedFile
+    from django.urls import reverse
+    from django.db.utils import IntegrityError
+    from signbank.dictionary.models import (
+        Dataset,
+        FieldChoice,
+        Gloss,
+        GlossTranslations,
+        Language,
+        ManualValidationAggregation,
+        ShareValidationAggregation,
+        ValidationRecord,
+    )
+    from signbank.video.models import GlossVideo
+
+# Globals
+CSV_DELIMITER = ","
+FAKEKEY_PREFIX = "this_is_not_a_key_"
+DATABASE_URL = os.getenv("DATABASE_URL", "")
+AWSCLI = args.awscli
+PGCLI = args.pgcli
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+
+
+def pg_cli(args_list):
+    try:
+        return subprocess.run(
+            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
+            env=os.environ,
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
+        print(e.cmd, file=sys.stderr)
+        print(e.stdout, file=sys.stderr)
+        print(e.stderr, file=sys.stderr)
+        exit()
+
+
+def aws_cli(args_list):
+    # Try indefinitely
+    output = None
+    while not output:
+        try:
+            output = subprocess.run(
+                [AWSCLI] + args_list,
+                env=os.environ,
+                capture_output=True,
+                check=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
+            )
+            print(e.cmd, file=sys.stderr)
+            print(e.stdout, file=sys.stderr)
+            print(e.stderr, file=sys.stderr)
+            sleep(1)
+    return output
+
+
+# Fake key is a hack to handle FULL JOIN
+def maybe_fakekey(instring):
+    return instring if instring else FAKEKEY_PREFIX + str(uuid4())
+
+
+def filter_fakekey(instring):
+    return "" if instring.startswith(FAKEKEY_PREFIX) else instring
+
+
+# Get the video files info from NZSL Signbank
+def get_nzsl_raw_keys_dict():
+    print(
+        f"Getting raw list of video file info from NZSL Signbank ...",
+        file=sys.stderr,
+    )
+    this_nzsl_raw_keys_dict = {}
+    # Column renaming is for readability
+    # Special delimiter because columns might contain commas
+    result = pg_cli(
+        [
+            "COPY ("
+            "SELECT "
+            "dg.id AS gloss_id, "
+            "dg.idgloss AS gloss_idgloss, "
+            "dg.created_at AS gloss_created_at, "
+            "dg.published AS gloss_public, "
+            "vg.is_public AS video_public, "
+            "vg.id AS video_id, "
+            "vg.videofile AS video_key "
+            "FROM dictionary_gloss AS dg "
+            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        ]
+    )
+
+    # Separate the NZSL db columns
+    # Write them to a dictionary, so we can do fast operations
+    for rawl in result.stdout.split("\n"):
+        rawl = rawl.strip()
+        if not rawl:
+            continue
+        [
+            gloss_id,
+            gloss_idgloss,
+            gloss_created_at,
+            gloss_public,
+            video_public,
+            video_id,
+            video_key,
+        ] = rawl.split("|")
+
+        # Hack to handle FULL JOIN
+        video_key = maybe_fakekey(video_key.strip())
+
+        # This sets the initial field ordering in the all_keys dictionary row
+        this_nzsl_raw_keys_dict[video_key] = [
+            gloss_idgloss.replace(CSV_DELIMITER, ""),
+            gloss_created_at,
+            gloss_id,
+            video_id,
+            gloss_public.lower() == "t",
+            video_public.lower() == "t",
+        ]
+
+    print(
+        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_nzsl_raw_keys_dict
+
+
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = aws_cli(
+        [
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
+# Get the keys present and absent across NZSL Signbank and S3, to dictionary
+def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
+    print(
+        "Getting keys present and absent across NZSL Signbank and S3 ...",
+        file=sys.stderr,
+    )
+    this_all_keys_dict = {}
+
+    # Find S3 keys that are present in NZSL, or absent
+    for video_key in this_s3_bucket_raw_keys_list:
+        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
+        if dict_row:
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                True,  # S3 PRESENT
+            ] + dict_row
+        else:
+            this_all_keys_dict[video_key] = [
+                False,  # NZSL Absent
+                True,  # S3 PRESENT
+            ] + [""] * 6
+
+    # Find NZSL keys that are absent from S3 (present handled above)
+    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
+        if video_key not in this_s3_bucket_raw_keys_list:
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                False,  # S3 Absent
+            ] + dict_row
+
+    return this_all_keys_dict
+
+
+# Cases
+# In S3     In NZSL     Action
+#   Is        Not         Delete S3 Object
+#   Is        Is          Update ACL
+#   Not       Is          Review
+#      Other              Review
+def get_recommended_action(key_in_nzsl, key_in_s3):
+    if key_in_s3:
+        if key_in_nzsl:
+            return "Update ACL"
+        else:
+            return "Delete S3 Object"
+    return "Review"
+
+
+# Get S3 object's ACL
+def get_s3_canned_acl(video_key):
+    result = aws_cli(
+        [
+            "s3api",
+            "get-object-acl",
+            "--output",
+            "text",
+            "--query",
+            "Grants[*].Permission",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ]
+    )
+    acls_grants = result.stdout.strip().split("\t")
+
+    if len(acls_grants) > 1:
+        if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
+            return "public-read"
+    elif acls_grants[0] == "FULL_CONTROL":
+        return "private"
+
+    return "unknown"
+
+
+# Get S3 object's LastModified date/time
+def get_s3_lastmodified(video_key):
+    result = aws_cli(
+        [
+            "s3api",
+            "head-object",
+            "--output",
+            "text",
+            "--query",
+            "LastModified",
+            "--bucket",
+            AWS_S3_BUCKET,
+            "--key",
+            video_key,
+        ]
+    )
+    return result.stdout.strip()
+
+
+def build_csv_header():
+    return CSV_DELIMITER.join(
+        [
+            "Action",
+            "S3 Video key",
+            "S3 LastModified",
+            "S3 Expected Canned ACL",
+            "S3 Actual Canned ACL",
+            "Sbank Gloss ID",
+            "Sbank Video ID",
+            "Sbank Gloss public",
+            "Sbank Video public",
+            "Sbank Gloss",
+            "Sbank Gloss created at",
+        ]
+    )
+
+
+def build_csv_row(
+    video_key,
+    key_in_nzsl=False,
+    key_in_s3=False,
+    gloss_idgloss=None,
+    gloss_created_at=None,
+    gloss_id=None,
+    video_id=None,
+    gloss_public=False,
+    video_public=False,
+):
+    # See signbank/video/models.py, line 59, function set_public_acl()
+    canned_acl_expected = ""
+    if key_in_nzsl:
+        canned_acl_expected = "public-read" if video_public else "private"
+
+    lastmodified = ""
+    canned_acl = ""
+    if key_in_s3:
+        lastmodified = get_s3_lastmodified(video_key)
+        canned_acl = get_s3_canned_acl(video_key)
+
+    action = get_recommended_action(key_in_nzsl, key_in_s3)
+
+    return CSV_DELIMITER.join(
+        [
+            action,
+            f"{filter_fakekey(video_key)}",
+            f"{lastmodified}",
+            f"{canned_acl_expected}",
+            f"{canned_acl}",
+            f"{gloss_id}",
+            f"{video_id}",
+            f"{gloss_public}",
+            f"{video_public}",
+            f"{gloss_idgloss}",
+            f"{gloss_created_at}",
+        ]
+    )
+
+
+# From the keys present in NZSL, get all their S3 information
+def process_keys(this_all_keys_dict):
+    print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
+
+    print(build_csv_header())
+
+    for video_key, dict_row in this_all_keys_dict.items():
+        print(build_csv_row(video_key, *dict_row))
+
+
+def process_orphans():
+    all_keys_dict = create_all_keys_dict(
+        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
+    )
+
+    print("Gloss ID,Gloss,Suggested Video key")
+
+    # Traverse all the NZSL Signbank glosses that are missing S3 objects
+    for video_key, [
+        key_in_nzsl,
+        key_in_s3,
+        gloss_idgloss,
+        gloss_created_at,
+        gloss_id,
+        video_id,
+        gloss_public,
+        video_public,
+    ] in all_keys_dict.items():
+
+        if not key_in_nzsl:
+            # This is an S3 object, not a Signbank record
+            continue
+
+        if key_in_s3:
+            # This Signbank record already has an S3 object, all is well
+            continue
+
+        # Business rule
+        if int(gloss_id) < 8000:
+            continue
+
+        # The gloss_id is the only reliable retrieval key at the Signbank end
+        gloss = Gloss.objects.get(id=gloss_id)
+        video_path = gloss.get_video_path()
+
+        # Skip any that already have a video path
+        # If these had S3 video candidates they should not have made it this far
+        # These will have to have their videos reinstated (separate operation)
+        if len(video_path) > 0:
+            continue
+
+        gloss_name = gloss.idgloss.split(":")[0].strip()
+
+        csv_rows = []
+
+        # We try to find the orphaned S3 object, if it exists
+        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
+        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
+            if gloss_name in test_key:
+                if str(gloss_id) in test_key:
+                    if key_nzsl_yes:
+                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    if not key_s3_yes:
+                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    csv_rows.append([gloss_id, gloss.idgloss, test_key])
+        if csv_rows:
+            for c_row in csv_rows:
+                print(CSV_DELIMITER.join(c_row))
+
+
+print(f"Env:         {args.env}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
+print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+
+if args.dumpnzsl:
+    pprint(get_nzsl_raw_keys_dict())
+    exit()
+
+if args.orphans:
+    if args.pyenv:
+        process_orphans()
+    else:
+        print(
+            "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
+        )
+    exit()
+
+process_keys(
+    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+)
diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py
deleted file mode 100755
index 2eb27e24..00000000
--- a/bin/test-videos-s3.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env -S python3 -u
-# You need to run this in a venv that has all the right Python site-packages.
-# Bang line above passes '-u' to python, for unbuffered output
-# Permissions required:
-#  psql - access to heroku app's postgres
-#  aws s3 - NZSL IAM access
-#  s3:GetObjectAcl permissions or READ_ACP access to the object
-#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
-
-import os
-import sys
-import subprocess
-import argparse
-import re
-from time import sleep
-from pprint import pprint
-import boto3
-import copy
-import csv
-
-# Magic required to allow this script to use Signbank Django classes
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-
-from django.contrib.auth.models import Permission
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
-from django.test import Client
-from django.core.files.uploadedfile import SimpleUploadedFile
-from django.urls import reverse
-from django.db.utils import IntegrityError
-from signbank.dictionary.models import (
-    Dataset,
-    FieldChoice,
-    Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
-)
-from signbank.video.models import GlossVideo
-
-parser = argparse.ArgumentParser(
-    description="You need to run this in a venv that has all the right Python site-packages. "
-    "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var."
-)
-parser.add_argument(
-    "--env",
-    default="dev",
-    required=False,
-    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
-)
-parser.add_argument(
-    "--pgcli",
-    default="/usr/bin/psql",
-    required=False,
-    help=f"Postgres client path (default: %(default)s)",
-)
-parser.add_argument(
-    "--awscli",
-    default="/usr/local/bin/aws",
-    required=False,
-    help=f"AWS client path (default: %(default)s)",
-)
-args = parser.parse_args()
-
-
-# Globals
-CSV_DELIMITER = ","
-DATABASE_URL = os.getenv("DATABASE_URL", "")
-AWSCLI = args.awscli
-PGCLI = args.pgcli
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
-
-
-def pg_cli(args_list):
-    try:
-        return subprocess.run(
-            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
-            env=os.environ,
-            capture_output=True,
-            check=True,
-            text=True,
-        )
-    except subprocess.CalledProcessError as e:
-        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
-        print(e.cmd, file=sys.stderr)
-        print(e.stdout, file=sys.stderr)
-        print(e.stderr, file=sys.stderr)
-        exit()
-
-
-def aws_cli(args_list):
-    # Try indefinitely
-    output = None
-    while not output:
-        try:
-            output = subprocess.run(
-                [AWSCLI] + args_list,
-                env=os.environ,
-                capture_output=True,
-                check=True,
-                text=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(
-                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
-            )
-            print(e.cmd, file=sys.stderr)
-            print(e.stdout, file=sys.stderr)
-            print(e.stderr, file=sys.stderr)
-            sleep(1)
-    return output
-
-
-# Run some tests against the remote endpoints
-def do_tests():
-    # Debugging safety
-    if args.env != "dev":
-        print("Error: tests must be in 'dev' environment")
-        exit()
-    if DATABASE_URL.find("@localhost") < 0:
-        print("Error: database url must contain '@localhost'")
-        exit()
-    print(f"DATABASE_URL:{DATABASE_URL}")
-
-    print("Running tests")
-    # s3 = boto3.client("s3")
-    # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET))
-    # get_nzsl_raw_keys_dict()
-    # pprint(Gloss.objects.all())
-
-    # This is a cut and paste of the mock tests, but we're doing it "live" on dev
-    _csv_content = {
-        "id": "111",
-        "word": "Test",
-        "maori": "maori, maori 2",
-        "secondary": "test",
-        "notes": "a note",
-        "created_at": "2023-09-12 22:37:59 UTC",
-        "contributor_email": "ops@ackama.com",
-        "contributor_username": "Ackama Ops",
-        "agrees": "0",
-        "disagrees": "1",
-        "topic_names": "Test Topic|Test",
-        "videos": "/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBc2pFIiwiZXhwIjoiMjAyNC0xMS0wM1QyMzoyNzo1Ni4yNDNaIiwicHVyIjoiYmxvYl9pZCJ9fQ==--53448dc4efcf056e7ba7fe6b711d6b1ae551d171/Zimbabwe.mp4",
-        "illustrations": "/kiwifruit-2-6422.png",
-        "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4",
-        "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"),
-    }
-    file_name = "test.csv"
-    csv_content = [copy.deepcopy(_csv_content)]
-    csv_content[0]["id"] = "12345"
-    with open(file_name, "w") as file:
-        writer = csv.writer(file)
-        writer.writerow(csv_content[0].keys())
-        for row in csv_content:
-            writer.writerow(row.values())
-    data = open(file_name, "rb")
-    file = SimpleUploadedFile(
-        content=data.read(), name=data.name, content_type="content/multipart"
-    )
-    dataset = Dataset.objects.get(name="NZSL")
-
-    try:
-        Gloss.objects.get(idgloss="Share:11").delete()
-    except ValueError:
-        pass
-    Gloss.objects.create(
-        dataset=dataset,
-        idgloss="Share:11",
-        nzsl_share_id="12345",
-    )
-
-    # Create user and add permissions
-    try:
-        user = User.objects.create_user(username="test", email=None, password="test")
-        csv_permission = Permission.objects.get(codename="import_csv")
-        user.user_permissions.add(csv_permission)
-    except IntegrityError:
-        user = User.objects.get(username="test")
-
-    # Create client with change_gloss permission.
-    client = Client()
-    client.force_login(user)
-    s = client.session
-    s.update({"dataset_id": dataset.pk, "glosses_new": csv_content})
-    s.save()
-    response = client.post(
-        reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), {"confirm": True}
-    )
-
-    # test to see if we have to wait for thread
-    X_SECONDS = 20
-    print(f"Sleeping {X_SECONDS} seconds to allow threads to complete ...")
-    sleep(X_SECONDS)
-
-
-print(f"Env:         {args.env}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
-print(f"PGCLI:       {PGCLI}", file=sys.stderr)
-print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
-
-do_tests()

From 7c8d4612031e8209b192e11288d6c0ed74c05203 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 12:28:08 +1100
Subject: [PATCH 166/222] Orphan-detection code removed

---
 bin/get-video-s3-acls.py | 118 ---------------------------------------
 1 file changed, 118 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index efe041cd..eb5436be 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -17,7 +17,6 @@
 from uuid import uuid4
 from pprint import pprint
 
-
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
@@ -47,54 +46,8 @@
     action="store_true",
     help=f"Dump raw NZSL database output",
 )
-parser.add_argument(
-    "--pyenv",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
-)
-parser.add_argument(
-    "--orphans",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Try to identify and match-up S3 orphans (requires --pyenv)",
-)
 args = parser.parse_args()
 
-if args.pyenv:
-    # Magic required to allow this script to use Signbank Django classes
-    # This goes away if this script becomes a Django Management Command
-    print("Importing site-packages environment", file=sys.stderr)
-    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-    from django.core.wsgi import get_wsgi_application
-
-    get_wsgi_application()
-
-    from django.contrib.auth.models import Permission
-    from django.contrib.auth import get_user_model
-
-    User = get_user_model()
-
-    from django.test import Client
-    from django.core.files.uploadedfile import SimpleUploadedFile
-    from django.urls import reverse
-    from django.db.utils import IntegrityError
-    from signbank.dictionary.models import (
-        Dataset,
-        FieldChoice,
-        Gloss,
-        GlossTranslations,
-        Language,
-        ManualValidationAggregation,
-        ShareValidationAggregation,
-        ValidationRecord,
-    )
-    from signbank.video.models import GlossVideo
-
 # Globals
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
@@ -405,68 +358,6 @@ def process_keys(this_all_keys_dict):
         print(build_csv_row(video_key, *dict_row))
 
 
-def process_orphans():
-    all_keys_dict = create_all_keys_dict(
-        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
-    )
-
-    print("Gloss ID,Gloss,Suggested Video key")
-
-    # Traverse all the NZSL Signbank glosses that are missing S3 objects
-    for video_key, [
-        key_in_nzsl,
-        key_in_s3,
-        gloss_idgloss,
-        gloss_created_at,
-        gloss_id,
-        video_id,
-        gloss_public,
-        video_public,
-    ] in all_keys_dict.items():
-
-        if not key_in_nzsl:
-            # This is an S3 object, not a Signbank record
-            continue
-
-        if key_in_s3:
-            # This Signbank record already has an S3 object, all is well
-            continue
-
-        # Business rule
-        if int(gloss_id) < 8000:
-            continue
-
-        # The gloss_id is the only reliable retrieval key at the Signbank end
-        gloss = Gloss.objects.get(id=gloss_id)
-        video_path = gloss.get_video_path()
-
-        # Skip any that already have a video path
-        # If these had S3 video candidates they should not have made it this far
-        # These will have to have their videos reinstated (separate operation)
-        if len(video_path) > 0:
-            continue
-
-        gloss_name = gloss.idgloss.split(":")[0].strip()
-
-        csv_rows = []
-
-        # We try to find the orphaned S3 object, if it exists
-        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
-        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
-            if gloss_name in test_key:
-                if str(gloss_id) in test_key:
-                    if key_nzsl_yes:
-                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
-                        continue
-                    if not key_s3_yes:
-                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
-                        continue
-                    csv_rows.append([gloss_id, gloss.idgloss, test_key])
-        if csv_rows:
-            for c_row in csv_rows:
-                print(CSV_DELIMITER.join(c_row))
-
-
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
@@ -477,15 +368,6 @@ def process_orphans():
     pprint(get_nzsl_raw_keys_dict())
     exit()
 
-if args.orphans:
-    if args.pyenv:
-        process_orphans()
-    else:
-        print(
-            "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
-        )
-    exit()
-
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )

From 7ef56b8b08bad189166f615784efcf1ac97fc2b9 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 13:01:29 +1100
Subject: [PATCH 167/222] Orphan detection script separated

---
 bin/get-orphaned-videos.py | 168 ++-----------------------------------
 1 file changed, 7 insertions(+), 161 deletions(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index efe041cd..f18b9d3f 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -40,13 +40,6 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--dumpnzsl",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Dump raw NZSL database output",
-)
 parser.add_argument(
     "--pyenv",
     default=False,
@@ -54,13 +47,6 @@
     action="store_true",
     help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
 )
-parser.add_argument(
-    "--orphans",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Try to identify and match-up S3 orphans (requires --pyenv)",
-)
 args = parser.parse_args()
 
 if args.pyenv:
@@ -275,137 +261,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     return this_all_keys_dict
 
 
-# Cases
-# In S3     In NZSL     Action
-#   Is        Not         Delete S3 Object
-#   Is        Is          Update ACL
-#   Not       Is          Review
-#      Other              Review
-def get_recommended_action(key_in_nzsl, key_in_s3):
-    if key_in_s3:
-        if key_in_nzsl:
-            return "Update ACL"
-        else:
-            return "Delete S3 Object"
-    return "Review"
-
-
-# Get S3 object's ACL
-def get_s3_canned_acl(video_key):
-    result = aws_cli(
-        [
-            "s3api",
-            "get-object-acl",
-            "--output",
-            "text",
-            "--query",
-            "Grants[*].Permission",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-    )
-    acls_grants = result.stdout.strip().split("\t")
-
-    if len(acls_grants) > 1:
-        if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
-            return "public-read"
-    elif acls_grants[0] == "FULL_CONTROL":
-        return "private"
-
-    return "unknown"
-
-
-# Get S3 object's LastModified date/time
-def get_s3_lastmodified(video_key):
-    result = aws_cli(
-        [
-            "s3api",
-            "head-object",
-            "--output",
-            "text",
-            "--query",
-            "LastModified",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-    )
-    return result.stdout.strip()
-
-
-def build_csv_header():
-    return CSV_DELIMITER.join(
-        [
-            "Action",
-            "S3 Video key",
-            "S3 LastModified",
-            "S3 Expected Canned ACL",
-            "S3 Actual Canned ACL",
-            "Sbank Gloss ID",
-            "Sbank Video ID",
-            "Sbank Gloss public",
-            "Sbank Video public",
-            "Sbank Gloss",
-            "Sbank Gloss created at",
-        ]
-    )
-
-
-def build_csv_row(
-    video_key,
-    key_in_nzsl=False,
-    key_in_s3=False,
-    gloss_idgloss=None,
-    gloss_created_at=None,
-    gloss_id=None,
-    video_id=None,
-    gloss_public=False,
-    video_public=False,
-):
-    # See signbank/video/models.py, line 59, function set_public_acl()
-    canned_acl_expected = ""
-    if key_in_nzsl:
-        canned_acl_expected = "public-read" if video_public else "private"
-
-    lastmodified = ""
-    canned_acl = ""
-    if key_in_s3:
-        lastmodified = get_s3_lastmodified(video_key)
-        canned_acl = get_s3_canned_acl(video_key)
-
-    action = get_recommended_action(key_in_nzsl, key_in_s3)
-
-    return CSV_DELIMITER.join(
-        [
-            action,
-            f"{filter_fakekey(video_key)}",
-            f"{lastmodified}",
-            f"{canned_acl_expected}",
-            f"{canned_acl}",
-            f"{gloss_id}",
-            f"{video_id}",
-            f"{gloss_public}",
-            f"{video_public}",
-            f"{gloss_idgloss}",
-            f"{gloss_created_at}",
-        ]
-    )
-
-
-# From the keys present in NZSL, get all their S3 information
-def process_keys(this_all_keys_dict):
-    print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
-
-    print(build_csv_header())
-
-    for video_key, dict_row in this_all_keys_dict.items():
-        print(build_csv_row(video_key, *dict_row))
-
-
-def process_orphans():
+def find_orphans():
     all_keys_dict = create_all_keys_dict(
         get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
     )
@@ -473,19 +329,9 @@ def process_orphans():
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-if args.dumpnzsl:
-    pprint(get_nzsl_raw_keys_dict())
-    exit()
-
-if args.orphans:
-    if args.pyenv:
-        process_orphans()
-    else:
-        print(
-            "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
-        )
-    exit()
-
-process_keys(
-    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
-)
+if args.pyenv:
+    find_orphans()
+else:
+    print(
+        "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
+    )

From fcbde529cf54d61c8fed77a5e1dd5c63f53ff46f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 13:38:12 +1100
Subject: [PATCH 168/222] Removed --pyenv requirement, prior to management
 command

---
 bin/get-orphaned-videos.py | 68 +++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index f18b9d3f..97e4aabf 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -49,37 +49,36 @@
 )
 args = parser.parse_args()
 
-if args.pyenv:
-    # Magic required to allow this script to use Signbank Django classes
-    # This goes away if this script becomes a Django Management Command
-    print("Importing site-packages environment", file=sys.stderr)
-    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-    from django.core.wsgi import get_wsgi_application
-
-    get_wsgi_application()
-
-    from django.contrib.auth.models import Permission
-    from django.contrib.auth import get_user_model
-
-    User = get_user_model()
-
-    from django.test import Client
-    from django.core.files.uploadedfile import SimpleUploadedFile
-    from django.urls import reverse
-    from django.db.utils import IntegrityError
-    from signbank.dictionary.models import (
-        Dataset,
-        FieldChoice,
-        Gloss,
-        GlossTranslations,
-        Language,
-        ManualValidationAggregation,
-        ShareValidationAggregation,
-        ValidationRecord,
-    )
-    from signbank.video.models import GlossVideo
+# Magic required to allow this script to use Signbank Django classes
+# This goes away if this script becomes a Django Management Command
+print("Importing site-packages environment", file=sys.stderr)
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth.models import Permission
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from django.test import Client
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.urls import reverse
+from django.db.utils import IntegrityError
+from signbank.dictionary.models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
+from signbank.video.models import GlossVideo
 
 # Globals
 CSV_DELIMITER = ","
@@ -329,9 +328,4 @@ def find_orphans():
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-if args.pyenv:
-    find_orphans()
-else:
-    print(
-        "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
-    )
+find_orphans()

From bfe08487de13bd13004c3493c8bd7131099ad726 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 13:42:21 +1100
Subject: [PATCH 169/222] Moved to management dir

---
 .../commands/get-orphaned-videos.py           | 331 ++++++++++++++++++
 1 file changed, 331 insertions(+)
 create mode 100755 signbank/dictionary/management/commands/get-orphaned-videos.py

diff --git a/signbank/dictionary/management/commands/get-orphaned-videos.py b/signbank/dictionary/management/commands/get-orphaned-videos.py
new file mode 100755
index 00000000..97e4aabf
--- /dev/null
+++ b/signbank/dictionary/management/commands/get-orphaned-videos.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env -S python3 -u
+# Bang line above passes '-u' to python, for unbuffered output
+# Permissions required:
+#  psql - access to heroku app's postgres
+#  aws s3 - NZSL IAM access
+#  s3:GetObjectAcl permissions or READ_ACP access to the object
+#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
+# For some commands you need to run this in a venv that has all the right Python site-packages.
+# TODO Convert this script to a Django Management Command
+
+import os
+import sys
+import subprocess
+import argparse
+import re
+from time import sleep
+from uuid import uuid4
+from pprint import pprint
+
+
+parser = argparse.ArgumentParser(
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var."
+)
+parser.add_argument(
+    "--env",
+    default="uat",
+    required=False,
+    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+)
+parser.add_argument(
+    "--pgcli",
+    default="/usr/bin/psql",
+    required=False,
+    help=f"Postgres client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--awscli",
+    default="/usr/local/bin/aws",
+    required=False,
+    help=f"AWS client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--pyenv",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
+)
+args = parser.parse_args()
+
+# Magic required to allow this script to use Signbank Django classes
+# This goes away if this script becomes a Django Management Command
+print("Importing site-packages environment", file=sys.stderr)
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth.models import Permission
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from django.test import Client
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.urls import reverse
+from django.db.utils import IntegrityError
+from signbank.dictionary.models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
+from signbank.video.models import GlossVideo
+
+# Globals
+CSV_DELIMITER = ","
+FAKEKEY_PREFIX = "this_is_not_a_key_"
+DATABASE_URL = os.getenv("DATABASE_URL", "")
+AWSCLI = args.awscli
+PGCLI = args.pgcli
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+
+
+def pg_cli(args_list):
+    try:
+        return subprocess.run(
+            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
+            env=os.environ,
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
+        print(e.cmd, file=sys.stderr)
+        print(e.stdout, file=sys.stderr)
+        print(e.stderr, file=sys.stderr)
+        exit()
+
+
+def aws_cli(args_list):
+    # Try indefinitely
+    output = None
+    while not output:
+        try:
+            output = subprocess.run(
+                [AWSCLI] + args_list,
+                env=os.environ,
+                capture_output=True,
+                check=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
+            )
+            print(e.cmd, file=sys.stderr)
+            print(e.stdout, file=sys.stderr)
+            print(e.stderr, file=sys.stderr)
+            sleep(1)
+    return output
+
+
+# Fake key is a hack to handle FULL JOIN
+def maybe_fakekey(instring):
+    return instring if instring else FAKEKEY_PREFIX + str(uuid4())
+
+
+def filter_fakekey(instring):
+    return "" if instring.startswith(FAKEKEY_PREFIX) else instring
+
+
+# Get the video files info from NZSL Signbank
+def get_nzsl_raw_keys_dict():
+    print(
+        f"Getting raw list of video file info from NZSL Signbank ...",
+        file=sys.stderr,
+    )
+    this_nzsl_raw_keys_dict = {}
+    # Column renaming is for readability
+    # Special delimiter because columns might contain commas
+    result = pg_cli(
+        [
+            "COPY ("
+            "SELECT "
+            "dg.id AS gloss_id, "
+            "dg.idgloss AS gloss_idgloss, "
+            "dg.created_at AS gloss_created_at, "
+            "dg.published AS gloss_public, "
+            "vg.is_public AS video_public, "
+            "vg.id AS video_id, "
+            "vg.videofile AS video_key "
+            "FROM dictionary_gloss AS dg "
+            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        ]
+    )
+
+    # Separate the NZSL db columns
+    # Write them to a dictionary, so we can do fast operations
+    for rawl in result.stdout.split("\n"):
+        rawl = rawl.strip()
+        if not rawl:
+            continue
+        [
+            gloss_id,
+            gloss_idgloss,
+            gloss_created_at,
+            gloss_public,
+            video_public,
+            video_id,
+            video_key,
+        ] = rawl.split("|")
+
+        # Hack to handle FULL JOIN
+        video_key = maybe_fakekey(video_key.strip())
+
+        # This sets the initial field ordering in the all_keys dictionary row
+        this_nzsl_raw_keys_dict[video_key] = [
+            gloss_idgloss.replace(CSV_DELIMITER, ""),
+            gloss_created_at,
+            gloss_id,
+            video_id,
+            gloss_public.lower() == "t",
+            video_public.lower() == "t",
+        ]
+
+    print(
+        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_nzsl_raw_keys_dict
+
+
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = aws_cli(
+        [
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
+# Get the keys present and absent across NZSL Signbank and S3, to dictionary
+def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
+    print(
+        "Getting keys present and absent across NZSL Signbank and S3 ...",
+        file=sys.stderr,
+    )
+    this_all_keys_dict = {}
+
+    # Find S3 keys that are present in NZSL, or absent
+    for video_key in this_s3_bucket_raw_keys_list:
+        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
+        if dict_row:
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                True,  # S3 PRESENT
+            ] + dict_row
+        else:
+            this_all_keys_dict[video_key] = [
+                False,  # NZSL Absent
+                True,  # S3 PRESENT
+            ] + [""] * 6
+
+    # Find NZSL keys that are absent from S3 (present handled above)
+    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
+        if video_key not in this_s3_bucket_raw_keys_list:
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                False,  # S3 Absent
+            ] + dict_row
+
+    return this_all_keys_dict
+
+
+def find_orphans():
+    all_keys_dict = create_all_keys_dict(
+        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
+    )
+
+    print("Gloss ID,Gloss,Suggested Video key")
+
+    # Traverse all the NZSL Signbank glosses that are missing S3 objects
+    for video_key, [
+        key_in_nzsl,
+        key_in_s3,
+        gloss_idgloss,
+        gloss_created_at,
+        gloss_id,
+        video_id,
+        gloss_public,
+        video_public,
+    ] in all_keys_dict.items():
+
+        if not key_in_nzsl:
+            # This is an S3 object, not a Signbank record
+            continue
+
+        if key_in_s3:
+            # This Signbank record already has an S3 object, all is well
+            continue
+
+        # Business rule
+        if int(gloss_id) < 8000:
+            continue
+
+        # The gloss_id is the only reliable retrieval key at the Signbank end
+        gloss = Gloss.objects.get(id=gloss_id)
+        video_path = gloss.get_video_path()
+
+        # Skip any that already have a video path
+        # If these had S3 video candidates they should not have made it this far
+        # These will have to have their videos reinstated (separate operation)
+        if len(video_path) > 0:
+            continue
+
+        gloss_name = gloss.idgloss.split(":")[0].strip()
+
+        csv_rows = []
+
+        # We try to find the orphaned S3 object, if it exists
+        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
+        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
+            if gloss_name in test_key:
+                if str(gloss_id) in test_key:
+                    if key_nzsl_yes:
+                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    if not key_s3_yes:
+                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    csv_rows.append([gloss_id, gloss.idgloss, test_key])
+        if csv_rows:
+            for c_row in csv_rows:
+                print(CSV_DELIMITER.join(c_row))
+
+
+print(f"Env:         {args.env}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
+print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+
+find_orphans()

From 068244d7ccf207dbccb6e54deda4d8c3e5804248 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 15:00:02 +1100
Subject: [PATCH 170/222] Revert "Moved to management dir"

This reverts commit bfe08487de13bd13004c3493c8bd7131099ad726.
---
 .../commands/get-orphaned-videos.py           | 331 ------------------
 1 file changed, 331 deletions(-)
 delete mode 100755 signbank/dictionary/management/commands/get-orphaned-videos.py

diff --git a/signbank/dictionary/management/commands/get-orphaned-videos.py b/signbank/dictionary/management/commands/get-orphaned-videos.py
deleted file mode 100755
index 97e4aabf..00000000
--- a/signbank/dictionary/management/commands/get-orphaned-videos.py
+++ /dev/null
@@ -1,331 +0,0 @@
-#!/usr/bin/env -S python3 -u
-# Bang line above passes '-u' to python, for unbuffered output
-# Permissions required:
-#  psql - access to heroku app's postgres
-#  aws s3 - NZSL IAM access
-#  s3:GetObjectAcl permissions or READ_ACP access to the object
-#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
-# For some commands you need to run this in a venv that has all the right Python site-packages.
-# TODO Convert this script to a Django Management Command
-
-import os
-import sys
-import subprocess
-import argparse
-import re
-from time import sleep
-from uuid import uuid4
-from pprint import pprint
-
-
-parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var."
-)
-parser.add_argument(
-    "--env",
-    default="uat",
-    required=False,
-    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
-)
-parser.add_argument(
-    "--pgcli",
-    default="/usr/bin/psql",
-    required=False,
-    help=f"Postgres client path (default: %(default)s)",
-)
-parser.add_argument(
-    "--awscli",
-    default="/usr/local/bin/aws",
-    required=False,
-    help=f"AWS client path (default: %(default)s)",
-)
-parser.add_argument(
-    "--pyenv",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
-)
-args = parser.parse_args()
-
-# Magic required to allow this script to use Signbank Django classes
-# This goes away if this script becomes a Django Management Command
-print("Importing site-packages environment", file=sys.stderr)
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-
-from django.contrib.auth.models import Permission
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
-from django.test import Client
-from django.core.files.uploadedfile import SimpleUploadedFile
-from django.urls import reverse
-from django.db.utils import IntegrityError
-from signbank.dictionary.models import (
-    Dataset,
-    FieldChoice,
-    Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
-)
-from signbank.video.models import GlossVideo
-
-# Globals
-CSV_DELIMITER = ","
-FAKEKEY_PREFIX = "this_is_not_a_key_"
-DATABASE_URL = os.getenv("DATABASE_URL", "")
-AWSCLI = args.awscli
-PGCLI = args.pgcli
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
-
-
-def pg_cli(args_list):
-    try:
-        return subprocess.run(
-            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
-            env=os.environ,
-            capture_output=True,
-            check=True,
-            text=True,
-        )
-    except subprocess.CalledProcessError as e:
-        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
-        print(e.cmd, file=sys.stderr)
-        print(e.stdout, file=sys.stderr)
-        print(e.stderr, file=sys.stderr)
-        exit()
-
-
-def aws_cli(args_list):
-    # Try indefinitely
-    output = None
-    while not output:
-        try:
-            output = subprocess.run(
-                [AWSCLI] + args_list,
-                env=os.environ,
-                capture_output=True,
-                check=True,
-                text=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(
-                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
-            )
-            print(e.cmd, file=sys.stderr)
-            print(e.stdout, file=sys.stderr)
-            print(e.stderr, file=sys.stderr)
-            sleep(1)
-    return output
-
-
-# Fake key is a hack to handle FULL JOIN
-def maybe_fakekey(instring):
-    return instring if instring else FAKEKEY_PREFIX + str(uuid4())
-
-
-def filter_fakekey(instring):
-    return "" if instring.startswith(FAKEKEY_PREFIX) else instring
-
-
-# Get the video files info from NZSL Signbank
-def get_nzsl_raw_keys_dict():
-    print(
-        f"Getting raw list of video file info from NZSL Signbank ...",
-        file=sys.stderr,
-    )
-    this_nzsl_raw_keys_dict = {}
-    # Column renaming is for readability
-    # Special delimiter because columns might contain commas
-    result = pg_cli(
-        [
-            "COPY ("
-            "SELECT "
-            "dg.id AS gloss_id, "
-            "dg.idgloss AS gloss_idgloss, "
-            "dg.created_at AS gloss_created_at, "
-            "dg.published AS gloss_public, "
-            "vg.is_public AS video_public, "
-            "vg.id AS video_id, "
-            "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg "
-            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
-        ]
-    )
-
-    # Separate the NZSL db columns
-    # Write them to a dictionary, so we can do fast operations
-    for rawl in result.stdout.split("\n"):
-        rawl = rawl.strip()
-        if not rawl:
-            continue
-        [
-            gloss_id,
-            gloss_idgloss,
-            gloss_created_at,
-            gloss_public,
-            video_public,
-            video_id,
-            video_key,
-        ] = rawl.split("|")
-
-        # Hack to handle FULL JOIN
-        video_key = maybe_fakekey(video_key.strip())
-
-        # This sets the initial field ordering in the all_keys dictionary row
-        this_nzsl_raw_keys_dict[video_key] = [
-            gloss_idgloss.replace(CSV_DELIMITER, ""),
-            gloss_created_at,
-            gloss_id,
-            video_id,
-            gloss_public.lower() == "t",
-            video_public.lower() == "t",
-        ]
-
-    print(
-        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_nzsl_raw_keys_dict
-
-
-# Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = aws_cli(
-        [
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-    )
-
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
-
-    print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_s3_bucket_raw_keys_list
-
-
-# Get the keys present and absent across NZSL Signbank and S3, to dictionary
-def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
-    print(
-        "Getting keys present and absent across NZSL Signbank and S3 ...",
-        file=sys.stderr,
-    )
-    this_all_keys_dict = {}
-
-    # Find S3 keys that are present in NZSL, or absent
-    for video_key in this_s3_bucket_raw_keys_list:
-        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
-        if dict_row:
-            this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
-                True,  # S3 PRESENT
-            ] + dict_row
-        else:
-            this_all_keys_dict[video_key] = [
-                False,  # NZSL Absent
-                True,  # S3 PRESENT
-            ] + [""] * 6
-
-    # Find NZSL keys that are absent from S3 (present handled above)
-    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
-        if video_key not in this_s3_bucket_raw_keys_list:
-            this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
-                False,  # S3 Absent
-            ] + dict_row
-
-    return this_all_keys_dict
-
-
-def find_orphans():
-    all_keys_dict = create_all_keys_dict(
-        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
-    )
-
-    print("Gloss ID,Gloss,Suggested Video key")
-
-    # Traverse all the NZSL Signbank glosses that are missing S3 objects
-    for video_key, [
-        key_in_nzsl,
-        key_in_s3,
-        gloss_idgloss,
-        gloss_created_at,
-        gloss_id,
-        video_id,
-        gloss_public,
-        video_public,
-    ] in all_keys_dict.items():
-
-        if not key_in_nzsl:
-            # This is an S3 object, not a Signbank record
-            continue
-
-        if key_in_s3:
-            # This Signbank record already has an S3 object, all is well
-            continue
-
-        # Business rule
-        if int(gloss_id) < 8000:
-            continue
-
-        # The gloss_id is the only reliable retrieval key at the Signbank end
-        gloss = Gloss.objects.get(id=gloss_id)
-        video_path = gloss.get_video_path()
-
-        # Skip any that already have a video path
-        # If these had S3 video candidates they should not have made it this far
-        # These will have to have their videos reinstated (separate operation)
-        if len(video_path) > 0:
-            continue
-
-        gloss_name = gloss.idgloss.split(":")[0].strip()
-
-        csv_rows = []
-
-        # We try to find the orphaned S3 object, if it exists
-        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
-        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
-            if gloss_name in test_key:
-                if str(gloss_id) in test_key:
-                    if key_nzsl_yes:
-                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
-                        continue
-                    if not key_s3_yes:
-                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
-                        continue
-                    csv_rows.append([gloss_id, gloss.idgloss, test_key])
-        if csv_rows:
-            for c_row in csv_rows:
-                print(CSV_DELIMITER.join(c_row))
-
-
-print(f"Env:         {args.env}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
-print(f"PGCLI:       {PGCLI}", file=sys.stderr)
-print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
-
-find_orphans()

From a49f9df16347d3622e9ce075093722e737121799 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 15:00:06 +1100
Subject: [PATCH 171/222] Revert "Removed --pyenv requirement, prior to
 management command"

This reverts commit fcbde529cf54d61c8fed77a5e1dd5c63f53ff46f.
---
 bin/get-orphaned-videos.py | 68 +++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 31 deletions(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index 97e4aabf..f18b9d3f 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -49,36 +49,37 @@
 )
 args = parser.parse_args()
 
-# Magic required to allow this script to use Signbank Django classes
-# This goes away if this script becomes a Django Management Command
-print("Importing site-packages environment", file=sys.stderr)
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-
-from django.contrib.auth.models import Permission
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
-from django.test import Client
-from django.core.files.uploadedfile import SimpleUploadedFile
-from django.urls import reverse
-from django.db.utils import IntegrityError
-from signbank.dictionary.models import (
-    Dataset,
-    FieldChoice,
-    Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
-)
-from signbank.video.models import GlossVideo
+if args.pyenv:
+    # Magic required to allow this script to use Signbank Django classes
+    # This goes away if this script becomes a Django Management Command
+    print("Importing site-packages environment", file=sys.stderr)
+    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+    from django.core.wsgi import get_wsgi_application
+
+    get_wsgi_application()
+
+    from django.contrib.auth.models import Permission
+    from django.contrib.auth import get_user_model
+
+    User = get_user_model()
+
+    from django.test import Client
+    from django.core.files.uploadedfile import SimpleUploadedFile
+    from django.urls import reverse
+    from django.db.utils import IntegrityError
+    from signbank.dictionary.models import (
+        Dataset,
+        FieldChoice,
+        Gloss,
+        GlossTranslations,
+        Language,
+        ManualValidationAggregation,
+        ShareValidationAggregation,
+        ValidationRecord,
+    )
+    from signbank.video.models import GlossVideo
 
 # Globals
 CSV_DELIMITER = ","
@@ -328,4 +329,9 @@ def find_orphans():
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-find_orphans()
+if args.pyenv:
+    find_orphans()
+else:
+    print(
+        "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
+    )

From 61158e981407a829ffc134b71da0081451734ce5 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:37:56 +1100
Subject: [PATCH 172/222] Comments

---
 bin/get-orphaned-videos.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index f18b9d3f..1d53619e 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -240,19 +240,25 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     for video_key in this_s3_bucket_raw_keys_list:
         dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
         if dict_row:
+            # NZSL glossvideo record for this S3 key
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT
                 True,  # S3 PRESENT
             ] + dict_row
         else:
+            # S3 key with no corresponding NZSL glossvideo record
             this_all_keys_dict[video_key] = [
                 False,  # NZSL Absent
                 True,  # S3 PRESENT
             ] + [""] * 6
 
-    # Find NZSL keys that are absent from S3 (present handled above)
+    # Find NZSL keys that are absent from S3 (present in both handled above)
     for video_key, dict_row in this_nzsl_raw_keys_dict.items():
         if video_key not in this_s3_bucket_raw_keys_list:
+            # gloss/glossvideo record with no corresponding S3 key
+            # Either:
+            # video_key is real, but the S3 object is missing
+            # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object
             this_all_keys_dict[video_key] = [
                 True,  # NZSL PRESENT
                 False,  # S3 Absent

From 8f5b88afaa4b46d3c3ad594672020ee456815063 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:40:22 +1100
Subject: [PATCH 173/222] Comment

---
 bin/get-orphaned-videos.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index 1d53619e..eeb7189e 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -302,8 +302,8 @@ def find_orphans():
         gloss = Gloss.objects.get(id=gloss_id)
         video_path = gloss.get_video_path()
 
-        # Skip any that already have a video path
-        # If these had S3 video candidates they should not have made it this far
+        # Skip any that already have a video path.
+        # These should have an S3 object but don't. For some reason the video never made it to S3.
         # These will have to have their videos reinstated (separate operation)
         if len(video_path) > 0:
             continue

From 917e9ad4968b4d1672db407780ff5d3fd6ae87cd Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:44:58 +1100
Subject: [PATCH 174/222] refactor

---
 bin/get-orphaned-videos.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index eeb7189e..b5dae6ea 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -300,18 +300,16 @@ def find_orphans():
 
         # The gloss_id is the only reliable retrieval key at the Signbank end
         gloss = Gloss.objects.get(id=gloss_id)
+        gloss_name = gloss.idgloss.split(":")[0].strip()
         video_path = gloss.get_video_path()
 
-        # Skip any that already have a video path.
-        # These should have an S3 object but don't. For some reason the video never made it to S3.
+        # Skip any that already have a video path
+        # These should have an S3 object but don't. For some reason the video never made it to S3
         # These will have to have their videos reinstated (separate operation)
+        # TODO If it's worth it, make a --param to output these
         if len(video_path) > 0:
             continue
 
-        gloss_name = gloss.idgloss.split(":")[0].strip()
-
-        csv_rows = []
-
         # We try to find the orphaned S3 object, if it exists
         # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
         for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
@@ -323,10 +321,7 @@ def find_orphans():
                     if not key_s3_yes:
                         print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
                         continue
-                    csv_rows.append([gloss_id, gloss.idgloss, test_key])
-        if csv_rows:
-            for c_row in csv_rows:
-                print(CSV_DELIMITER.join(c_row))
+                    print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key]))
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From f7485239f6da127efd11c9b6d3f30695023924dc Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:48:05 +1100
Subject: [PATCH 175/222] Comment

---
 bin/get-orphaned-videos.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index b5dae6ea..627200bc 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -306,7 +306,6 @@ def find_orphans():
         # Skip any that already have a video path
         # These should have an S3 object but don't. For some reason the video never made it to S3
         # These will have to have their videos reinstated (separate operation)
-        # TODO If it's worth it, make a --param to output these
         if len(video_path) > 0:
             continue
 

From ad2733dfc11cf8c06f407ef5034a5f7f3189e901 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:52:46 +1100
Subject: [PATCH 176/222] Cleanups

---
 bin/get-orphaned-videos.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py
index 627200bc..74d9e3ee 100755
--- a/bin/get-orphaned-videos.py
+++ b/bin/get-orphaned-videos.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env -S python3 -u
+#
+# This script needs to be run in a pyenv virtualenv with the Django project installed.
+#
 # Bang line above passes '-u' to python, for unbuffered output
 # Permissions required:
 #  psql - access to heroku app's postgres
@@ -40,13 +43,6 @@
     required=False,
     help=f"AWS client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--pyenv",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed",
-)
 args = parser.parse_args()
 
 if args.pyenv:
@@ -304,7 +300,7 @@ def find_orphans():
         video_path = gloss.get_video_path()
 
         # Skip any that already have a video path
-        # These should have an S3 object but don't. For some reason the video never made it to S3
+        # These should have an S3 object but don't: For some reason the video never made it to S3
         # These will have to have their videos reinstated (separate operation)
         if len(video_path) > 0:
             continue
@@ -329,9 +325,4 @@ def find_orphans():
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-if args.pyenv:
-    find_orphans()
-else:
-    print(
-        "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv"
-    )
+find_orphans()

From 12ab098b4c66e4aae54817d90f4eb53d6911c658 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:07:14 +1100
Subject: [PATCH 177/222] rename

---
 bin/{get-orphaned-videos.py => find-orphaned-videos.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename bin/{get-orphaned-videos.py => find-orphaned-videos.py} (100%)

diff --git a/bin/get-orphaned-videos.py b/bin/find-orphaned-videos.py
similarity index 100%
rename from bin/get-orphaned-videos.py
rename to bin/find-orphaned-videos.py

From 75f0a8f29b85dc3ed5c85eadc805a71c84978729 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:08:16 +1100
Subject: [PATCH 178/222] initial commit of orphan video repair script

---
 bin/repair-orphaned-videos.py | 328 ++++++++++++++++++++++++++++++++++
 1 file changed, 328 insertions(+)
 create mode 100755 bin/repair-orphaned-videos.py

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
new file mode 100755
index 00000000..74d9e3ee
--- /dev/null
+++ b/bin/repair-orphaned-videos.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env -S python3 -u
+#
+# This script needs to be run in a pyenv virtualenv with the Django project installed.
+#
+# Bang line above passes '-u' to python, for unbuffered output
+# Permissions required:
+#  psql - access to heroku app's postgres
+#  aws s3 - NZSL IAM access
+#  s3:GetObjectAcl permissions or READ_ACP access to the object
+#  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
+# For some commands you need to run this in a venv that has all the right Python site-packages.
+# TODO Convert this script to a Django Management Command
+
+import os
+import sys
+import subprocess
+import argparse
+import re
+from time import sleep
+from uuid import uuid4
+from pprint import pprint
+
+
+parser = argparse.ArgumentParser(
+    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var."
+)
+parser.add_argument(
+    "--env",
+    default="uat",
+    required=False,
+    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+)
+parser.add_argument(
+    "--pgcli",
+    default="/usr/bin/psql",
+    required=False,
+    help=f"Postgres client path (default: %(default)s)",
+)
+parser.add_argument(
+    "--awscli",
+    default="/usr/local/bin/aws",
+    required=False,
+    help=f"AWS client path (default: %(default)s)",
+)
+args = parser.parse_args()
+
+if args.pyenv:
+    # Magic required to allow this script to use Signbank Django classes
+    # This goes away if this script becomes a Django Management Command
+    print("Importing site-packages environment", file=sys.stderr)
+    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+    from django.core.wsgi import get_wsgi_application
+
+    get_wsgi_application()
+
+    from django.contrib.auth.models import Permission
+    from django.contrib.auth import get_user_model
+
+    User = get_user_model()
+
+    from django.test import Client
+    from django.core.files.uploadedfile import SimpleUploadedFile
+    from django.urls import reverse
+    from django.db.utils import IntegrityError
+    from signbank.dictionary.models import (
+        Dataset,
+        FieldChoice,
+        Gloss,
+        GlossTranslations,
+        Language,
+        ManualValidationAggregation,
+        ShareValidationAggregation,
+        ValidationRecord,
+    )
+    from signbank.video.models import GlossVideo
+
+# Globals
+CSV_DELIMITER = ","
+FAKEKEY_PREFIX = "this_is_not_a_key_"
+DATABASE_URL = os.getenv("DATABASE_URL", "")
+AWSCLI = args.awscli
+PGCLI = args.pgcli
+AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+
+
+def pg_cli(args_list):
+    try:
+        return subprocess.run(
+            [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"],
+            env=os.environ,
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr)
+        print(e.cmd, file=sys.stderr)
+        print(e.stdout, file=sys.stderr)
+        print(e.stderr, file=sys.stderr)
+        exit()
+
+
+def aws_cli(args_list):
+    # Try indefinitely
+    output = None
+    while not output:
+        try:
+            output = subprocess.run(
+                [AWSCLI] + args_list,
+                env=os.environ,
+                capture_output=True,
+                check=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            print(
+                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
+            )
+            print(e.cmd, file=sys.stderr)
+            print(e.stdout, file=sys.stderr)
+            print(e.stderr, file=sys.stderr)
+            sleep(1)
+    return output
+
+
+# Fake key is a hack to handle FULL JOIN
+def maybe_fakekey(instring):
+    return instring if instring else FAKEKEY_PREFIX + str(uuid4())
+
+
+def filter_fakekey(instring):
+    return "" if instring.startswith(FAKEKEY_PREFIX) else instring
+
+
+# Get the video files info from NZSL Signbank
+def get_nzsl_raw_keys_dict():
+    print(
+        f"Getting raw list of video file info from NZSL Signbank ...",
+        file=sys.stderr,
+    )
+    this_nzsl_raw_keys_dict = {}
+    # Column renaming is for readability
+    # Special delimiter because columns might contain commas
+    result = pg_cli(
+        [
+            "COPY ("
+            "SELECT "
+            "dg.id AS gloss_id, "
+            "dg.idgloss AS gloss_idgloss, "
+            "dg.created_at AS gloss_created_at, "
+            "dg.published AS gloss_public, "
+            "vg.is_public AS video_public, "
+            "vg.id AS video_id, "
+            "vg.videofile AS video_key "
+            "FROM dictionary_gloss AS dg "
+            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
+            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
+        ]
+    )
+
+    # Separate the NZSL db columns
+    # Write them to a dictionary, so we can do fast operations
+    for rawl in result.stdout.split("\n"):
+        rawl = rawl.strip()
+        if not rawl:
+            continue
+        [
+            gloss_id,
+            gloss_idgloss,
+            gloss_created_at,
+            gloss_public,
+            video_public,
+            video_id,
+            video_key,
+        ] = rawl.split("|")
+
+        # Hack to handle FULL JOIN
+        video_key = maybe_fakekey(video_key.strip())
+
+        # This sets the initial field ordering in the all_keys dictionary row
+        this_nzsl_raw_keys_dict[video_key] = [
+            gloss_idgloss.replace(CSV_DELIMITER, ""),
+            gloss_created_at,
+            gloss_id,
+            video_id,
+            gloss_public.lower() == "t",
+            video_public.lower() == "t",
+        ]
+
+    print(
+        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_nzsl_raw_keys_dict
+
+
+# Get all keys from AWS S3
+def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
+    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+    result = aws_cli(
+        [
+            "s3",
+            "ls",
+            f"s3://{s3_bucket}",
+            "--recursive",
+        ],
+    )
+
+    # Separate out just the key from date, time, size, key
+    this_s3_bucket_raw_keys_list = []
+    for line in result.stdout.split("\n"):
+        if line:
+            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+
+    print(
+        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
+        file=sys.stderr,
+    )
+
+    return this_s3_bucket_raw_keys_list
+
+
+# Get the keys present and absent across NZSL Signbank and S3, to dictionary
+def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
+    print(
+        "Getting keys present and absent across NZSL Signbank and S3 ...",
+        file=sys.stderr,
+    )
+    this_all_keys_dict = {}
+
+    # Find S3 keys that are present in NZSL, or absent
+    for video_key in this_s3_bucket_raw_keys_list:
+        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
+        if dict_row:
+            # NZSL glossvideo record for this S3 key
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                True,  # S3 PRESENT
+            ] + dict_row
+        else:
+            # S3 key with no corresponding NZSL glossvideo record
+            this_all_keys_dict[video_key] = [
+                False,  # NZSL Absent
+                True,  # S3 PRESENT
+            ] + [""] * 6
+
+    # Find NZSL keys that are absent from S3 (present in both handled above)
+    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
+        if video_key not in this_s3_bucket_raw_keys_list:
+            # gloss/glossvideo record with no corresponding S3 key
+            # Either:
+            # video_key is real, but the S3 object is missing
+            # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object
+            this_all_keys_dict[video_key] = [
+                True,  # NZSL PRESENT
+                False,  # S3 Absent
+            ] + dict_row
+
+    return this_all_keys_dict
+
+
+def find_orphans():
+    all_keys_dict = create_all_keys_dict(
+        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
+    )
+
+    print("Gloss ID,Gloss,Suggested Video key")
+
+    # Traverse all the NZSL Signbank glosses that are missing S3 objects
+    for video_key, [
+        key_in_nzsl,
+        key_in_s3,
+        gloss_idgloss,
+        gloss_created_at,
+        gloss_id,
+        video_id,
+        gloss_public,
+        video_public,
+    ] in all_keys_dict.items():
+
+        if not key_in_nzsl:
+            # This is an S3 object, not a Signbank record
+            continue
+
+        if key_in_s3:
+            # This Signbank record already has an S3 object, all is well
+            continue
+
+        # Business rule
+        if int(gloss_id) < 8000:
+            continue
+
+        # The gloss_id is the only reliable retrieval key at the Signbank end
+        gloss = Gloss.objects.get(id=gloss_id)
+        gloss_name = gloss.idgloss.split(":")[0].strip()
+        video_path = gloss.get_video_path()
+
+        # Skip any that already have a video path
+        # These should have an S3 object but don't: For some reason the video never made it to S3
+        # These will have to have their videos reinstated (separate operation)
+        if len(video_path) > 0:
+            continue
+
+        # We try to find the orphaned S3 object, if it exists
+        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
+        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
+            if gloss_name in test_key:
+                if str(gloss_id) in test_key:
+                    if key_nzsl_yes:
+                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    if not key_s3_yes:
+                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
+                        continue
+                    print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key]))
+
+
+print(f"Env:         {args.env}", file=sys.stderr)
+print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
+print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+
+find_orphans()

From f16ca20adc60747a86cb0dd5d75d7371a6375db1 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:10:56 +1100
Subject: [PATCH 179/222] pyenv whoops

---
 bin/find-orphaned-videos.py   | 61 +++++++++++++++++------------------
 bin/repair-orphaned-videos.py | 61 +++++++++++++++++------------------
 2 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py
index 74d9e3ee..3066bd20 100755
--- a/bin/find-orphaned-videos.py
+++ b/bin/find-orphaned-videos.py
@@ -45,37 +45,36 @@
 )
 args = parser.parse_args()
 
-if args.pyenv:
-    # Magic required to allow this script to use Signbank Django classes
-    # This goes away if this script becomes a Django Management Command
-    print("Importing site-packages environment", file=sys.stderr)
-    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-    from django.core.wsgi import get_wsgi_application
-
-    get_wsgi_application()
-
-    from django.contrib.auth.models import Permission
-    from django.contrib.auth import get_user_model
-
-    User = get_user_model()
-
-    from django.test import Client
-    from django.core.files.uploadedfile import SimpleUploadedFile
-    from django.urls import reverse
-    from django.db.utils import IntegrityError
-    from signbank.dictionary.models import (
-        Dataset,
-        FieldChoice,
-        Gloss,
-        GlossTranslations,
-        Language,
-        ManualValidationAggregation,
-        ShareValidationAggregation,
-        ValidationRecord,
-    )
-    from signbank.video.models import GlossVideo
+# Magic required to allow this script to use Signbank Django classes
+# This goes away if this script becomes a Django Management Command
+print("Importing site-packages environment", file=sys.stderr)
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth.models import Permission
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from django.test import Client
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.urls import reverse
+from django.db.utils import IntegrityError
+from signbank.dictionary.models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
+from signbank.video.models import GlossVideo
 
 # Globals
 CSV_DELIMITER = ","
diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 74d9e3ee..3066bd20 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -45,37 +45,36 @@
 )
 args = parser.parse_args()
 
-if args.pyenv:
-    # Magic required to allow this script to use Signbank Django classes
-    # This goes away if this script becomes a Django Management Command
-    print("Importing site-packages environment", file=sys.stderr)
-    print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-    sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-    from django.core.wsgi import get_wsgi_application
-
-    get_wsgi_application()
-
-    from django.contrib.auth.models import Permission
-    from django.contrib.auth import get_user_model
-
-    User = get_user_model()
-
-    from django.test import Client
-    from django.core.files.uploadedfile import SimpleUploadedFile
-    from django.urls import reverse
-    from django.db.utils import IntegrityError
-    from signbank.dictionary.models import (
-        Dataset,
-        FieldChoice,
-        Gloss,
-        GlossTranslations,
-        Language,
-        ManualValidationAggregation,
-        ShareValidationAggregation,
-        ValidationRecord,
-    )
-    from signbank.video.models import GlossVideo
+# Magic required to allow this script to use Signbank Django classes
+# This goes away if this script becomes a Django Management Command
+print("Importing site-packages environment", file=sys.stderr)
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth.models import Permission
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from django.test import Client
+from django.core.files.uploadedfile import SimpleUploadedFile
+from django.urls import reverse
+from django.db.utils import IntegrityError
+from signbank.dictionary.models import (
+    Dataset,
+    FieldChoice,
+    Gloss,
+    GlossTranslations,
+    Language,
+    ManualValidationAggregation,
+    ShareValidationAggregation,
+    ValidationRecord,
+)
+from signbank.video.models import GlossVideo
 
 # Globals
 CSV_DELIMITER = ","

From 4951b52a298c12e3b16b50e74008402d5e3c5547 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:12:14 +1100
Subject: [PATCH 180/222] Repair script stripped

---
 bin/repair-orphaned-videos.py | 194 ----------------------------------
 1 file changed, 194 deletions(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 3066bd20..5b97a4b6 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -125,203 +125,9 @@ def aws_cli(args_list):
     return output
 
 
-# Fake key is a hack to handle FULL JOIN
-def maybe_fakekey(instring):
-    return instring if instring else FAKEKEY_PREFIX + str(uuid4())
-
-
-def filter_fakekey(instring):
-    return "" if instring.startswith(FAKEKEY_PREFIX) else instring
-
-
-# Get the video files info from NZSL Signbank
-def get_nzsl_raw_keys_dict():
-    print(
-        f"Getting raw list of video file info from NZSL Signbank ...",
-        file=sys.stderr,
-    )
-    this_nzsl_raw_keys_dict = {}
-    # Column renaming is for readability
-    # Special delimiter because columns might contain commas
-    result = pg_cli(
-        [
-            "COPY ("
-            "SELECT "
-            "dg.id AS gloss_id, "
-            "dg.idgloss AS gloss_idgloss, "
-            "dg.created_at AS gloss_created_at, "
-            "dg.published AS gloss_public, "
-            "vg.is_public AS video_public, "
-            "vg.id AS video_id, "
-            "vg.videofile AS video_key "
-            "FROM dictionary_gloss AS dg "
-            "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
-            ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
-        ]
-    )
-
-    # Separate the NZSL db columns
-    # Write them to a dictionary, so we can do fast operations
-    for rawl in result.stdout.split("\n"):
-        rawl = rawl.strip()
-        if not rawl:
-            continue
-        [
-            gloss_id,
-            gloss_idgloss,
-            gloss_created_at,
-            gloss_public,
-            video_public,
-            video_id,
-            video_key,
-        ] = rawl.split("|")
-
-        # Hack to handle FULL JOIN
-        video_key = maybe_fakekey(video_key.strip())
-
-        # This sets the initial field ordering in the all_keys dictionary row
-        this_nzsl_raw_keys_dict[video_key] = [
-            gloss_idgloss.replace(CSV_DELIMITER, ""),
-            gloss_created_at,
-            gloss_id,
-            video_id,
-            gloss_public.lower() == "t",
-            video_public.lower() == "t",
-        ]
-
-    print(
-        f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_nzsl_raw_keys_dict
-
-
-# Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = aws_cli(
-        [
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-    )
-
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
-
-    print(
-        f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
-        file=sys.stderr,
-    )
-
-    return this_s3_bucket_raw_keys_list
-
-
-# Get the keys present and absent across NZSL Signbank and S3, to dictionary
-def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
-    print(
-        "Getting keys present and absent across NZSL Signbank and S3 ...",
-        file=sys.stderr,
-    )
-    this_all_keys_dict = {}
-
-    # Find S3 keys that are present in NZSL, or absent
-    for video_key in this_s3_bucket_raw_keys_list:
-        dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
-        if dict_row:
-            # NZSL glossvideo record for this S3 key
-            this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
-                True,  # S3 PRESENT
-            ] + dict_row
-        else:
-            # S3 key with no corresponding NZSL glossvideo record
-            this_all_keys_dict[video_key] = [
-                False,  # NZSL Absent
-                True,  # S3 PRESENT
-            ] + [""] * 6
-
-    # Find NZSL keys that are absent from S3 (present in both handled above)
-    for video_key, dict_row in this_nzsl_raw_keys_dict.items():
-        if video_key not in this_s3_bucket_raw_keys_list:
-            # gloss/glossvideo record with no corresponding S3 key
-            # Either:
-            # video_key is real, but the S3 object is missing
-            # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object
-            this_all_keys_dict[video_key] = [
-                True,  # NZSL PRESENT
-                False,  # S3 Absent
-            ] + dict_row
-
-    return this_all_keys_dict
-
-
-def find_orphans():
-    all_keys_dict = create_all_keys_dict(
-        get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
-    )
-
-    print("Gloss ID,Gloss,Suggested Video key")
-
-    # Traverse all the NZSL Signbank glosses that are missing S3 objects
-    for video_key, [
-        key_in_nzsl,
-        key_in_s3,
-        gloss_idgloss,
-        gloss_created_at,
-        gloss_id,
-        video_id,
-        gloss_public,
-        video_public,
-    ] in all_keys_dict.items():
-
-        if not key_in_nzsl:
-            # This is an S3 object, not a Signbank record
-            continue
-
-        if key_in_s3:
-            # This Signbank record already has an S3 object, all is well
-            continue
-
-        # Business rule
-        if int(gloss_id) < 8000:
-            continue
-
-        # The gloss_id is the only reliable retrieval key at the Signbank end
-        gloss = Gloss.objects.get(id=gloss_id)
-        gloss_name = gloss.idgloss.split(":")[0].strip()
-        video_path = gloss.get_video_path()
-
-        # Skip any that already have a video path
-        # These should have an S3 object but don't: For some reason the video never made it to S3
-        # These will have to have their videos reinstated (separate operation)
-        if len(video_path) > 0:
-            continue
-
-        # We try to find the orphaned S3 object, if it exists
-        # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
-        for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
-            if gloss_name in test_key:
-                if str(gloss_id) in test_key:
-                    if key_nzsl_yes:
-                        print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
-                        continue
-                    if not key_s3_yes:
-                        print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
-                        continue
-                    print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key]))
-
-
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-find_orphans()

From aae1bd74e5fa4c4bf7a20400f1562496c6f51be4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:29:38 +1100
Subject: [PATCH 181/222] Import cleanup

---
 bin/find-orphaned-videos.py   | 13 -------------
 bin/repair-orphaned-videos.py | 25 +++++++++++++------------
 2 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py
index 3066bd20..55221552 100755
--- a/bin/find-orphaned-videos.py
+++ b/bin/find-orphaned-videos.py
@@ -55,26 +55,13 @@
 
 get_wsgi_application()
 
-from django.contrib.auth.models import Permission
 from django.contrib.auth import get_user_model
 
 User = get_user_model()
 
-from django.test import Client
-from django.core.files.uploadedfile import SimpleUploadedFile
-from django.urls import reverse
-from django.db.utils import IntegrityError
 from signbank.dictionary.models import (
-    Dataset,
-    FieldChoice,
     Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
 )
-from signbank.video.models import GlossVideo
 
 # Globals
 CSV_DELIMITER = ","
diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 5b97a4b6..762f1fcd 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -13,6 +13,7 @@
 
 import os
 import sys
+import csv
 import subprocess
 import argparse
 import re
@@ -25,6 +26,14 @@
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
     "Postgres access details, eg. DATABASE_URL env var."
 )
+
+# Positional arguments
+parser.add_argument(
+    "csv_filename",
+    help="Name of CSV file"
+)
+
+# Optional arguments
 parser.add_argument(
     "--env",
     default="uat",
@@ -55,24 +64,12 @@
 
 get_wsgi_application()
 
-from django.contrib.auth.models import Permission
 from django.contrib.auth import get_user_model
 
 User = get_user_model()
 
-from django.test import Client
-from django.core.files.uploadedfile import SimpleUploadedFile
-from django.urls import reverse
-from django.db.utils import IntegrityError
 from signbank.dictionary.models import (
-    Dataset,
-    FieldChoice,
     Gloss,
-    GlossTranslations,
-    Language,
-    ManualValidationAggregation,
-    ShareValidationAggregation,
-    ValidationRecord,
 )
 from signbank.video.models import GlossVideo
 
@@ -125,6 +122,10 @@ def aws_cli(args_list):
     return output
 
 
+def read_csv(csv_filename):
+    pass
+
+
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)

From 44a1bd8e5dc830e9871d5fd4332610f5d7401544 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:41:15 +1100
Subject: [PATCH 182/222] Syncing headers

---
 bin/find-orphaned-videos.py   |  3 ++-
 bin/repair-orphaned-videos.py | 22 +++++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py
index 55221552..467440c4 100755
--- a/bin/find-orphaned-videos.py
+++ b/bin/find-orphaned-videos.py
@@ -64,6 +64,7 @@
 )
 
 # Globals
+GLOBAL_COLUMN_HEADINGS = ["Gloss ID", "Gloss", "Suggested Video key"]  # Keep synced with other scripts
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
@@ -254,7 +255,7 @@ def find_orphans():
         get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
     )
 
-    print("Gloss ID,Gloss,Suggested Video key")
+    print(CSV_DELIMITER.join(GLOBAL_COLUMN_HEADINGS))
 
     # Traverse all the NZSL Signbank glosses that are missing S3 objects
     for video_key, [
diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 762f1fcd..d8bf5400 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -16,9 +16,7 @@
 import csv
 import subprocess
 import argparse
-import re
 from time import sleep
-from uuid import uuid4
 from pprint import pprint
 
 
@@ -28,10 +26,7 @@
 )
 
 # Positional arguments
-parser.add_argument(
-    "csv_filename",
-    help="Name of CSV file"
-)
+parser.add_argument("csv_filename", help="Name of CSV file")
 
 # Optional arguments
 parser.add_argument(
@@ -74,6 +69,11 @@
 from signbank.video.models import GlossVideo
 
 # Globals
+GLOBAL_COLUMN_HEADINGS = [
+    "Gloss ID",
+    "Gloss",
+    "Suggested Video key",
+]  # Keep synced with other scripts
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
@@ -123,7 +123,14 @@ def aws_cli(args_list):
 
 
 def read_csv(csv_filename):
-    pass
+    if csv_filename == "-":
+        f = sys.stdin.read().splitlines()
+    else:
+        f = open(csv_filename, "r")
+    csv_dict = csv.DictReader(f)
+    for row in csv_dict:
+        pprint(row)
+        # print(dict(row))
 
 
 print(f"Env:         {args.env}", file=sys.stderr)
@@ -132,3 +139,4 @@ def read_csv(csv_filename):
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
+read_csv(args.csv_filename)

From a23eadb4e918f5ae434a8331e02409fbab2c9bbc Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:47:20 +1100
Subject: [PATCH 183/222] cleanups

---
 bin/find-orphaned-videos.py   |  9 +++++++--
 bin/repair-orphaned-videos.py | 13 +++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py
index 467440c4..c592fd77 100755
--- a/bin/find-orphaned-videos.py
+++ b/bin/find-orphaned-videos.py
@@ -63,8 +63,13 @@
     Gloss,
 )
 
-# Globals
-GLOBAL_COLUMN_HEADINGS = ["Gloss ID", "Gloss", "Suggested Video key"]  # Keep synced with other scripts
+# Keep synced with other scripts
+GLOSS_ID_COLUMN = "Gloss ID"
+GLOSS_COLUMN = "Gloss"
+GLOSS_VIDEO_COLUMN = "Suggested Video key"
+GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN]
+
+# Other globals
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index d8bf5400..b60e1021 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -68,12 +68,13 @@
 )
 from signbank.video.models import GlossVideo
 
-# Globals
-GLOBAL_COLUMN_HEADINGS = [
-    "Gloss ID",
-    "Gloss",
-    "Suggested Video key",
-]  # Keep synced with other scripts
+# Keep synced with other scripts
+GLOSS_ID_COLUMN = "Gloss ID"
+GLOSS_COLUMN = "Gloss"
+GLOSS_VIDEO_COLUMN = "Suggested Video key"
+GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN]
+
+# Other globals
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")

From bc48b268618c993fa91d9497914543c3a431305c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:49:10 +1100
Subject: [PATCH 184/222] help message

---
 bin/repair-orphaned-videos.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index b60e1021..5425121d 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -26,7 +26,7 @@
 )
 
 # Positional arguments
-parser.add_argument("csv_filename", help="Name of CSV file")
+parser.add_argument("csv_filename", help="Name of CSV file, or '-' for STDIN")
 
 # Optional arguments
 parser.add_argument(
@@ -130,7 +130,7 @@ def read_csv(csv_filename):
         f = open(csv_filename, "r")
     csv_dict = csv.DictReader(f)
     for row in csv_dict:
-        pprint(row)
+        pprint(row[GLOSS_COLUMN])
         # print(dict(row))
 
 

From 8b2aa15f03f45a60355b3d06d4c56a710f2750bf Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:54:51 +1100
Subject: [PATCH 185/222] refactor

---
 bin/repair-orphaned-videos.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 5425121d..7842d3bf 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -123,15 +123,24 @@ def aws_cli(args_list):
     return output
 
 
+# Returns a list of dictionaries, one for each CSV row
 def read_csv(csv_filename):
     if csv_filename == "-":
         f = sys.stdin.read().splitlines()
     else:
         f = open(csv_filename, "r")
-    csv_dict = csv.DictReader(f)
-    for row in csv_dict:
-        pprint(row[GLOSS_COLUMN])
-        # print(dict(row))
+    return csv.DictReader(f)
+
+
+def process_csv():
+    csv_rows = read_csv(args.csv_filename)
+    for csv_row in csv_rows:
+        gloss_id = int(csv_row[GLOSS_ID_COLUMN])
+        gloss_idgloss = csv_row[GLOSS_COLUMN]
+        video_key = csv_row[GLOSS_VIDEO_COLUMN]
+        print(gloss_id)
+        print(gloss_idgloss)
+        print(video_key)
 
 
 print(f"Env:         {args.env}", file=sys.stderr)
@@ -140,4 +149,5 @@ def read_csv(csv_filename):
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-read_csv(args.csv_filename)
+
+process_csv()

From 860235f84766506bfca8402dec383c43b7eec6f2 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 18:06:23 +1100
Subject: [PATCH 186/222] Basics working

---
 bin/repair-orphaned-videos.py | 70 +++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 7842d3bf..10120eea 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -19,6 +19,28 @@
 from time import sleep
 from pprint import pprint
 
+# Magic required to allow this script to use Signbank Django classes
+# This goes away if this script becomes a Django Management Command
+print("Importing site-packages environment", file=sys.stderr)
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from signbank.dictionary.models import (
+    FieldChoice,
+    Gloss,
+)
+from signbank.video.models import GlossVideo
+
+from django.core.exceptions import ObjectDoesNotExist
+
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -49,25 +71,6 @@
 )
 args = parser.parse_args()
 
-# Magic required to allow this script to use Signbank Django classes
-# This goes away if this script becomes a Django Management Command
-print("Importing site-packages environment", file=sys.stderr)
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
-from signbank.dictionary.models import (
-    Gloss,
-)
-from signbank.video.models import GlossVideo
-
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
 GLOSS_COLUMN = "Gloss"
@@ -133,14 +136,35 @@ def read_csv(csv_filename):
 
 
 def process_csv():
+    main_video_type = FieldChoice.objects.filter(field="video_type", english_name="main").first()
+
     csv_rows = read_csv(args.csv_filename)
     for csv_row in csv_rows:
-        gloss_id = int(csv_row[GLOSS_ID_COLUMN])
+        gloss_id = csv_row[GLOSS_ID_COLUMN]
         gloss_idgloss = csv_row[GLOSS_COLUMN]
         video_key = csv_row[GLOSS_VIDEO_COLUMN]
-        print(gloss_id)
-        print(gloss_idgloss)
-        print(video_key)
+        print(CSV_DELIMITER.join([gloss_id, gloss_idgloss, video_key]))
+        gloss_id = int(gloss_id)
+
+        try:
+            gloss = Gloss.objects.get(id=gloss_id)
+            print(gloss)
+        except ObjectDoesNotExist as e:
+            print(e)
+            continue
+
+        gloss_video = GlossVideo(
+            gloss=gloss,
+            dataset=gloss.dataset,
+            videofile=video_key,
+            title=video_key,
+            version=0,
+            is_public=False,
+            video_type=main_video_type
+        )
+        print(gloss_video)
+
+
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From 81ef0cc25d9113c7d3e1b249e3734a7fe473cbc7 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 18:08:08 +1100
Subject: [PATCH 187/222] More import cleanups

---
 bin/find-orphaned-videos.py   | 36 +++++++++++++++++------------------
 bin/repair-orphaned-videos.py |  8 ++++----
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py
index c592fd77..353015b5 100755
--- a/bin/find-orphaned-videos.py
+++ b/bin/find-orphaned-videos.py
@@ -20,6 +20,24 @@
 from uuid import uuid4
 from pprint import pprint
 
+# Magic required to allow this script to use Signbank Django classes
+# This goes away if this script becomes a Django Management Command
+print("Importing site-packages environment", file=sys.stderr)
+print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
+from django.core.wsgi import get_wsgi_application
+
+get_wsgi_application()
+
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+from signbank.dictionary.models import (
+    Gloss,
+)
+
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -45,24 +63,6 @@
 )
 args = parser.parse_args()
 
-# Magic required to allow this script to use Signbank Django classes
-# This goes away if this script becomes a Django Management Command
-print("Importing site-packages environment", file=sys.stderr)
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
-from signbank.dictionary.models import (
-    Gloss,
-)
-
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
 GLOSS_COLUMN = "Gloss"
diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 10120eea..1f768a5a 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -136,7 +136,9 @@ def read_csv(csv_filename):
 
 
 def process_csv():
-    main_video_type = FieldChoice.objects.filter(field="video_type", english_name="main").first()
+    main_video_type = FieldChoice.objects.filter(
+        field="video_type", english_name="main"
+    ).first()
 
     csv_rows = read_csv(args.csv_filename)
     for csv_row in csv_rows:
@@ -160,13 +162,11 @@ def process_csv():
             title=video_key,
             version=0,
             is_public=False,
-            video_type=main_video_type
+            video_type=main_video_type,
         )
         print(gloss_video)
 
 
-
-
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)

From 39bb18f7089354b7b28d8cee4be2955e1b44ef44 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 18:12:45 +1100
Subject: [PATCH 188/222] Warnings

---
 bin/repair-orphaned-videos.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 1f768a5a..1fec0e93 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -155,6 +155,13 @@ def process_csv():
             print(e)
             continue
 
+        try:
+            GlossVideo.objects.get(videofile=video_key)
+            print(f"Error: GlossVideo already exists: {video_key}")
+            continue
+        except ObjectDoesNotExist:
+            pass
+
         gloss_video = GlossVideo(
             gloss=gloss,
             dataset=gloss.dataset,
@@ -166,6 +173,11 @@ def process_csv():
         )
         print(gloss_video)
 
+        # At this point the repair should be complete
+        # WARNING, it tries to save to the current storage medium, so this needs sorting out!
+        # Hm, maybe we SHOULD just write to the database after all, and hope Django copes?
+        #gloss_video.save()
+
 
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)

From 9e6d570481146389771e510a19240f5f3d9f3217 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 18:16:02 +1100
Subject: [PATCH 189/222] Notes

---
 bin/repair-orphaned-videos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 1fec0e93..be8b23ab 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -173,8 +173,9 @@ def process_csv():
         )
         print(gloss_video)
 
-        # At this point the repair should be complete
+        # At this point we complete the repair
         # WARNING, it tries to save to the current storage medium, so this needs sorting out!
+        # save() is overridden in the GlossVideo model
         # Hm, maybe we SHOULD just write to the database after all, and hope Django copes?
         #gloss_video.save()
 

From f94518b376b6f6e5ec0cd99477813e3fe3aea6e6 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 18:18:30 +1100
Subject: [PATCH 190/222] Notes

---
 bin/repair-orphaned-videos.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index be8b23ab..20430b56 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -175,8 +175,10 @@ def process_csv():
 
         # At this point we complete the repair
         # WARNING, it tries to save to the current storage medium, so this needs sorting out!
+        # We absolutely DO NOT want it to try and save!
         # save() is overridden in the GlossVideo model
         # Hm, maybe we SHOULD just write to the database after all, and hope Django copes?
+        # Yeah, starting to think that's the way to go, IF postgres will allow us to do so (constraints)
         #gloss_video.save()
 
 

From d7c3bc28c1a8ff9322b46747f9bbefa78c1a049b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 1 Nov 2024 18:48:10 +1100
Subject: [PATCH 191/222] First success

---
 bin/repair-orphaned-videos.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index 20430b56..aadcbd36 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -40,6 +40,7 @@
 from signbank.video.models import GlossVideo
 
 from django.core.exceptions import ObjectDoesNotExist
+from django.db import models
 
 
 parser = argparse.ArgumentParser(
@@ -172,6 +173,8 @@ def process_csv():
             video_type=main_video_type,
         )
         print(gloss_video)
+        # HOLY ****, this works!
+        gloss_video.save = models.Model.save
 
         # At this point we complete the repair
         # WARNING, it tries to save to the current storage medium, so this needs sorting out!
@@ -179,7 +182,8 @@ def process_csv():
         # save() is overridden in the GlossVideo model
         # Hm, maybe we SHOULD just write to the database after all, and hope Django copes?
         # Yeah, starting to think that's the way to go, IF postgres will allow us to do so (constraints)
-        #gloss_video.save()
+        # HOLY ****, this works!
+        gloss_video.save(gloss_video)
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From 9684c694292e2dd487021d3d82bd1b659979386a Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 13:17:06 +1100
Subject: [PATCH 192/222] Uses bulk_create() so that save() does not run

---
 bin/repair-orphaned-videos.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py
index aadcbd36..590cceab 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-orphaned-videos.py
@@ -158,7 +158,7 @@ def process_csv():
 
         try:
             GlossVideo.objects.get(videofile=video_key)
-            print(f"Error: GlossVideo already exists: {video_key}")
+            print(f"Ignoring: GlossVideo already exists: {video_key}")
             continue
         except ObjectDoesNotExist:
             pass
@@ -173,17 +173,12 @@ def process_csv():
             video_type=main_video_type,
         )
         print(gloss_video)
-        # HOLY ****, this works!
-        gloss_video.save = models.Model.save
-
         # At this point we complete the repair
-        # WARNING, it tries to save to the current storage medium, so this needs sorting out!
-        # We absolutely DO NOT want it to try and save!
-        # save() is overridden in the GlossVideo model
-        # Hm, maybe we SHOULD just write to the database after all, and hope Django copes?
-        # Yeah, starting to think that's the way to go, IF postgres will allow us to do so (constraints)
-        # HOLY ****, this works!
-        gloss_video.save(gloss_video)
+        # We cannot allow the GlossVideo save() method to run, as it has side-effects including
+        # trying to save the video file to the current storage medium (eg. S3)
+        createds = GlossVideo.objects.bulk_create([gloss_video])
+        if len(createds) < 1:
+            print(f"Error: could not create {gloss_video}")
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From cd0143a65ee3487113acfea8fee2efa27a614219 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 13:23:46 +1100
Subject: [PATCH 193/222] Neatening and rename

---
 bin/{find-orphaned-videos.py => find-fixable-orphans.py}    | 0
 ...{repair-orphaned-videos.py => repair-fixable-orphans.py} | 6 ++----
 2 files changed, 2 insertions(+), 4 deletions(-)
 rename bin/{find-orphaned-videos.py => find-fixable-orphans.py} (100%)
 rename bin/{repair-orphaned-videos.py => repair-fixable-orphans.py} (95%)

diff --git a/bin/find-orphaned-videos.py b/bin/find-fixable-orphans.py
similarity index 100%
rename from bin/find-orphaned-videos.py
rename to bin/find-fixable-orphans.py
diff --git a/bin/repair-orphaned-videos.py b/bin/repair-fixable-orphans.py
similarity index 95%
rename from bin/repair-orphaned-videos.py
rename to bin/repair-fixable-orphans.py
index 590cceab..b99830ef 100755
--- a/bin/repair-orphaned-videos.py
+++ b/bin/repair-fixable-orphans.py
@@ -174,10 +174,8 @@ def process_csv():
         )
         print(gloss_video)
         # At this point we complete the repair
-        # We cannot allow the GlossVideo save() method to run, as it has side-effects including
-        # trying to save the video file to the current storage medium (eg. S3)
-        createds = GlossVideo.objects.bulk_create([gloss_video])
-        if len(createds) < 1:
+        # We use bulk_create() because we cannot allow save() to run
+        if len(GlossVideo.objects.bulk_create([gloss_video])) < 1:
             print(f"Error: could not create {gloss_video}")
 
 

From 36d251e0da290b0c1ef66314e34c265a65133eb7 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 13:26:48 +1100
Subject: [PATCH 194/222] Comments

---
 bin/find-fixable-orphans.py   | 3 +++
 bin/repair-fixable-orphans.py | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py
index 353015b5..c649293b 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-orphans.py
@@ -2,6 +2,9 @@
 #
 # This script needs to be run in a pyenv virtualenv with the Django project installed.
 #
+# Finds orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects.
+# Essentially finds one form of import error.
+#
 # Bang line above passes '-u' to python, for unbuffered output
 # Permissions required:
 #  psql - access to heroku app's postgres
diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py
index b99830ef..3e98bd82 100755
--- a/bin/repair-fixable-orphans.py
+++ b/bin/repair-fixable-orphans.py
@@ -2,6 +2,10 @@
 #
 # This script needs to be run in a pyenv virtualenv with the Django project installed.
 #
+# Given a CSV file containing S3 objects that can be matched back to NZSL entries.
+# Updates the database to repair the NZSL entries.
+# Essentially repairs one form of import error.
+#
 # Bang line above passes '-u' to python, for unbuffered output
 # Permissions required:
 #  psql - access to heroku app's postgres

From 964321f9d729e515cc6e6999a0a118be746c241d Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 15:06:31 +1100
Subject: [PATCH 195/222] Added S3 dumper

---
 bin/get-video-s3-acls.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index eb5436be..5ffbefe8 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -46,6 +46,13 @@
     action="store_true",
     help=f"Dump raw NZSL database output",
 )
+parser.add_argument(
+    "--dumps3",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Dump raw S3 keys output",
+)
 args = parser.parse_args()
 
 # Globals
@@ -368,6 +375,11 @@ def process_keys(this_all_keys_dict):
     pprint(get_nzsl_raw_keys_dict())
     exit()
 
+if args.dumps3:
+    pprint(get_s3_bucket_raw_keys_list())
+    exit()
+
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )
+

From bd6f86d369f30e934449ed696bbc366dac01a120 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:34:44 +1100
Subject: [PATCH 196/222] Boto3 conversion of get-video-s3-acls

---
 bin/get-video-s3-acls.py | 86 +++++-----------------------------------
 1 file changed, 9 insertions(+), 77 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 5ffbefe8..5021f17c 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -16,6 +16,7 @@
 from time import sleep
 from uuid import uuid4
 from pprint import pprint
+import boto3
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -33,12 +34,6 @@
     required=False,
     help=f"Postgres client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--awscli",
-    default="/usr/local/bin/aws",
-    required=False,
-    help=f"AWS client path (default: %(default)s)",
-)
 parser.add_argument(
     "--dumpnzsl",
     default=False,
@@ -59,7 +54,6 @@
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-AWSCLI = args.awscli
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 
@@ -81,29 +75,6 @@ def pg_cli(args_list):
         exit()
 
 
-def aws_cli(args_list):
-    # Try indefinitely
-    output = None
-    while not output:
-        try:
-            output = subprocess.run(
-                [AWSCLI] + args_list,
-                env=os.environ,
-                capture_output=True,
-                check=True,
-                text=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(
-                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
-            )
-            print(e.cmd, file=sys.stderr)
-            print(e.stdout, file=sys.stderr)
-            print(e.stderr, file=sys.stderr)
-            sleep(1)
-    return output
-
-
 # Fake key is a hack to handle FULL JOIN
 def maybe_fakekey(instring):
     return instring if instring else FAKEKEY_PREFIX + str(uuid4())
@@ -179,20 +150,10 @@ def get_nzsl_raw_keys_dict():
 # Get all keys from AWS S3
 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = aws_cli(
-        [
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-    )
 
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+    s3_resource = boto3.resource('s3')
+    s3_resource_bucket = s3_resource.Bucket(s3_bucket)
+    this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ]
 
     print(
         f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
@@ -252,26 +213,12 @@ def get_recommended_action(key_in_nzsl, key_in_s3):
 
 # Get S3 object's ACL
 def get_s3_canned_acl(video_key):
-    result = aws_cli(
-        [
-            "s3api",
-            "get-object-acl",
-            "--output",
-            "text",
-            "--query",
-            "Grants[*].Permission",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-    )
-    acls_grants = result.stdout.strip().split("\t")
-
+    s3_client = boto3.client("s3")
+    acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)["Grants"]
     if len(acls_grants) > 1:
-        if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ":
+        if acls_grants[0]["Permission"] == "FULL_CONTROL" and acls_grants[1]["Permission"] == "READ":
             return "public-read"
-    elif acls_grants[0] == "FULL_CONTROL":
+    elif acls_grants[0]["Permission"] == "FULL_CONTROL":
         return "private"
 
     return "unknown"
@@ -279,21 +226,7 @@ def get_s3_canned_acl(video_key):
 
 # Get S3 object's LastModified date/time
 def get_s3_lastmodified(video_key):
-    result = aws_cli(
-        [
-            "s3api",
-            "head-object",
-            "--output",
-            "text",
-            "--query",
-            "LastModified",
-            "--bucket",
-            AWS_S3_BUCKET,
-            "--key",
-            video_key,
-        ]
-    )
-    return result.stdout.strip()
+    return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"]
 
 
 def build_csv_header():
@@ -367,7 +300,6 @@ def process_keys(this_all_keys_dict):
 
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 

From 4b37c9346349b63fba7b09581537acc6ba9bf47c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:48:32 +1100
Subject: [PATCH 197/222] black

---
 bin/get-video-s3-acls.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 5021f17c..316f8e82 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -151,9 +151,11 @@ def get_nzsl_raw_keys_dict():
 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
 
-    s3_resource = boto3.resource('s3')
+    s3_resource = boto3.resource("s3")
     s3_resource_bucket = s3_resource.Bucket(s3_bucket)
-    this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ]
+    this_s3_bucket_raw_keys_list = [
+        s3_object.key for s3_object in s3_resource_bucket.objects.all()
+    ]
 
     print(
         f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
@@ -214,9 +216,14 @@ def get_recommended_action(key_in_nzsl, key_in_s3):
 # Get S3 object's ACL
 def get_s3_canned_acl(video_key):
     s3_client = boto3.client("s3")
-    acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)["Grants"]
+    acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[
+        "Grants"
+    ]
     if len(acls_grants) > 1:
-        if acls_grants[0]["Permission"] == "FULL_CONTROL" and acls_grants[1]["Permission"] == "READ":
+        if (
+            acls_grants[0]["Permission"] == "FULL_CONTROL"
+            and acls_grants[1]["Permission"] == "READ"
+        ):
             return "public-read"
     elif acls_grants[0]["Permission"] == "FULL_CONTROL":
         return "private"
@@ -226,7 +233,9 @@ def get_s3_canned_acl(video_key):
 
 # Get S3 object's LastModified date/time
 def get_s3_lastmodified(video_key):
-    return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"]
+    return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)[
+        "LastModified"
+    ]
 
 
 def build_csv_header():
@@ -314,4 +323,3 @@ def process_keys(this_all_keys_dict):
 process_keys(
     create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
 )
-

From 01ce2dd579c873d4cdd1be404224f84a1d58393f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:48:48 +1100
Subject: [PATCH 198/222] Boto3 conversion: find-fixable-orphans

---
 bin/find-fixable-orphans.py | 50 +++++--------------------------------
 1 file changed, 6 insertions(+), 44 deletions(-)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py
index c649293b..ca36c435 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-orphans.py
@@ -21,6 +21,7 @@
 import re
 from time import sleep
 from uuid import uuid4
+import boto3
 from pprint import pprint
 
 # Magic required to allow this script to use Signbank Django classes
@@ -58,12 +59,6 @@
     required=False,
     help=f"Postgres client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--awscli",
-    default="/usr/local/bin/aws",
-    required=False,
-    help=f"AWS client path (default: %(default)s)",
-)
 args = parser.parse_args()
 
 # Keep synced with other scripts
@@ -76,7 +71,6 @@
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-AWSCLI = args.awscli
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 
@@ -98,29 +92,6 @@ def pg_cli(args_list):
         exit()
 
 
-def aws_cli(args_list):
-    # Try indefinitely
-    output = None
-    while not output:
-        try:
-            output = subprocess.run(
-                [AWSCLI] + args_list,
-                env=os.environ,
-                capture_output=True,
-                check=True,
-                text=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(
-                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
-            )
-            print(e.cmd, file=sys.stderr)
-            print(e.stdout, file=sys.stderr)
-            print(e.stderr, file=sys.stderr)
-            sleep(1)
-    return output
-
-
 # Fake key is a hack to handle FULL JOIN
 def maybe_fakekey(instring):
     return instring if instring else FAKEKEY_PREFIX + str(uuid4())
@@ -196,20 +167,12 @@ def get_nzsl_raw_keys_dict():
 # Get all keys from AWS S3
 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
-    result = aws_cli(
-        [
-            "s3",
-            "ls",
-            f"s3://{s3_bucket}",
-            "--recursive",
-        ],
-    )
 
-    # Separate out just the key from date, time, size, key
-    this_s3_bucket_raw_keys_list = []
-    for line in result.stdout.split("\n"):
-        if line:
-            this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])
+    s3_resource = boto3.resource("s3")
+    s3_resource_bucket = s3_resource.Bucket(s3_bucket)
+    this_s3_bucket_raw_keys_list = [
+        s3_object.key for s3_object in s3_resource_bucket.objects.all()
+    ]
 
     print(
         f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
@@ -316,7 +279,6 @@ def find_orphans():
 
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 

From ab294e57705d56240f08896959f06ff2992b64d8 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:54:45 +1100
Subject: [PATCH 199/222] Boto3 conversion: repair-fixable-orphans.py

---
 bin/repair-fixable-orphans.py | 36 +----------------------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py
index 3e98bd82..144e1171 100755
--- a/bin/repair-fixable-orphans.py
+++ b/bin/repair-fixable-orphans.py
@@ -20,8 +20,6 @@
 import csv
 import subprocess
 import argparse
-from time import sleep
-from pprint import pprint
 
 # Magic required to allow this script to use Signbank Django classes
 # This goes away if this script becomes a Django Management Command
@@ -68,12 +66,6 @@
     required=False,
     help=f"Postgres client path (default: %(default)s)",
 )
-parser.add_argument(
-    "--awscli",
-    default="/usr/local/bin/aws",
-    required=False,
-    help=f"AWS client path (default: %(default)s)",
-)
 args = parser.parse_args()
 
 # Keep synced with other scripts
@@ -86,7 +78,6 @@
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-AWSCLI = args.awscli
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 
@@ -108,29 +99,6 @@ def pg_cli(args_list):
         exit()
 
 
-def aws_cli(args_list):
-    # Try indefinitely
-    output = None
-    while not output:
-        try:
-            output = subprocess.run(
-                [AWSCLI] + args_list,
-                env=os.environ,
-                capture_output=True,
-                check=True,
-                text=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(
-                f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr
-            )
-            print(e.cmd, file=sys.stderr)
-            print(e.stdout, file=sys.stderr)
-            print(e.stderr, file=sys.stderr)
-            sleep(1)
-    return output
-
-
 # Returns a list of dictionaries, one for each CSV row
 def read_csv(csv_filename):
     if csv_filename == "-":
@@ -155,7 +123,6 @@ def process_csv():
 
         try:
             gloss = Gloss.objects.get(id=gloss_id)
-            print(gloss)
         except ObjectDoesNotExist as e:
             print(e)
             continue
@@ -176,6 +143,7 @@ def process_csv():
             is_public=False,
             video_type=main_video_type,
         )
+        print(gloss)
         print(gloss_video)
         # At this point we complete the repair
         # We use bulk_create() because we cannot allow save() to run
@@ -185,9 +153,7 @@ def process_csv():
 
 print(f"Env:         {args.env}", file=sys.stderr)
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"AWSCLI:      {AWSCLI}", file=sys.stderr)
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-
 process_csv()

From 57e0b4611f196c8d6b28d484922140a2778f03f3 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 11 Nov 2024 14:07:48 +1100
Subject: [PATCH 200/222] Added a public/published boolean column

---
 bin/find-fixable-orphans.py   | 5 +++--
 bin/repair-fixable-orphans.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py
index ca36c435..7f6b4b13 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-orphans.py
@@ -64,8 +64,9 @@
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
 GLOSS_COLUMN = "Gloss"
+GLOSS_PUBLIC_COLUMN = "Gloss public"
 GLOSS_VIDEO_COLUMN = "Suggested Video key"
-GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN]
+GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN]
 
 # Other globals
 CSV_DELIMITER = ","
@@ -274,7 +275,7 @@ def find_orphans():
                     if not key_s3_yes:
                         print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
                         continue
-                    print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key]))
+                    print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, str(gloss_public), test_key]))
 
 
 print(f"Env:         {args.env}", file=sys.stderr)
diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py
index 144e1171..e795d03a 100755
--- a/bin/repair-fixable-orphans.py
+++ b/bin/repair-fixable-orphans.py
@@ -71,8 +71,9 @@
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
 GLOSS_COLUMN = "Gloss"
+GLOSS_PUBLIC_COLUMN = "Gloss public"
 GLOSS_VIDEO_COLUMN = "Suggested Video key"
-GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN]
+GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN]
 
 # Other globals
 CSV_DELIMITER = ","

From 587e01c1ac83f813651463cf43398d645a816d2a Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:27:55 +1100
Subject: [PATCH 201/222] message

---
 bin/find-fixable-orphans.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py
index 7f6b4b13..403d23ca 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-orphans.py
@@ -226,6 +226,7 @@ def find_orphans():
     all_keys_dict = create_all_keys_dict(
         get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
     )
+    print("Finding fixable orphans", file=sys.stderr)
 
     print(CSV_DELIMITER.join(GLOBAL_COLUMN_HEADINGS))
 

From a240a3d2a9bbb5b02e4a5f3c4da624a065f049fd Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:49:20 +1100
Subject: [PATCH 202/222] comments/black

---
 bin/find-fixable-orphans.py   | 13 +++++++++++--
 bin/get-video-s3-acls.py      |  2 ++
 bin/repair-fixable-orphans.py |  7 ++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py
index 403d23ca..31f51fe4 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-orphans.py
@@ -66,7 +66,12 @@
 GLOSS_COLUMN = "Gloss"
 GLOSS_PUBLIC_COLUMN = "Gloss public"
 GLOSS_VIDEO_COLUMN = "Suggested Video key"
-GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN]
+GLOBAL_COLUMN_HEADINGS = [
+    GLOSS_ID_COLUMN,
+    GLOSS_COLUMN,
+    GLOSS_PUBLIC_COLUMN,
+    GLOSS_VIDEO_COLUMN,
+]
 
 # Other globals
 CSV_DELIMITER = ","
@@ -276,7 +281,11 @@ def find_orphans():
                     if not key_s3_yes:
                         print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
                         continue
-                    print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, str(gloss_public), test_key]))
+                    print(
+                        CSV_DELIMITER.join(
+                            [gloss_id, gloss.idgloss, str(gloss_public), test_key]
+                        )
+                    )
 
 
 print(f"Env:         {args.env}", file=sys.stderr)
diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 316f8e82..656de197 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -215,6 +215,7 @@ def get_recommended_action(key_in_nzsl, key_in_s3):
 
 # Get S3 object's ACL
 def get_s3_canned_acl(video_key):
+    # TODO pass in a boto client instead of recreating one each time
     s3_client = boto3.client("s3")
     acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[
         "Grants"
@@ -233,6 +234,7 @@ def get_s3_canned_acl(video_key):
 
 # Get S3 object's LastModified date/time
 def get_s3_lastmodified(video_key):
+    # TODO pass in a boto client instead of recreating one each time
     return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)[
         "LastModified"
     ]
diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py
index e795d03a..ce948be2 100755
--- a/bin/repair-fixable-orphans.py
+++ b/bin/repair-fixable-orphans.py
@@ -73,7 +73,12 @@
 GLOSS_COLUMN = "Gloss"
 GLOSS_PUBLIC_COLUMN = "Gloss public"
 GLOSS_VIDEO_COLUMN = "Suggested Video key"
-GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN]
+GLOBAL_COLUMN_HEADINGS = [
+    GLOSS_ID_COLUMN,
+    GLOSS_COLUMN,
+    GLOSS_PUBLIC_COLUMN,
+    GLOSS_VIDEO_COLUMN,
+]
 
 # Other globals
 CSV_DELIMITER = ","

From 76e81b5a5719fbe699186e22f9299ee92e470bee Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:50:26 +1100
Subject: [PATCH 203/222] Unused imports removed

---
 bin/find-fixable-orphans.py | 3 ---
 bin/get-video-s3-acls.py    | 2 --
 2 files changed, 5 deletions(-)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py
index 31f51fe4..90555cfc 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-orphans.py
@@ -18,11 +18,8 @@
 import sys
 import subprocess
 import argparse
-import re
-from time import sleep
 from uuid import uuid4
 import boto3
-from pprint import pprint
 
 # Magic required to allow this script to use Signbank Django classes
 # This goes away if this script becomes a Django Management Command
diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 656de197..e0851953 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -12,8 +12,6 @@
 import sys
 import subprocess
 import argparse
-import re
-from time import sleep
 from uuid import uuid4
 from pprint import pprint
 import boto3

From 1d2a86a197ef020df3366464747a458833ad6886 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 15:44:11 +1100
Subject: [PATCH 204/222] Initial review commits/black

---
 bin/get-video-s3-acls.py | 48 ++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index e0851953..00b820f9 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -54,6 +54,8 @@
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+S3_CLIENT = boto3.client("s3")
+S3_RESOURCE = boto3.resource("s3")
 
 
 def pg_cli(args_list):
@@ -124,7 +126,23 @@ def get_nzsl_raw_keys_dict():
             video_key,
         ] = rawl.split("|")
 
-        # Hack to handle FULL JOIN
+        """
+        Hack to handle FULL JOIN.
+        We are storing data rows in a dictionary, indexed by video_key.
+        Because we are doing a FULL JOIN on the NZSL Signbank database,
+        we also get rows where there are gloss entries that do not have
+        a corresponding video_glossvideo.
+        (These are erroneous and one of the reasons this script exists,
+        to find them.)
+        Consequently there is no video_key, and we cannot use it to index
+        the data row.
+        Instead, we create a fake video_key that is unique and, theoretically,
+        impossible for anything else to try and use. It also has a 'safe',
+        easily filtered prefix, which means later code can easily tell
+        a fake key from a real key.
+        Always having a key, in this way, means that code, eg. loops,
+        that depends on there being a dictionary key axis will not break.
+        """
         video_key = maybe_fakekey(video_key.strip())
 
         # This sets the initial field ordering in the all_keys dictionary row
@@ -149,8 +167,7 @@ def get_nzsl_raw_keys_dict():
 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
     print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
 
-    s3_resource = boto3.resource("s3")
-    s3_resource_bucket = s3_resource.Bucket(s3_bucket)
+    s3_resource_bucket = S3_RESOURCE.Bucket(s3_bucket)
     this_s3_bucket_raw_keys_list = [
         s3_object.key for s3_object in s3_resource_bucket.objects.all()
     ]
@@ -172,6 +189,9 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     this_all_keys_dict = {}
 
     # Find S3 keys that are present in NZSL, or absent
+    # TODO This could be changed to use pop(), so that on each pass we are left
+    # with a smaller subset of the rows, which we can search faster. If the
+    # database becomes very large in future this could save a lot of processing.
     for video_key in this_s3_bucket_raw_keys_list:
         dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
         if dict_row:
@@ -196,13 +216,14 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     return this_all_keys_dict
 
 
-# Cases
-# In S3     In NZSL     Action
-#   Is        Not         Delete S3 Object
-#   Is        Is          Update ACL
-#   Not       Is          Review
-#      Other              Review
 def get_recommended_action(key_in_nzsl, key_in_s3):
+    """
+    Cases
+    In S3     In NZSL     Action
+      Is        Not         Delete S3 Object
+      Is        Is          Update ACL
+      Not       --          Review
+    """
     if key_in_s3:
         if key_in_nzsl:
             return "Update ACL"
@@ -213,9 +234,7 @@ def get_recommended_action(key_in_nzsl, key_in_s3):
 
 # Get S3 object's ACL
 def get_s3_canned_acl(video_key):
-    # TODO pass in a boto client instead of recreating one each time
-    s3_client = boto3.client("s3")
-    acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[
+    acls_grants = S3_CLIENT.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[
         "Grants"
     ]
     if len(acls_grants) > 1:
@@ -232,10 +251,7 @@ def get_s3_canned_acl(video_key):
 
 # Get S3 object's LastModified date/time
 def get_s3_lastmodified(video_key):
-    # TODO pass in a boto client instead of recreating one each time
-    return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)[
-        "LastModified"
-    ]
+    return S3_CLIENT.head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"]
 
 
 def build_csv_header():

From 749bb20d3a902a5c865c537875ca55f132ab7045 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 15:44:34 +1100
Subject: [PATCH 205/222] Script renamings

---
 ...-orphans.py => find-fixable-s3-orphans.py} | 35 +++++++++++++------
 ...rphans.py => repair-fixable-s3-orphans.py} | 12 +++++++
 2 files changed, 37 insertions(+), 10 deletions(-)
 rename bin/{find-fixable-orphans.py => find-fixable-s3-orphans.py} (88%)
 rename bin/{repair-fixable-orphans.py => repair-fixable-s3-orphans.py} (95%)

diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-s3-orphans.py
similarity index 88%
rename from bin/find-fixable-orphans.py
rename to bin/find-fixable-s3-orphans.py
index 90555cfc..0b886714 100755
--- a/bin/find-fixable-orphans.py
+++ b/bin/find-fixable-s3-orphans.py
@@ -23,8 +23,6 @@
 
 # Magic required to allow this script to use Signbank Django classes
 # This goes away if this script becomes a Django Management Command
-print("Importing site-packages environment", file=sys.stderr)
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
 from django.core.wsgi import get_wsgi_application
@@ -72,11 +70,14 @@
 
 # Other globals
 CSV_DELIMITER = ","
-FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 PGCLI = args.pgcli
 AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 
+# Hack to handle FULL JOIN
+# See get_nzsl_raw_keys_dict()
+FAKEKEY_PREFIX = "this_is_not_a_key_"
+
 
 def pg_cli(args_list):
     try:
@@ -146,7 +147,23 @@ def get_nzsl_raw_keys_dict():
             video_key,
         ] = rawl.split("|")
 
-        # Hack to handle FULL JOIN
+        """
+        Hack to handle FULL JOIN.
+        We are storing data rows in a dictionary, indexed by video_key.
+        Because we are doing a FULL JOIN on the NZSL Signbank database,
+        we also get rows where there are gloss entries that do not have
+        a corresponding video_glossvideo.
+        (These are erroneous and one of the reasons this script exists,
+        to find them.)
+        Consequently there is no video_key, and we cannot use it to index
+        the data row.
+        Instead, we create a fake video_key that is unique and, theoretically,
+        impossible for anything else to try and use. It also has a 'safe',
+        easily filtered prefix, which means later code can easily tell
+        a fake key from a real key.
+        Always having a key, in this way, means that code, eg. loops,
+        that depends on there being a dictionary key axis will not break.
+        """
         video_key = maybe_fakekey(video_key.strip())
 
         # This sets the initial field ordering in the all_keys dictionary row
@@ -194,6 +211,9 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
     this_all_keys_dict = {}
 
     # Find S3 keys that are present in NZSL, or absent
+    # TODO This could be changed to use pop(), so that on each pass we are left
+    # with a smaller subset of the rows, which we can search faster. If the
+    # database becomes very large in future this could save a lot of processing.
     for video_key in this_s3_bucket_raw_keys_list:
         dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
         if dict_row:
@@ -252,19 +272,14 @@ def find_orphans():
             # This Signbank record already has an S3 object, all is well
             continue
 
-        # Business rule
-        if int(gloss_id) < 8000:
-            continue
-
         # The gloss_id is the only reliable retrieval key at the Signbank end
         gloss = Gloss.objects.get(id=gloss_id)
         gloss_name = gloss.idgloss.split(":")[0].strip()
-        video_path = gloss.get_video_path()
 
         # Skip any that already have a video path
         # These should have an S3 object but don't: For some reason the video never made it to S3
         # These will have to have their videos reinstated (separate operation)
-        if len(video_path) > 0:
+        if gloss.glossvideo_set.exists():
             continue
 
         # We try to find the orphaned S3 object, if it exists
diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-s3-orphans.py
similarity index 95%
rename from bin/repair-fixable-orphans.py
rename to bin/repair-fixable-s3-orphans.py
index ce948be2..84648b49 100755
--- a/bin/repair-fixable-orphans.py
+++ b/bin/repair-fixable-s3-orphans.py
@@ -66,6 +66,13 @@
     required=False,
     help=f"Postgres client path (default: %(default)s)",
 )
+parser.add_argument(
+    "--dryrun",
+    default=False,
+    required=False,
+    action="store_true",
+    help=f"Don't actually make any changes, just output what would happen",
+)
 args = parser.parse_args()
 
 # Keep synced with other scripts
@@ -151,6 +158,11 @@ def process_csv():
         )
         print(gloss)
         print(gloss_video)
+
+        if args.dryrun:
+            print("Dry run, no changes")
+            continue
+
         # At this point we complete the repair
         # We use bulk_create() because we cannot allow save() to run
         if len(GlossVideo.objects.bulk_create([gloss_video])) < 1:

From 48a320755b3c444932f2185a55b95f334cfa4182 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:11:43 +1100
Subject: [PATCH 206/222] OSV ignore GHSA-rrqc-c2jx-6jgv to suppress build
 warnings (We have a Django upgrade in progress anyway that will address this
 vuln)

---
 .osv-detector.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.osv-detector.yml b/.osv-detector.yml
index 2d4acb90..59841218 100644
--- a/.osv-detector.yml
+++ b/.osv-detector.yml
@@ -9,3 +9,4 @@ ignore:
   - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc)
   - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2)
   - GHSA-9mvj-f7w8-pvh2 # Bootstrap Cross-Site Scripting (XSS) vulnerability (https://github.com/advisories/GHSA-9mvj-f7w8-pvh2)
+  - GHSA-rrqc-c2jx-6jgv # Django allows enumeration of user e-mail addresses

From 760dd8e0e5c8097e098856cc4bda2b5be4bd9d42 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:24:49 +1100
Subject: [PATCH 207/222] Do not orphan-test fake keys

---
 bin/find-fixable-s3-orphans.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bin/find-fixable-s3-orphans.py b/bin/find-fixable-s3-orphans.py
index 0b886714..28494514 100755
--- a/bin/find-fixable-s3-orphans.py
+++ b/bin/find-fixable-s3-orphans.py
@@ -285,6 +285,8 @@ def find_orphans():
         # We try to find the orphaned S3 object, if it exists
         # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
         for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
+            if test_key.startswith(FAKEKEY_PREFIX):
+                continue
             if gloss_name in test_key:
                 if str(gloss_id) in test_key:
                     if key_nzsl_yes:

From 781beddf34e91c3f8a670553486684458820e82c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:45:50 +1100
Subject: [PATCH 208/222] Use csv.writer() for get_ script

---
 bin/get-video-s3-acls.py | 62 +++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py
index 00b820f9..ee6d70d8 100755
--- a/bin/get-video-s3-acls.py
+++ b/bin/get-video-s3-acls.py
@@ -15,6 +15,7 @@
 from uuid import uuid4
 from pprint import pprint
 import boto3
+import csv
 
 parser = argparse.ArgumentParser(
     description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
@@ -255,21 +256,19 @@ def get_s3_lastmodified(video_key):
 
 
 def build_csv_header():
-    return CSV_DELIMITER.join(
-        [
-            "Action",
-            "S3 Video key",
-            "S3 LastModified",
-            "S3 Expected Canned ACL",
-            "S3 Actual Canned ACL",
-            "Sbank Gloss ID",
-            "Sbank Video ID",
-            "Sbank Gloss public",
-            "Sbank Video public",
-            "Sbank Gloss",
-            "Sbank Gloss created at",
-        ]
-    )
+    return [
+        "Action",
+        "S3 Video key",
+        "S3 LastModified",
+        "S3 Expected Canned ACL",
+        "S3 Actual Canned ACL",
+        "Sbank Gloss ID",
+        "Sbank Video ID",
+        "Sbank Gloss public",
+        "Sbank Video public",
+        "Sbank Gloss",
+        "Sbank Gloss created at",
+    ]
 
 
 def build_csv_row(
@@ -296,31 +295,30 @@ def build_csv_row(
 
     action = get_recommended_action(key_in_nzsl, key_in_s3)
 
-    return CSV_DELIMITER.join(
-        [
-            action,
-            f"{filter_fakekey(video_key)}",
-            f"{lastmodified}",
-            f"{canned_acl_expected}",
-            f"{canned_acl}",
-            f"{gloss_id}",
-            f"{video_id}",
-            f"{gloss_public}",
-            f"{video_public}",
-            f"{gloss_idgloss}",
-            f"{gloss_created_at}",
-        ]
-    )
+    return [
+        action,
+        f"{filter_fakekey(video_key)}",
+        f"{lastmodified}",
+        f"{canned_acl_expected}",
+        f"{canned_acl}",
+        f"{gloss_id}",
+        f"{video_id}",
+        f"{gloss_public}",
+        f"{video_public}",
+        f"{gloss_idgloss}",
+        f"{gloss_created_at}",
+    ]
 
 
 # From the keys present in NZSL, get all their S3 information
 def process_keys(this_all_keys_dict):
     print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
-    print(build_csv_header())
+    out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE)
+    out.writerow(build_csv_header())
 
     for video_key, dict_row in this_all_keys_dict.items():
-        print(build_csv_row(video_key, *dict_row))
+        out.writerow(build_csv_row(video_key, *dict_row))
 
 
 print(f"Env:         {args.env}", file=sys.stderr)

From 04c1cc986fdd89bb0bb562691c055d5670dfe333 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:57:14 +1100
Subject: [PATCH 209/222] Other scripts now using csv.writerow() also

---
 bin/find-fixable-s3-orphans.py   | 10 ++++------
 bin/repair-fixable-s3-orphans.py |  7 +++++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/bin/find-fixable-s3-orphans.py b/bin/find-fixable-s3-orphans.py
index 28494514..2be5967f 100755
--- a/bin/find-fixable-s3-orphans.py
+++ b/bin/find-fixable-s3-orphans.py
@@ -20,6 +20,7 @@
 import argparse
 from uuid import uuid4
 import boto3
+import csv
 
 # Magic required to allow this script to use Signbank Django classes
 # This goes away if this script becomes a Django Management Command
@@ -250,7 +251,8 @@ def find_orphans():
     )
     print("Finding fixable orphans", file=sys.stderr)
 
-    print(CSV_DELIMITER.join(GLOBAL_COLUMN_HEADINGS))
+    out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE)
+    out.writerow(GLOBAL_COLUMN_HEADINGS)
 
     # Traverse all the NZSL Signbank glosses that are missing S3 objects
     for video_key, [
@@ -295,11 +297,7 @@ def find_orphans():
                     if not key_s3_yes:
                         print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
                         continue
-                    print(
-                        CSV_DELIMITER.join(
-                            [gloss_id, gloss.idgloss, str(gloss_public), test_key]
-                        )
-                    )
+                    out.writerow([gloss_id, gloss.idgloss, str(gloss_public), test_key])
 
 
 print(f"Env:         {args.env}", file=sys.stderr)
diff --git a/bin/repair-fixable-s3-orphans.py b/bin/repair-fixable-s3-orphans.py
index 84648b49..67c7321d 100755
--- a/bin/repair-fixable-s3-orphans.py
+++ b/bin/repair-fixable-s3-orphans.py
@@ -17,9 +17,9 @@
 
 import os
 import sys
-import csv
 import subprocess
 import argparse
+import csv
 
 # Magic required to allow this script to use Signbank Django classes
 # This goes away if this script becomes a Django Management Command
@@ -127,11 +127,14 @@ def process_csv():
     ).first()
 
     csv_rows = read_csv(args.csv_filename)
+
+    out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE)
+
     for csv_row in csv_rows:
         gloss_id = csv_row[GLOSS_ID_COLUMN]
         gloss_idgloss = csv_row[GLOSS_COLUMN]
         video_key = csv_row[GLOSS_VIDEO_COLUMN]
-        print(CSV_DELIMITER.join([gloss_id, gloss_idgloss, video_key]))
+        out.writerow([gloss_id, gloss_idgloss, video_key])
         gloss_id = int(gloss_id)
 
         try:

From 87db6a24008ae4ca678e12809dce8e779443e49a Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:03:30 +1100
Subject: [PATCH 210/222] Dry run mode made default, flag changed to --commit

---
 bin/repair-fixable-s3-orphans.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/bin/repair-fixable-s3-orphans.py b/bin/repair-fixable-s3-orphans.py
index 67c7321d..32c76b96 100755
--- a/bin/repair-fixable-s3-orphans.py
+++ b/bin/repair-fixable-s3-orphans.py
@@ -67,11 +67,11 @@
     help=f"Postgres client path (default: %(default)s)",
 )
 parser.add_argument(
-    "--dryrun",
+    "--commit",
     default=False,
     required=False,
     action="store_true",
-    help=f"Don't actually make any changes, just output what would happen",
+    help=f"Actually make changes, instead of just outputting what would happen (default)",
 )
 args = parser.parse_args()
 
@@ -162,8 +162,8 @@ def process_csv():
         print(gloss)
         print(gloss_video)
 
-        if args.dryrun:
-            print("Dry run, no changes")
+        if not args.commit:
+            print("Dry run, no changes (use --commit flag to make changes)")
             continue
 
         # At this point we complete the repair
@@ -176,5 +176,6 @@ def process_csv():
 print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
 print(f"PGCLI:       {PGCLI}", file=sys.stderr)
 print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+print(f"Mode:        {'Commit' if args.commit else 'Dry-run'}")
 
 process_csv()

From 556e709fec8af3d56e62c766b5e423bfa4b02f2f Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 13:51:31 +1100
Subject: [PATCH 211/222] moved get script

---
 .../dictionary/management/commands}/get-video-s3-acls.py          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {bin => signbank/dictionary/management/commands}/get-video-s3-acls.py (100%)

diff --git a/bin/get-video-s3-acls.py b/signbank/dictionary/management/commands/get-video-s3-acls.py
similarity index 100%
rename from bin/get-video-s3-acls.py
rename to signbank/dictionary/management/commands/get-video-s3-acls.py

From da0befbdd45828ff5994df3bb35a0edd4a0f5e74 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 13:52:02 +1100
Subject: [PATCH 212/222] rename get script for consistency

---
 .../commands/{get-video-s3-acls.py => get_video_s3_acls.py}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename signbank/dictionary/management/commands/{get-video-s3-acls.py => get_video_s3_acls.py} (100%)

diff --git a/signbank/dictionary/management/commands/get-video-s3-acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py
similarity index 100%
rename from signbank/dictionary/management/commands/get-video-s3-acls.py
rename to signbank/dictionary/management/commands/get_video_s3_acls.py

From 20f8bf428347976f982eb4e37a53ddb13949e633 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 13:52:44 +1100
Subject: [PATCH 213/222] changed permissions on get script for consistency

---
 signbank/dictionary/management/commands/get_video_s3_acls.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 signbank/dictionary/management/commands/get_video_s3_acls.py

diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py
old mode 100755
new mode 100644

From 695b398702736635e74f977a8b8eacdca0c559aa Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:34:29 +1100
Subject: [PATCH 214/222] get_video_s3_acls -> Management Command

---
 .../management/commands/get_video_s3_acls.py  | 107 +++++++++---------
 1 file changed, 56 insertions(+), 51 deletions(-)

diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py
index ee6d70d8..9804759d 100644
--- a/signbank/dictionary/management/commands/get_video_s3_acls.py
+++ b/signbank/dictionary/management/commands/get_video_s3_acls.py
@@ -6,57 +6,25 @@
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
 # For some commands you need to run this in a venv that has all the right Python site-packages.
-# TODO Convert this script to a Django Management Command
 
+from django.core.management.base import BaseCommand
 import os
 import sys
 import subprocess
-import argparse
 from uuid import uuid4
 from pprint import pprint
 import boto3
 import csv
 
-parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var."
-)
-parser.add_argument(
-    "--env",
-    default="uat",
-    required=False,
-    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
-)
-parser.add_argument(
-    "--pgcli",
-    default="/usr/bin/psql",
-    required=False,
-    help=f"Postgres client path (default: %(default)s)",
-)
-parser.add_argument(
-    "--dumpnzsl",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Dump raw NZSL database output",
-)
-parser.add_argument(
-    "--dumps3",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Dump raw S3 keys output",
-)
-args = parser.parse_args()
 
 # Globals
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-PGCLI = args.pgcli
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
 S3_CLIENT = boto3.client("s3")
 S3_RESOURCE = boto3.resource("s3")
+PGCLI = "/usr/bin/psql"
+AWS_S3_BUCKET = ""
 
 
 def pg_cli(args_list):
@@ -165,10 +133,10 @@ def get_nzsl_raw_keys_dict():
 
 
 # Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+def get_s3_bucket_raw_keys_list():
+    print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
-    s3_resource_bucket = S3_RESOURCE.Bucket(s3_bucket)
+    s3_resource_bucket = S3_RESOURCE.Bucket(AWS_S3_BUCKET)
     this_s3_bucket_raw_keys_list = [
         s3_object.key for s3_object in s3_resource_bucket.objects.all()
     ]
@@ -321,19 +289,56 @@ def process_keys(this_all_keys_dict):
         out.writerow(build_csv_row(video_key, *dict_row))
 
 
-print(f"Env:         {args.env}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"PGCLI:       {PGCLI}", file=sys.stderr)
-print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+class Command(BaseCommand):
+    help = "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--env",
+            default="uat",
+            required=False,
+            help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+        )
+        parser.add_argument(
+            "--pgcli",
+            default=PGCLI,
+            required=False,
+            help=f"Postgres client path (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--dumpnzsl",
+            default=False,
+            required=False,
+            action="store_true",
+            help=f"Dump raw NZSL database output",
+        )
+        parser.add_argument(
+            "--dumps3",
+            default=False,
+            required=False,
+            action="store_true",
+            help=f"Dump raw S3 keys output",
+        )
+
+    def handle(self, *args, **options):
+        global PGCLI, AWS_S3_BUCKET
+        PGCLI = options["pgcli"]
+        AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}"
+
+        print(f"Env:         {options['env']}", file=sys.stderr)
+        print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+        print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+        print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-if args.dumpnzsl:
-    pprint(get_nzsl_raw_keys_dict())
-    exit()
+        if options["dumpnzsl"]:
+            pprint(get_nzsl_raw_keys_dict())
+            exit()
 
-if args.dumps3:
-    pprint(get_s3_bucket_raw_keys_list())
-    exit()
+        if options["dumps3"]:
+            pprint(get_s3_bucket_raw_keys_list())
+            exit()
 
-process_keys(
-    create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
-)
+        process_keys(
+            create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+        )

From 4d79d329d5855ec10cfa83883e6e9f135523d81d Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:38:54 +1100
Subject: [PATCH 215/222] Comments

---
 .../management/commands/get_video_s3_acls.py         | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py
index 9804759d..f4273f97 100644
--- a/signbank/dictionary/management/commands/get_video_s3_acls.py
+++ b/signbank/dictionary/management/commands/get_video_s3_acls.py
@@ -5,7 +5,6 @@
 #  aws s3 - NZSL IAM access
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
-# For some commands you need to run this in a venv that has all the right Python site-packages.
 
 from django.core.management.base import BaseCommand
 import os
@@ -290,8 +289,11 @@ def process_keys(this_all_keys_dict):
 
 
 class Command(BaseCommand):
-    help = "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var."
+    help = (
+        "Gets all S3 bucket video objects and recommends actions for them. "
+        "You must setup: (1) An AWS auth means, eg. AWS_PROFILE env var. "
+        "(2) Postgres access details, eg. DATABASE_URL env var."
+    )
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -340,5 +342,7 @@ def handle(self, *args, **options):
             exit()
 
         process_keys(
-            create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list())
+            create_all_keys_dict(
+                get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
+            )
         )

From 4caa11a027a69c649fe4ca31d2216cfced30958b Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:52:06 +1100
Subject: [PATCH 216/222] Comments

---
 signbank/dictionary/management/commands/get_video_s3_acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py
index f4273f97..fa23ebd7 100644
--- a/signbank/dictionary/management/commands/get_video_s3_acls.py
+++ b/signbank/dictionary/management/commands/get_video_s3_acls.py
@@ -188,8 +188,8 @@ def get_recommended_action(key_in_nzsl, key_in_s3):
     """
     Cases
     In S3     In NZSL     Action
-      Is        Not         Delete S3 Object
       Is        Is          Update ACL
+      Is        Not         Delete S3 Object
       Not       --          Review
     """
     if key_in_s3:

From 75e82cfb8f47974ff94c50adbc7564de168980e0 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:55:15 +1100
Subject: [PATCH 217/222] Comments

---
 signbank/dictionary/management/commands/get_video_s3_acls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py
index fa23ebd7..51396c01 100644
--- a/signbank/dictionary/management/commands/get_video_s3_acls.py
+++ b/signbank/dictionary/management/commands/get_video_s3_acls.py
@@ -290,7 +290,7 @@ def process_keys(this_all_keys_dict):
 
 class Command(BaseCommand):
     help = (
-        "Gets all S3 bucket video objects and recommends actions for them. "
+        "Get all S3 bucket video object and recommends actions for them. "
         "You must setup: (1) An AWS auth means, eg. AWS_PROFILE env var. "
         "(2) Postgres access details, eg. DATABASE_URL env var."
     )

From 967daaf13c0de995b56c8ea5cb83a17b2e47491c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:23:36 +1100
Subject: [PATCH 218/222] Moved remaining commands

---
 .../dictionary/management/commands}/find-fixable-s3-orphans.py    | 0
 .../dictionary/management/commands}/repair-fixable-s3-orphans.py  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {bin => signbank/dictionary/management/commands}/find-fixable-s3-orphans.py (100%)
 rename {bin => signbank/dictionary/management/commands}/repair-fixable-s3-orphans.py (100%)

diff --git a/bin/find-fixable-s3-orphans.py b/signbank/dictionary/management/commands/find-fixable-s3-orphans.py
similarity index 100%
rename from bin/find-fixable-s3-orphans.py
rename to signbank/dictionary/management/commands/find-fixable-s3-orphans.py
diff --git a/bin/repair-fixable-s3-orphans.py b/signbank/dictionary/management/commands/repair-fixable-s3-orphans.py
similarity index 100%
rename from bin/repair-fixable-s3-orphans.py
rename to signbank/dictionary/management/commands/repair-fixable-s3-orphans.py

From 2ae11cd46093b163ffbe45e93bfb4394e5eab79c Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:24:48 +1100
Subject: [PATCH 219/222] Renamed

---
 ...-orphans.py => find_fixable_s3_orphans.py} | 73 +++++++++----------
 ...rphans.py => repair_fixable_s3_orphans.py} |  0
 2 files changed, 34 insertions(+), 39 deletions(-)
 rename signbank/dictionary/management/commands/{find-fixable-s3-orphans.py => find_fixable_s3_orphans.py} (84%)
 rename signbank/dictionary/management/commands/{repair-fixable-s3-orphans.py => repair_fixable_s3_orphans.py} (100%)

diff --git a/signbank/dictionary/management/commands/find-fixable-s3-orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
similarity index 84%
rename from signbank/dictionary/management/commands/find-fixable-s3-orphans.py
rename to signbank/dictionary/management/commands/find_fixable_s3_orphans.py
index 2be5967f..308ac5e2 100755
--- a/signbank/dictionary/management/commands/find-fixable-s3-orphans.py
+++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env -S python3 -u
 #
-# This script needs to be run in a pyenv virtualenv with the Django project installed.
-#
 # Finds orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects.
 # Essentially finds one form of import error.
 #
@@ -11,24 +9,15 @@
 #  aws s3 - NZSL IAM access
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
-# For some commands you need to run this in a venv that has all the right Python site-packages.
-# TODO Convert this script to a Django Management Command
 
+from django.core.management.base import BaseCommand
 import os
 import sys
 import subprocess
-import argparse
 from uuid import uuid4
 import boto3
 import csv
 
-# Magic required to allow this script to use Signbank Django classes
-# This goes away if this script becomes a Django Management Command
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
 
 from django.contrib.auth import get_user_model
 
@@ -39,23 +28,6 @@
 )
 
 
-parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var."
-)
-parser.add_argument(
-    "--env",
-    default="uat",
-    required=False,
-    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
-)
-parser.add_argument(
-    "--pgcli",
-    default="/usr/bin/psql",
-    required=False,
-    help=f"Postgres client path (default: %(default)s)",
-)
-args = parser.parse_args()
 
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
@@ -72,8 +44,8 @@
 # Other globals
 CSV_DELIMITER = ","
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-PGCLI = args.pgcli
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+PGCLI = "/usr/bin/psql"
+AWS_S3_BUCKET = ""
 
 # Hack to handle FULL JOIN
 # See get_nzsl_raw_keys_dict()
@@ -186,11 +158,11 @@ def get_nzsl_raw_keys_dict():
 
 
 # Get all keys from AWS S3
-def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
-    print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
+def get_s3_bucket_raw_keys_list():
+    print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr)
 
     s3_resource = boto3.resource("s3")
-    s3_resource_bucket = s3_resource.Bucket(s3_bucket)
+    s3_resource_bucket = s3_resource.Bucket(AWS_S3_BUCKET)
     this_s3_bucket_raw_keys_list = [
         s3_object.key for s3_object in s3_resource_bucket.objects.all()
     ]
@@ -300,9 +272,32 @@ def find_orphans():
                     out.writerow([gloss_id, gloss.idgloss, str(gloss_public), test_key])
 
 
-print(f"Env:         {args.env}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"PGCLI:       {PGCLI}", file=sys.stderr)
-print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+class Command(BaseCommand):
+    help = ( "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+    "Postgres access details, eg. DATABASE_URL env var." )
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--env",
+            default="uat",
+            required=False,
+            help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+        )
+        parser.add_argument(
+            "--pgcli",
+            default=PGCLI,
+            required=False,
+            help=f"Postgres client path (default: %(default)s)",
+        )
+
+    def handle(self, *args, **options):
+        global PGCLI, AWS_S3_BUCKET
+        PGCLI = options["pgcli"]
+        AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}"
+
+        print(f"Env:         {options['env']}", file=sys.stderr)
+        print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+        print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+        print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
 
-find_orphans()
+        find_orphans()
diff --git a/signbank/dictionary/management/commands/repair-fixable-s3-orphans.py b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py
similarity index 100%
rename from signbank/dictionary/management/commands/repair-fixable-s3-orphans.py
rename to signbank/dictionary/management/commands/repair_fixable_s3_orphans.py

From cbe56c8465684f54d4cf27426a74486a7a4477c4 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:25:17 +1100
Subject: [PATCH 220/222] black

---
 .../management/commands/find_fixable_s3_orphans.py         | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
index 308ac5e2..f11df71c 100755
--- a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
+++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
@@ -28,7 +28,6 @@
 )
 
 
-
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
 GLOSS_COLUMN = "Gloss"
@@ -273,8 +272,10 @@ def find_orphans():
 
 
 class Command(BaseCommand):
-    help = ( "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var." )
+    help = (
+        "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
+        "Postgres access details, eg. DATABASE_URL env var."
+    )
 
     def add_arguments(self, parser):
         parser.add_argument(

From 1fb9978d26f4b345cec343fd4a46d3e51ea5ec41 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:31:05 +1100
Subject: [PATCH 221/222] find_fixable_s3_orphans.py -> Management Command

---
 .../management/commands/find_fixable_s3_orphans.py    | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
index f11df71c..081e9622 100755
--- a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
+++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
@@ -17,15 +17,7 @@
 from uuid import uuid4
 import boto3
 import csv
-
-
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
-from signbank.dictionary.models import (
-    Gloss,
-)
+from signbank.dictionary.models import Gloss
 
 
 # Keep synced with other scripts
@@ -273,6 +265,7 @@ def find_orphans():
 
 class Command(BaseCommand):
     help = (
+        "Find orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects. "
         "You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
         "Postgres access details, eg. DATABASE_URL env var."
     )

From 4f1934a770aa0dafb2afa1bbc98228ae9d1809e5 Mon Sep 17 00:00:00 2001
From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com>
Date: Fri, 13 Dec 2024 17:02:02 +1100
Subject: [PATCH 222/222] black and cleanups

---
 .../commands/find_fixable_s3_orphans.py       |   5 +-
 .../commands/repair_fixable_s3_orphans.py     | 119 +++++++++---------
 2 files changed, 58 insertions(+), 66 deletions(-)

diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
index 081e9622..6fcc73c2 100755
--- a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
+++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py
@@ -34,14 +34,11 @@
 
 # Other globals
 CSV_DELIMITER = ","
+FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
 PGCLI = "/usr/bin/psql"
 AWS_S3_BUCKET = ""
 
-# Hack to handle FULL JOIN
-# See get_nzsl_raw_keys_dict()
-FAKEKEY_PREFIX = "this_is_not_a_key_"
-
 
 def pg_cli(args_list):
     try:
diff --git a/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py
index 32c76b96..06085051 100755
--- a/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py
+++ b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env -S python3 -u
 #
-# This script needs to be run in a pyenv virtualenv with the Django project installed.
-#
 # Given a CSV file containing S3 objects that can be matched back to NZSL entries.
 # Updates the database to repair the NZSL entries.
 # Essentially repairs one form of import error.
@@ -12,68 +10,19 @@
 #  aws s3 - NZSL IAM access
 #  s3:GetObjectAcl permissions or READ_ACP access to the object
 #  https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html
-# For some commands you need to run this in a venv that has all the right Python site-packages.
-# TODO Convert this script to a Django Management Command
 
+from django.core.management.base import BaseCommand
 import os
 import sys
 import subprocess
-import argparse
 import csv
-
-# Magic required to allow this script to use Signbank Django classes
-# This goes away if this script becomes a Django Management Command
-print("Importing site-packages environment", file=sys.stderr)
-print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr)
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development")
-from django.core.wsgi import get_wsgi_application
-
-get_wsgi_application()
-
-from django.contrib.auth import get_user_model
-
-User = get_user_model()
-
 from signbank.dictionary.models import (
     FieldChoice,
     Gloss,
 )
 from signbank.video.models import GlossVideo
-
 from django.core.exceptions import ObjectDoesNotExist
-from django.db import models
-
-
-parser = argparse.ArgumentParser(
-    description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. "
-    "Postgres access details, eg. DATABASE_URL env var."
-)
 
-# Positional arguments
-parser.add_argument("csv_filename", help="Name of CSV file, or '-' for STDIN")
-
-# Optional arguments
-parser.add_argument(
-    "--env",
-    default="uat",
-    required=False,
-    help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
-)
-parser.add_argument(
-    "--pgcli",
-    default="/usr/bin/psql",
-    required=False,
-    help=f"Postgres client path (default: %(default)s)",
-)
-parser.add_argument(
-    "--commit",
-    default=False,
-    required=False,
-    action="store_true",
-    help=f"Actually make changes, instead of just outputting what would happen (default)",
-)
-args = parser.parse_args()
 
 # Keep synced with other scripts
 GLOSS_ID_COLUMN = "Gloss ID"
@@ -91,8 +40,10 @@
 CSV_DELIMITER = ","
 FAKEKEY_PREFIX = "this_is_not_a_key_"
 DATABASE_URL = os.getenv("DATABASE_URL", "")
-PGCLI = args.pgcli
-AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}"
+PGCLI = "/usr/bin/psql"
+AWS_S3_BUCKET = ""
+DO_COMMIT = False
+CSV_INPUT_FILENAME = "-"
 
 
 def pg_cli(args_list):
@@ -126,7 +77,7 @@ def process_csv():
         field="video_type", english_name="main"
     ).first()
 
-    csv_rows = read_csv(args.csv_filename)
+    csv_rows = read_csv(CSV_INPUT_FILENAME)
 
     out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE)
 
@@ -162,7 +113,7 @@ def process_csv():
         print(gloss)
         print(gloss_video)
 
-        if not args.commit:
+        if not DO_COMMIT:
             print("Dry run, no changes (use --commit flag to make changes)")
             continue
 
@@ -172,10 +123,54 @@ def process_csv():
             print(f"Error: could not create {gloss_video}")
 
 
-print(f"Env:         {args.env}", file=sys.stderr)
-print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
-print(f"PGCLI:       {PGCLI}", file=sys.stderr)
-print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
-print(f"Mode:        {'Commit' if args.commit else 'Dry-run'}")
+class Command(BaseCommand):
+    help = (
+        f"Given a CSV file containing S3 objects that can be matched back to NZSL entries: "
+        f"Update the database to repair the NZSL entries. "
+        f"CSV Column headings {GLOBAL_COLUMN_HEADINGS}. "
+        f"You must have setup: An AWS auth means, eg. AWS_PROFILE env var. "
+        f"Postgres access details, eg. DATABASE_URL env var."
+    )
+
+    def add_arguments(self, parser):
+        # Positional arguments
+        parser.add_argument(
+            "csv_filename", help="Name of CSV input file, or '-' for STDIN"
+        )
+
+        # Optional arguments
+        parser.add_argument(
+            "--env",
+            default="uat",
+            required=False,
+            help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')",
+        )
+        parser.add_argument(
+            "--pgcli",
+            default="/usr/bin/psql",
+            required=False,
+            help=f"Postgres client path (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--commit",
+            default=DO_COMMIT,
+            required=False,
+            action="store_true",
+            help=f"Actually make changes, instead of just outputting what would happen (default)",
+        )
 
-process_csv()
+    def handle(self, *args, **options):
+        global PGCLI, AWS_S3_BUCKET, CSV_INPUT_FILENAME, DO_COMMIT
+        PGCLI = options["pgcli"]
+        AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}"
+        CSV_INPUT_FILENAME = options["csv_filename"]
+        DO_COMMIT = options["commit"]
+
+        print(f"Env:         {options['env']}", file=sys.stderr)
+        print(f"S3 bucket:   {AWS_S3_BUCKET}", file=sys.stderr)
+        print(f"PGCLI:       {PGCLI}", file=sys.stderr)
+        print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)
+        print(f"Input file:  {options['csv_filename']}", file=sys.stderr)
+        print(f"Mode:        {'Commit' if DO_COMMIT else 'Dry-run'}", file=sys.stderr)
+
+        process_csv()